From 3eee1b9b8fa13d044509089c7fc8186f4439d412 Mon Sep 17 00:00:00 2001 From: Harisankar Sadasivan Date: Wed, 1 Nov 2023 23:54:52 +0000 Subject: [PATCH] adding tall and skinny gemm --- .../CHANGELOG.md | 0 .../CITATION.cff | 0 .../CMakeLists.txt | 0 .../CONTRIBUTORS.md | 0 .../Config.cmake.in | 0 Dockerfile => composable_kernel/Dockerfile | 0 Jenkinsfile => composable_kernel/Jenkinsfile | 0 LICENSE => composable_kernel/LICENSE | 0 README.md => composable_kernel/README.md | 0 .../client_example}/01_gemm/CMakeLists.txt | 0 .../client_example}/01_gemm/gemm.cpp | 0 .../02_gemm_add_add_fastgelu/CMakeLists.txt | 0 .../gemm_add_add_fastgelu.cpp | 0 .../gemm_add_add_fastgelu_generic.cpp | 0 .../gemm_add_fastgelu.cpp | 0 .../gemm_add_fastgelu_generic.cpp | 0 .../gemm_fastgelu.cpp | 0 .../gemm_fastgelu_generic.cpp | 0 .../03_gemm_layernorm/CMakeLists.txt | 0 .../gemm_add_add_layernorm_naive.cpp | 0 .../gemm_add_relu_add_layernorm_welford.cpp | 0 .../04_contraction/CMakeLists.txt | 0 .../contraction_bilinear_fp32.cpp | 0 .../contraction_bilinear_fp64.cpp | 0 .../contraction_g1m2n3k1_add_xdl_fp16.cpp | 0 .../04_contraction/contraction_scale_fp32.cpp | 0 .../04_contraction/contraction_scale_fp64.cpp | 0 .../05_layernorm/CMakeLists.txt | 0 .../05_layernorm/layernorm2d.cpp | 0 .../client_example}/06_softmax/CMakeLists.txt | 0 .../client_example}/06_softmax/softmax4d.cpp | 0 .../07_grouped_convnd_fwd/CMakeLists.txt | 0 .../grouped_conv1d_fwd.cpp | 0 .../grouped_conv2d_fwd.cpp | 0 .../08_fused_attention/CMakeLists.txt | 0 .../08_fused_attention/fused_attention.cpp | 0 .../fused_attention_bias.cpp | 0 .../09_quantization/CMakeLists.txt | 0 ..._fwd_bias_relu_perchannel_quantization.cpp | 0 ...2d_fwd_bias_relu_perlayer_quantization.cpp | 0 ..._fwd_bias_tanh_perchannel_quantization.cpp | 0 ...2d_fwd_bias_tanh_perlayer_quantization.cpp | 0 .../conv2d_fwd_perchannel_quantization.cpp | 0 .../conv2d_fwd_perlayer_quantization.cpp | 0 .../09_quantization/gemm_quantization.cpp | 0 .../10_grouped_convnd_bwd_data/CMakeLists.txt | 0 .../grouped_conv2d_bwd_data.cpp | 0 .../grouped_conv3d_bwd_data.cpp | 0 ..._conv3d_bwd_data_input_fp16_comp_bf8f8.cpp | 0 .../11_grouped_conv_bwd_weight/CMakeLists.txt | 0 .../11_grouped_conv_bwd_weight/common.hpp | 0 .../grouped_conv1d_bwd_weight_fp16.cpp | 0 .../grouped_conv2d_bwd_weight_fp16.cpp | 0 .../grouped_conv3d_bwd_weight_fp16.cpp | 0 .../grouped_conv3d_bwd_weight_fp32.cpp | 0 .../CMakeLists.txt | 0 .../elementwise_layernorm2d.cpp | 0 .../13_batchnorm/CMakeLists.txt | 0 .../13_batchnorm/batchnorm_bwd_nhwc.cpp | 0 .../13_batchnorm/batchnorm_fwd_nhwc.cpp | 0 .../13_batchnorm/batchnorm_infer_nhwc.cpp | 0 .../14_instance_id/CMakeLists.txt | 0 .../batchnorm_fwd_instance_id.cpp | 0 .../15_convnd_bwd_data/CMakeLists.txt | 0 .../15_convnd_bwd_data/common.hpp | 0 .../conv3d_bwd_data_fp16.cpp | 0 .../conv3d_bwd_data_fp32.cpp | 0 .../15_gemm_add_multiply/CMakeLists.txt | 0 .../gemm_add_multiply.cpp | 0 .../client_example}/15_reduce/CMakeLists.txt | 0 .../15_reduce/reduce_nhwc_c.cpp | 0 .../16_convnd_fwd/CMakeLists.txt | 0 .../client_example}/16_convnd_fwd/common.hpp | 0 .../16_convnd_fwd/conv3d_fwd_fp16.cpp | 0 .../conv3d_fwd_fp16_comp_fp8.cpp | 0 .../16_convnd_fwd/conv3d_fwd_fp32.cpp | 0 .../17_grouped_gemm_fastgelu/CMakeLists.txt | 0 .../grouped_gemm_fastgelu.cpp | 0 .../18_groupnorm/CMakeLists.txt | 0 .../18_groupnorm/groupnorm_swish.cpp | 0 .../client_example}/19_pool/CMakeLists.txt | 0 .../19_pool/avg_pool3d_bwd.cpp | 0 .../19_pool/avg_pool3d_fwd.cpp | 0 .../19_pool/max_pool2d_bwd.cpp | 0 .../19_pool/max_pool2d_fwd.cpp | 0 .../20_splitk_gemm/CMakeLists.txt | 0 .../20_splitk_gemm/splitK_gemm_fp16_f8.cpp | 0 .../21_grouped_gemm_bias/CMakeLists.txt | 0 .../grouped_gemm_fixed_nk_bias_fp16.cpp | 0 .../22_grouped_gemm/CMakeLists.txt | 0 .../grouped_gemm_fixed_nk_fp16.cpp | 0 .../grouped_gemm_fixed_nk_fp8.cpp | 0 .../grouped_gemm_fixed_nk_i8.cpp | 0 .../22_im2col_col2im/CMakeLists.txt | 0 .../22_im2col_col2im/column_to_image.cpp | 0 .../22_im2col_col2im/image_to_column.cpp | 0 .../client_example}/CMakeLists.txt | 0 .../client_example}/README.md | 0 .../cmake}/Analyzers.cmake | 0 .../cmake}/ClangTidy.cmake | 0 .../cmake}/CppCheck.cmake | 0 .../cmake}/DoxygenDoc.cmake | 0 .../cmake}/EnableCompilerWarnings.cmake | 0 .../cmake}/TargetFlags.cmake | 0 .../cmake}/googletest.cmake | 0 .../dev-requirements.txt | 0 .../docs}/API_Reference_Guide.rst | 0 .../docs}/Contributors_Guide.rst | 0 .../docs}/Supported_Primitives_Guide.rst | 0 {docs => composable_kernel/docs}/conf.py | 0 .../docs}/data/ck_component.png | Bin .../docs}/data/ck_layer.png | Bin .../docs}/dockerhub.rst | 0 .../docs}/doxygen/Doxyfile | 0 {docs => composable_kernel/docs}/index.rst | 0 {docs => composable_kernel/docs}/license.rst | 0 {docs => composable_kernel/docs}/refs.bib | 0 .../docs}/sphinx/_toc.yml.in | 0 .../docs}/sphinx/requirements.in | 0 .../docs}/sphinx/requirements.txt | 0 .../docs}/tutorial_hello_world.rst | 0 .../example}/01_gemm/CMakeLists.txt | 0 .../example}/01_gemm/README.md | 0 .../example}/01_gemm/common.hpp | 0 .../example}/01_gemm/gemm_dl_fp16.cpp | 0 .../example}/01_gemm/gemm_dl_fp32.cpp | 0 .../example}/01_gemm/gemm_dl_int4.cpp | 0 .../example}/01_gemm/gemm_dl_int8.cpp | 0 .../example}/01_gemm/gemm_dpp_fp16.cpp | 0 .../example}/01_gemm/gemm_wmma_fp16.cpp | 0 .../example}/01_gemm/gemm_xdl_bf16.cpp | 0 .../example}/01_gemm/gemm_xdl_bf16_rtn.cpp | 0 .../example}/01_gemm/gemm_xdl_fp16.cpp | 0 .../example}/01_gemm/gemm_xdl_fp16_fp8.cpp | 0 .../example}/01_gemm/gemm_xdl_fp64.cpp | 0 .../example}/01_gemm/gemm_xdl_fp8.cpp | 0 .../example}/01_gemm/gemm_xdl_fp8_bf8.cpp | 0 .../example}/01_gemm/gemm_xdl_int4.cpp | 0 .../example}/01_gemm/gemm_xdl_int8.cpp | 0 .../01_gemm/gemm_xdl_skip_b_lds_fp16.cpp | 0 .../example}/01_gemm/gemm_xdl_streamk.cpp | 0 .../01_gemm/gemm_xdl_wavelet_fp16.cpp | 0 .../example}/01_gemm/run_gemm_example.inc | 0 .../example}/02_gemm_bilinear/CMakeLists.txt | 0 .../example}/02_gemm_bilinear/README.md | 0 .../gemm_bilinear_wmma_fp16.cpp | 0 .../gemm_bilinear_wmma_int8.cpp | 0 .../gemm_bilinear_xdl_fp16.cpp | 0 .../example}/03_gemm_bias_relu/CMakeLists.txt | 0 .../example}/03_gemm_bias_relu/README.md | 0 .../gemm_bias_relu_xdl_fp16.cpp | 0 .../04_gemm_add_add_fastgelu/CMakeLists.txt | 0 .../04_gemm_add_add_fastgelu/README.md | 0 .../04_gemm_add_add_fastgelu/common.hpp | 0 .../gemm_add_add_fastgelu_xdl_bf16.cpp | 0 .../gemm_add_add_fastgelu_xdl_fp16.cpp | 0 .../gemm_add_add_fastgelu_xdl_fp32.cpp | 0 .../gemm_add_add_fastgelu_xdl_int4.cpp | 0 .../gemm_add_add_fastgelu_xdl_int8.cpp | 0 .../run_gemm_add_add_fastgelu_example.inc | 0 .../example}/09_convnd_fwd/CMakeLists.txt | 0 .../example}/09_convnd_fwd/README.md | 0 .../09_convnd_fwd/convnd_fwd_common.hpp | 0 .../09_convnd_fwd/convnd_fwd_dl_common.hpp | 0 .../09_convnd_fwd/convnd_fwd_dl_fp16.cpp | 0 .../09_convnd_fwd/convnd_fwd_dl_fp32.cpp | 0 .../09_convnd_fwd/convnd_fwd_dl_int8.cpp | 0 .../09_convnd_fwd/convnd_fwd_xdl_bf16.cpp | 0 .../09_convnd_fwd/convnd_fwd_xdl_fp16.cpp | 0 .../09_convnd_fwd/convnd_fwd_xdl_fp32.cpp | 0 .../09_convnd_fwd/convnd_fwd_xdl_fp64.cpp | 0 .../09_convnd_fwd/convnd_fwd_xdl_int8.cpp | 0 .../run_convnd_fwd_dl_example.inc | 0 .../09_convnd_fwd/run_convnd_fwd_example.inc | 0 .../CMakeLists.txt | 0 .../common.hpp | 0 .../convnd_fwd_max_xdl_bf16.cpp | 0 .../convnd_fwd_max_xdl_fp16.cpp | 0 .../convnd_fwd_max_xdl_fp32.cpp | 0 .../convnd_fwd_max_xdl_int4.cpp | 0 .../convnd_fwd_max_xdl_int8.cpp | 0 .../run_convnd_fwd_max_example.inc | 0 .../example}/12_reduce/CMakeLists.txt | 0 .../example}/12_reduce/README.md | 0 .../example}/12_reduce/reduce_blockwise.cpp | 0 .../12_reduce/reduce_blockwise_impl.hpp | 0 .../12_reduce/reduce_blockwise_two_call.cpp | 0 .../12_reduce/reduce_example_common.hpp | 0 .../reduce_multiblock_atomic_add.cpp | 0 .../reduce_multiblock_atomic_add_impl.hpp | 0 .../example}/13_pool2d_fwd/CMakeLists.txt | 0 .../example}/13_pool2d_fwd/README.md | 0 .../13_pool2d_fwd/pool2d_fwd_common.hpp | 0 .../13_pool2d_fwd/pool2d_fwd_fp16.cpp | 0 .../13_pool2d_fwd/pool2d_fwd_fp32.cpp | 0 .../14_gemm_quantization/CMakeLists.txt | 0 .../gemm_dl_quantization_int8.cpp | 0 .../gemm_xdl_bias_relu_quantization_int8.cpp | 0 .../gemm_xdl_quantization_int8.cpp | 0 .../example}/15_grouped_gemm/CMakeLists.txt | 0 .../example}/15_grouped_gemm/README.md | 0 .../grouped_gemm_multiple_d_dl_fp16.cpp | 0 .../15_grouped_gemm/grouped_gemm_xdl_bf16.cpp | 0 .../grouped_gemm_xdl_fixed_nk_bias_fp16.cpp | 0 .../grouped_gemm_xdl_fixed_nk_fp16.cpp | 0 .../grouped_gemm_xdl_fixed_nk_fp8.cpp | 0 .../15_grouped_gemm/grouped_gemm_xdl_fp16.cpp | 0 .../15_grouped_gemm/grouped_gemm_xdl_fp32.cpp | 0 .../15_grouped_gemm/grouped_gemm_xdl_int4.cpp | 0 .../15_grouped_gemm/grouped_gemm_xdl_int8.cpp | 0 .../grouped_gemm_xdl_splitk_fp16.cpp | 0 .../run_grouped_gemm_example.inc | 0 .../CMakeLists.txt | 0 .../gemm_add_add_mean_meansquare_xdl_fp16.cpp | 0 .../gemm_add_addsquare_xdl_int8.cpp | 0 .../gemm_max_xdl_bf16.cpp | 0 .../gemm_max_xdl_fp16.cpp | 0 .../gemm_max_xdl_fp32.cpp | 0 .../gemm_max_xdl_int4.cpp | 0 .../gemm_max_xdl_int8.cpp | 0 .../gemm_mean_meansquare_xdl_bf16.cpp | 0 .../gemm_mean_meansquare_xdl_fp16.cpp | 0 .../gemm_mean_meansquare_xdl_fp32.cpp | 0 .../gemm_reduce_xdl_common.hpp | 0 .../17_convnd_bwd_data/CMakeLists.txt | 0 .../example}/17_convnd_bwd_data/README.md | 0 .../convnd_bwd_data_common.hpp | 0 .../convnd_bwd_data_dl_fp16.cpp | 0 .../convnd_bwd_data_xdl_fp16.cpp | 0 .../18_batched_gemm_reduce/CMakeLists.txt | 0 .../batched_gemm_reduce_xdl_fp16.cpp | 0 .../19_binary_elementwise/CMakeLists.txt | 0 .../broadcast_add_2d_amn_bn.cpp | 0 .../broadcast_add_3d_am_bmnk.cpp | 0 .../elementwise_add_1d.cpp | 0 .../elementwise_add_4d.cpp | 0 .../20_grouped_conv_bwd_weight/CMakeLists.txt | 0 .../20_grouped_conv_bwd_weight/common.hpp | 0 .../grouped_conv_bwd_weight_dl_fp16.cpp | 0 .../grouped_conv_bwd_weight_wmma_fp16.cpp | 0 .../grouped_conv_bwd_weight_xdl_bf16.cpp | 0 .../grouped_conv_bwd_weight_xdl_fp16.cpp | 0 ..._conv_bwd_weight_xdl_fp16_comp_bf8_fp8.cpp | 0 .../run_grouped_conv_bwd_weight_example.inc | 0 .../example}/21_gemm_layernorm/CMakeLists.txt | 0 ...bias_relu_add_layernorm_xdl_naive_fp16.cpp | 0 ...as_relu_add_layernorm_xdl_welford_fp16.cpp | 0 .../gemm_layernorm_xdl_naive_fp16.cpp | 0 ...xdl_layernorm_naive_single_kernel_fp16.cpp | 0 .../example}/22_cgemm/CMakeLists.txt | 0 .../example}/22_cgemm/cgemm_xdl_bf16.cpp | 0 .../example}/22_cgemm/cgemm_xdl_common.hpp | 0 .../example}/22_cgemm/cgemm_xdl_fp16.cpp | 0 .../example}/22_cgemm/cgemm_xdl_fp32.cpp | 0 .../example}/22_cgemm/cgemm_xdl_int4.cpp | 0 .../example}/22_cgemm/cgemm_xdl_int8.cpp | 0 .../example}/23_softmax/CMakeLists.txt | 0 .../example}/23_softmax/README.md | 0 .../example}/23_softmax/softmax_blockwise.cpp | 0 .../example}/24_batched_gemm/CMakeLists.txt | 0 .../24_batched_gemm/batched_gemm_xdl_bf16.cpp | 0 .../24_batched_gemm/batched_gemm_xdl_fp16.cpp | 0 .../24_batched_gemm/batched_gemm_xdl_fp32.cpp | 0 .../24_batched_gemm/batched_gemm_xdl_int4.cpp | 0 .../24_batched_gemm/batched_gemm_xdl_int8.cpp | 0 .../run_batched_gemm_example.inc | 0 .../25_gemm_bias_e_permute/CMakeLists.txt | 0 .../gemm_bias_e_permute_g1m2n3k1_xdl_fp16.cpp | 0 .../gemm_bias_e_permute_g1m3n2k1_xdl_fp16.cpp | 0 .../example}/26_contraction/CMakeLists.txt | 0 .../example}/26_contraction/README.md | 0 .../contraction_bilinear_xdl_fp32.cpp | 0 .../contraction_bilinear_xdl_fp64.cpp | 0 .../contraction_scale_xdl_fp32.cpp | 0 .../contraction_scale_xdl_fp64.cpp | 0 .../example}/27_layernorm/CMakeLists.txt | 0 .../example}/27_layernorm/common.hpp | 0 .../example}/27_layernorm/layernorm_fp16.cpp | 0 .../27_layernorm/layernorm_splitk_fp16.cpp | 0 .../27_layernorm/run_layernorm_example.inc | 0 .../CMakeLists.txt | 0 .../grouped_gemm_bias_e_permute_xdl_fp16.cpp | 0 .../CMakeLists.txt | 0 .../batched_gemm_bias_e_permute_wmma_fp16.cpp | 0 .../batched_gemm_bias_e_permute_xdl_fp16.cpp | 0 .../CMakeLists.txt | 0 .../30_grouped_conv_fwd_multiple_d/README.md | 0 .../30_grouped_conv_fwd_multiple_d/common.hpp | 0 .../common_wmma.hpp | 0 ...ouped_conv_fwd_bias_relu_add_wmma_fp16.cpp | 0 ...ouped_conv_fwd_bias_relu_add_wmma_int8.cpp | 0 ...rouped_conv_fwd_bias_relu_add_xdl_bf16.cpp | 0 ...rouped_conv_fwd_bias_relu_add_xdl_fp16.cpp | 0 ...rouped_conv_fwd_bias_relu_add_xdl_fp32.cpp | 0 ...rouped_conv_fwd_bias_relu_add_xdl_int4.cpp | 0 ...rouped_conv_fwd_bias_relu_add_xdl_int8.cpp | 0 .../grouped_conv_fwd_xdl_fp16.cpp | 0 ...grouped_conv_fwd_bias_relu_add_example.inc | 0 ...ed_conv_fwd_bias_relu_add_wmma_example.inc | 0 .../run_grouped_conv_fwd_example.inc | 0 .../31_batched_gemm_gemm/CMakeLists.txt | 0 .../batched_gemm_gemm_xdl_bf16.cpp | 0 .../batched_gemm_gemm_xdl_fp16.cpp | 0 .../batched_gemm_gemm_xdl_fp32.cpp | 0 .../batched_gemm_gemm_xdl_int4.cpp | 0 .../batched_gemm_gemm_xdl_int8.cpp | 0 .../run_batched_gemm_gemm_example.inc | 0 .../CMakeLists.txt | 0 ...le_scale_softmax_gemm_permute_xdl_fp16.cpp | 0 ...mm_scale_softmax_gemm_permute_xdl_bf16.cpp | 0 ...mm_scale_softmax_gemm_permute_xdl_fp16.cpp | 0 ...tched_gemm_scale_softmax_gemm_xdl_bf16.cpp | 0 ...tched_gemm_scale_softmax_gemm_xdl_fp16.cpp | 0 ...le_scale_softmax_gemm_permute_xdl_fp16.cpp | 0 ...mm_scale_softmax_gemm_permute_xdl_fp16.cpp | 0 .../run_batched_gemm_scale_softmax_gemm.inc | 0 ...atched_gemm_scale_softmax_gemm_permute.inc | 0 ...rouped_gemm_scale_softmax_gemm_permute.inc | 0 .../33_multiple_reduce/CMakeLists.txt | 0 .../example}/33_multiple_reduce/README.md | 0 .../33_multiple_reduce/dual_reduce_common.hpp | 0 .../dual_reduce_multiblock.cpp | 0 .../dual_reduce_threadwise.cpp | 0 .../example}/34_batchnorm/CMakeLists.txt | 0 .../example}/34_batchnorm/README.md | 0 .../34_batchnorm/batchnorm_backward_nhwc.cpp | 0 .../34_batchnorm/batchnorm_common.hpp | 0 .../batchnorm_forward_inferring_nhwc.cpp | 0 .../batchnorm_forward_training_nhwc.cpp | 0 ...tchnorm_forward_training_nhwc_obsolete.cpp | 0 .../34_batchnorm/batchnorm_infer_impl.hpp | 0 .../example}/35_splitK_gemm/CMakeLists.txt | 0 .../run_splitK_gemm_example.inc | 0 .../35_splitK_gemm/splitK_gemm_xdl_bf16.cpp | 0 .../35_splitK_gemm/splitK_gemm_xdl_fp16.cpp | 0 .../35_splitK_gemm/splitK_gemm_xdl_fp32.cpp | 0 .../35_splitK_gemm/splitK_gemm_xdl_int4.cpp | 0 .../35_splitK_gemm/splitK_gemm_xdl_int8.cpp | 0 .../36_sparse_embedding/CMakeLists.txt | 0 .../sparse_embedding3_forward_layernorm.cpp | 0 .../CMakeLists.txt | 0 ...ed_gemm_add_add_relu_gemm_add_xdl_fp16.cpp | 0 .../CMakeLists.txt | 0 .../common.hpp | 0 ...ouped_conv_bwd_data_bias_relu_xdl_fp16.cpp | 0 .../grouped_conv_bwd_data_wmma_fp16.cpp | 0 .../grouped_conv_bwd_data_xdl_fp16.cpp | 0 ...rouped_conv_bwd_data_bias_relu_example.inc | 0 .../run_grouped_conv_bwd_data_example.inc | 0 .../example}/39_permute/CMakeLists.txt | 0 .../example}/39_permute/common.hpp | 0 .../39_permute/permute_1xHxW_fp16.cpp | 0 .../39_permute/permute_HxWx4_fp16.cpp | 0 .../39_permute/permute_NxHxW_fp16.cpp | 0 .../39_permute/run_permute_bundle_example.inc | 0 .../run_permute_element_example.inc | 0 .../40_conv2d_fwd_quantization/CMakeLists.txt | 0 .../40_conv2d_fwd_quantization/common.hpp | 0 ...bias_relu_perchannel_quantization_int8.cpp | 0 ...l_bias_relu_perlayer_quantization_int8.cpp | 0 ...bias_tanh_perchannel_quantization_int8.cpp | 0 ...l_bias_tanh_perlayer_quantization_int8.cpp | 0 ...2d_fwd_dl_perchannel_quantization_int8.cpp | 0 ...nv2d_fwd_dl_perlayer_quantization_int8.cpp | 0 ...bias_relu_perchannel_quantization_int8.cpp | 0 ...l_bias_relu_perlayer_quantization_int8.cpp | 0 ...d_fwd_xdl_perchannel_quantization_int8.cpp | 0 ...v2d_fwd_xdl_perlayer_quantization_int8.cpp | 0 ...d_bias_perchannel_quantization_example.inc | 0 ...fwd_bias_perlayer_quantization_example.inc | 0 ...2d_fwd_perchannel_quantization_example.inc | 0 ...nv2d_fwd_perlayer_quantization_example.inc | 0 .../41_grouped_conv_conv_fwd/CMakeLists.txt | 0 .../grouped_conv_conv_fwd_xdl_bf16.cpp | 0 .../grouped_conv_conv_fwd_xdl_fp16.cpp | 0 .../grouped_conv_conv_fwd_xdl_fp32.cpp | 0 .../grouped_conv_conv_fwd_xdl_int4.cpp | 0 .../grouped_conv_conv_fwd_xdl_int8.cpp | 0 .../run_grouped_conv_conv_fwd_example.inc | 0 .../example}/42_groupnorm/CMakeLists.txt | 0 .../example}/42_groupnorm/common.hpp | 0 .../groupnorm_sigmoid_mul_fp16.cpp | 0 .../42_groupnorm/groupnorm_splitk_fp16.cpp | 0 .../42_groupnorm/groupnorm_swish_fp16.cpp | 0 .../42_groupnorm/run_groupnorm_example.inc | 0 .../CMakeLists.txt | 0 .../splitk_gemm_bias_e_permute_xdl_fp16.cpp | 0 .../splitk_gemm_bias_e_permute_xdl_fp32.cpp | 0 .../44_elementwise_permute/CMakeLists.txt | 0 .../elementwise_permute_4D_fp16.cpp | 0 .../elementwise_permute_4D_fp16_2d.cpp | 0 .../CMakeLists.txt | 0 .../elementwise_layernorm_blockwise.cpp | 0 .../46_gemm_add_multiply/CMakeLists.txt | 0 .../example}/46_gemm_add_multiply/README.md | 0 .../example}/46_gemm_add_multiply/common.hpp | 0 .../gemm_add_multiply_dl_fp16.cpp | 0 .../gemm_add_multiply_xdl_fp16.cpp | 0 .../run_gemm_add_multiply_example.inc | 0 .../CMakeLists.txt | 0 .../gemm_bias_softmax_gemm_permute.cpp | 0 .../example}/48_pool3d_fwd/CMakeLists.txt | 0 .../48_pool3d_fwd/pool3d_fwd_common.hpp | 0 .../48_pool3d_fwd/pool3d_fwd_fp16.cpp | 0 .../example}/49_maxpool2d_bwd/CMakeLists.txt | 0 .../49_maxpool2d_bwd/maxpool2d_bwd_bf16.cpp | 0 .../49_maxpool2d_bwd/maxpool2d_bwd_common.hpp | 0 .../49_maxpool2d_bwd/maxpool2d_bwd_fp16.cpp | 0 .../49_maxpool2d_bwd/maxpool2d_bwd_fp32.cpp | 0 .../example}/50_put_element/CMakeLists.txt | 0 .../50_put_element/put_element_fp16.cpp | 0 .../example}/51_avgpool3d_bwd/CMakeLists.txt | 0 .../51_avgpool3d_bwd/avgpool3d_bwd_bf16.cpp | 0 .../51_avgpool3d_bwd/avgpool3d_bwd_common.hpp | 0 .../51_avgpool3d_bwd/avgpool3d_bwd_fp16.cpp | 0 .../51_avgpool3d_bwd/avgpool3d_bwd_fp32.cpp | 0 .../example}/52_im2col_col2im/CMakeLists.txt | 0 .../52_im2col_col2im/column_to_image_f32.cpp | 0 .../example}/52_im2col_col2im/common.hpp | 0 .../52_im2col_col2im/image_to_column_f32.cpp | 0 .../example}/53_gemv_splitk/CMakeLists.txt | 0 .../example}/53_gemv_splitk/README.md | 0 .../example}/53_gemv_splitk/common.hpp | 0 .../53_gemv_splitk/gemv_splitk_fp16.cpp | 4 +- .../run_gemv_splitk_example.inc | 0 .../CMakeLists.txt | 12 + .../54_tall_and_skinny_gemm_splitk/README.md | 19 + .../54_tall_and_skinny_gemm_splitk/common.hpp | 95 +++ ...un_tall_and_skinny_gemm_splitk_example.inc | 196 +++++ .../tall_and_skinny_gemm_splitk_fp16.cpp | 43 + .../example}/60_gemm_multi_ABD/CMakeLists.txt | 0 .../gemm_multi_ABD_xdl_fp16.cpp | 0 .../61_contraction_multi_ABD/CMakeLists.txt | 0 .../contraction_multi_ABD_xdl_fp16.cpp | 0 .../example}/62_conv_fwd_activ/CMakeLists.txt | 0 .../convnd_fwd_activ_common.hpp | 0 .../convnd_fwd_xdl_abs_fp16.cpp | 0 .../convnd_fwd_xdl_clippedrelu_fp16.cpp | 0 .../convnd_fwd_xdl_elu_fp16.cpp | 0 .../convnd_fwd_xdl_leakyrelu_fp16.cpp | 0 .../convnd_fwd_xdl_pow_fp16.cpp | 0 .../convnd_fwd_xdl_relu_fp16.cpp | 0 .../convnd_fwd_xdl_sigmoid_fp16.cpp | 0 .../convnd_fwd_xdl_softrelu_fp16.cpp | 0 .../convnd_fwd_xdl_tanh_fp16.cpp | 0 .../run_convnd_fwd_activ_example.inc | 0 .../example}/CMakeLists.txt | 0 .../include}/ck/ck.hpp | 0 .../include}/ck/config.h.in | 0 .../include}/ck/host_utility/device_prop.hpp | 0 .../ck/host_utility/hip_check_error.hpp | 0 .../include}/ck/host_utility/io.hpp | 0 .../ck/host_utility/kernel_launch.hpp | 0 .../ck/host_utility/stream_utility.hpp | 0 ...n3d_into_gemm_v4r4r4_ndhwc_kzyxc_ndhwk.hpp | 0 .../include}/ck/stream_config.hpp | 0 .../include}/ck/tensor/static_tensor.hpp | 0 .../tensor_description/cluster_descriptor.hpp | 0 .../multi_index_transform.hpp | 0 .../multi_index_transform_helper.hpp | 0 .../ck/tensor_description/tensor_adaptor.hpp | 0 .../tensor_description/tensor_descriptor.hpp | 0 .../tensor_descriptor_helper.hpp | 0 .../tensor_space_filling_curve.hpp | 0 .../gpu/block/blockwise_gemm_dl_v2r3.hpp | 0 .../gpu/block/blockwise_gemm_dlops_v2r2.hpp | 0 .../gpu/block/blockwise_gemm_dlops_v3.hpp | 0 .../gpu/block/blockwise_gemm_dpp.hpp | 0 .../gpu/block/blockwise_gemm_wmma.hpp | 0 .../gpu/block/blockwise_gemm_xdlops.hpp | 0 .../blockwise_gemm_xdlops_skip_b_lds.hpp | 0 .../gpu/block/blockwise_softmax.hpp | 0 .../blockwise_tensor_slice_transfer_v5r1.hpp | 0 .../gpu/block/blockwise_welford.hpp | 0 .../block/reduction_functions_blockwise.hpp | 0 ...hread_group_tensor_slice_transfer_v4r1.hpp | 0 ...hread_group_tensor_slice_transfer_v6r1.hpp | 0 ...ead_group_tensor_slice_transfer_v6r1r2.hpp | 0 ...hread_group_tensor_slice_transfer_v6r2.hpp | 0 ...hread_group_tensor_slice_transfer_v6r3.hpp | 0 .../thread_group_tensor_slice_transfer_v7.hpp | 0 ...hread_group_tensor_slice_transfer_v7r2.hpp | 0 .../gpu/device/conv_tensor_rearrange_op.hpp | 0 ...nvolution_backward_data_specialization.hpp | 0 ...olution_backward_weight_specialization.hpp | 0 .../convolution_forward_specialization.hpp | 0 .../gpu/device/device_avgpool_bwd.hpp | 0 .../gpu/device/device_base.hpp | 0 .../device_batched_contraction_multiple_d.hpp | 0 .../gpu/device/device_batched_gemm.hpp | 0 .../device/device_batched_gemm_e_permute.hpp | 0 .../gpu/device/device_batched_gemm_gemm.hpp | 0 .../device/device_batched_gemm_multi_d.hpp | 0 ...atched_gemm_multiple_d_gemm_multiple_d.hpp | 0 .../device_batched_gemm_softmax_gemm.hpp | 0 ...vice_batched_gemm_softmax_gemm_permute.hpp | 0 .../gpu/device/device_batchnorm_backward.hpp | 0 .../gpu/device/device_batchnorm_forward.hpp | 0 .../gpu/device/device_batchnorm_infer.hpp | 0 .../gpu/device/device_cgemm.hpp | 0 .../device_contraction_multiple_abd.hpp | 0 .../device/device_contraction_multiple_d.hpp | 0 .../gpu/device/device_conv_bwd_data.hpp | 0 .../gpu/device/device_conv_fwd.hpp | 0 .../device_conv_fwd_bias_activation.hpp | 0 .../device_conv_fwd_bias_activation_add.hpp | 0 .../device/device_conv_tensor_rearrange.hpp | 0 .../gpu/device/device_elementwise.hpp | 0 .../device_elementwise_normalization.hpp | 0 .../gpu/device/device_gemm.hpp | 0 .../gpu/device/device_gemm_bias_e_permute.hpp | 0 .../gpu/device/device_gemm_multiple_abd.hpp | 0 .../gpu/device/device_gemm_multiple_d.hpp | 0 .../device_gemm_multiple_d_layernorm.hpp | 0 .../device_gemm_multiple_d_multiple_r.hpp | 0 .../gpu/device/device_gemm_reduce.hpp | 0 .../gpu/device/device_gemm_splitk.hpp | 0 .../gpu/device/device_gemm_streamk.hpp | 0 .../device_grouped_contraction_multiple_d.hpp | 0 ...evice_grouped_conv_bwd_data_multiple_d.hpp | 0 .../device/device_grouped_conv_bwd_weight.hpp | 0 .../gpu/device/device_grouped_conv_fwd.hpp | 0 .../device_grouped_conv_fwd_multiple_d.hpp | 0 .../gpu/device/device_grouped_gemm.hpp | 0 .../device/device_grouped_gemm_fixed_nk.hpp | 0 ...vice_grouped_gemm_softmax_gemm_permute.hpp | 0 .../gpu/device/device_grouped_gemm_splitk.hpp | 0 .../gpu/device/device_max_pool_bwd.hpp | 0 .../gpu/device/device_multiple_reduce.hpp | 0 .../gpu/device/device_normalization.hpp | 0 .../gpu/device/device_permute.hpp | 0 .../gpu/device/device_pool_fwd.hpp | 0 .../gpu/device/device_put_element.hpp | 0 .../gpu/device/device_reduce.hpp | 0 .../gpu/device/device_softmax.hpp | 0 .../device_splitk_contraction_multiple_d.hpp | 0 .../device/device_tall_and_skinny_gemm.hpp | 2 +- .../gpu/device/gemm_specialization.hpp | 0 .../impl/device_avgpool3d_bwd_ndhwc_ndhwc.hpp | 0 ...d_contraction_multiple_d_wmma_cshuffle.hpp | 0 ...ed_contraction_multiple_d_xdl_cshuffle.hpp | 0 .../device_batched_gemm_e_permute_xdl.hpp | 0 .../device_batched_gemm_gemm_xdl_cshuffle.hpp | 0 .../impl/device_batched_gemm_multi_d_xdl.hpp | 0 .../device_batched_gemm_multiple_d_dl.hpp | 0 ...ultiple_d_gemm_multiple_d_xdl_cshuffle.hpp | 0 ...evice_batched_gemm_reduce_xdl_cshuffle.hpp | 0 ...gemm_softmax_gemm_permute_xdl_cshuffle.hpp | 0 ...batched_gemm_softmax_gemm_xdl_cshuffle.hpp | 0 .../device/impl/device_batched_gemm_xdl.hpp | 0 .../impl/device_batchnorm_backward_impl.hpp | 0 .../impl/device_batchnorm_forward_impl.hpp | 0 ...device_batchnorm_forward_impl_obsolete.hpp | 0 .../impl/device_cgemm_4gemm_xdl_cshuffle.hpp | 0 .../impl/device_column_to_image_impl.hpp | 0 ..._contraction_multiple_abd_xdl_cshuffle.hpp | 0 ...ce_contraction_multiple_d_xdl_cshuffle.hpp | 0 ...rd_weight_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp | 0 ...ice_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk.hpp | 0 ...fle_bias_activation_add_nhwc_kyxc_nhwk.hpp | 0 ...shuffle_bias_activation_nhwc_kyxc_nhwk.hpp | 0 ...onv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp | 0 .../device_conv2d_fwd_xdl_nhwc_kyxc_nhwk.hpp | 0 ...ice_conv3d_fwd_naive_ndhwc_kzyxc_ndhwk.hpp | 0 ...evice_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk.hpp | 0 .../device_convnd_bwd_data_nwc_kxc_nwk_dl.hpp | 0 ...device_convnd_bwd_data_nwc_kxc_nwk_xdl.hpp | 0 .../impl/device_elementwise_2d_impl.hpp | 0 .../device/impl/device_elementwise_impl.hpp | 0 .../device_elementwise_normalization_impl.hpp | 0 ...vice_gemm_bias_add_reduce_xdl_cshuffle.hpp | 0 .../gpu/device/impl/device_gemm_dl.hpp | 0 .../gpu/device/impl/device_gemm_dpp.hpp | 0 .../device_gemm_multiple_abd_xdl_cshuffle.hpp | 0 .../device/impl/device_gemm_multiple_d_dl.hpp | 0 ...gemm_multiple_d_layernorm_xdl_cshuffle.hpp | 0 ...emm_multiple_d_multiple_r_xdl_cshuffle.hpp | 0 .../device_gemm_multiple_d_wmma_cshuffle.hpp | 0 .../device_gemm_multiple_d_xdl_cshuffle.hpp | 0 .../impl/device_gemm_reduce_xdl_cshuffle.hpp | 0 .../gpu/device/impl/device_gemm_wmma.hpp | 0 .../gpu/device/impl/device_gemm_xdl.hpp | 0 .../device/impl/device_gemm_xdl_cshuffle.hpp | 0 .../device_gemm_xdl_layernorm_cshuffle.hpp | 0 .../impl/device_gemm_xdl_skip_b_lds.hpp | 0 .../impl/device_gemm_xdl_splitk_c_shuffle.hpp | 0 .../device/impl/device_gemm_xdl_streamk.hpp | 0 .../device_gemm_xdl_waveletmodel_cshuffle.hpp | 0 ...ed_contraction_multiple_d_xdl_cshuffle.hpp | 0 ...conv_bwd_data_multiple_d_wmma_cshuffle.hpp | 0 ...nv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp | 0 .../device_grouped_conv_bwd_weight_dl.hpp | 0 ..._grouped_conv_bwd_weight_wmma_cshuffle.hpp | 0 ...e_grouped_conv_bwd_weight_xdl_cshuffle.hpp | 0 ..._conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp | 0 ...ice_grouped_conv_fwd_dl_nhwc_kyxc_nhwk.hpp | 0 ...grouped_conv_fwd_multiple_d_multiple_r.hpp | 0 ...fwd_multiple_d_multiple_r_xdl_cshuffle.hpp | 0 ...uped_conv_fwd_multiple_d_wmma_cshuffle.hpp | 0 ...ouped_conv_fwd_multiple_d_xdl_cshuffle.hpp | 0 .../device/impl/device_grouped_conv_utils.hpp | 0 .../device_grouped_gemm_multiple_d_dl.hpp | 0 ...gemm_softmax_gemm_permute_xdl_cshuffle.hpp | 0 .../device/impl/device_grouped_gemm_xdl.hpp | 0 .../impl/device_grouped_gemm_xdl_fixed_nk.hpp | 0 ...evice_grouped_gemm_xdl_splitk_cshuffle.hpp | 0 .../impl/device_image_to_column_impl.hpp | 0 .../device/impl/device_max_pool_bwd_impl.hpp | 0 .../device_multiple_reduce_multiblock.hpp | 0 .../device_multiple_reduce_threadwise.hpp | 0 .../device/impl/device_normalization_impl.hpp | 0 .../impl/device_normalization_splitk_impl.hpp | 0 .../gpu/device/impl/device_permute_impl.hpp | 0 .../impl/device_pool2d_fwd_nhwc_nhwc.hpp | 0 .../impl/device_pool3d_fwd_ndhwc_ndhwc.hpp | 0 .../device/impl/device_put_element_impl.hpp | 0 .../gpu/device/impl/device_reduce_common.hpp | 0 .../device/impl/device_reduce_multiblock.hpp | 0 .../device/impl/device_reduce_threadwise.hpp | 0 .../gpu/device/impl/device_softmax_impl.hpp | 0 ...ce_sparse_embeddings_forward_layernorm.hpp | 0 ...tk_contraction_multiple_d_xdl_cshuffle.hpp | 0 .../device_tall_and_skinny_gemm_splitk.hpp | 59 +- .../gpu/device/masking_specialization.hpp | 0 .../gpu/device/matrix_padder.hpp | 0 .../gpu/device/reduction_operator_mapping.hpp | 0 .../gpu/device/tensor_layout.hpp | 0 .../gpu/device/tensor_specialization.hpp | 0 .../gpu/device/welford_helper.hpp | 0 .../element/binary_element_wise_operation.hpp | 0 .../gpu/element/element_wise_operation.hpp | 0 .../gpu/element/quantization_operation.hpp | 0 .../element/unary_element_wise_operation.hpp | 0 .../gridwise_multiblock_batchnorm_forward.hpp | 0 ...e_second_half_batchnorm_backward_final.hpp | 0 ...gridwise_multiblock_welford_first_half.hpp | 0 ..._half_batchnorm_forward_final_obsolete.hpp | 0 ...cond_half_multiblock_reduce_first_half.hpp | 0 .../gpu/grid/block_to_ctile_map.hpp | 0 ...iple_d_welford_first_half_xdl_cshuffle.hpp | 0 ...idwise_welford_second_half_layernorm2d.hpp | 0 ...dwise_2d_multiple_reduction_multiblock.hpp | 0 ...dwise_2d_multiple_reduction_threadwise.hpp | 0 .../grid/gridwise_2d_reduction_multiblock.hpp | 0 .../grid/gridwise_2d_reduction_threadwise.hpp | 0 ...wise_batched_gemm_gemm_xdl_cshuffle_v1.hpp | 0 ...iple_d_gemm_multiple_d_xdl_cshuffle_v1.hpp | 0 ...ultiple_d_softmax_gemm_xdl_cshuffle_v1.hpp | 0 ...ched_gemm_softmax_gemm_xdl_cshuffle_v1.hpp | 0 ...e_batchnorm_backward_blockwise_welford.hpp | 0 ...se_batchnorm_forward_blockwise_welford.hpp | 0 .../gpu/grid/gridwise_elementwise_1d.hpp | 0 .../gpu/grid/gridwise_elementwise_2d.hpp | 0 ...elementwise_layernorm_welford_variance.hpp | 0 ...e_gemm_bias_add_reduce_xdl_cshuffle_v1.hpp | 0 .../gpu/grid/gridwise_gemm_dl_multiple_d.hpp | 0 .../gpu/grid/gridwise_gemm_dl_v1r3.hpp | 0 .../gpu/grid/gridwise_gemm_dpp.hpp | 0 ...ridwise_gemm_multiple_abd_xdl_cshuffle.hpp | 0 ...emm_multiple_d_multiple_r_xdl_cshuffle.hpp | 0 ...gridwise_gemm_multiple_d_wmma_cshuffle.hpp | 0 .../gridwise_gemm_multiple_d_xdl_cshuffle.hpp | 0 ...se_gemm_multiple_d_xdl_splitk_cshuffle.hpp | 0 .../grid/gridwise_gemm_pipeline_selector.hpp | 0 .../gpu/grid/gridwise_gemm_pipeline_v1.hpp | 0 .../gpu/grid/gridwise_gemm_pipeline_v2.hpp | 0 .../gpu/grid/gridwise_gemm_pipeline_v3.hpp | 0 .../gridwise_gemm_reduce_xdl_cshuffle_v1.hpp | 0 ...e_gemm_split_k_multiple_d_xdl_cshuffle.hpp | 0 ...emm_split_k_multiple_d_xdl_cshuffle_v2.hpp | 0 .../gpu/grid/gridwise_gemm_waveletmodel.hpp | 0 .../gpu/grid/gridwise_gemm_wmma.hpp | 0 .../grid/gridwise_gemm_xdl_cshuffle_v1.hpp | 0 ...ridwise_gemm_xdl_layernorm_cshuffle_v1.hpp | 0 ...ridwise_gemm_xdl_waveletmodel_cshuffle.hpp | 0 .../grid/gridwise_gemm_xdlops_bwd_weight.hpp | 0 .../gridwise_gemm_xdlops_skip_b_lds_v1.hpp | 0 .../gpu/grid/gridwise_gemm_xdlops_streamk.hpp | 0 .../gpu/grid/gridwise_gemm_xdlops_v2r3.hpp | 0 .../gpu/grid/gridwise_gemm_xdlops_v2r4.hpp | 0 .../gpu/grid/gridwise_gemm_xdlops_v2r4r2.hpp | 0 .../gpu/grid/gridwise_gemm_xdlops_v3r1.hpp | 0 .../gpu/grid/gridwise_gemm_xdlops_v3r2.hpp | 0 .../gpu/grid/gridwise_gemm_xdlops_v3r3.hpp | 0 .../gpu/grid/gridwise_gemv_splitk.hpp | 0 .../gpu/grid/gridwise_permute.hpp | 0 .../gpu/grid/gridwise_put_element_1d.hpp | 0 .../gpu/grid/gridwise_set_buffer_value.hpp | 0 .../gridwise_set_multiple_buffer_value.hpp | 0 .../gpu/grid/gridwise_softmax.hpp | 0 ...se_sparse_embeddings_forward_layernorm.hpp | 0 .../gridwise_tall_and_skinny_gemm_splitk.hpp | 773 ++++++++++++++++++ .../gpu/grid/gridwise_tensor_rearrange.hpp | 0 .../gridwise_normalization_naive_variance.hpp | 0 .../gridwise_normalization_selector.hpp | 0 .../gridwise_normalization_splitk_1st.hpp | 0 .../gridwise_normalization_splitk_2nd.hpp | 0 ...ridwise_normalization_welford_variance.hpp | 0 .../thread/reduction_functions_threadwise.hpp | 0 .../gpu/thread/threadwise_contraction_dl.hpp | 0 .../gpu/thread/threadwise_gemm_dlops_v3.hpp | 0 .../thread/threadwise_tensor_slice_set.hpp | 0 .../threadwise_tensor_slice_transfer.hpp | 0 .../threadwise_tensor_slice_transfer_v3r1.hpp | 0 .../threadwise_tensor_slice_transfer_v4r1.hpp | 0 .../threadwise_tensor_slice_transfer_v5r1.hpp | 0 .../threadwise_tensor_slice_transfer_v6r1.hpp | 0 ...hreadwise_tensor_slice_transfer_v6r1r2.hpp | 0 .../threadwise_tensor_slice_transfer_v6r2.hpp | 0 .../threadwise_tensor_slice_transfer_v6r3.hpp | 0 .../threadwise_tensor_slice_transfer_v7.hpp | 0 .../threadwise_tensor_slice_transfer_v7r2.hpp | 0 .../gpu/thread/threadwise_welford.hpp | 0 .../ck/tensor_operation/gpu/warp/dpp_gemm.hpp | 0 .../tensor_operation/gpu/warp/wmma_gemm.hpp | 0 .../tensor_operation/gpu/warp/xdlops_gemm.hpp | 0 .../transform_contraction_to_gemm.hpp | 0 .../transform_conv_bwd_data_to_gemm_v1.hpp | 0 .../transform_conv_fwd_to_gemm.hpp | 0 .../include}/ck/utility/amd_address_space.hpp | 0 .../ck/utility/amd_buffer_addressing.hpp | 0 .../include}/ck/utility/amd_gemm_dpp.hpp | 0 .../include}/ck/utility/amd_inline_asm.hpp | 0 .../ck/utility/amd_wave_read_first_lane.hpp | 0 .../include}/ck/utility/amd_wmma.hpp | 0 .../include}/ck/utility/amd_xdlops.hpp | 0 .../include}/ck/utility/array.hpp | 0 .../include}/ck/utility/array_multi_index.hpp | 0 .../ck/utility/c_style_pointer_cast.hpp | 0 .../include}/ck/utility/common_header.hpp | 0 .../ck/utility/container_element_picker.hpp | 0 .../include}/ck/utility/container_helper.hpp | 0 .../include}/ck/utility/data_type.hpp | 0 .../include}/ck/utility/debug.hpp | 0 .../include}/ck/utility/dynamic_buffer.hpp | 0 .../include}/ck/utility/enable_if.hpp | 0 .../include}/ck/utility/f8_utils.hpp | 0 .../include}/ck/utility/functional.hpp | 0 .../include}/ck/utility/functional2.hpp | 0 .../include}/ck/utility/functional3.hpp | 0 .../include}/ck/utility/functional4.hpp | 0 .../utility/generic_memory_space_atomic.hpp | 0 .../include}/ck/utility/get_id.hpp | 0 .../include}/ck/utility/get_shift.hpp | 0 .../include}/ck/utility/ignore.hpp | 0 .../include}/ck/utility/inner_product.hpp | 0 .../ck/utility/inner_product_dpp8.hpp | 0 .../include}/ck/utility/integral_constant.hpp | 0 .../include}/ck/utility/is_detected.hpp | 0 .../ck/utility/is_known_at_compile_time.hpp | 0 .../include}/ck/utility/loop_scheduler.hpp | 0 .../include}/ck/utility/magic_division.hpp | 0 .../include}/ck/utility/math.hpp | 0 .../include}/ck/utility/math_v2.hpp | 0 .../include}/ck/utility/multi_index.hpp | 0 .../include}/ck/utility/number.hpp | 0 .../include}/ck/utility/random_gen.hpp | 0 .../include}/ck/utility/reduction_common.hpp | 0 .../include}/ck/utility/reduction_enums.hpp | 0 .../reduction_functions_accumulate.hpp | 0 .../ck/utility/reduction_operator.hpp | 0 .../include}/ck/utility/sequence.hpp | 0 .../include}/ck/utility/sequence_helper.hpp | 0 .../include}/ck/utility/span.hpp | 0 .../include}/ck/utility/static_buffer.hpp | 0 .../ck/utility/statically_indexed_array.hpp | 0 .../statically_indexed_array_multi_index.hpp | 0 .../include}/ck/utility/synchronization.hpp | 0 .../include}/ck/utility/thread_group.hpp | 0 .../include}/ck/utility/transpose_vectors.hpp | 0 .../include}/ck/utility/tuple.hpp | 0 .../include}/ck/utility/tuple_helper.hpp | 0 .../include}/ck/utility/type.hpp | 0 .../include}/ck/utility/type_convert.hpp | 0 .../include}/ck/utility/workgroup_barrier.hpp | 0 .../ck/utility/workgroup_synchronization.hpp | 0 .../include}/ck/version.h.in | 0 .../library}/CMakeLists.txt | 0 .../cpu/reference_avgpool_bwd.hpp | 0 .../cpu/reference_batched_gemm.hpp | 0 .../cpu/reference_batchnorm_backward.hpp | 0 .../cpu/reference_batchnorm_forward.hpp | 0 .../cpu/reference_batchnorm_infer.hpp | 0 .../cpu/reference_cgemm.hpp | 0 .../cpu/reference_column_to_image.hpp | 0 .../cpu/reference_contraction.hpp | 0 .../cpu/reference_conv_bwd_data.hpp | 0 .../cpu/reference_conv_bwd_weight.hpp | 0 .../cpu/reference_conv_fwd.hpp | 0 .../reference_conv_fwd_bias_activation.hpp | 0 ...reference_conv_fwd_bias_activation_add.hpp | 0 .../cpu/reference_gemm.hpp | 0 .../cpu/reference_gemm_layernorm.hpp | 0 .../cpu/reference_groupnorm.hpp | 0 .../cpu/reference_image_to_column.hpp | 0 .../cpu/reference_layernorm.hpp | 0 .../cpu/reference_maxpool_bwd.hpp | 0 .../cpu/reference_pool_fwd.hpp | 0 .../cpu/reference_reduce.hpp | 0 .../cpu/reference_softmax.hpp | 0 ...ce_sparse_embedding3_forward_layernorm.hpp | 0 .../gpu/naive_conv_fwd.hpp | 0 .../add_device_operation_instance.hpp | 0 .../device_operation_instance_factory.hpp | 0 .../gpu/avg_pool3d_bwd.hpp | 0 .../gpu/batched_gemm.hpp | 0 .../gpu/batched_gemm_add_relu_gemm_add.hpp | 0 .../gpu/batched_gemm_bias_permute.hpp | 0 ...batched_gemm_bias_softmax_gemm_permute.hpp | 0 .../gpu/batched_gemm_gemm.hpp | 0 .../gpu/batched_gemm_multi_d.hpp | 0 .../gpu/batched_gemm_softmax_gemm.hpp | 0 .../gpu/batched_gemm_softmax_gemm_permute.hpp | 0 .../gpu/batchnorm_backward.hpp | 0 .../gpu/batchnorm_forward.hpp | 0 .../gpu/batchnorm_infer.hpp | 0 .../gpu/contraction_bilinear.hpp | 0 .../gpu/contraction_scale.hpp | 0 .../gpu/conv_tensor_rearrange.hpp | 0 .../device_column_to_image_instance.hpp | 0 .../device_image_to_column_instance.hpp | 0 .../gpu/convolution_backward_data.hpp | 0 .../gpu/convolution_forward.hpp | 0 .../gpu/device_elementwise_instance.hpp | 0 .../device_gemm_mean_squaremean_instance.hpp | 0 .../gpu/elementwise_normalization.hpp | 0 .../tensor_operation_instance/gpu/gemm.hpp | 0 .../gpu/gemm_add_add_fastgelu.hpp | 0 .../gpu/gemm_add_fastgelu.hpp | 0 .../gpu/gemm_add_multiply.hpp | 0 .../gpu/gemm_add_relu_add_layernorm.hpp | 0 .../gpu/gemm_bilinear.hpp | 0 .../gpu/gemm_fastgelu.hpp | 0 .../gpu/gemm_multiply_add.hpp | 0 .../gpu/gemm_splitk.hpp | 0 .../gpu/gemm_streamk.hpp | 0 .../gpu/gemv_splitk.hpp | 10 +- ...ce_grouped_conv_bwd_data_wmma_instance.hpp | 0 ...ice_grouped_conv_bwd_data_xdl_instance.hpp | 0 ...ce_grouped_conv_bwd_weight_dl_instance.hpp | 0 ..._grouped_conv_bwd_weight_wmma_instance.hpp | 0 ...e_grouped_conv_bwd_weight_xdl_instance.hpp | 0 ...evice_grouped_conv2d_fwd_wmma_instance.hpp | 0 .../device_grouped_conv_fwd_dl_instance.hpp | 0 .../device_grouped_conv_fwd_wmma_instance.hpp | 0 .../device_grouped_conv_fwd_xdl_instance.hpp | 0 .../gpu/grouped_convolution_backward_data.hpp | 0 .../grouped_convolution_backward_weight.hpp | 0 .../gpu/grouped_convolution_forward.hpp | 0 .../gpu/grouped_gemm.hpp | 0 .../gpu/grouped_gemm_bias.hpp | 0 .../gpu/grouped_gemm_fastgelu.hpp | 0 .../gpu/grouped_gemm_fixed_nk.hpp | 0 .../gpu/image_to_column.hpp | 0 .../gpu/max_pool_bwd.hpp | 0 .../gpu/normalization.hpp | 0 .../gpu/normalization_swish.hpp | 0 .../gpu/pool3d_fwd.hpp | 0 .../gpu/quantization/gemm_quantization.hpp | 0 ...n_bias_forward_perchannel_quantization.hpp | 0 ...ion_bias_forward_perlayer_quantization.hpp | 0 ...lution_forward_perchannel_quantization.hpp | 0 ...volution_forward_perlayer_quantization.hpp | 0 .../gpu/reduce/device_reduce_instance.hpp | 0 .../device_reduce_instance_blockwise.hpp | 0 ...uce_instance_blockwise_b16_f32_b16_add.hpp | 0 ...ce_instance_blockwise_b16_f32_b16_amax.hpp | 0 ...uce_instance_blockwise_b16_f32_b16_avg.hpp | 0 ...uce_instance_blockwise_b16_f32_b16_max.hpp | 0 ...uce_instance_blockwise_b16_f32_b16_min.hpp | 0 ...e_instance_blockwise_b16_f32_b16_norm2.hpp | 0 ...ce_instance_blockwise_f16_f16_f16_amax.hpp | 0 ...uce_instance_blockwise_f16_f16_f16_max.hpp | 0 ...uce_instance_blockwise_f16_f16_f16_min.hpp | 0 ...uce_instance_blockwise_f16_f32_f16_add.hpp | 0 ...uce_instance_blockwise_f16_f32_f16_avg.hpp | 0 ...e_instance_blockwise_f16_f32_f16_norm2.hpp | 0 ...uce_instance_blockwise_f32_f32_f32_add.hpp | 0 ...ce_instance_blockwise_f32_f32_f32_amax.hpp | 0 ...uce_instance_blockwise_f32_f32_f32_avg.hpp | 0 ...uce_instance_blockwise_f32_f32_f32_max.hpp | 0 ...uce_instance_blockwise_f32_f32_f32_min.hpp | 0 ...e_instance_blockwise_f32_f32_f32_norm2.hpp | 0 ...uce_instance_blockwise_f32_f64_f32_add.hpp | 0 ...uce_instance_blockwise_f32_f64_f32_avg.hpp | 0 ...e_instance_blockwise_f32_f64_f32_norm2.hpp | 0 ...uce_instance_blockwise_f64_f64_f64_add.hpp | 0 ...ce_instance_blockwise_f64_f64_f64_amax.hpp | 0 ...uce_instance_blockwise_f64_f64_f64_avg.hpp | 0 ...uce_instance_blockwise_f64_f64_f64_max.hpp | 0 ...uce_instance_blockwise_f64_f64_f64_min.hpp | 0 ...e_instance_blockwise_f64_f64_f64_norm2.hpp | 0 ...educe_instance_blockwise_i8_i32_i8_add.hpp | 0 ...educe_instance_blockwise_i8_i32_i8_avg.hpp | 0 ...educe_instance_blockwise_i8_i8_i8_amax.hpp | 0 ...reduce_instance_blockwise_i8_i8_i8_max.hpp | 0 ...reduce_instance_blockwise_i8_i8_i8_min.hpp | 0 .../device_reduce_instance_impl_common.hpp | 0 ..._reduce_instance_multiblock_atomic_add.hpp | 0 ..._multiblock_atomic_add_b16_f32_f32_add.hpp | 0 ..._multiblock_atomic_add_b16_f32_f32_avg.hpp | 0 ..._multiblock_atomic_add_f16_f32_f32_add.hpp | 0 ..._multiblock_atomic_add_f16_f32_f32_avg.hpp | 0 ..._multiblock_atomic_add_f32_f32_f32_add.hpp | 0 ..._multiblock_atomic_add_f32_f32_f32_avg.hpp | 0 ..._multiblock_atomic_add_f32_f64_f32_add.hpp | 0 ..._multiblock_atomic_add_f32_f64_f32_avg.hpp | 0 ..._multiblock_atomic_add_f64_f64_f64_add.hpp | 0 ..._multiblock_atomic_add_f64_f64_f64_avg.hpp | 0 .../device_reduce_instance_threadwise.hpp | 0 ...ce_instance_threadwise_b16_f32_b16_add.hpp | 0 ...e_instance_threadwise_b16_f32_b16_amax.hpp | 0 ...ce_instance_threadwise_b16_f32_b16_avg.hpp | 0 ...ce_instance_threadwise_b16_f32_b16_max.hpp | 0 ...ce_instance_threadwise_b16_f32_b16_min.hpp | 0 ..._instance_threadwise_b16_f32_b16_norm2.hpp | 0 ...e_instance_threadwise_f16_f16_f16_amax.hpp | 0 ...ce_instance_threadwise_f16_f16_f16_max.hpp | 0 ...ce_instance_threadwise_f16_f16_f16_min.hpp | 0 ...ce_instance_threadwise_f16_f32_f16_add.hpp | 0 ...ce_instance_threadwise_f16_f32_f16_avg.hpp | 0 ..._instance_threadwise_f16_f32_f16_norm2.hpp | 0 ...ce_instance_threadwise_f32_f32_f32_add.hpp | 0 ...e_instance_threadwise_f32_f32_f32_amax.hpp | 0 ...ce_instance_threadwise_f32_f32_f32_avg.hpp | 0 ...ce_instance_threadwise_f32_f32_f32_max.hpp | 0 ...ce_instance_threadwise_f32_f32_f32_min.hpp | 0 ..._instance_threadwise_f32_f32_f32_norm2.hpp | 0 ...ce_instance_threadwise_f32_f64_f32_add.hpp | 0 ...ce_instance_threadwise_f32_f64_f32_avg.hpp | 0 ..._instance_threadwise_f32_f64_f32_norm2.hpp | 0 ...ce_instance_threadwise_f64_f64_f64_add.hpp | 0 ...e_instance_threadwise_f64_f64_f64_amax.hpp | 0 ...ce_instance_threadwise_f64_f64_f64_avg.hpp | 0 ...ce_instance_threadwise_f64_f64_f64_max.hpp | 0 ...ce_instance_threadwise_f64_f64_f64_min.hpp | 0 ..._instance_threadwise_f64_f64_f64_norm2.hpp | 0 ...duce_instance_threadwise_i8_i32_i8_add.hpp | 0 ...duce_instance_threadwise_i8_i32_i8_avg.hpp | 0 ...duce_instance_threadwise_i8_i8_i8_amax.hpp | 0 ...educe_instance_threadwise_i8_i8_i8_max.hpp | 0 ...educe_instance_threadwise_i8_i8_i8_min.hpp | 0 .../gpu/reduce/reduce.hpp | 0 .../tensor_operation_instance/gpu/softmax.hpp | 0 ...softmax_f16_f16_instance_rank3_reduce1.hpp | 0 ...softmax_f16_f16_instance_rank3_reduce2.hpp | 0 ...softmax_f16_f16_instance_rank3_reduce3.hpp | 0 ...softmax_f16_f16_instance_rank4_reduce1.hpp | 0 ...softmax_f16_f16_instance_rank4_reduce2.hpp | 0 ...softmax_f16_f16_instance_rank4_reduce3.hpp | 0 ...softmax_f16_f16_instance_rank4_reduce4.hpp | 0 .../device_softmax_f16_f16_instance_type.hpp | 0 ...softmax_f32_f32_instance_rank3_reduce1.hpp | 0 ...softmax_f32_f32_instance_rank3_reduce2.hpp | 0 ...softmax_f32_f32_instance_rank3_reduce3.hpp | 0 ...softmax_f32_f32_instance_rank4_reduce1.hpp | 0 ...softmax_f32_f32_instance_rank4_reduce2.hpp | 0 ...softmax_f32_f32_instance_rank4_reduce3.hpp | 0 ...softmax_f32_f32_instance_rank4_reduce4.hpp | 0 .../device_softmax_f32_f32_instance_type.hpp | 0 .../gpu/softmax/device_softmax_instance.hpp | 0 .../gpu/tall_and_skinny_gemm_splitk.hpp | 83 ++ .../include/ck/library/utility/algorithm.hpp | 0 .../include/ck/library/utility/check_err.hpp | 0 .../ck/library/utility/conv_common.hpp | 0 ...volution_host_tensor_descriptor_helper.hpp | 0 .../library/utility/convolution_parameter.hpp | 0 .../ck/library/utility/device_memory.hpp | 0 .../include/ck/library/utility/fill.hpp | 0 .../ck/library/utility/host_common_util.hpp | 0 .../include/ck/library/utility/host_gemm.hpp | 0 .../ck/library/utility/host_tensor.hpp | 0 .../library/utility/host_tensor_generator.hpp | 0 .../include/ck/library/utility/iterator.hpp | 0 .../include/ck/library/utility/literals.hpp | 0 .../include/ck/library/utility/numeric.hpp | 0 .../include/ck/library/utility/ranges.hpp | 0 .../gpu/CMakeLists.txt | 0 .../gpu/avg_pool3d_bwd/CMakeLists.txt | 0 .../avg_pool3d_bwd_ndhwc_instance_common.hpp | 0 ...ice_avg_pool3d_bwd_ndhwc_bf16_instance.cpp | 0 ...vice_avg_pool3d_bwd_ndhwc_f16_instance.cpp | 0 ...vice_avg_pool3d_bwd_ndhwc_f32_instance.cpp | 0 .../gpu/batched_gemm/CMakeLists.txt | 0 ...dl_bf16_bf16_bf16_gkm_gkn_gmn_instance.cpp | 0 ...dl_bf16_bf16_bf16_gkm_gnk_gmn_instance.cpp | 0 ...dl_bf16_bf16_bf16_gmk_gkn_gmn_instance.cpp | 0 ...dl_bf16_bf16_bf16_gmk_gnk_gmn_instance.cpp | 0 ...m_xdl_f16_f16_f16_gkm_gkn_gmn_instance.cpp | 0 ...m_xdl_f16_f16_f16_gkm_gnk_gmn_instance.cpp | 0 ...m_xdl_f16_f16_f16_gmk_gkn_gmn_instance.cpp | 0 ...m_xdl_f16_f16_f16_gmk_gnk_gmn_instance.cpp | 0 ...m_xdl_f32_f32_f32_gkm_gkn_gmn_instance.cpp | 0 ...m_xdl_f32_f32_f32_gkm_gnk_gmn_instance.cpp | 0 ...m_xdl_f32_f32_f32_gmk_gkn_gmn_instance.cpp | 0 ...m_xdl_f32_f32_f32_gmk_gnk_gmn_instance.cpp | 0 ...dl_int8_int8_int8_gkm_gkn_gmn_instance.cpp | 0 ...dl_int8_int8_int8_gkm_gnk_gmn_instance.cpp | 0 ...dl_int8_int8_int8_gmk_gkn_gmn_instance.cpp | 0 ...dl_int8_int8_int8_gmk_gnk_gmn_instance.cpp | 0 .../CMakeLists.txt | 0 ...6_f16_f16_f16_gmk_gnk_gno_gmo_instance.cpp | 0 ...6_f16_f16_f16_gmk_gnk_gon_gmo_instance.cpp | 0 .../batched_gemm_bias_permute/CMakeLists.txt | 0 ...xdl_c_shuffle_f16_f16_f16_f16_instance.cpp | 0 .../gpu/batched_gemm_gemm/CMakeLists.txt | 0 ...6_f16_f16_f16_gmk_gnk_gno_gmo_instance.cpp | 0 ...6_f16_f16_f16_gmk_gnk_gon_gmo_instance.cpp | 0 .../gpu/batched_gemm_multi_d/CMakeLists.txt | 0 ..._d_dl_f16_f16_f16_gkm_gkn_gmn_instance.cpp | 0 ...f16_f16_gkm_gkn_gmn_irregular_instance.cpp | 0 ..._d_dl_f16_f16_f16_gkm_gnk_gmn_instance.cpp | 0 ...f16_f16_gkm_gnk_gmn_irregular_instance.cpp | 0 ..._d_dl_f16_f16_f16_gmk_gkn_gmn_instance.cpp | 0 ...f16_f16_gmk_gkn_gmn_irregular_instance.cpp | 0 ..._d_dl_f16_f16_f16_gmk_gnk_gmn_instance.cpp | 0 ...f16_f16_gmk_gnk_gmn_irregular_instance.cpp | 0 ...lti_d_dl_i8_i8_i8_gkm_gkn_gmn_instance.cpp | 0 ...8_i8_i8_gkm_gkn_gmn_irregular_instance.cpp | 0 ...lti_d_dl_i8_i8_i8_gkm_gnk_gmn_instance.cpp | 0 ...8_i8_i8_gkm_gnk_gmn_irregular_instance.cpp | 0 ...lti_d_dl_i8_i8_i8_gmk_gkn_gmn_instance.cpp | 0 ...8_i8_i8_gmk_gkn_gmn_irregular_instance.cpp | 0 ...lti_d_dl_i8_i8_i8_gmk_gnk_gmn_instance.cpp | 0 ...8_i8_i8_gmk_gnk_gmn_irregular_instance.cpp | 0 .../gpu/batched_gemm_reduce/CMakeLists.txt | 0 ...6_f16_f16_f32_f32_gkm_gkn_gmn_instance.cpp | 0 ...6_f16_f16_f32_f32_gkm_gnk_gmn_instance.cpp | 0 ...6_f16_f16_f32_f32_gmk_gkn_gmn_instance.cpp | 0 ...6_f16_f16_f32_f32_gmk_gnk_gmn_instance.cpp | 0 .../batched_gemm_softmax_gemm/CMakeLists.txt | 0 ...6_f16_f16_f16_gmk_gnk_gno_gmo_instance.cpp | 0 .../CMakeLists.txt | 0 ...f16_bf16_bf16_gmk_gnk_gno_gmo_instance.cpp | 0 ...6_f16_f16_f16_gmk_gnk_gno_gmo_instance.cpp | 0 ...f16_bf16_bf16_gmk_gnk_gno_gmo_instance.cpp | 0 ...6_f16_f16_f16_gmk_gnk_gno_gmo_instance.cpp | 0 .../gpu/batchnorm/CMakeLists.txt | 0 ...evice_batchnorm_backward_bf16_instance.cpp | 0 ...device_batchnorm_backward_f16_instance.cpp | 0 ...device_batchnorm_backward_f32_instance.cpp | 0 ...device_batchnorm_backward_f64_instance.cpp | 0 ...device_batchnorm_forward_bf16_instance.cpp | 0 .../device_batchnorm_forward_f16_instance.cpp | 0 .../device_batchnorm_forward_f32_instance.cpp | 0 .../device_batchnorm_forward_f64_instance.cpp | 0 .../device_batchnorm_infer_bf16_instance.cpp | 0 .../device_batchnorm_infer_f16_instance.cpp | 0 .../device_batchnorm_infer_f32_instance.cpp | 0 .../device_batchnorm_infer_f64_instance.cpp | 0 .../gpu/column_to_image/CMakeLists.txt | 0 ...evice_column_to_image_nhwc_1d_instance.cpp | 0 ...evice_column_to_image_nhwc_2d_instance.cpp | 0 ...evice_column_to_image_nhwc_3d_instance.cpp | 0 .../gpu/contraction_bilinear/CMakeLists.txt | 0 ..._shuffle_f32_f32_f32_f32_kknn_instance.cpp | 0 ..._shuffle_f32_f32_f32_f32_knnn_instance.cpp | 0 ..._shuffle_f32_f32_f32_f32_mknn_instance.cpp | 0 ..._shuffle_f32_f32_f32_f32_mnnn_instance.cpp | 0 ..._shuffle_f64_f64_f64_f64_kknn_instance.cpp | 0 ..._shuffle_f64_f64_f64_f64_knnn_instance.cpp | 0 ..._shuffle_f64_f64_f64_f64_mknn_instance.cpp | 0 ..._shuffle_f64_f64_f64_f64_mnnn_instance.cpp | 0 .../gpu/contraction_scale/CMakeLists.txt | 0 ...xdl_c_shuffle_f32_f32_f32_kkn_instance.cpp | 0 ...xdl_c_shuffle_f32_f32_f32_knn_instance.cpp | 0 ...xdl_c_shuffle_f32_f32_f32_mkn_instance.cpp | 0 ...xdl_c_shuffle_f32_f32_f32_mnn_instance.cpp | 0 ...xdl_c_shuffle_f64_f64_f64_kkn_instance.cpp | 0 ...xdl_c_shuffle_f64_f64_f64_knn_instance.cpp | 0 ...xdl_c_shuffle_f64_f64_f64_mkn_instance.cpp | 0 ...xdl_c_shuffle_f64_f64_f64_mnn_instance.cpp | 0 .../gpu/conv1d_bwd_data/CMakeLists.txt | 0 ...bwd_data_xdl_nwc_kxc_nwk_bf16_instance.cpp | 0 ..._bwd_data_xdl_nwc_kxc_nwk_f16_instance.cpp | 0 ..._bwd_data_xdl_nwc_kxc_nwk_f32_instance.cpp | 0 ...bwd_data_xdl_nwc_kxc_nwk_int8_instance.cpp | 0 .../gpu/conv2d_bwd_data/CMakeLists.txt | 0 ...wd_data_dl_nhwc_kyxc_nhwk_f16_instance.cpp | 0 ...wd_data_dl_nhwc_kyxc_nhwk_f32_instance.cpp | 0 ...d_data_dl_nhwc_kyxc_nhwk_int8_instance.cpp | 0 ..._data_xdl_nhwc_kyxc_nhwk_bf16_instance.cpp | 0 ...d_data_xdl_nhwc_kyxc_nhwk_f16_instance.cpp | 0 ...d_data_xdl_nhwc_kyxc_nhwk_f32_instance.cpp | 0 ..._data_xdl_nhwc_kyxc_nhwk_int8_instance.cpp | 0 .../gpu/conv2d_fwd/CMakeLists.txt | 0 ..._c_shuffle_nhwc_kyxc_nhwk_f16_instance.cpp | 0 ...d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instance.cpp | 0 ...2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instance.cpp | 0 ...2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instance.cpp | 0 ...d_fwd_xdl_nhwc_kyxc_nhwk_int8_instance.cpp | 0 .../gpu/conv2d_fwd_bias_relu/CMakeLists.txt | 0 ..._bias_relu_nhwc_kyxc_nhwk_f16_instance.cpp | 0 .../conv2d_fwd_bias_relu_add/CMakeLists.txt | 0 ...s_relu_add_nhwc_kyxc_nhwk_f16_instance.cpp | 0 .../gpu/conv3d_bwd_data/CMakeLists.txt | 0 ...ta_xdl_ndhwc_kzyxc_ndhwk_bf16_instance.cpp | 0 ...ata_xdl_ndhwc_kzyxc_ndhwk_f16_instance.cpp | 0 ...ata_xdl_ndhwc_kzyxc_ndhwk_f32_instance.cpp | 0 ...ta_xdl_ndhwc_kzyxc_ndhwk_int8_instance.cpp | 0 .../gpu/elementwise/CMakeLists.txt | 0 .../elementwise/device_normalize_instance.cpp | 0 .../elementwise_normalization/CMakeLists.txt | 0 ...elementwise_normalization_f16_instance.cpp | 0 .../gpu/gemm/CMakeLists.txt | 0 ..._gemm_dl_f16_f16_f16_km_kn_mn_instance.cpp | 0 ...16_f16_f16_km_kn_mn_irregular_instance.cpp | 0 ..._gemm_dl_f16_f16_f16_km_nk_mn_instance.cpp | 0 ...16_f16_f16_km_nk_mn_irregular_instance.cpp | 0 ..._gemm_dl_f16_f16_f16_mk_kn_mn_instance.cpp | 0 ...16_f16_f16_mk_kn_mn_irregular_instance.cpp | 0 ..._gemm_dl_f16_f16_f16_mk_nk_mn_instance.cpp | 0 ...16_f16_f16_mk_nk_mn_irregular_instance.cpp | 0 ..._gemm_dl_f32_f32_f32_km_kn_mn_instance.cpp | 0 ..._gemm_dl_f32_f32_f32_km_nk_mn_instance.cpp | 0 ..._gemm_dl_f32_f32_f32_mk_kn_mn_instance.cpp | 0 ..._gemm_dl_f32_f32_f32_mk_nk_mn_instance.cpp | 0 ...ice_gemm_dl_i8_i8_i8_km_kn_mn_instance.cpp | 0 ...l_i8_i8_i8_km_kn_mn_irregular_instance.cpp | 0 ...ice_gemm_dl_i8_i8_i8_km_nk_mn_instance.cpp | 0 ...l_i8_i8_i8_km_nk_mn_irregular_instance.cpp | 0 ...ice_gemm_dl_i8_i8_i8_mk_kn_mn_instance.cpp | 0 ...l_i8_i8_i8_mk_kn_mn_irregular_instance.cpp | 0 ...ice_gemm_dl_i8_i8_i8_mk_nk_mn_instance.cpp | 0 ...l_i8_i8_i8_mk_nk_mn_irregular_instance.cpp | 0 ...gemm_dpp_f16_f16_f16_km_kn_mn_instance.cpp | 0 ...16_f16_f16_km_kn_mn_irregular_instance.cpp | 0 ...gemm_dpp_f16_f16_f16_km_nk_mn_instance.cpp | 0 ...16_f16_f16_km_nk_mn_irregular_instance.cpp | 0 ...gemm_dpp_f16_f16_f16_mk_kn_mn_instance.cpp | 0 ...16_f16_f16_mk_kn_mn_irregular_instance.cpp | 0 ...gemm_dpp_f16_f16_f16_mk_nk_mn_instance.cpp | 0 ...16_f16_f16_mk_nk_mn_irregular_instance.cpp | 0 ..._2_stage_f16_f16_f16_mk_nk_mn_instance.cpp | 0 ...uffle_bf16_bf16_bf16_km_kn_mn_instance.cpp | 0 ...uffle_bf16_bf16_bf16_km_nk_mn_instance.cpp | 0 ...uffle_bf16_bf16_bf16_mk_kn_mn_instance.cpp | 0 ...uffle_bf16_bf16_bf16_mk_nk_mn_instance.cpp | 0 ..._shuffle_f16_f16_f16_km_kn_mn_instance.cpp | 0 ..._shuffle_f16_f16_f16_km_nk_mn_instance.cpp | 0 ..._shuffle_f16_f16_f16_mk_kn_mn_instance.cpp | 0 ..._shuffle_f16_f16_f16_mk_nk_mn_instance.cpp | 0 ..._shuffle_f32_f32_f32_km_kn_mn_instance.cpp | 0 ..._shuffle_f32_f32_f32_km_nk_mn_instance.cpp | 0 ..._shuffle_f32_f32_f32_mk_kn_mn_instance.cpp | 0 ..._shuffle_f32_f32_f32_mk_nk_mn_instance.cpp | 0 ..._shuffle_fp8_fp8_fp8_km_kn_mn_instance.cpp | 0 ..._shuffle_fp8_fp8_fp8_km_nk_mn_instance.cpp | 0 ..._shuffle_fp8_fp8_fp8_mk_kn_mn_instance.cpp | 0 ..._shuffle_fp8_fp8_fp8_mk_nk_mn_instance.cpp | 0 ...l_c_shuffle_i8_i8_i8_km_kn_mn_instance.cpp | 0 ...l_c_shuffle_i8_i8_i8_km_nk_mn_instance.cpp | 0 ...l_c_shuffle_i8_i8_i8_mk_kn_mn_instance.cpp | 0 ...l_c_shuffle_i8_i8_i8_mk_nk_mn_instance.cpp | 0 .../device_gemm_xdl_f16_f16_f16/common.hpp | 0 .../km_kn_mn_add_instance.cpp | 0 .../km_kn_mn_default_pipeline_v1_instance.cpp | 0 .../km_kn_mn_default_pipeline_v2_instance.cpp | 0 ...kn_mn_default_pipeline_v2_opt_instance.cpp | 0 ...m_kn_mn_interwave_pipeline_v1_instance.cpp | 0 ...irregular_default_pipeline_v1_instance.cpp | 0 ...irregular_default_pipeline_v2_instance.cpp | 0 ...regular_interwave_pipeline_v1_instance.cpp | 0 .../km_nk_mn_add_instance.cpp | 0 .../km_nk_mn_default_pipeline_v1_instance.cpp | 0 .../km_nk_mn_default_pipeline_v2_instance.cpp | 0 ...nk_mn_default_pipeline_v2_opt_instance.cpp | 0 ...m_nk_mn_interwave_pipeline_v1_instance.cpp | 0 ...irregular_default_pipeline_v1_instance.cpp | 0 ...irregular_default_pipeline_v2_instance.cpp | 0 ...regular_interwave_pipeline_v1_instance.cpp | 0 .../mk_kn_mn_add_instance.cpp | 0 .../mk_kn_mn_default_pipeline_v1_instance.cpp | 0 .../mk_kn_mn_default_pipeline_v2_instance.cpp | 0 ...kn_mn_default_pipeline_v2_opt_instance.cpp | 0 ...k_kn_mn_interwave_pipeline_v1_instance.cpp | 0 ...irregular_default_pipeline_v1_instance.cpp | 0 ...irregular_default_pipeline_v2_instance.cpp | 0 ...regular_interwave_pipeline_v1_instance.cpp | 0 .../mk_nk_mn_add_instance.cpp | 0 .../mk_nk_mn_default_pipeline_v1_instance.cpp | 0 .../mk_nk_mn_default_pipeline_v2_instance.cpp | 0 ...nk_mn_default_pipeline_v2_opt_instance.cpp | 0 ...k_nk_mn_interwave_pipeline_v1_instance.cpp | 0 ...irregular_default_pipeline_v1_instance.cpp | 0 ...irregular_default_pipeline_v2_instance.cpp | 0 ...regular_interwave_pipeline_v1_instance.cpp | 0 ...gemm_xdl_f32_f32_f32_km_kn_mn_instance.cpp | 0 ...gemm_xdl_f32_f32_f32_km_nk_mn_instance.cpp | 0 ...gemm_xdl_f32_f32_f32_mk_kn_mn_instance.cpp | 0 ...gemm_xdl_f32_f32_f32_mk_nk_mn_instance.cpp | 0 ...gemm_xdl_f64_f64_f64_km_kn_mn_instance.cpp | 0 ...gemm_xdl_f64_f64_f64_km_nk_mn_instance.cpp | 0 ...gemm_xdl_f64_f64_f64_mk_kn_mn_instance.cpp | 0 ...gemm_xdl_f64_f64_f64_mk_nk_mn_instance.cpp | 0 .../gpu/gemm_add_add_fastgelu/CMakeLists.txt | 0 ...16_f16_f16_f16_km_kn_mn_mn_mn_instance.cpp | 0 ...16_f16_f16_f16_km_nk_mn_mn_mn_instance.cpp | 0 ...16_f16_f16_f16_mk_kn_mn_mn_mn_instance.cpp | 0 ...16_f16_f16_f16_mk_nk_mn_mn_mn_instance.cpp | 0 .../gpu/gemm_add_fastgelu/CMakeLists.txt | 0 ...e_f16_f16_f16_f16_km_kn_mn_mn_instance.cpp | 0 ...e_f16_f16_f16_f16_km_nk_mn_mn_instance.cpp | 0 ...e_f16_f16_f16_f16_mk_kn_mn_mn_instance.cpp | 0 ...e_f16_f16_f16_f16_mk_nk_mn_mn_instance.cpp | 0 .../gpu/gemm_add_multiply/CMakeLists.txt | 0 ...16_f16_f16_f16_km_kn_mn_mn_mn_instance.cpp | 0 ...16_f16_f16_f16_km_nk_mn_mn_mn_instance.cpp | 0 ...16_f16_f16_f16_mk_kn_mn_mn_mn_instance.cpp | 0 ...16_f16_f16_f16_mk_nk_mn_mn_mn_instance.cpp | 0 .../CMakeLists.txt | 0 ..._layernorm_f16_km_kn_mn_mn_mn_instance.cpp | 0 ..._layernorm_f16_km_nk_mn_mn_mn_instance.cpp | 0 ..._layernorm_f16_mk_kn_mn_mn_mn_instance.cpp | 0 ..._layernorm_f16_mk_nk_mn_mn_mn_instance.cpp | 0 .../gpu/gemm_bias_add_reduce/CMakeLists.txt | 0 ..._f16_f16_f16_f32_f32_km_kn_mn_instance.cpp | 0 ..._f16_f16_f16_f32_f32_km_nk_mn_instance.cpp | 0 ..._f16_f16_f16_f32_f32_mk_kn_mn_instance.cpp | 0 ..._f16_f16_f16_f32_f32_mk_nk_mn_instance.cpp | 0 .../gpu/gemm_bilinear/CMakeLists.txt | 0 ...uffle_i8_i8_i8_i8_km_kn_mn_mn_instance.cpp | 0 ...uffle_i8_i8_i8_i8_km_nk_mn_mn_instance.cpp | 0 ...uffle_i8_i8_i8_i8_mk_kn_mn_mn_instance.cpp | 0 ...uffle_i8_i8_i8_i8_mk_nk_mn_mn_instance.cpp | 0 ...e_f16_f16_f16_f16_km_kn_mn_mn_instance.cpp | 0 ...e_f16_f16_f16_f16_km_nk_mn_mn_instance.cpp | 0 ...e_f16_f16_f16_f16_mk_kn_mn_mn_instance.cpp | 0 ...e_f16_f16_f16_f16_mk_nk_mn_mn_instance.cpp | 0 .../gpu/gemm_fastgelu/CMakeLists.txt | 0 ..._shuffle_f16_f16_f16_km_kn_mn_instance.cpp | 0 ..._shuffle_f16_f16_f16_km_nk_mn_instance.cpp | 0 ..._shuffle_f16_f16_f16_mk_kn_mn_instance.cpp | 0 ..._shuffle_f16_f16_f16_mk_nk_mn_instance.cpp | 0 .../gpu/gemm_multiply_add/CMakeLists.txt | 0 ...16_f16_f16_f16_mk_kn_mn_mn_mn_instance.cpp | 0 ...16_f16_f16_f16_mk_nk_mn_mn_mn_instance.cpp | 0 ...f8_f32_f32_f16_mk_kn_mn_mn_mn_instance.cpp | 0 ...f8_f32_f32_f16_mk_nk_mn_mn_mn_instance.cpp | 0 .../gpu/gemm_reduce/CMakeLists.txt | 0 ..._f16_f16_f16_f32_f32_km_kn_mn_instance.cpp | 0 ..._f16_f16_f16_f32_f32_km_nk_mn_instance.cpp | 0 ..._f16_f16_f16_f32_f32_mk_kn_mn_instance.cpp | 0 ..._f16_f16_f16_f32_f32_mk_nk_mn_instance.cpp | 0 .../gpu/gemm_splitk/CMakeLists.txt | 0 ...f16_f16_f16_comp_fp8_km_kn_mn_instance.cpp | 0 ...f16_f16_f16_comp_fp8_km_nk_mn_instance.cpp | 0 ...f16_f16_f16_comp_fp8_mk_kn_mn_instance.cpp | 0 ...f16_f16_f16_comp_fp8_mk_nk_mn_instance.cpp | 0 ...l_splitk_f16_f16_f16_km_kn_mn_instance.cpp | 0 ...l_splitk_f16_f16_f16_km_nk_mn_instance.cpp | 0 ...l_splitk_f16_f16_f16_mk_kn_mn_instance.cpp | 0 ...l_splitk_f16_f16_f16_mk_nk_mn_instance.cpp | 0 ...l_splitk_f16_fp8_f16_km_kn_mn_instance.cpp | 0 ...l_splitk_f16_fp8_f16_km_nk_mn_instance.cpp | 0 ...l_splitk_f16_fp8_f16_mk_kn_mn_instance.cpp | 0 ...l_splitk_f16_fp8_f16_mk_nk_mn_instance.cpp | 0 ...l_splitk_f32_f32_f32_km_kn_mn_instance.cpp | 0 ...l_splitk_f32_f32_f32_km_nk_mn_instance.cpp | 0 ...l_splitk_f32_f32_f32_mk_kn_mn_instance.cpp | 0 ...l_splitk_f32_f32_f32_mk_nk_mn_instance.cpp | 0 ...l_splitk_fp8_f16_f16_km_kn_mn_instance.cpp | 0 ...l_splitk_fp8_f16_f16_km_nk_mn_instance.cpp | 0 ...l_splitk_fp8_f16_f16_mk_kn_mn_instance.cpp | 0 ...l_splitk_fp8_f16_f16_mk_nk_mn_instance.cpp | 0 .../gpu/gemm_streamk/CMakeLists.txt | 0 ..._streamk_f16_f16_f16_mk_kn_mn_instance.cpp | 0 .../gpu/gemv_splitk/CMakeLists.txt | 17 + ...v_splitk_f16_f16_f16_mk_kn_mn_instance.cpp | 148 ++-- ...v_splitk_f16_f16_f16_mk_nk_mn_instance.cpp | 148 ++-- .../grouped_conv1d_bwd_weight/CMakeLists.txt | 0 ...weight_dl_gnwc_gkxc_gnwk_bf16_instance.cpp | 0 ..._weight_dl_gnwc_gkxc_gnwk_f16_instance.cpp | 0 ..._weight_dl_gnwc_gkxc_gnwk_f32_instance.cpp | 0 ...weight_dl_nwgc_gkxc_nwgk_bf16_instance.cpp | 0 ..._weight_dl_nwgc_gkxc_nwgk_f16_instance.cpp | 0 ..._weight_dl_nwgc_gkxc_nwgk_f32_instance.cpp | 0 ...eight_xdl_gnwc_gkxc_gnwk_bf16_instance.cpp | 0 ...weight_xdl_gnwc_gkxc_gnwk_f16_instance.cpp | 0 ...weight_xdl_gnwc_gkxc_gnwk_f32_instance.cpp | 0 .../gpu/grouped_conv1d_fwd/CMakeLists.txt | 0 ...d_fwd_xdl_gnwc_gkxc_gnwk_bf16_instance.cpp | 0 ...1d_fwd_xdl_gnwc_gkxc_gnwk_f16_instance.cpp | 0 ...1d_fwd_xdl_gnwc_gkxc_gnwk_f32_instance.cpp | 0 ...d_fwd_xdl_gnwc_gkxc_gnwk_int8_instance.cpp | 0 .../grouped_conv2d_bwd_data/CMakeLists.txt | 0 ...gnhwc_gkyxc_gnhwk_f16_1x1s1p0_instance.cpp | 0 ...ta_wmma_gnhwc_gkyxc_gnhwk_f16_instance.cpp | 0 ..._gnhwc_gkyxc_gnhwk_i8_1x1s1p0_instance.cpp | 0 ...ata_wmma_gnhwc_gkyxc_gnhwk_i8_instance.cpp | 0 ...nhwgc_gkyxc_nhwgk_f16_1x1s1p0_instance.cpp | 0 ...ta_wmma_nhwgc_gkyxc_nhwgk_f16_instance.cpp | 0 ..._nhwgc_gkyxc_nhwgk_i8_1x1s1p0_instance.cpp | 0 ...ata_wmma_nhwgc_gkyxc_nhwgk_i8_instance.cpp | 0 ...ta_xdl_gnhwc_gkyxc_gnhwk_bf16_instance.cpp | 0 ...ata_xdl_gnhwc_gkyxc_gnhwk_f16_instance.cpp | 0 ...ata_xdl_gnhwc_gkyxc_gnhwk_f32_instance.cpp | 0 ...ta_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.cpp | 0 ...ata_xdl_nhwgc_gkyxc_nhwgk_f16_instance.cpp | 0 ...ata_xdl_nhwgc_gkyxc_nhwgk_f32_instance.cpp | 0 .../grouped_conv2d_bwd_weight/CMakeLists.txt | 0 ...ght_dl_gnhwc_gkyxc_gnhwk_bf16_instance.cpp | 0 ...ight_dl_gnhwc_gkyxc_gnhwk_f16_instance.cpp | 0 ...ight_dl_gnhwc_gkyxc_gnhwk_f32_instance.cpp | 0 ...ght_dl_nhwgc_gkyxc_nhwgk_bf16_instance.cpp | 0 ...ight_dl_nhwgc_gkyxc_nhwgk_f16_instance.cpp | 0 ...ight_dl_nhwgc_gkyxc_nhwgk_f32_instance.cpp | 0 ...ht_xdl_gnhwc_gkyxc_gnhwk_bf16_instance.cpp | 0 ...ght_xdl_gnhwc_gkyxc_gnhwk_f16_instance.cpp | 0 ...ght_xdl_gnhwc_gkyxc_gnhwk_f32_instance.cpp | 0 ...ht_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.cpp | 0 ...ght_xdl_nhwgc_gkyxc_nhwgk_f16_instance.cpp | 0 ...ght_xdl_nhwgc_gkyxc_nhwgk_f32_instance.cpp | 0 .../gpu/grouped_conv2d_fwd/CMakeLists.txt | 0 ...wd_wmma_gnhwc_gkyxc_gnhwk_f16_instance.cpp | 0 ...fwd_wmma_gnhwc_gkyxc_gnhwk_i8_instance.cpp | 0 ..._fwd_dl_gnhwc_gkyxc_gnhwk_f16_instance.cpp | 0 ..._fwd_dl_gnhwc_gkyxc_gnhwk_f32_instance.cpp | 0 ..._fwd_dl_nhwgc_gkyxc_nhwgk_f16_instance.cpp | 0 ..._fwd_dl_nhwgc_gkyxc_nhwgk_f32_instance.cpp | 0 ...a_gnhwc_gkyxc_gnhwk_f16_1x1p0_instance.cpp | 0 ...gnhwc_gkyxc_gnhwk_f16_1x1s1p0_instance.cpp | 0 ...wd_wmma_gnhwc_gkyxc_gnhwk_f16_instance.cpp | 0 ...ma_gnhwc_gkyxc_gnhwk_f16_oddc_instance.cpp | 0 ...ma_gnhwc_gkyxc_gnhwk_i8_1x1p0_instance.cpp | 0 ..._gnhwc_gkyxc_gnhwk_i8_1x1s1p0_instance.cpp | 0 ...fwd_wmma_gnhwc_gkyxc_gnhwk_i8_instance.cpp | 0 ...mma_gnhwc_gkyxc_gnhwk_i8_oddc_instance.cpp | 0 ...a_nhwgc_gkyxc_nhwgk_f16_1x1p0_instance.cpp | 0 ...nhwgc_gkyxc_nhwgk_f16_1x1s1p0_instance.cpp | 0 ...wd_wmma_nhwgc_gkyxc_nhwgk_f16_instance.cpp | 0 ...ma_nhwgc_gkyxc_nhwgk_f16_oddc_instance.cpp | 0 ...ma_nhwgc_gkyxc_nhwgk_i8_1x1p0_instance.cpp | 0 ..._nhwgc_gkyxc_nhwgk_i8_1x1s1p0_instance.cpp | 0 ...fwd_wmma_nhwgc_gkyxc_nhwgk_i8_instance.cpp | 0 ...mma_nhwgc_gkyxc_nhwgk_i8_oddc_instance.cpp | 0 ...wd_xdl_gnhwc_gkyxc_gnhwk_bf16_instance.cpp | 0 ...fwd_xdl_gnhwc_gkyxc_gnhwk_f16_instance.cpp | 0 ...fwd_xdl_gnhwc_gkyxc_gnhwk_f32_instance.cpp | 0 ...wd_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.cpp | 0 ...fwd_xdl_nhwgc_gkyxc_nhwgk_f16_instance.cpp | 0 ...fwd_xdl_nhwgc_gkyxc_nhwgk_f32_instance.cpp | 0 .../grouped_conv3d_bwd_data/CMakeLists.txt | 0 ...hwc_gkzyxc_gndhwk_f16_1x1s1p0_instance.cpp | 0 ...wmma_gndhwc_gkzyxc_gndhwk_f16_instance.cpp | 0 ...dhwc_gkzyxc_gndhwk_i8_1x1s1p0_instance.cpp | 0 ..._wmma_gndhwc_gkzyxc_gndhwk_i8_instance.cpp | 0 ...wgc_gkzyxc_ndhwgk_f16_1x1s1p0_instance.cpp | 0 ...wmma_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp | 0 ...hwgc_gkzyxc_ndhwgk_i8_1x1s1p0_instance.cpp | 0 ..._wmma_ndhwgc_gkzyxc_ndhwgk_i8_instance.cpp | 0 ...xdl_gndhwc_gkzyxc_gndhwk_bf16_instance.cpp | 0 ..._xdl_gndhwc_gkzyxc_gndhwk_f16_instance.cpp | 0 ..._xdl_gndhwc_gkzyxc_gndhwk_f32_instance.cpp | 0 ...xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp | 0 ..._xdl_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp | 0 ..._xdl_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp | 0 ..._ndhwgk_input_f16_comp_bf8_f8_instance.cpp | 0 .../grouped_conv3d_bwd_weight/CMakeLists.txt | 0 ..._dl_gndhwc_gkzyxc_gndhwk_bf16_instance.cpp | 0 ...t_dl_gndhwc_gkzyxc_gndhwk_f16_instance.cpp | 0 ...t_dl_gndhwc_gkzyxc_gndhwk_f32_instance.cpp | 0 ..._dl_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp | 0 ...t_dl_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp | 0 ...t_dl_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp | 0 ...hwc_gkzyxc_gndhwk_f16_1x1s1p0_instance.cpp | 0 ...wmma_gndhwc_gkzyxc_gndhwk_f16_instance.cpp | 0 ...dhwc_gkzyxc_gndhwk_i8_1x1s1p0_instance.cpp | 0 ..._wmma_gndhwc_gkzyxc_gndhwk_i8_instance.cpp | 0 ...wgc_gkzyxc_ndhwgk_f16_1x1s1p0_instance.cpp | 0 ...wmma_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp | 0 ...hwgc_gkzyxc_ndhwgk_i8_1x1s1p0_instance.cpp | 0 ..._wmma_ndhwgc_gkzyxc_ndhwgk_i8_instance.cpp | 0 ...xdl_gndhwc_gkzyxc_gndhwk_bf16_instance.cpp | 0 ..._xdl_gndhwc_gkzyxc_gndhwk_f16_instance.cpp | 0 ..._xdl_gndhwc_gkzyxc_gndhwk_f32_instance.cpp | 0 ...xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp | 0 ...kzyxc_ndhwgk_f16_comp_bf8_fp8_instance.cpp | 0 ..._xdl_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp | 0 ..._xdl_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp | 0 .../gpu/grouped_conv3d_fwd/CMakeLists.txt | 0 ...ndhwc_gkzyxc_gndhwk_f16_1x1p0_instance.cpp | 0 ...hwc_gkzyxc_gndhwk_f16_1x1s1p0_instance.cpp | 0 ...wmma_gndhwc_gkzyxc_gndhwk_f16_instance.cpp | 0 ...gndhwc_gkzyxc_gndhwk_f16_oddc_instance.cpp | 0 ...gndhwc_gkzyxc_gndhwk_i8_1x1p0_instance.cpp | 0 ...dhwc_gkzyxc_gndhwk_i8_1x1s1p0_instance.cpp | 0 ..._wmma_gndhwc_gkzyxc_gndhwk_i8_instance.cpp | 0 ..._gndhwc_gkzyxc_gndhwk_i8_oddc_instance.cpp | 0 ...dhwgc_gkzyxc_ndhwgk_f16_1x1p0_instance.cpp | 0 ...wgc_gkzyxc_ndhwgk_f16_1x1s1p0_instance.cpp | 0 ...wmma_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp | 0 ...ndhwgc_gkzyxc_ndhwgk_f16_oddc_instance.cpp | 0 ...ndhwgc_gkzyxc_ndhwgk_i8_1x1p0_instance.cpp | 0 ...hwgc_gkzyxc_ndhwgk_i8_1x1s1p0_instance.cpp | 0 ..._wmma_ndhwgc_gkzyxc_ndhwgk_i8_instance.cpp | 0 ..._ndhwgc_gkzyxc_ndhwgk_i8_oddc_instance.cpp | 0 ...xdl_gndhwc_gkzyxc_gndhwk_bf16_instance.cpp | 0 ..._xdl_gndhwc_gkzyxc_gndhwk_f16_instance.cpp | 0 ..._xdl_gndhwc_gkzyxc_gndhwk_f32_instance.cpp | 0 ...xdl_gndhwc_gkzyxc_gndhwk_int8_instance.cpp | 0 ...xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp | 0 ...gc_gkzyxc_ndhwgk_f16_comp_fp8_instance.cpp | 0 ..._xdl_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp | 0 ..._xdl_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp | 0 ...xdl_ndhwgc_gkzyxc_ndhwgk_int8_instance.cpp | 0 .../gpu/grouped_gemm/CMakeLists.txt | 0 ...gemm_xdl_f16_f16_f16_km_kn_mn_instance.cpp | 0 ...gemm_xdl_f16_f16_f16_km_nk_mn_instance.cpp | 0 ...gemm_xdl_f16_f16_f16_mk_kn_mn_instance.cpp | 0 ...gemm_xdl_f16_f16_f16_mk_nk_mn_instance.cpp | 0 ...l_splitk_f16_f16_f16_mk_kn_mn_instance.cpp | 0 ...16_f16_f16_mk_kn_mn_irregular_instance.cpp | 0 ...l_splitk_f16_f16_f16_mk_nk_mn_instance.cpp | 0 ...16_f16_f16_mk_nk_mn_irregular_instance.cpp | 0 .../gpu/grouped_gemm_bias/CMakeLists.txt | 0 ..._nk_bias_f16_f16_f16_mk_kn_mn_instance.cpp | 0 ..._nk_bias_f16_f16_f16_mk_nk_mn_instance.cpp | 0 ..._nk_bias_f16_f16_f32_mk_kn_mn_instance.cpp | 0 ..._nk_bias_f16_f16_f32_mk_nk_mn_instance.cpp | 0 .../gpu/grouped_gemm_fastgelu/CMakeLists.txt | 0 ...gelu_xdl_f16_f16_f16_km_kn_mn_instance.cpp | 0 ...gelu_xdl_f16_f16_f16_km_nk_mn_instance.cpp | 0 ...gelu_xdl_f16_f16_f16_mk_kn_mn_instance.cpp | 0 ...gelu_xdl_f16_f16_f16_mk_nk_mn_instance.cpp | 0 .../gpu/grouped_gemm_fixed_nk/CMakeLists.txt | 0 ...fixed_nk_f16_f16_f16_mk_kn_mn_instance.cpp | 0 ...fixed_nk_f16_f16_f16_mk_nk_mn_instance.cpp | 0 ...fixed_nk_f16_fp8_f16_mk_kn_mn_instance.cpp | 0 ...fixed_nk_f16_fp8_f16_mk_nk_mn_instance.cpp | 0 ..._fixed_nk_f16_i8_f16_mk_kn_mn_instance.cpp | 0 ..._fixed_nk_f16_i8_f16_mk_nk_mn_instance.cpp | 0 .../gpu/image_to_column/CMakeLists.txt | 0 ...evice_image_to_column_nhwc_1d_instance.cpp | 0 ...evice_image_to_column_nhwc_2d_instance.cpp | 0 ...evice_image_to_column_nhwc_3d_instance.cpp | 0 .../gpu/max_pool_bwd/CMakeLists.txt | 0 .../device_max_pool_bwd_bf16_instance.cpp | 0 .../device_max_pool_bwd_f16_instance.cpp | 0 .../device_max_pool_bwd_f32_instance.cpp | 0 .../max_pool_bwd_instance_common.hpp | 0 .../gpu/normalization/CMakeLists.txt | 0 .../device_groupnorm_f16_instance.cpp | 0 .../device_groupnorm_f32_instance.cpp | 0 ...oupnorm_swish_f16_f32_f32_f16_instance.cpp | 0 .../device_groupnorm_swish_f16_instance.cpp | 0 .../device_groupnorm_swish_f32_instance.cpp | 0 .../device_layernorm2d_f16_instance.cpp | 0 .../device_layernorm2d_f32_instance.cpp | 0 .../device_layernorm4d_f16_instance.cpp | 0 .../device_layernorm4d_f32_instance.cpp | 0 .../normalization_instance_common.hpp | 0 .../gpu/pool3d_fwd/CMakeLists.txt | 0 ...ice_avg_pool3d_fwd_ndhwc_bf16_instance.cpp | 0 ...vice_avg_pool3d_fwd_ndhwc_f16_instance.cpp | 0 ...vice_avg_pool3d_fwd_ndhwc_f32_instance.cpp | 0 ...ice_max_pool3d_fwd_ndhwc_bf16_instance.cpp | 0 ...vice_max_pool3d_fwd_ndhwc_f16_instance.cpp | 0 ...vice_max_pool3d_fwd_ndhwc_f32_instance.cpp | 0 .../pool3d_fwd/pool_fwd_instance_common.hpp | 0 .../gpu/quantization/CMakeLists.txt | 0 .../conv2d_fwd/conv2d_quantization_common.hpp | 0 ..._perchannel_quantization_int8_instance.cpp | 0 ...as_perlayer_quantization_int8_instance.cpp | 0 .../device_conv2d_dl_int8_instance.hpp | 0 ..._perchannel_quantization_int8_instance.cpp | 0 ...dl_perlayer_quantization_int8_instance.cpp | 0 ..._perchannel_quantization_int8_instance.cpp | 0 ...as_perlayer_quantization_int8_instance.cpp | 0 .../device_conv2d_xdl_int8_instance.hpp | 0 ..._perchannel_quantization_int8_instance.cpp | 0 ...dl_perlayer_quantization_int8_instance.cpp | 0 ...ization_dl_c_shuffle_i8_i8_i8_instance.hpp | 0 ...l_c_shuffle_i8_i8_i8_km_kn_mn_instance.cpp | 0 ...l_c_shuffle_i8_i8_i8_km_nk_mn_instance.cpp | 0 ...l_c_shuffle_i8_i8_i8_mk_kn_mn_instance.cpp | 0 ...l_c_shuffle_i8_i8_i8_mk_nk_mn_instance.cpp | 0 ...zation_xdl_c_shuffle_i8_i8_i8_instance.hpp | 0 ...l_c_shuffle_i8_i8_i8_km_kn_mn_instance.cpp | 0 ...l_c_shuffle_i8_i8_i8_km_nk_mn_instance.cpp | 0 ...l_c_shuffle_i8_i8_i8_mk_kn_mn_instance.cpp | 0 ...l_c_shuffle_i8_i8_i8_mk_nk_mn_instance.cpp | 0 .../gemm/gemm_quantization_common.hpp | 0 .../gpu/reduce/CMakeLists.txt | 0 ...uce_instance_blockwise_b16_f32_b16_add.cpp | 0 ...ce_instance_blockwise_b16_f32_b16_amax.cpp | 0 ...uce_instance_blockwise_b16_f32_b16_avg.cpp | 0 ...uce_instance_blockwise_b16_f32_b16_max.cpp | 0 ...uce_instance_blockwise_b16_f32_b16_min.cpp | 0 ...e_instance_blockwise_b16_f32_b16_norm2.cpp | 0 ...ce_instance_blockwise_f16_f16_f16_amax.cpp | 0 ...uce_instance_blockwise_f16_f16_f16_max.cpp | 0 ...uce_instance_blockwise_f16_f16_f16_min.cpp | 0 ...uce_instance_blockwise_f16_f32_f16_add.cpp | 0 ...uce_instance_blockwise_f16_f32_f16_avg.cpp | 0 ...e_instance_blockwise_f16_f32_f16_norm2.cpp | 0 ...uce_instance_blockwise_f32_f32_f32_add.cpp | 0 ...ce_instance_blockwise_f32_f32_f32_amax.cpp | 0 ...uce_instance_blockwise_f32_f32_f32_avg.cpp | 0 ...uce_instance_blockwise_f32_f32_f32_max.cpp | 0 ...uce_instance_blockwise_f32_f32_f32_min.cpp | 0 ...e_instance_blockwise_f32_f32_f32_norm2.cpp | 0 ...uce_instance_blockwise_f32_f64_f32_add.cpp | 0 ...uce_instance_blockwise_f32_f64_f32_avg.cpp | 0 ...e_instance_blockwise_f32_f64_f32_norm2.cpp | 0 ...uce_instance_blockwise_f64_f64_f64_add.cpp | 0 ...ce_instance_blockwise_f64_f64_f64_amax.cpp | 0 ...uce_instance_blockwise_f64_f64_f64_avg.cpp | 0 ...uce_instance_blockwise_f64_f64_f64_max.cpp | 0 ...uce_instance_blockwise_f64_f64_f64_min.cpp | 0 ...e_instance_blockwise_f64_f64_f64_norm2.cpp | 0 ...educe_instance_blockwise_i8_i32_i8_add.cpp | 0 ...educe_instance_blockwise_i8_i32_i8_avg.cpp | 0 ...educe_instance_blockwise_i8_i8_i8_amax.cpp | 0 ...reduce_instance_blockwise_i8_i8_i8_max.cpp | 0 ...reduce_instance_blockwise_i8_i8_i8_min.cpp | 0 ..._multiblock_atomic_add_b16_f32_f32_add.cpp | 0 ..._multiblock_atomic_add_b16_f32_f32_avg.cpp | 0 ..._multiblock_atomic_add_f16_f32_f32_add.cpp | 0 ..._multiblock_atomic_add_f16_f32_f32_avg.cpp | 0 ..._multiblock_atomic_add_f32_f32_f32_add.cpp | 0 ..._multiblock_atomic_add_f32_f32_f32_avg.cpp | 0 ..._multiblock_atomic_add_f32_f64_f32_add.cpp | 0 ..._multiblock_atomic_add_f32_f64_f32_avg.cpp | 0 ..._multiblock_atomic_add_f64_f64_f64_add.cpp | 0 ..._multiblock_atomic_add_f64_f64_f64_avg.cpp | 0 ...ce_instance_threadwise_b16_f32_b16_add.cpp | 0 ...e_instance_threadwise_b16_f32_b16_amax.cpp | 0 ...ce_instance_threadwise_b16_f32_b16_avg.cpp | 0 ...ce_instance_threadwise_b16_f32_b16_max.cpp | 0 ...ce_instance_threadwise_b16_f32_b16_min.cpp | 0 ..._instance_threadwise_b16_f32_b16_norm2.cpp | 0 ...e_instance_threadwise_f16_f16_f16_amax.cpp | 0 ...ce_instance_threadwise_f16_f16_f16_max.cpp | 0 ...ce_instance_threadwise_f16_f16_f16_min.cpp | 0 ...ce_instance_threadwise_f16_f32_f16_add.cpp | 0 ...ce_instance_threadwise_f16_f32_f16_avg.cpp | 0 ..._instance_threadwise_f16_f32_f16_norm2.cpp | 0 ...ce_instance_threadwise_f32_f32_f32_add.cpp | 0 ...e_instance_threadwise_f32_f32_f32_amax.cpp | 0 ...ce_instance_threadwise_f32_f32_f32_avg.cpp | 0 ...ce_instance_threadwise_f32_f32_f32_max.cpp | 0 ...ce_instance_threadwise_f32_f32_f32_min.cpp | 0 ..._instance_threadwise_f32_f32_f32_norm2.cpp | 0 ...ce_instance_threadwise_f32_f64_f32_add.cpp | 0 ...ce_instance_threadwise_f32_f64_f32_avg.cpp | 0 ..._instance_threadwise_f32_f64_f32_norm2.cpp | 0 ...ce_instance_threadwise_f64_f64_f64_add.cpp | 0 ...e_instance_threadwise_f64_f64_f64_amax.cpp | 0 ...ce_instance_threadwise_f64_f64_f64_avg.cpp | 0 ...ce_instance_threadwise_f64_f64_f64_max.cpp | 0 ...ce_instance_threadwise_f64_f64_f64_min.cpp | 0 ..._instance_threadwise_f64_f64_f64_norm2.cpp | 0 ...duce_instance_threadwise_i8_i32_i8_add.cpp | 0 ...duce_instance_threadwise_i8_i32_i8_avg.cpp | 0 ...duce_instance_threadwise_i8_i8_i8_amax.cpp | 0 ...educe_instance_threadwise_i8_i8_i8_max.cpp | 0 ...educe_instance_threadwise_i8_i8_i8_min.cpp | 0 .../gpu/softmax/CMakeLists.txt | 0 ...softmax_f16_f16_instance_rank3_reduce1.cpp | 0 ...softmax_f16_f16_instance_rank3_reduce2.cpp | 0 ...softmax_f16_f16_instance_rank3_reduce3.cpp | 0 ...softmax_f16_f16_instance_rank4_reduce1.cpp | 0 ...softmax_f16_f16_instance_rank4_reduce2.cpp | 0 ...softmax_f16_f16_instance_rank4_reduce3.cpp | 0 ...softmax_f16_f16_instance_rank4_reduce4.cpp | 0 ...softmax_f32_f32_instance_rank3_reduce1.cpp | 0 ...softmax_f32_f32_instance_rank3_reduce2.cpp | 0 ...softmax_f32_f32_instance_rank3_reduce3.cpp | 0 ...softmax_f32_f32_instance_rank4_reduce1.cpp | 0 ...softmax_f32_f32_instance_rank4_reduce2.cpp | 0 ...softmax_f32_f32_instance_rank4_reduce3.cpp | 0 ...softmax_f32_f32_instance_rank4_reduce4.cpp | 0 .../CMakeLists.txt | 18 + ...m_splitk_f16_f16_f16_mk_kn_mn_instance.cpp | 200 +++++ ...m_splitk_f16_f16_f16_mk_nk_mn_instance.cpp | 199 +++++ .../library}/src/utility/CMakeLists.txt | 0 .../src/utility/convolution_parameter.cpp | 0 .../library}/src/utility/device_memory.cpp | 0 .../library}/src/utility/host_tensor.cpp | 0 .../profiler}/CMakeLists.txt | 0 .../profiler}/README.md | 0 .../include/profiler/data_type_enum.hpp | 0 .../profiler/profile_avg_pool3d_bwd_impl.hpp | 0 ...le_batched_gemm_add_relu_gemm_add_impl.hpp | 0 ...ed_gemm_bias_softmax_gemm_permute_impl.hpp | 0 .../profile_batched_gemm_gemm_impl.hpp | 0 .../profiler/profile_batched_gemm_impl.hpp | 0 .../profile_batched_gemm_reduce_impl.hpp | 0 ...profile_batched_gemm_softmax_gemm_impl.hpp | 0 ...batched_gemm_softmax_gemm_permute_impl.hpp | 0 .../profile_batchnorm_backward_impl.hpp | 0 .../profile_batchnorm_forward_impl.hpp | 0 .../profiler/profile_batchnorm_infer_impl.hpp | 0 .../profiler/profile_contraction_impl.hpp | 0 .../profiler/profile_contraction_utils.hpp | 0 .../profiler/profile_conv_bwd_data_impl.hpp | 0 .../profile_conv_fwd_bias_relu_add_impl.hpp | 0 .../profile_conv_fwd_bias_relu_impl.hpp | 0 .../profiler/profile_conv_fwd_impl.hpp | 0 .../profile_conv_tensor_rearrange_impl.hpp | 0 .../profile_elementwise_layernorm_impl.hpp | 0 .../profile_gemm_add_add_fastgelu_impl.hpp | 0 .../profile_gemm_add_fastgelu_impl.hpp | 0 .../profile_gemm_add_multiply_impl.hpp | 0 ...ofile_gemm_add_relu_add_layernorm_impl.hpp | 0 .../profile_gemm_bias_add_reduce_impl.hpp | 0 .../profiler/profile_gemm_bilinear_impl.hpp | 0 .../profiler/profile_gemm_fastgelu_impl.hpp | 0 .../include/profiler/profile_gemm_impl.hpp | 0 .../profile_gemm_multiply_add_impl.hpp | 0 .../profiler/profile_gemm_reduce_impl.hpp | 0 .../profiler/profile_gemm_splitk_impl.hpp | 0 .../profiler/profile_gemm_streamk_impl.hpp | 0 .../profiler/profile_gemv_splitk_impl.hpp | 4 +- .../profile_grouped_conv_bwd_data_impl.hpp | 0 .../profile_grouped_conv_bwd_weight_impl.hpp | 0 .../profile_grouped_conv_fwd_impl.hpp | 0 .../profile_grouped_gemm_fastgelu_impl.hpp | 0 .../profiler/profile_grouped_gemm_impl.hpp | 0 .../profiler/profile_groupnorm_impl.hpp | 0 .../profiler/profile_layernorm_impl.hpp | 0 .../profiler/profile_max_pool3d_bwd_impl.hpp | 0 .../profiler/profile_pool3d_fwd_impl.hpp | 0 .../include/profiler/profile_reduce_impl.hpp | 0 .../include/profiler/profile_softmax_impl.hpp | 0 ...ofile_tall_and_skinny_gemm_splitk_impl.hpp | 297 +++++++ .../profiler}/src/CMakeLists.txt | 0 .../profiler}/src/profile_avg_pool3d_bwd.cpp | 0 .../profiler}/src/profile_batched_gemm.cpp | 0 ...profile_batched_gemm_add_relu_gemm_add.cpp | 0 .../src/profile_batched_gemm_gemm.cpp | 0 .../src/profile_batched_gemm_multi_d.cpp | 0 .../src/profile_batched_gemm_reduce.cpp | 0 .../profiler}/src/profile_batchnorm_bwd.cpp | 0 .../profiler}/src/profile_batchnorm_fwd.cpp | 0 .../profiler}/src/profile_batchnorm_infer.cpp | 0 .../src/profile_contraction_bilinear.cpp | 0 .../src/profile_contraction_scale.cpp | 0 .../profiler}/src/profile_conv_bwd_data.cpp | 0 .../profiler}/src/profile_conv_fwd.cpp | 0 .../src/profile_conv_fwd_bias_relu.cpp | 0 .../src/profile_conv_fwd_bias_relu_add.cpp | 0 .../src/profile_conv_tensor_rearrange.cpp | 0 .../profiler}/src/profile_gemm.cpp | 0 .../src/profile_gemm_add_add_fastgelu.cpp | 0 .../src/profile_gemm_add_fastgelu.cpp | 0 .../src/profile_gemm_add_multiply.cpp | 0 .../profile_gemm_add_relu_add_layernorm.cpp | 0 .../src/profile_gemm_bias_add_reduce.cpp | 0 .../profiler}/src/profile_gemm_bilinear.cpp | 0 .../profiler}/src/profile_gemm_fastgelu.cpp | 0 .../src/profile_gemm_multiply_add.cpp | 0 .../profiler}/src/profile_gemm_reduce.cpp | 0 .../profiler}/src/profile_gemm_splitk.cpp | 0 .../profiler}/src/profile_gemm_streamk.cpp | 0 .../profiler}/src/profile_gemv_splitk.cpp | 0 .../src/profile_grouped_conv_bwd_data.cpp | 0 .../src/profile_grouped_conv_bwd_weight.cpp | 0 .../src/profile_grouped_conv_fwd.cpp | 0 .../profiler}/src/profile_grouped_gemm.cpp | 0 .../src/profile_grouped_gemm_fastgelu.cpp | 0 .../profiler}/src/profile_groupnorm.cpp | 0 .../profiler}/src/profile_layernorm.cpp | 0 .../profiler}/src/profile_max_pool3d_bwd.cpp | 0 .../profiler}/src/profile_max_pool3d_fwd.cpp | 0 .../profiler}/src/profile_reduce.cpp | 0 .../profiler}/src/profile_softmax.cpp | 0 .../profile_tall_and_skinny_gemm_splitk.cpp | 160 ++++ .../profiler}/src/profiler.cpp | 0 .../src/profiler_operation_registry.hpp | 0 rbuild.ini => composable_kernel/rbuild.ini | 0 .../requirements.txt | 0 .../script}/check_copyright_year.sh | 0 .../script}/clang-format-overwrite.sh | 0 .../script}/cmake-ck-dev.sh | 0 .../script}/cmake-ck-release.sh | 0 .../script}/count_vgpr.sh | 0 .../script}/hipclang_opt.sh | 0 .../script}/install_precommit.sh | 0 .../script}/parse_perf_data.py | 0 .../script}/process_perf_data.py | 0 .../script}/process_perf_data.sh | 0 .../script}/process_qa_data.sh | 0 .../script}/profile_batched_gemm.sh | 0 .../script}/profile_conv_bwd_data.sh | 0 .../script}/profile_conv_fwd.sh | 0 .../script}/profile_gemm.sh | 0 .../script}/profile_gemm_bilinear.sh | 0 .../script}/profile_grouped_gemm.sh | 0 .../script}/profile_onnx_gemm.sh | 0 .../script}/profile_reduce_no_index.sh | 0 .../script}/profile_reduce_with_index.sh | 0 .../script}/profile_resnet50.sh | 0 .../script}/profile_splitK_gemm.sh | 0 .../script}/run_full_performance_tests.sh | 0 .../script}/run_performance_tests.sh | 0 .../script}/test_convnd_fwd.sh | 0 .../script}/test_reduce_no_index.sh | 0 .../script}/test_reduce_with_index.sh | 0 .../script}/uninstall_precommit.sh | 0 .../test}/CMakeLists.txt | 0 .../test}/batched_gemm/CMakeLists.txt | 0 .../test}/batched_gemm/batched_gemm_bf16.cpp | 0 .../test}/batched_gemm/batched_gemm_fp16.cpp | 0 .../test}/batched_gemm/batched_gemm_fp32.cpp | 0 .../test}/batched_gemm/batched_gemm_int8.cpp | 0 .../test}/batched_gemm/test_batched_gemm.cpp | 0 .../test}/batched_gemm_gemm/CMakeLists.txt | 0 .../test_batched_gemm_gemm_fp16.cpp | 0 .../test_batched_gemm_gemm_util.hpp | 0 .../test}/batched_gemm_multi_d/CMakeLists.txt | 0 .../test_batched_gemm_multi_d_dl.cpp | 0 .../test}/batched_gemm_reduce/CMakeLists.txt | 0 .../batched_gemm_reduce_fp16.cpp | 0 .../batched_gemm_softmax_gemm/CMakeLists.txt | 0 .../test_batched_gemm_softmax_gemm_fp16.cpp | 0 .../test_batched_gemm_softmax_gemm_util.hpp | 0 .../CMakeLists.txt | 0 ...ed_gemm_bias_softmax_gemm_permute_bf16.cpp | 0 ...ed_gemm_bias_softmax_gemm_permute_fp16.cpp | 0 ...ed_gemm_bias_softmax_gemm_permute_util.hpp | 0 ...batched_gemm_softmax_gemm_permute_bf16.cpp | 0 ...batched_gemm_softmax_gemm_permute_fp16.cpp | 0 ...batched_gemm_softmax_gemm_permute_util.hpp | 0 .../test}/batchnorm/CMakeLists.txt | 0 .../test}/batchnorm/batchnorm_bwd_rank_4.cpp | 0 .../test}/batchnorm/batchnorm_fwd_rank_4.cpp | 0 .../batchnorm/batchnorm_infer_rank_4.cpp | 0 .../block_swizzle_test/block_swizzle_test.cpp | 0 .../test}/block_swizzle_test/rebuild.sh | 0 .../test}/block_swizzle_test/simple_args.h | 0 .../test}/block_to_ctile_map/CMakeLists.txt | 0 .../test_block_to_ctile_map.cpp | 0 .../test}/contraction/CMakeLists.txt | 0 .../test}/contraction/test_contraction.cpp | 0 .../test_contraction_interface.cpp | 0 .../conv_tensor_rearrange/CMakeLists.txt | 0 .../test_conv_tensor_rearrange.cpp | 0 .../test_conv_tensor_rearrange_interface.cpp | 0 .../test}/conv_util/CMakeLists.txt | 0 .../test}/conv_util/conv_util.cpp | 0 .../test}/convnd_bwd_data/CMakeLists.txt | 0 .../test}/convnd_bwd_data/convnd_bwd_data.cpp | 0 .../test}/convnd_fwd/CMakeLists.txt | 0 .../test}/convnd_fwd/convnd_fwd.cpp | 0 .../test}/data_type/CMakeLists.txt | 0 .../test}/data_type/test_bf8.cpp | 0 .../test}/data_type/test_fp8.cpp | 0 .../test}/data_type/test_int4.cpp | 0 .../test}/data_type/type_convert_const.cpp | 0 .../elementwise_normalization/CMakeLists.txt | 0 .../test_elementwise_layernorm_fp16.cpp | 0 .../test}/gemm/CMakeLists.txt | 0 .../test}/gemm/gemm_bf16.cpp | 0 .../test}/gemm/gemm_fp16.cpp | 0 .../test}/gemm/gemm_fp32.cpp | 0 .../test}/gemm/gemm_fp64.cpp | 0 .../test}/gemm/gemm_int8.cpp | 0 .../test}/gemm/gemm_standalone_xdl_fp16.cpp | 0 .../test}/gemm/gemm_util.hpp | 0 .../gemm/instance/gemm_f16_nn_instance.cpp | 0 .../gemm/instance/gemm_f16_nn_instance.hpp | 0 .../gemm/instance/gemm_f16_nt_instance.cpp | 0 .../gemm/instance/gemm_f16_nt_instance.hpp | 0 .../gemm/instance/gemm_f16_tn_instance.cpp | 0 .../gemm/instance/gemm_f16_tn_instance.hpp | 0 .../gemm/instance/gemm_f16_tt_instance.cpp | 0 .../gemm/instance/gemm_f16_tt_instance.hpp | 0 .../instance/gemm_wavelet_f16_tn_instance.cpp | 0 .../instance/gemm_wavelet_f16_tn_instance.hpp | 0 .../test}/gemm/run_gemm_test.inc | 0 .../test}/gemm_layernorm/CMakeLists.txt | 0 .../test_gemm_add_relu_add_layernorm_fp16.cpp | 0 .../test}/gemm_reduce/CMakeLists.txt | 0 .../test}/gemm_reduce/gemm_reduce_fp16.cpp | 0 .../test}/gemm_split_k/CMakeLists.txt | 0 .../test}/gemm_split_k/test_gemm_splitk.cpp | 0 .../test_gemm_splitk_ut_cases.inc | 0 .../gemm_split_k/test_gemm_splitk_util.hpp | 0 .../grouped_convnd_bwd_data/CMakeLists.txt | 0 .../test_grouped_convnd_bwd_data.cpp | 0 ...grouped_convnd_bwd_data_interface_wmma.cpp | 0 ..._grouped_convnd_bwd_data_interface_xdl.cpp | 0 .../grouped_convnd_bwd_weight/CMakeLists.txt | 0 .../test_grouped_convnd_bwd_weight.cpp | 0 ...ouped_convnd_bwd_weight_interface_wmma.cpp | 0 ...rouped_convnd_bwd_weight_interface_xdl.cpp | 0 .../test}/grouped_convnd_fwd/CMakeLists.txt | 0 .../grouped_convnd_fwd/grouped_convnd_fwd.cpp | 0 .../test_grouped_convnd_fwd.cpp | 0 .../test}/grouped_gemm/CMakeLists.txt | 0 .../test_grouped_gemm_interface.cpp | 0 .../grouped_gemm/test_grouped_gemm_splitk.cpp | 0 .../test_grouped_gemm_ut_cases.inc | 0 .../grouped_gemm/test_grouped_gemm_util.hpp | 0 .../test}/image_to_column/CMakeLists.txt | 0 .../image_to_column/test_image_to_column.cpp | 0 .../test_image_to_column_interface.cpp | 0 .../magic_number_division/CMakeLists.txt | 0 .../magic_number_division.cpp | 0 .../test}/normalization/CMakeLists.txt | 0 .../normalization/test_groupnorm_fp16.cpp | 0 .../normalization/test_groupnorm_fp32.cpp | 0 .../normalization/test_layernorm2d_fp16.cpp | 0 .../normalization/test_layernorm2d_fp32.cpp | 0 .../test}/pool/CMakeLists.txt | 0 .../test}/pool/test_avg_pool3d_bwd.cpp | 0 .../test}/pool/test_avg_pool3d_fwd.cpp | 0 .../test}/pool/test_max_pool3d_bwd.cpp | 0 .../test}/pool/test_max_pool3d_fwd.cpp | 0 .../test}/pool/test_pool_fwd_common.hpp | 0 .../test}/reduce/CMakeLists.txt | 0 .../test}/reduce/reduce_no_index.cpp | 0 .../test}/reduce/reduce_with_index.cpp | 0 .../test}/reference_conv_fwd/CMakeLists.txt | 0 .../reference_conv_fwd/reference_conv_fwd.cpp | 0 .../test}/softmax/CMakeLists.txt | 0 .../test}/softmax/test_softmax_interface.cpp | 0 .../test}/softmax/test_softmax_rank3.cpp | 0 .../test}/softmax/test_softmax_rank4.cpp | 0 .../test}/softmax/test_softmax_ut_cases.inc | 0 .../test}/softmax/test_softmax_util.hpp | 0 .../test}/space_filling_curve/CMakeLists.txt | 0 .../space_filling_curve.cpp | 0 .../test}/wmma_op/CMakeLists.txt | 0 .../test}/wmma_op/wmma_op.cpp | 0 .../test}/wmma_op/wmma_op_util.hpp | 0 .../gpu/gemv_splitk/CMakeLists.txt | 10 - 1829 files changed, 2300 insertions(+), 197 deletions(-) rename CHANGELOG.md => composable_kernel/CHANGELOG.md (100%) rename CITATION.cff => composable_kernel/CITATION.cff (100%) rename CMakeLists.txt => composable_kernel/CMakeLists.txt (100%) rename CONTRIBUTORS.md => composable_kernel/CONTRIBUTORS.md (100%) rename Config.cmake.in => composable_kernel/Config.cmake.in (100%) rename Dockerfile => composable_kernel/Dockerfile (100%) rename Jenkinsfile => composable_kernel/Jenkinsfile (100%) rename LICENSE => composable_kernel/LICENSE (100%) rename README.md => composable_kernel/README.md (100%) rename {client_example => composable_kernel/client_example}/01_gemm/CMakeLists.txt (100%) rename {client_example => composable_kernel/client_example}/01_gemm/gemm.cpp (100%) rename {client_example => composable_kernel/client_example}/02_gemm_add_add_fastgelu/CMakeLists.txt (100%) rename {client_example => composable_kernel/client_example}/02_gemm_add_add_fastgelu/gemm_add_add_fastgelu.cpp (100%) rename {client_example => composable_kernel/client_example}/02_gemm_add_add_fastgelu/gemm_add_add_fastgelu_generic.cpp (100%) rename {client_example => composable_kernel/client_example}/02_gemm_add_add_fastgelu/gemm_add_fastgelu.cpp (100%) rename {client_example => composable_kernel/client_example}/02_gemm_add_add_fastgelu/gemm_add_fastgelu_generic.cpp (100%) rename {client_example => composable_kernel/client_example}/02_gemm_add_add_fastgelu/gemm_fastgelu.cpp (100%) rename {client_example => composable_kernel/client_example}/02_gemm_add_add_fastgelu/gemm_fastgelu_generic.cpp (100%) rename {client_example => composable_kernel/client_example}/03_gemm_layernorm/CMakeLists.txt (100%) rename {client_example => composable_kernel/client_example}/03_gemm_layernorm/gemm_add_add_layernorm_naive.cpp (100%) rename {client_example => composable_kernel/client_example}/03_gemm_layernorm/gemm_add_relu_add_layernorm_welford.cpp (100%) rename {client_example => composable_kernel/client_example}/04_contraction/CMakeLists.txt (100%) rename {client_example => composable_kernel/client_example}/04_contraction/contraction_bilinear_fp32.cpp (100%) rename {client_example => composable_kernel/client_example}/04_contraction/contraction_bilinear_fp64.cpp (100%) rename {client_example => composable_kernel/client_example}/04_contraction/contraction_g1m2n3k1_add_xdl_fp16.cpp (100%) rename {client_example => composable_kernel/client_example}/04_contraction/contraction_scale_fp32.cpp (100%) rename {client_example => composable_kernel/client_example}/04_contraction/contraction_scale_fp64.cpp (100%) rename {client_example => composable_kernel/client_example}/05_layernorm/CMakeLists.txt (100%) rename {client_example => composable_kernel/client_example}/05_layernorm/layernorm2d.cpp (100%) rename {client_example => composable_kernel/client_example}/06_softmax/CMakeLists.txt (100%) rename {client_example => composable_kernel/client_example}/06_softmax/softmax4d.cpp (100%) rename {client_example => composable_kernel/client_example}/07_grouped_convnd_fwd/CMakeLists.txt (100%) rename {client_example => composable_kernel/client_example}/07_grouped_convnd_fwd/grouped_conv1d_fwd.cpp (100%) rename {client_example => composable_kernel/client_example}/07_grouped_convnd_fwd/grouped_conv2d_fwd.cpp (100%) rename {client_example => composable_kernel/client_example}/08_fused_attention/CMakeLists.txt (100%) rename {client_example => composable_kernel/client_example}/08_fused_attention/fused_attention.cpp (100%) rename {client_example => composable_kernel/client_example}/08_fused_attention/fused_attention_bias.cpp (100%) rename {client_example => composable_kernel/client_example}/09_quantization/CMakeLists.txt (100%) rename {client_example => composable_kernel/client_example}/09_quantization/conv2d_fwd_bias_relu_perchannel_quantization.cpp (100%) rename {client_example => composable_kernel/client_example}/09_quantization/conv2d_fwd_bias_relu_perlayer_quantization.cpp (100%) rename {client_example => composable_kernel/client_example}/09_quantization/conv2d_fwd_bias_tanh_perchannel_quantization.cpp (100%) rename {client_example => composable_kernel/client_example}/09_quantization/conv2d_fwd_bias_tanh_perlayer_quantization.cpp (100%) rename {client_example => composable_kernel/client_example}/09_quantization/conv2d_fwd_perchannel_quantization.cpp (100%) rename {client_example => composable_kernel/client_example}/09_quantization/conv2d_fwd_perlayer_quantization.cpp (100%) rename {client_example => composable_kernel/client_example}/09_quantization/gemm_quantization.cpp (100%) rename {client_example => composable_kernel/client_example}/10_grouped_convnd_bwd_data/CMakeLists.txt (100%) rename {client_example => composable_kernel/client_example}/10_grouped_convnd_bwd_data/grouped_conv2d_bwd_data.cpp (100%) rename {client_example => composable_kernel/client_example}/10_grouped_convnd_bwd_data/grouped_conv3d_bwd_data.cpp (100%) rename {client_example => composable_kernel/client_example}/10_grouped_convnd_bwd_data/grouped_conv3d_bwd_data_input_fp16_comp_bf8f8.cpp (100%) rename {client_example => composable_kernel/client_example}/11_grouped_conv_bwd_weight/CMakeLists.txt (100%) rename {client_example => composable_kernel/client_example}/11_grouped_conv_bwd_weight/common.hpp (100%) rename {client_example => composable_kernel/client_example}/11_grouped_conv_bwd_weight/grouped_conv1d_bwd_weight_fp16.cpp (100%) rename {client_example => composable_kernel/client_example}/11_grouped_conv_bwd_weight/grouped_conv2d_bwd_weight_fp16.cpp (100%) rename {client_example => composable_kernel/client_example}/11_grouped_conv_bwd_weight/grouped_conv3d_bwd_weight_fp16.cpp (100%) rename {client_example => composable_kernel/client_example}/11_grouped_conv_bwd_weight/grouped_conv3d_bwd_weight_fp32.cpp (100%) rename {client_example => composable_kernel/client_example}/12_elementwise_normalization/CMakeLists.txt (100%) rename {client_example => composable_kernel/client_example}/12_elementwise_normalization/elementwise_layernorm2d.cpp (100%) rename {client_example => composable_kernel/client_example}/13_batchnorm/CMakeLists.txt (100%) rename {client_example => composable_kernel/client_example}/13_batchnorm/batchnorm_bwd_nhwc.cpp (100%) rename {client_example => composable_kernel/client_example}/13_batchnorm/batchnorm_fwd_nhwc.cpp (100%) rename {client_example => composable_kernel/client_example}/13_batchnorm/batchnorm_infer_nhwc.cpp (100%) rename {client_example => composable_kernel/client_example}/14_instance_id/CMakeLists.txt (100%) rename {client_example => composable_kernel/client_example}/14_instance_id/batchnorm_fwd_instance_id.cpp (100%) rename {client_example => composable_kernel/client_example}/15_convnd_bwd_data/CMakeLists.txt (100%) rename {client_example => composable_kernel/client_example}/15_convnd_bwd_data/common.hpp (100%) rename {client_example => composable_kernel/client_example}/15_convnd_bwd_data/conv3d_bwd_data_fp16.cpp (100%) rename {client_example => composable_kernel/client_example}/15_convnd_bwd_data/conv3d_bwd_data_fp32.cpp (100%) rename {client_example => composable_kernel/client_example}/15_gemm_add_multiply/CMakeLists.txt (100%) rename {client_example => composable_kernel/client_example}/15_gemm_add_multiply/gemm_add_multiply.cpp (100%) rename {client_example => composable_kernel/client_example}/15_reduce/CMakeLists.txt (100%) rename {client_example => composable_kernel/client_example}/15_reduce/reduce_nhwc_c.cpp (100%) rename {client_example => composable_kernel/client_example}/16_convnd_fwd/CMakeLists.txt (100%) rename {client_example => composable_kernel/client_example}/16_convnd_fwd/common.hpp (100%) rename {client_example => composable_kernel/client_example}/16_convnd_fwd/conv3d_fwd_fp16.cpp (100%) rename {client_example => composable_kernel/client_example}/16_convnd_fwd/conv3d_fwd_fp16_comp_fp8.cpp (100%) rename {client_example => composable_kernel/client_example}/16_convnd_fwd/conv3d_fwd_fp32.cpp (100%) rename {client_example => composable_kernel/client_example}/17_grouped_gemm_fastgelu/CMakeLists.txt (100%) rename {client_example => composable_kernel/client_example}/17_grouped_gemm_fastgelu/grouped_gemm_fastgelu.cpp (100%) rename {client_example => composable_kernel/client_example}/18_groupnorm/CMakeLists.txt (100%) rename {client_example => composable_kernel/client_example}/18_groupnorm/groupnorm_swish.cpp (100%) rename {client_example => composable_kernel/client_example}/19_pool/CMakeLists.txt (100%) rename {client_example => composable_kernel/client_example}/19_pool/avg_pool3d_bwd.cpp (100%) rename {client_example => composable_kernel/client_example}/19_pool/avg_pool3d_fwd.cpp (100%) rename {client_example => composable_kernel/client_example}/19_pool/max_pool2d_bwd.cpp (100%) rename {client_example => composable_kernel/client_example}/19_pool/max_pool2d_fwd.cpp (100%) rename {client_example => composable_kernel/client_example}/20_splitk_gemm/CMakeLists.txt (100%) rename {client_example => composable_kernel/client_example}/20_splitk_gemm/splitK_gemm_fp16_f8.cpp (100%) rename {client_example => composable_kernel/client_example}/21_grouped_gemm_bias/CMakeLists.txt (100%) rename {client_example => composable_kernel/client_example}/21_grouped_gemm_bias/grouped_gemm_fixed_nk_bias_fp16.cpp (100%) rename {client_example => composable_kernel/client_example}/22_grouped_gemm/CMakeLists.txt (100%) rename {client_example => composable_kernel/client_example}/22_grouped_gemm/grouped_gemm_fixed_nk_fp16.cpp (100%) rename {client_example => composable_kernel/client_example}/22_grouped_gemm/grouped_gemm_fixed_nk_fp8.cpp (100%) rename {client_example => composable_kernel/client_example}/22_grouped_gemm/grouped_gemm_fixed_nk_i8.cpp (100%) rename {client_example => composable_kernel/client_example}/22_im2col_col2im/CMakeLists.txt (100%) rename {client_example => composable_kernel/client_example}/22_im2col_col2im/column_to_image.cpp (100%) rename {client_example => composable_kernel/client_example}/22_im2col_col2im/image_to_column.cpp (100%) rename {client_example => composable_kernel/client_example}/CMakeLists.txt (100%) rename {client_example => composable_kernel/client_example}/README.md (100%) rename {cmake => composable_kernel/cmake}/Analyzers.cmake (100%) rename {cmake => composable_kernel/cmake}/ClangTidy.cmake (100%) rename {cmake => composable_kernel/cmake}/CppCheck.cmake (100%) rename {cmake => composable_kernel/cmake}/DoxygenDoc.cmake (100%) rename {cmake => composable_kernel/cmake}/EnableCompilerWarnings.cmake (100%) rename {cmake => composable_kernel/cmake}/TargetFlags.cmake (100%) rename {cmake => composable_kernel/cmake}/googletest.cmake (100%) rename dev-requirements.txt => composable_kernel/dev-requirements.txt (100%) rename {docs => composable_kernel/docs}/API_Reference_Guide.rst (100%) rename {docs => composable_kernel/docs}/Contributors_Guide.rst (100%) rename {docs => composable_kernel/docs}/Supported_Primitives_Guide.rst (100%) rename {docs => composable_kernel/docs}/conf.py (100%) rename {docs => composable_kernel/docs}/data/ck_component.png (100%) rename {docs => composable_kernel/docs}/data/ck_layer.png (100%) rename {docs => composable_kernel/docs}/dockerhub.rst (100%) rename {docs => composable_kernel/docs}/doxygen/Doxyfile (100%) rename {docs => composable_kernel/docs}/index.rst (100%) rename {docs => composable_kernel/docs}/license.rst (100%) rename {docs => composable_kernel/docs}/refs.bib (100%) rename {docs => composable_kernel/docs}/sphinx/_toc.yml.in (100%) rename {docs => composable_kernel/docs}/sphinx/requirements.in (100%) rename {docs => composable_kernel/docs}/sphinx/requirements.txt (100%) rename {docs => composable_kernel/docs}/tutorial_hello_world.rst (100%) rename {example => composable_kernel/example}/01_gemm/CMakeLists.txt (100%) rename {example => composable_kernel/example}/01_gemm/README.md (100%) rename {example => composable_kernel/example}/01_gemm/common.hpp (100%) rename {example => composable_kernel/example}/01_gemm/gemm_dl_fp16.cpp (100%) rename {example => composable_kernel/example}/01_gemm/gemm_dl_fp32.cpp (100%) rename {example => composable_kernel/example}/01_gemm/gemm_dl_int4.cpp (100%) rename {example => composable_kernel/example}/01_gemm/gemm_dl_int8.cpp (100%) rename {example => composable_kernel/example}/01_gemm/gemm_dpp_fp16.cpp (100%) rename {example => composable_kernel/example}/01_gemm/gemm_wmma_fp16.cpp (100%) rename {example => composable_kernel/example}/01_gemm/gemm_xdl_bf16.cpp (100%) rename {example => composable_kernel/example}/01_gemm/gemm_xdl_bf16_rtn.cpp (100%) rename {example => composable_kernel/example}/01_gemm/gemm_xdl_fp16.cpp (100%) rename {example => composable_kernel/example}/01_gemm/gemm_xdl_fp16_fp8.cpp (100%) rename {example => composable_kernel/example}/01_gemm/gemm_xdl_fp64.cpp (100%) rename {example => composable_kernel/example}/01_gemm/gemm_xdl_fp8.cpp (100%) rename {example => composable_kernel/example}/01_gemm/gemm_xdl_fp8_bf8.cpp (100%) rename {example => composable_kernel/example}/01_gemm/gemm_xdl_int4.cpp (100%) rename {example => composable_kernel/example}/01_gemm/gemm_xdl_int8.cpp (100%) rename {example => composable_kernel/example}/01_gemm/gemm_xdl_skip_b_lds_fp16.cpp (100%) rename {example => composable_kernel/example}/01_gemm/gemm_xdl_streamk.cpp (100%) rename {example => composable_kernel/example}/01_gemm/gemm_xdl_wavelet_fp16.cpp (100%) rename {example => composable_kernel/example}/01_gemm/run_gemm_example.inc (100%) rename {example => composable_kernel/example}/02_gemm_bilinear/CMakeLists.txt (100%) rename {example => composable_kernel/example}/02_gemm_bilinear/README.md (100%) rename {example => composable_kernel/example}/02_gemm_bilinear/gemm_bilinear_wmma_fp16.cpp (100%) rename {example => composable_kernel/example}/02_gemm_bilinear/gemm_bilinear_wmma_int8.cpp (100%) rename {example => composable_kernel/example}/02_gemm_bilinear/gemm_bilinear_xdl_fp16.cpp (100%) rename {example => composable_kernel/example}/03_gemm_bias_relu/CMakeLists.txt (100%) rename {example => composable_kernel/example}/03_gemm_bias_relu/README.md (100%) rename {example => composable_kernel/example}/03_gemm_bias_relu/gemm_bias_relu_xdl_fp16.cpp (100%) rename {example => composable_kernel/example}/04_gemm_add_add_fastgelu/CMakeLists.txt (100%) rename {example => composable_kernel/example}/04_gemm_add_add_fastgelu/README.md (100%) rename {example => composable_kernel/example}/04_gemm_add_add_fastgelu/common.hpp (100%) rename {example => composable_kernel/example}/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_bf16.cpp (100%) rename {example => composable_kernel/example}/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_fp16.cpp (100%) rename {example => composable_kernel/example}/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_fp32.cpp (100%) rename {example => composable_kernel/example}/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_int4.cpp (100%) rename {example => composable_kernel/example}/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_int8.cpp (100%) rename {example => composable_kernel/example}/04_gemm_add_add_fastgelu/run_gemm_add_add_fastgelu_example.inc (100%) rename {example => composable_kernel/example}/09_convnd_fwd/CMakeLists.txt (100%) rename {example => composable_kernel/example}/09_convnd_fwd/README.md (100%) rename {example => composable_kernel/example}/09_convnd_fwd/convnd_fwd_common.hpp (100%) rename {example => composable_kernel/example}/09_convnd_fwd/convnd_fwd_dl_common.hpp (100%) rename {example => composable_kernel/example}/09_convnd_fwd/convnd_fwd_dl_fp16.cpp (100%) rename {example => composable_kernel/example}/09_convnd_fwd/convnd_fwd_dl_fp32.cpp (100%) rename {example => composable_kernel/example}/09_convnd_fwd/convnd_fwd_dl_int8.cpp (100%) rename {example => composable_kernel/example}/09_convnd_fwd/convnd_fwd_xdl_bf16.cpp (100%) rename {example => composable_kernel/example}/09_convnd_fwd/convnd_fwd_xdl_fp16.cpp (100%) rename {example => composable_kernel/example}/09_convnd_fwd/convnd_fwd_xdl_fp32.cpp (100%) rename {example => composable_kernel/example}/09_convnd_fwd/convnd_fwd_xdl_fp64.cpp (100%) rename {example => composable_kernel/example}/09_convnd_fwd/convnd_fwd_xdl_int8.cpp (100%) rename {example => composable_kernel/example}/09_convnd_fwd/run_convnd_fwd_dl_example.inc (100%) rename {example => composable_kernel/example}/09_convnd_fwd/run_convnd_fwd_example.inc (100%) rename {example => composable_kernel/example}/10_convnd_fwd_multiple_d_multiple_reduce/CMakeLists.txt (100%) rename {example => composable_kernel/example}/10_convnd_fwd_multiple_d_multiple_reduce/common.hpp (100%) rename {example => composable_kernel/example}/10_convnd_fwd_multiple_d_multiple_reduce/convnd_fwd_max_xdl_bf16.cpp (100%) rename {example => composable_kernel/example}/10_convnd_fwd_multiple_d_multiple_reduce/convnd_fwd_max_xdl_fp16.cpp (100%) rename {example => composable_kernel/example}/10_convnd_fwd_multiple_d_multiple_reduce/convnd_fwd_max_xdl_fp32.cpp (100%) rename {example => composable_kernel/example}/10_convnd_fwd_multiple_d_multiple_reduce/convnd_fwd_max_xdl_int4.cpp (100%) rename {example => composable_kernel/example}/10_convnd_fwd_multiple_d_multiple_reduce/convnd_fwd_max_xdl_int8.cpp (100%) rename {example => composable_kernel/example}/10_convnd_fwd_multiple_d_multiple_reduce/run_convnd_fwd_max_example.inc (100%) rename {example => composable_kernel/example}/12_reduce/CMakeLists.txt (100%) rename {example => composable_kernel/example}/12_reduce/README.md (100%) rename {example => composable_kernel/example}/12_reduce/reduce_blockwise.cpp (100%) rename {example => composable_kernel/example}/12_reduce/reduce_blockwise_impl.hpp (100%) rename {example => composable_kernel/example}/12_reduce/reduce_blockwise_two_call.cpp (100%) rename {example => composable_kernel/example}/12_reduce/reduce_example_common.hpp (100%) rename {example => composable_kernel/example}/12_reduce/reduce_multiblock_atomic_add.cpp (100%) rename {example => composable_kernel/example}/12_reduce/reduce_multiblock_atomic_add_impl.hpp (100%) rename {example => composable_kernel/example}/13_pool2d_fwd/CMakeLists.txt (100%) rename {example => composable_kernel/example}/13_pool2d_fwd/README.md (100%) rename {example => composable_kernel/example}/13_pool2d_fwd/pool2d_fwd_common.hpp (100%) rename {example => composable_kernel/example}/13_pool2d_fwd/pool2d_fwd_fp16.cpp (100%) rename {example => composable_kernel/example}/13_pool2d_fwd/pool2d_fwd_fp32.cpp (100%) rename {example => composable_kernel/example}/14_gemm_quantization/CMakeLists.txt (100%) rename {example => composable_kernel/example}/14_gemm_quantization/gemm_dl_quantization_int8.cpp (100%) rename {example => composable_kernel/example}/14_gemm_quantization/gemm_xdl_bias_relu_quantization_int8.cpp (100%) rename {example => composable_kernel/example}/14_gemm_quantization/gemm_xdl_quantization_int8.cpp (100%) rename {example => composable_kernel/example}/15_grouped_gemm/CMakeLists.txt (100%) rename {example => composable_kernel/example}/15_grouped_gemm/README.md (100%) rename {example => composable_kernel/example}/15_grouped_gemm/grouped_gemm_multiple_d_dl_fp16.cpp (100%) rename {example => composable_kernel/example}/15_grouped_gemm/grouped_gemm_xdl_bf16.cpp (100%) rename {example => composable_kernel/example}/15_grouped_gemm/grouped_gemm_xdl_fixed_nk_bias_fp16.cpp (100%) rename {example => composable_kernel/example}/15_grouped_gemm/grouped_gemm_xdl_fixed_nk_fp16.cpp (100%) rename {example => composable_kernel/example}/15_grouped_gemm/grouped_gemm_xdl_fixed_nk_fp8.cpp (100%) rename {example => composable_kernel/example}/15_grouped_gemm/grouped_gemm_xdl_fp16.cpp (100%) rename {example => composable_kernel/example}/15_grouped_gemm/grouped_gemm_xdl_fp32.cpp (100%) rename {example => composable_kernel/example}/15_grouped_gemm/grouped_gemm_xdl_int4.cpp (100%) rename {example => composable_kernel/example}/15_grouped_gemm/grouped_gemm_xdl_int8.cpp (100%) rename {example => composable_kernel/example}/15_grouped_gemm/grouped_gemm_xdl_splitk_fp16.cpp (100%) rename {example => composable_kernel/example}/15_grouped_gemm/run_grouped_gemm_example.inc (100%) rename {example => composable_kernel/example}/16_gemm_multi_d_multi_reduces/CMakeLists.txt (100%) rename {example => composable_kernel/example}/16_gemm_multi_d_multi_reduces/gemm_add_add_mean_meansquare_xdl_fp16.cpp (100%) rename {example => composable_kernel/example}/16_gemm_multi_d_multi_reduces/gemm_add_addsquare_xdl_int8.cpp (100%) rename {example => composable_kernel/example}/16_gemm_multi_d_multi_reduces/gemm_max_xdl_bf16.cpp (100%) rename {example => composable_kernel/example}/16_gemm_multi_d_multi_reduces/gemm_max_xdl_fp16.cpp (100%) rename {example => composable_kernel/example}/16_gemm_multi_d_multi_reduces/gemm_max_xdl_fp32.cpp (100%) rename {example => composable_kernel/example}/16_gemm_multi_d_multi_reduces/gemm_max_xdl_int4.cpp (100%) rename {example => composable_kernel/example}/16_gemm_multi_d_multi_reduces/gemm_max_xdl_int8.cpp (100%) rename {example => composable_kernel/example}/16_gemm_multi_d_multi_reduces/gemm_mean_meansquare_xdl_bf16.cpp (100%) rename {example => composable_kernel/example}/16_gemm_multi_d_multi_reduces/gemm_mean_meansquare_xdl_fp16.cpp (100%) rename {example => composable_kernel/example}/16_gemm_multi_d_multi_reduces/gemm_mean_meansquare_xdl_fp32.cpp (100%) rename {example => composable_kernel/example}/16_gemm_multi_d_multi_reduces/gemm_reduce_xdl_common.hpp (100%) rename {example => composable_kernel/example}/17_convnd_bwd_data/CMakeLists.txt (100%) rename {example => composable_kernel/example}/17_convnd_bwd_data/README.md (100%) rename {example => composable_kernel/example}/17_convnd_bwd_data/convnd_bwd_data_common.hpp (100%) rename {example => composable_kernel/example}/17_convnd_bwd_data/convnd_bwd_data_dl_fp16.cpp (100%) rename {example => composable_kernel/example}/17_convnd_bwd_data/convnd_bwd_data_xdl_fp16.cpp (100%) rename {example => composable_kernel/example}/18_batched_gemm_reduce/CMakeLists.txt (100%) rename {example => composable_kernel/example}/18_batched_gemm_reduce/batched_gemm_reduce_xdl_fp16.cpp (100%) rename {example => composable_kernel/example}/19_binary_elementwise/CMakeLists.txt (100%) rename {example => composable_kernel/example}/19_binary_elementwise/broadcast_add_2d_amn_bn.cpp (100%) rename {example => composable_kernel/example}/19_binary_elementwise/broadcast_add_3d_am_bmnk.cpp (100%) rename {example => composable_kernel/example}/19_binary_elementwise/elementwise_add_1d.cpp (100%) rename {example => composable_kernel/example}/19_binary_elementwise/elementwise_add_4d.cpp (100%) rename {example => composable_kernel/example}/20_grouped_conv_bwd_weight/CMakeLists.txt (100%) rename {example => composable_kernel/example}/20_grouped_conv_bwd_weight/common.hpp (100%) rename {example => composable_kernel/example}/20_grouped_conv_bwd_weight/grouped_conv_bwd_weight_dl_fp16.cpp (100%) rename {example => composable_kernel/example}/20_grouped_conv_bwd_weight/grouped_conv_bwd_weight_wmma_fp16.cpp (100%) rename {example => composable_kernel/example}/20_grouped_conv_bwd_weight/grouped_conv_bwd_weight_xdl_bf16.cpp (100%) rename {example => composable_kernel/example}/20_grouped_conv_bwd_weight/grouped_conv_bwd_weight_xdl_fp16.cpp (100%) rename {example => composable_kernel/example}/20_grouped_conv_bwd_weight/grouped_conv_bwd_weight_xdl_fp16_comp_bf8_fp8.cpp (100%) rename {example => composable_kernel/example}/20_grouped_conv_bwd_weight/run_grouped_conv_bwd_weight_example.inc (100%) rename {example => composable_kernel/example}/21_gemm_layernorm/CMakeLists.txt (100%) rename {example => composable_kernel/example}/21_gemm_layernorm/gemm_bias_relu_add_layernorm_xdl_naive_fp16.cpp (100%) rename {example => composable_kernel/example}/21_gemm_layernorm/gemm_bias_relu_add_layernorm_xdl_welford_fp16.cpp (100%) rename {example => composable_kernel/example}/21_gemm_layernorm/gemm_layernorm_xdl_naive_fp16.cpp (100%) rename {example => composable_kernel/example}/21_gemm_layernorm/gemm_xdl_layernorm_naive_single_kernel_fp16.cpp (100%) rename {example => composable_kernel/example}/22_cgemm/CMakeLists.txt (100%) rename {example => composable_kernel/example}/22_cgemm/cgemm_xdl_bf16.cpp (100%) rename {example => composable_kernel/example}/22_cgemm/cgemm_xdl_common.hpp (100%) rename {example => composable_kernel/example}/22_cgemm/cgemm_xdl_fp16.cpp (100%) rename {example => composable_kernel/example}/22_cgemm/cgemm_xdl_fp32.cpp (100%) rename {example => composable_kernel/example}/22_cgemm/cgemm_xdl_int4.cpp (100%) rename {example => composable_kernel/example}/22_cgemm/cgemm_xdl_int8.cpp (100%) rename {example => composable_kernel/example}/23_softmax/CMakeLists.txt (100%) rename {example => composable_kernel/example}/23_softmax/README.md (100%) rename {example => composable_kernel/example}/23_softmax/softmax_blockwise.cpp (100%) rename {example => composable_kernel/example}/24_batched_gemm/CMakeLists.txt (100%) rename {example => composable_kernel/example}/24_batched_gemm/batched_gemm_xdl_bf16.cpp (100%) rename {example => composable_kernel/example}/24_batched_gemm/batched_gemm_xdl_fp16.cpp (100%) rename {example => composable_kernel/example}/24_batched_gemm/batched_gemm_xdl_fp32.cpp (100%) rename {example => composable_kernel/example}/24_batched_gemm/batched_gemm_xdl_int4.cpp (100%) rename {example => composable_kernel/example}/24_batched_gemm/batched_gemm_xdl_int8.cpp (100%) rename {example => composable_kernel/example}/24_batched_gemm/run_batched_gemm_example.inc (100%) rename {example => composable_kernel/example}/25_gemm_bias_e_permute/CMakeLists.txt (100%) rename {example => composable_kernel/example}/25_gemm_bias_e_permute/gemm_bias_e_permute_g1m2n3k1_xdl_fp16.cpp (100%) rename {example => composable_kernel/example}/25_gemm_bias_e_permute/gemm_bias_e_permute_g1m3n2k1_xdl_fp16.cpp (100%) rename {example => composable_kernel/example}/26_contraction/CMakeLists.txt (100%) rename {example => composable_kernel/example}/26_contraction/README.md (100%) rename {example => composable_kernel/example}/26_contraction/contraction_bilinear_xdl_fp32.cpp (100%) rename {example => composable_kernel/example}/26_contraction/contraction_bilinear_xdl_fp64.cpp (100%) rename {example => composable_kernel/example}/26_contraction/contraction_scale_xdl_fp32.cpp (100%) rename {example => composable_kernel/example}/26_contraction/contraction_scale_xdl_fp64.cpp (100%) rename {example => composable_kernel/example}/27_layernorm/CMakeLists.txt (100%) rename {example => composable_kernel/example}/27_layernorm/common.hpp (100%) rename {example => composable_kernel/example}/27_layernorm/layernorm_fp16.cpp (100%) rename {example => composable_kernel/example}/27_layernorm/layernorm_splitk_fp16.cpp (100%) rename {example => composable_kernel/example}/27_layernorm/run_layernorm_example.inc (100%) rename {example => composable_kernel/example}/28_grouped_gemm_bias_e_permute/CMakeLists.txt (100%) rename {example => composable_kernel/example}/28_grouped_gemm_bias_e_permute/grouped_gemm_bias_e_permute_xdl_fp16.cpp (100%) rename {example => composable_kernel/example}/29_batched_gemm_bias_e_permute/CMakeLists.txt (100%) rename {example => composable_kernel/example}/29_batched_gemm_bias_e_permute/batched_gemm_bias_e_permute_wmma_fp16.cpp (100%) rename {example => composable_kernel/example}/29_batched_gemm_bias_e_permute/batched_gemm_bias_e_permute_xdl_fp16.cpp (100%) rename {example => composable_kernel/example}/30_grouped_conv_fwd_multiple_d/CMakeLists.txt (100%) rename {example => composable_kernel/example}/30_grouped_conv_fwd_multiple_d/README.md (100%) rename {example => composable_kernel/example}/30_grouped_conv_fwd_multiple_d/common.hpp (100%) rename {example => composable_kernel/example}/30_grouped_conv_fwd_multiple_d/common_wmma.hpp (100%) rename {example => composable_kernel/example}/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_bias_relu_add_wmma_fp16.cpp (100%) rename {example => composable_kernel/example}/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_bias_relu_add_wmma_int8.cpp (100%) rename {example => composable_kernel/example}/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_bias_relu_add_xdl_bf16.cpp (100%) rename {example => composable_kernel/example}/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_bias_relu_add_xdl_fp16.cpp (100%) rename {example => composable_kernel/example}/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_bias_relu_add_xdl_fp32.cpp (100%) rename {example => composable_kernel/example}/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_bias_relu_add_xdl_int4.cpp (100%) rename {example => composable_kernel/example}/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_bias_relu_add_xdl_int8.cpp (100%) rename {example => composable_kernel/example}/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_xdl_fp16.cpp (100%) rename {example => composable_kernel/example}/30_grouped_conv_fwd_multiple_d/run_grouped_conv_fwd_bias_relu_add_example.inc (100%) rename {example => composable_kernel/example}/30_grouped_conv_fwd_multiple_d/run_grouped_conv_fwd_bias_relu_add_wmma_example.inc (100%) rename {example => composable_kernel/example}/30_grouped_conv_fwd_multiple_d/run_grouped_conv_fwd_example.inc (100%) rename {example => composable_kernel/example}/31_batched_gemm_gemm/CMakeLists.txt (100%) rename {example => composable_kernel/example}/31_batched_gemm_gemm/batched_gemm_gemm_xdl_bf16.cpp (100%) rename {example => composable_kernel/example}/31_batched_gemm_gemm/batched_gemm_gemm_xdl_fp16.cpp (100%) rename {example => composable_kernel/example}/31_batched_gemm_gemm/batched_gemm_gemm_xdl_fp32.cpp (100%) rename {example => composable_kernel/example}/31_batched_gemm_gemm/batched_gemm_gemm_xdl_int4.cpp (100%) rename {example => composable_kernel/example}/31_batched_gemm_gemm/batched_gemm_gemm_xdl_int8.cpp (100%) rename {example => composable_kernel/example}/31_batched_gemm_gemm/run_batched_gemm_gemm_example.inc (100%) rename {example => composable_kernel/example}/32_batched_gemm_scale_softmax_gemm/CMakeLists.txt (100%) rename {example => composable_kernel/example}/32_batched_gemm_scale_softmax_gemm/batched_gemm_lower_triangle_scale_softmax_gemm_permute_xdl_fp16.cpp (100%) rename {example => composable_kernel/example}/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_permute_xdl_bf16.cpp (100%) rename {example => composable_kernel/example}/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_permute_xdl_fp16.cpp (100%) rename {example => composable_kernel/example}/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_xdl_bf16.cpp (100%) rename {example => composable_kernel/example}/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_xdl_fp16.cpp (100%) rename {example => composable_kernel/example}/32_batched_gemm_scale_softmax_gemm/grouped_gemm_lower_triangle_scale_softmax_gemm_permute_xdl_fp16.cpp (100%) rename {example => composable_kernel/example}/32_batched_gemm_scale_softmax_gemm/grouped_gemm_scale_softmax_gemm_permute_xdl_fp16.cpp (100%) rename {example => composable_kernel/example}/32_batched_gemm_scale_softmax_gemm/run_batched_gemm_scale_softmax_gemm.inc (100%) rename {example => composable_kernel/example}/32_batched_gemm_scale_softmax_gemm/run_batched_gemm_scale_softmax_gemm_permute.inc (100%) rename {example => composable_kernel/example}/32_batched_gemm_scale_softmax_gemm/run_grouped_gemm_scale_softmax_gemm_permute.inc (100%) rename {example => composable_kernel/example}/33_multiple_reduce/CMakeLists.txt (100%) rename {example => composable_kernel/example}/33_multiple_reduce/README.md (100%) rename {example => composable_kernel/example}/33_multiple_reduce/dual_reduce_common.hpp (100%) rename {example => composable_kernel/example}/33_multiple_reduce/dual_reduce_multiblock.cpp (100%) rename {example => composable_kernel/example}/33_multiple_reduce/dual_reduce_threadwise.cpp (100%) rename {example => composable_kernel/example}/34_batchnorm/CMakeLists.txt (100%) rename {example => composable_kernel/example}/34_batchnorm/README.md (100%) rename {example => composable_kernel/example}/34_batchnorm/batchnorm_backward_nhwc.cpp (100%) rename {example => composable_kernel/example}/34_batchnorm/batchnorm_common.hpp (100%) rename {example => composable_kernel/example}/34_batchnorm/batchnorm_forward_inferring_nhwc.cpp (100%) rename {example => composable_kernel/example}/34_batchnorm/batchnorm_forward_training_nhwc.cpp (100%) rename {example => composable_kernel/example}/34_batchnorm/batchnorm_forward_training_nhwc_obsolete.cpp (100%) rename {example => composable_kernel/example}/34_batchnorm/batchnorm_infer_impl.hpp (100%) rename {example => composable_kernel/example}/35_splitK_gemm/CMakeLists.txt (100%) rename {example => composable_kernel/example}/35_splitK_gemm/run_splitK_gemm_example.inc (100%) rename {example => composable_kernel/example}/35_splitK_gemm/splitK_gemm_xdl_bf16.cpp (100%) rename {example => composable_kernel/example}/35_splitK_gemm/splitK_gemm_xdl_fp16.cpp (100%) rename {example => composable_kernel/example}/35_splitK_gemm/splitK_gemm_xdl_fp32.cpp (100%) rename {example => composable_kernel/example}/35_splitK_gemm/splitK_gemm_xdl_int4.cpp (100%) rename {example => composable_kernel/example}/35_splitK_gemm/splitK_gemm_xdl_int8.cpp (100%) rename {example => composable_kernel/example}/36_sparse_embedding/CMakeLists.txt (100%) rename {example => composable_kernel/example}/36_sparse_embedding/sparse_embedding3_forward_layernorm.cpp (100%) rename {example => composable_kernel/example}/37_batched_gemm_add_add_relu_gemm_add/CMakeLists.txt (100%) rename {example => composable_kernel/example}/37_batched_gemm_add_add_relu_gemm_add/batched_gemm_add_add_relu_gemm_add_xdl_fp16.cpp (100%) rename {example => composable_kernel/example}/38_grouped_conv_bwd_data_multiple_d/CMakeLists.txt (100%) rename {example => composable_kernel/example}/38_grouped_conv_bwd_data_multiple_d/common.hpp (100%) rename {example => composable_kernel/example}/38_grouped_conv_bwd_data_multiple_d/grouped_conv_bwd_data_bias_relu_xdl_fp16.cpp (100%) rename {example => composable_kernel/example}/38_grouped_conv_bwd_data_multiple_d/grouped_conv_bwd_data_wmma_fp16.cpp (100%) rename {example => composable_kernel/example}/38_grouped_conv_bwd_data_multiple_d/grouped_conv_bwd_data_xdl_fp16.cpp (100%) rename {example => composable_kernel/example}/38_grouped_conv_bwd_data_multiple_d/run_grouped_conv_bwd_data_bias_relu_example.inc (100%) rename {example => composable_kernel/example}/38_grouped_conv_bwd_data_multiple_d/run_grouped_conv_bwd_data_example.inc (100%) rename {example => composable_kernel/example}/39_permute/CMakeLists.txt (100%) rename {example => composable_kernel/example}/39_permute/common.hpp (100%) rename {example => composable_kernel/example}/39_permute/permute_1xHxW_fp16.cpp (100%) rename {example => composable_kernel/example}/39_permute/permute_HxWx4_fp16.cpp (100%) rename {example => composable_kernel/example}/39_permute/permute_NxHxW_fp16.cpp (100%) rename {example => composable_kernel/example}/39_permute/run_permute_bundle_example.inc (100%) rename {example => composable_kernel/example}/39_permute/run_permute_element_example.inc (100%) rename {example => composable_kernel/example}/40_conv2d_fwd_quantization/CMakeLists.txt (100%) rename {example => composable_kernel/example}/40_conv2d_fwd_quantization/common.hpp (100%) rename {example => composable_kernel/example}/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_relu_perchannel_quantization_int8.cpp (100%) rename {example => composable_kernel/example}/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_relu_perlayer_quantization_int8.cpp (100%) rename {example => composable_kernel/example}/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_tanh_perchannel_quantization_int8.cpp (100%) rename {example => composable_kernel/example}/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_tanh_perlayer_quantization_int8.cpp (100%) rename {example => composable_kernel/example}/40_conv2d_fwd_quantization/conv2d_fwd_dl_perchannel_quantization_int8.cpp (100%) rename {example => composable_kernel/example}/40_conv2d_fwd_quantization/conv2d_fwd_dl_perlayer_quantization_int8.cpp (100%) rename {example => composable_kernel/example}/40_conv2d_fwd_quantization/conv2d_fwd_xdl_bias_relu_perchannel_quantization_int8.cpp (100%) rename {example => composable_kernel/example}/40_conv2d_fwd_quantization/conv2d_fwd_xdl_bias_relu_perlayer_quantization_int8.cpp (100%) rename {example => composable_kernel/example}/40_conv2d_fwd_quantization/conv2d_fwd_xdl_perchannel_quantization_int8.cpp (100%) rename {example => composable_kernel/example}/40_conv2d_fwd_quantization/conv2d_fwd_xdl_perlayer_quantization_int8.cpp (100%) rename {example => composable_kernel/example}/40_conv2d_fwd_quantization/run_conv2d_fwd_bias_perchannel_quantization_example.inc (100%) rename {example => composable_kernel/example}/40_conv2d_fwd_quantization/run_conv2d_fwd_bias_perlayer_quantization_example.inc (100%) rename {example => composable_kernel/example}/40_conv2d_fwd_quantization/run_conv2d_fwd_perchannel_quantization_example.inc (100%) rename {example => composable_kernel/example}/40_conv2d_fwd_quantization/run_conv2d_fwd_perlayer_quantization_example.inc (100%) rename {example => composable_kernel/example}/41_grouped_conv_conv_fwd/CMakeLists.txt (100%) rename {example => composable_kernel/example}/41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_bf16.cpp (100%) rename {example => composable_kernel/example}/41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_fp16.cpp (100%) rename {example => composable_kernel/example}/41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_fp32.cpp (100%) rename {example => composable_kernel/example}/41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_int4.cpp (100%) rename {example => composable_kernel/example}/41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_int8.cpp (100%) rename {example => composable_kernel/example}/41_grouped_conv_conv_fwd/run_grouped_conv_conv_fwd_example.inc (100%) rename {example => composable_kernel/example}/42_groupnorm/CMakeLists.txt (100%) rename {example => composable_kernel/example}/42_groupnorm/common.hpp (100%) rename {example => composable_kernel/example}/42_groupnorm/groupnorm_sigmoid_mul_fp16.cpp (100%) rename {example => composable_kernel/example}/42_groupnorm/groupnorm_splitk_fp16.cpp (100%) rename {example => composable_kernel/example}/42_groupnorm/groupnorm_swish_fp16.cpp (100%) rename {example => composable_kernel/example}/42_groupnorm/run_groupnorm_example.inc (100%) rename {example => composable_kernel/example}/43_splitk_gemm_bias_e_permute/CMakeLists.txt (100%) rename {example => composable_kernel/example}/43_splitk_gemm_bias_e_permute/splitk_gemm_bias_e_permute_xdl_fp16.cpp (100%) rename {example => composable_kernel/example}/43_splitk_gemm_bias_e_permute/splitk_gemm_bias_e_permute_xdl_fp32.cpp (100%) rename {example => composable_kernel/example}/44_elementwise_permute/CMakeLists.txt (100%) rename {example => composable_kernel/example}/44_elementwise_permute/elementwise_permute_4D_fp16.cpp (100%) rename {example => composable_kernel/example}/44_elementwise_permute/elementwise_permute_4D_fp16_2d.cpp (100%) rename {example => composable_kernel/example}/45_elementwise_normalization/CMakeLists.txt (100%) rename {example => composable_kernel/example}/45_elementwise_normalization/elementwise_layernorm_blockwise.cpp (100%) rename {example => composable_kernel/example}/46_gemm_add_multiply/CMakeLists.txt (100%) rename {example => composable_kernel/example}/46_gemm_add_multiply/README.md (100%) rename {example => composable_kernel/example}/46_gemm_add_multiply/common.hpp (100%) rename {example => composable_kernel/example}/46_gemm_add_multiply/gemm_add_multiply_dl_fp16.cpp (100%) rename {example => composable_kernel/example}/46_gemm_add_multiply/gemm_add_multiply_xdl_fp16.cpp (100%) rename {example => composable_kernel/example}/46_gemm_add_multiply/run_gemm_add_multiply_example.inc (100%) rename {example => composable_kernel/example}/47_gemm_bias_softmax_gemm_permute/CMakeLists.txt (100%) rename {example => composable_kernel/example}/47_gemm_bias_softmax_gemm_permute/gemm_bias_softmax_gemm_permute.cpp (100%) rename {example => composable_kernel/example}/48_pool3d_fwd/CMakeLists.txt (100%) rename {example => composable_kernel/example}/48_pool3d_fwd/pool3d_fwd_common.hpp (100%) rename {example => composable_kernel/example}/48_pool3d_fwd/pool3d_fwd_fp16.cpp (100%) rename {example => composable_kernel/example}/49_maxpool2d_bwd/CMakeLists.txt (100%) rename {example => composable_kernel/example}/49_maxpool2d_bwd/maxpool2d_bwd_bf16.cpp (100%) rename {example => composable_kernel/example}/49_maxpool2d_bwd/maxpool2d_bwd_common.hpp (100%) rename {example => composable_kernel/example}/49_maxpool2d_bwd/maxpool2d_bwd_fp16.cpp (100%) rename {example => composable_kernel/example}/49_maxpool2d_bwd/maxpool2d_bwd_fp32.cpp (100%) rename {example => composable_kernel/example}/50_put_element/CMakeLists.txt (100%) rename {example => composable_kernel/example}/50_put_element/put_element_fp16.cpp (100%) rename {example => composable_kernel/example}/51_avgpool3d_bwd/CMakeLists.txt (100%) rename {example => composable_kernel/example}/51_avgpool3d_bwd/avgpool3d_bwd_bf16.cpp (100%) rename {example => composable_kernel/example}/51_avgpool3d_bwd/avgpool3d_bwd_common.hpp (100%) rename {example => composable_kernel/example}/51_avgpool3d_bwd/avgpool3d_bwd_fp16.cpp (100%) rename {example => composable_kernel/example}/51_avgpool3d_bwd/avgpool3d_bwd_fp32.cpp (100%) rename {example => composable_kernel/example}/52_im2col_col2im/CMakeLists.txt (100%) rename {example => composable_kernel/example}/52_im2col_col2im/column_to_image_f32.cpp (100%) rename {example => composable_kernel/example}/52_im2col_col2im/common.hpp (100%) rename {example => composable_kernel/example}/52_im2col_col2im/image_to_column_f32.cpp (100%) rename {example => composable_kernel/example}/53_gemv_splitk/CMakeLists.txt (100%) rename {example => composable_kernel/example}/53_gemv_splitk/README.md (100%) rename {example => composable_kernel/example}/53_gemv_splitk/common.hpp (100%) rename {example => composable_kernel/example}/53_gemv_splitk/gemv_splitk_fp16.cpp (95%) rename {example => composable_kernel/example}/53_gemv_splitk/run_gemv_splitk_example.inc (100%) create mode 100755 composable_kernel/example/54_tall_and_skinny_gemm_splitk/CMakeLists.txt create mode 100755 composable_kernel/example/54_tall_and_skinny_gemm_splitk/README.md create mode 100755 composable_kernel/example/54_tall_and_skinny_gemm_splitk/common.hpp create mode 100755 composable_kernel/example/54_tall_and_skinny_gemm_splitk/run_tall_and_skinny_gemm_splitk_example.inc create mode 100755 composable_kernel/example/54_tall_and_skinny_gemm_splitk/tall_and_skinny_gemm_splitk_fp16.cpp rename {example => composable_kernel/example}/60_gemm_multi_ABD/CMakeLists.txt (100%) rename {example => composable_kernel/example}/60_gemm_multi_ABD/gemm_multi_ABD_xdl_fp16.cpp (100%) rename {example => composable_kernel/example}/61_contraction_multi_ABD/CMakeLists.txt (100%) rename {example => composable_kernel/example}/61_contraction_multi_ABD/contraction_multi_ABD_xdl_fp16.cpp (100%) rename {example => composable_kernel/example}/62_conv_fwd_activ/CMakeLists.txt (100%) rename {example => composable_kernel/example}/62_conv_fwd_activ/convnd_fwd_activ_common.hpp (100%) rename {example => composable_kernel/example}/62_conv_fwd_activ/convnd_fwd_xdl_abs_fp16.cpp (100%) rename {example => composable_kernel/example}/62_conv_fwd_activ/convnd_fwd_xdl_clippedrelu_fp16.cpp (100%) rename {example => composable_kernel/example}/62_conv_fwd_activ/convnd_fwd_xdl_elu_fp16.cpp (100%) rename {example => composable_kernel/example}/62_conv_fwd_activ/convnd_fwd_xdl_leakyrelu_fp16.cpp (100%) rename {example => composable_kernel/example}/62_conv_fwd_activ/convnd_fwd_xdl_pow_fp16.cpp (100%) rename {example => composable_kernel/example}/62_conv_fwd_activ/convnd_fwd_xdl_relu_fp16.cpp (100%) rename {example => composable_kernel/example}/62_conv_fwd_activ/convnd_fwd_xdl_sigmoid_fp16.cpp (100%) rename {example => composable_kernel/example}/62_conv_fwd_activ/convnd_fwd_xdl_softrelu_fp16.cpp (100%) rename {example => composable_kernel/example}/62_conv_fwd_activ/convnd_fwd_xdl_tanh_fp16.cpp (100%) rename {example => composable_kernel/example}/62_conv_fwd_activ/run_convnd_fwd_activ_example.inc (100%) rename {example => composable_kernel/example}/CMakeLists.txt (100%) rename {include => composable_kernel/include}/ck/ck.hpp (100%) rename {include => composable_kernel/include}/ck/config.h.in (100%) rename {include => composable_kernel/include}/ck/host_utility/device_prop.hpp (100%) rename {include => composable_kernel/include}/ck/host_utility/hip_check_error.hpp (100%) rename {include => composable_kernel/include}/ck/host_utility/io.hpp (100%) rename {include => composable_kernel/include}/ck/host_utility/kernel_launch.hpp (100%) rename {include => composable_kernel/include}/ck/host_utility/stream_utility.hpp (100%) rename {include => composable_kernel/include}/ck/problem_transform/transform_forward_convolution3d_into_gemm_v4r4r4_ndhwc_kzyxc_ndhwk.hpp (100%) rename {include => composable_kernel/include}/ck/stream_config.hpp (100%) rename {include => composable_kernel/include}/ck/tensor/static_tensor.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_description/cluster_descriptor.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_description/multi_index_transform.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_description/multi_index_transform_helper.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_description/tensor_adaptor.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_description/tensor_descriptor.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_description/tensor_descriptor_helper.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_description/tensor_space_filling_curve.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/block/blockwise_gemm_dl_v2r3.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/block/blockwise_gemm_dlops_v2r2.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/block/blockwise_gemm_dlops_v3.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/block/blockwise_gemm_dpp.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops_skip_b_lds.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/block/blockwise_softmax.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/block/blockwise_tensor_slice_transfer_v5r1.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/block/blockwise_welford.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/block/reduction_functions_blockwise.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v4r1.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v6r1.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v6r1r2.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v6r2.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v6r3.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v7.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v7r2.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/device/conv_tensor_rearrange_op.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/device/convolution_backward_data_specialization.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/device/convolution_backward_weight_specialization.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/device/convolution_forward_specialization.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/device/device_avgpool_bwd.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/device/device_base.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/device/device_batched_contraction_multiple_d.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/device/device_batched_gemm.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/device/device_batched_gemm_e_permute.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/device/device_batched_gemm_gemm.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/device/device_batched_gemm_multi_d.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/device/device_batched_gemm_multiple_d_gemm_multiple_d.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/device/device_batched_gemm_softmax_gemm.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/device/device_batched_gemm_softmax_gemm_permute.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/device/device_batchnorm_backward.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/device/device_batchnorm_forward.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/device/device_batchnorm_infer.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/device/device_cgemm.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/device/device_contraction_multiple_abd.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/device/device_conv_bwd_data.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/device/device_conv_fwd.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/device/device_conv_fwd_bias_activation.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/device/device_conv_fwd_bias_activation_add.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/device/device_conv_tensor_rearrange.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/device/device_elementwise.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/device/device_elementwise_normalization.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/device/device_gemm.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/device/device_gemm_bias_e_permute.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/device/device_gemm_multiple_abd.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/device/device_gemm_multiple_d.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/device/device_gemm_multiple_d_layernorm.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/device/device_gemm_multiple_d_multiple_r.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/device/device_gemm_reduce.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/device/device_gemm_splitk.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/device/device_gemm_streamk.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/device/device_grouped_contraction_multiple_d.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/device/device_grouped_conv_bwd_data_multiple_d.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/device/device_grouped_conv_bwd_weight.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/device/device_grouped_conv_fwd.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_d.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/device/device_grouped_gemm.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/device/device_grouped_gemm_fixed_nk.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/device/device_grouped_gemm_softmax_gemm_permute.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/device/device_grouped_gemm_splitk.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/device/device_max_pool_bwd.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/device/device_multiple_reduce.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/device/device_normalization.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/device/device_permute.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/device/device_pool_fwd.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/device/device_put_element.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/device/device_reduce.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/device/device_softmax.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/device/device_splitk_contraction_multiple_d.hpp (100%) rename include/ck/tensor_operation/gpu/device/device_gemv.hpp => composable_kernel/include/ck/tensor_operation/gpu/device/device_tall_and_skinny_gemm.hpp (97%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/device/gemm_specialization.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/device/impl/device_avgpool3d_bwd_ndhwc_ndhwc.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/device/impl/device_batched_contraction_multiple_d_wmma_cshuffle.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/device/impl/device_batched_contraction_multiple_d_xdl_cshuffle.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/device/impl/device_batched_gemm_e_permute_xdl.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/device/impl/device_batched_gemm_gemm_xdl_cshuffle.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multi_d_xdl.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_dl.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/device/impl/device_batched_gemm_reduce_xdl_cshuffle.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_xdl_cshuffle.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/device/impl/device_batched_gemm_xdl.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/device/impl/device_batchnorm_backward_impl.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/device/impl/device_batchnorm_forward_impl.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/device/impl/device_batchnorm_forward_impl_obsolete.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/device/impl/device_cgemm_4gemm_xdl_cshuffle.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/device/impl/device_column_to_image_impl.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/device/impl/device_contraction_multiple_abd_xdl_cshuffle.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/device/impl/device_contraction_multiple_d_xdl_cshuffle.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/device/impl/device_conv2d_backward_weight_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/device/impl/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_c_shuffle_bias_activation_add_nhwc_kyxc_nhwk.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_c_shuffle_bias_activation_nhwc_kyxc_nhwk.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/device/impl/device_conv3d_fwd_naive_ndhwc_kzyxc_ndhwk.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/device/impl/device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/device/impl/device_convnd_bwd_data_nwc_kxc_nwk_dl.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/device/impl/device_convnd_bwd_data_nwc_kxc_nwk_xdl.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/device/impl/device_elementwise_2d_impl.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/device/impl/device_elementwise_impl.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/device/impl/device_elementwise_normalization_impl.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/device/impl/device_gemm_bias_add_reduce_xdl_cshuffle.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/device/impl/device_gemm_dl.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/device/impl/device_gemm_dpp.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_abd_xdl_cshuffle.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_dl.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_layernorm_xdl_cshuffle.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_wmma_cshuffle.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/device/impl/device_gemm_reduce_xdl_cshuffle.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/device/impl/device_gemm_wmma.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/device/impl/device_gemm_xdl.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_layernorm_cshuffle.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_skip_b_lds.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_splitk_c_shuffle.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_streamk.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_waveletmodel_cshuffle.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/device/impl/device_grouped_contraction_multiple_d_xdl_cshuffle.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_wmma_cshuffle.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_dl.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_wmma_cshuffle.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_dl_nhwc_kyxc_nhwk.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_multiple_r.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_multiple_r_xdl_cshuffle.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_wmma_cshuffle.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/device/impl/device_grouped_conv_utils.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_dl.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_softmax_gemm_permute_xdl_cshuffle.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_fixed_nk.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_splitk_cshuffle.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/device/impl/device_image_to_column_impl.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/device/impl/device_max_pool_bwd_impl.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/device/impl/device_multiple_reduce_multiblock.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/device/impl/device_multiple_reduce_threadwise.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/device/impl/device_normalization_impl.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/device/impl/device_normalization_splitk_impl.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/device/impl/device_permute_impl.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/device/impl/device_pool2d_fwd_nhwc_nhwc.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/device/impl/device_pool3d_fwd_ndhwc_ndhwc.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/device/impl/device_put_element_impl.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/device/impl/device_reduce_common.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/device/impl/device_reduce_multiblock.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/device/impl/device_reduce_threadwise.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/device/impl/device_softmax_impl.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/device/impl/device_sparse_embeddings_forward_layernorm.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/device/impl/device_splitk_contraction_multiple_d_xdl_cshuffle.hpp (100%) rename include/ck/tensor_operation/gpu/device/impl/device_gemv_splitk.hpp => composable_kernel/include/ck/tensor_operation/gpu/device/impl/device_tall_and_skinny_gemm_splitk.hpp (89%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/device/masking_specialization.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/device/matrix_padder.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/device/tensor_layout.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/device/tensor_specialization.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/device/welford_helper.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/element/element_wise_operation.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/element/quantization_operation.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/grid/batchnorm_multiblock/gridwise_multiblock_batchnorm_forward.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/grid/batchnorm_multiblock/gridwise_multiblock_reduce_second_half_batchnorm_backward_final.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/grid/batchnorm_multiblock/gridwise_multiblock_welford_first_half.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/grid/batchnorm_multiblock/gridwise_multiblock_welford_second_half_batchnorm_forward_final_obsolete.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/grid/batchnorm_multiblock/gridwise_multiblock_welford_second_half_multiblock_reduce_first_half.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/grid/gemm_layernorm/gridwise_gemm_multiple_d_welford_first_half_xdl_cshuffle.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/grid/gemm_layernorm/gridwise_welford_second_half_layernorm2d.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/grid/gridwise_2d_multiple_reduction_multiblock.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/grid/gridwise_2d_multiple_reduction_threadwise.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_multiblock.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_threadwise.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_gemm_xdl_cshuffle_v1.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle_v1.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_multiple_d_softmax_gemm_xdl_cshuffle_v1.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_xdl_cshuffle_v1.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/grid/gridwise_batchnorm_backward_blockwise_welford.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/grid/gridwise_batchnorm_forward_blockwise_welford.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/grid/gridwise_elementwise_1d.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/grid/gridwise_elementwise_2d.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/grid/gridwise_elementwise_layernorm_welford_variance.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/grid/gridwise_gemm_bias_add_reduce_xdl_cshuffle_v1.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/grid/gridwise_gemm_dl_multiple_d.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/grid/gridwise_gemm_dl_v1r3.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/grid/gridwise_gemm_dpp.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_abd_xdl_cshuffle.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_wmma_cshuffle.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_cshuffle.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_splitk_cshuffle.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_selector.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v1.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v2.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v3.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/grid/gridwise_gemm_reduce_xdl_cshuffle_v1.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/grid/gridwise_gemm_split_k_multiple_d_xdl_cshuffle.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/grid/gridwise_gemm_split_k_multiple_d_xdl_cshuffle_v2.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/grid/gridwise_gemm_waveletmodel.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v1.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_layernorm_cshuffle_v1.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_waveletmodel_cshuffle.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_bwd_weight.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_skip_b_lds_v1.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_streamk.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r3.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4r2.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r1.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r2.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r3.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/grid/gridwise_gemv_splitk.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/grid/gridwise_permute.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/grid/gridwise_put_element_1d.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/grid/gridwise_set_buffer_value.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/grid/gridwise_set_multiple_buffer_value.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/grid/gridwise_softmax.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/grid/gridwise_sparse_embeddings_forward_layernorm.hpp (100%) create mode 100755 composable_kernel/include/ck/tensor_operation/gpu/grid/gridwise_tall_and_skinny_gemm_splitk.hpp rename {include => composable_kernel/include}/ck/tensor_operation/gpu/grid/gridwise_tensor_rearrange.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/grid/normalization/gridwise_normalization_naive_variance.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/grid/normalization/gridwise_normalization_selector.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/grid/normalization/gridwise_normalization_splitk_1st.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/grid/normalization/gridwise_normalization_splitk_2nd.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/grid/normalization/gridwise_normalization_welford_variance.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/thread/reduction_functions_threadwise.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/thread/threadwise_contraction_dl.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/thread/threadwise_gemm_dlops_v3.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_set.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v4r1.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v5r1.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v6r1.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v6r1r2.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v6r2.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v6r3.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v7.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v7r2.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/thread/threadwise_welford.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/warp/dpp_gemm.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/warp/wmma_gemm.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/operator_transform/transform_contraction_to_gemm.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/operator_transform/transform_conv_bwd_data_to_gemm_v1.hpp (100%) rename {include => composable_kernel/include}/ck/tensor_operation/operator_transform/transform_conv_fwd_to_gemm.hpp (100%) rename {include => composable_kernel/include}/ck/utility/amd_address_space.hpp (100%) rename {include => composable_kernel/include}/ck/utility/amd_buffer_addressing.hpp (100%) rename {include => composable_kernel/include}/ck/utility/amd_gemm_dpp.hpp (100%) rename {include => composable_kernel/include}/ck/utility/amd_inline_asm.hpp (100%) rename {include => composable_kernel/include}/ck/utility/amd_wave_read_first_lane.hpp (100%) rename {include => composable_kernel/include}/ck/utility/amd_wmma.hpp (100%) rename {include => composable_kernel/include}/ck/utility/amd_xdlops.hpp (100%) rename {include => composable_kernel/include}/ck/utility/array.hpp (100%) rename {include => composable_kernel/include}/ck/utility/array_multi_index.hpp (100%) rename {include => composable_kernel/include}/ck/utility/c_style_pointer_cast.hpp (100%) rename {include => composable_kernel/include}/ck/utility/common_header.hpp (100%) rename {include => composable_kernel/include}/ck/utility/container_element_picker.hpp (100%) rename {include => composable_kernel/include}/ck/utility/container_helper.hpp (100%) rename {include => composable_kernel/include}/ck/utility/data_type.hpp (100%) rename {include => composable_kernel/include}/ck/utility/debug.hpp (100%) rename {include => composable_kernel/include}/ck/utility/dynamic_buffer.hpp (100%) rename {include => composable_kernel/include}/ck/utility/enable_if.hpp (100%) rename {include => composable_kernel/include}/ck/utility/f8_utils.hpp (100%) rename {include => composable_kernel/include}/ck/utility/functional.hpp (100%) rename {include => composable_kernel/include}/ck/utility/functional2.hpp (100%) rename {include => composable_kernel/include}/ck/utility/functional3.hpp (100%) rename {include => composable_kernel/include}/ck/utility/functional4.hpp (100%) rename {include => composable_kernel/include}/ck/utility/generic_memory_space_atomic.hpp (100%) rename {include => composable_kernel/include}/ck/utility/get_id.hpp (100%) rename {include => composable_kernel/include}/ck/utility/get_shift.hpp (100%) rename {include => composable_kernel/include}/ck/utility/ignore.hpp (100%) rename {include => composable_kernel/include}/ck/utility/inner_product.hpp (100%) rename {include => composable_kernel/include}/ck/utility/inner_product_dpp8.hpp (100%) rename {include => composable_kernel/include}/ck/utility/integral_constant.hpp (100%) rename {include => composable_kernel/include}/ck/utility/is_detected.hpp (100%) rename {include => composable_kernel/include}/ck/utility/is_known_at_compile_time.hpp (100%) rename {include => composable_kernel/include}/ck/utility/loop_scheduler.hpp (100%) rename {include => composable_kernel/include}/ck/utility/magic_division.hpp (100%) rename {include => composable_kernel/include}/ck/utility/math.hpp (100%) rename {include => composable_kernel/include}/ck/utility/math_v2.hpp (100%) rename {include => composable_kernel/include}/ck/utility/multi_index.hpp (100%) rename {include => composable_kernel/include}/ck/utility/number.hpp (100%) rename {include => composable_kernel/include}/ck/utility/random_gen.hpp (100%) rename {include => composable_kernel/include}/ck/utility/reduction_common.hpp (100%) rename {include => composable_kernel/include}/ck/utility/reduction_enums.hpp (100%) rename {include => composable_kernel/include}/ck/utility/reduction_functions_accumulate.hpp (100%) rename {include => composable_kernel/include}/ck/utility/reduction_operator.hpp (100%) rename {include => composable_kernel/include}/ck/utility/sequence.hpp (100%) rename {include => composable_kernel/include}/ck/utility/sequence_helper.hpp (100%) rename {include => composable_kernel/include}/ck/utility/span.hpp (100%) rename {include => composable_kernel/include}/ck/utility/static_buffer.hpp (100%) rename {include => composable_kernel/include}/ck/utility/statically_indexed_array.hpp (100%) rename {include => composable_kernel/include}/ck/utility/statically_indexed_array_multi_index.hpp (100%) rename {include => composable_kernel/include}/ck/utility/synchronization.hpp (100%) rename {include => composable_kernel/include}/ck/utility/thread_group.hpp (100%) rename {include => composable_kernel/include}/ck/utility/transpose_vectors.hpp (100%) rename {include => composable_kernel/include}/ck/utility/tuple.hpp (100%) rename {include => composable_kernel/include}/ck/utility/tuple_helper.hpp (100%) rename {include => composable_kernel/include}/ck/utility/type.hpp (100%) rename {include => composable_kernel/include}/ck/utility/type_convert.hpp (100%) rename {include => composable_kernel/include}/ck/utility/workgroup_barrier.hpp (100%) rename {include => composable_kernel/include}/ck/utility/workgroup_synchronization.hpp (100%) rename {include => composable_kernel/include}/ck/version.h.in (100%) rename {library => composable_kernel/library}/CMakeLists.txt (100%) rename {library => composable_kernel/library}/include/ck/library/reference_tensor_operation/cpu/reference_avgpool_bwd.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/reference_tensor_operation/cpu/reference_batchnorm_backward.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/reference_tensor_operation/cpu/reference_batchnorm_forward.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/reference_tensor_operation/cpu/reference_batchnorm_infer.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/reference_tensor_operation/cpu/reference_cgemm.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/reference_tensor_operation/cpu/reference_column_to_image.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/reference_tensor_operation/cpu/reference_contraction.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/reference_tensor_operation/cpu/reference_conv_bwd_data.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/reference_tensor_operation/cpu/reference_conv_bwd_weight.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd_bias_activation.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd_bias_activation_add.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/reference_tensor_operation/cpu/reference_gemm.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/reference_tensor_operation/cpu/reference_gemm_layernorm.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/reference_tensor_operation/cpu/reference_groupnorm.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/reference_tensor_operation/cpu/reference_image_to_column.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/reference_tensor_operation/cpu/reference_layernorm.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/reference_tensor_operation/cpu/reference_maxpool_bwd.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/reference_tensor_operation/cpu/reference_pool_fwd.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/reference_tensor_operation/cpu/reference_reduce.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/reference_tensor_operation/cpu/reference_softmax.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/reference_tensor_operation/cpu/reference_sparse_embedding3_forward_layernorm.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/reference_tensor_operation/gpu/naive_conv_fwd.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/tensor_operation_instance/add_device_operation_instance.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/tensor_operation_instance/device_operation_instance_factory.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/tensor_operation_instance/gpu/avg_pool3d_bwd.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/tensor_operation_instance/gpu/batched_gemm.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/tensor_operation_instance/gpu/batched_gemm_add_relu_gemm_add.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/tensor_operation_instance/gpu/batched_gemm_bias_permute.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/tensor_operation_instance/gpu/batched_gemm_bias_softmax_gemm_permute.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/tensor_operation_instance/gpu/batched_gemm_gemm.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/tensor_operation_instance/gpu/batched_gemm_multi_d.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/tensor_operation_instance/gpu/batched_gemm_softmax_gemm.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/tensor_operation_instance/gpu/batchnorm_backward.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/tensor_operation_instance/gpu/batchnorm_forward.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/tensor_operation_instance/gpu/batchnorm_infer.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/tensor_operation_instance/gpu/contraction_bilinear.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/tensor_operation_instance/gpu/contraction_scale.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/tensor_operation_instance/gpu/conv_tensor_rearrange.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/tensor_operation_instance/gpu/conv_tensor_rearrange/device_column_to_image_instance.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/tensor_operation_instance/gpu/conv_tensor_rearrange/device_image_to_column_instance.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/tensor_operation_instance/gpu/convolution_backward_data.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/tensor_operation_instance/gpu/convolution_forward.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/tensor_operation_instance/gpu/device_elementwise_instance.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/tensor_operation_instance/gpu/device_gemm_mean_squaremean_instance.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/tensor_operation_instance/gpu/elementwise_normalization.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/tensor_operation_instance/gpu/gemm.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/tensor_operation_instance/gpu/gemm_add_add_fastgelu.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/tensor_operation_instance/gpu/gemm_add_fastgelu.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/tensor_operation_instance/gpu/gemm_add_multiply.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/tensor_operation_instance/gpu/gemm_add_relu_add_layernorm.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/tensor_operation_instance/gpu/gemm_bilinear.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/tensor_operation_instance/gpu/gemm_fastgelu.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/tensor_operation_instance/gpu/gemm_multiply_add.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/tensor_operation_instance/gpu/gemm_splitk.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/tensor_operation_instance/gpu/gemm_streamk.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/tensor_operation_instance/gpu/gemv_splitk.hpp (90%) rename {library => composable_kernel/library}/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_wmma_instance.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_xdl_instance.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_dl_instance.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_wmma_instance.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_xdl_instance.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv2d_fwd_wmma_instance.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_dl_instance.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_wmma_instance.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_data.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_weight.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/tensor_operation_instance/gpu/grouped_gemm.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/tensor_operation_instance/gpu/grouped_gemm_bias.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/tensor_operation_instance/gpu/grouped_gemm_fastgelu.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/tensor_operation_instance/gpu/grouped_gemm_fixed_nk.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/tensor_operation_instance/gpu/image_to_column.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/tensor_operation_instance/gpu/max_pool_bwd.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/tensor_operation_instance/gpu/normalization.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/tensor_operation_instance/gpu/normalization_swish.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/tensor_operation_instance/gpu/pool3d_fwd.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/tensor_operation_instance/gpu/quantization/gemm_quantization.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/tensor_operation_instance/gpu/quantization/grouped_convolution_bias_forward_perchannel_quantization.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/tensor_operation_instance/gpu/quantization/grouped_convolution_bias_forward_perlayer_quantization.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/tensor_operation_instance/gpu/quantization/grouped_convolution_forward_perchannel_quantization.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/tensor_operation_instance/gpu/quantization/grouped_convolution_forward_perlayer_quantization.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_add.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_amax.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_avg.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_max.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_min.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_norm2.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_amax.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_max.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_min.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_add.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_avg.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_norm2.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_add.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_amax.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_avg.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_max.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_min.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_norm2.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_add.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_avg.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_norm2.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_add.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_amax.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_avg.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_max.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_min.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_norm2.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i32_i8_add.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i32_i8_avg.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8_amax.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8_max.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8_min.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_impl_common.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_b16_f32_f32_add.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_b16_f32_f32_avg.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32_add.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32_avg.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32_add.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32_avg.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32_add.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32_avg.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f64_f64_f64_add.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f64_f64_f64_avg.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_add.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_amax.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_avg.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_max.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_min.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_norm2.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16_amax.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16_max.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16_min.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16_add.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16_avg.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16_norm2.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_add.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_amax.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_avg.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_max.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_min.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_norm2.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_add.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_avg.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_norm2.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_add.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_amax.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_avg.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_max.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_min.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_norm2.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8_add.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8_avg.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_amax.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_max.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_min.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/tensor_operation_instance/gpu/reduce/reduce.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/tensor_operation_instance/gpu/softmax.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce1.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce2.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce3.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce1.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce2.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce3.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce4.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_type.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce1.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce2.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce3.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce1.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce2.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce3.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce4.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_type.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_instance.hpp (100%) create mode 100755 composable_kernel/library/include/ck/library/tensor_operation_instance/gpu/tall_and_skinny_gemm_splitk.hpp rename {library => composable_kernel/library}/include/ck/library/utility/algorithm.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/utility/check_err.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/utility/conv_common.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/utility/convolution_host_tensor_descriptor_helper.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/utility/convolution_parameter.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/utility/device_memory.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/utility/fill.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/utility/host_common_util.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/utility/host_gemm.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/utility/host_tensor.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/utility/host_tensor_generator.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/utility/iterator.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/utility/literals.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/utility/numeric.hpp (100%) rename {library => composable_kernel/library}/include/ck/library/utility/ranges.hpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/CMakeLists.txt (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/avg_pool3d_bwd/CMakeLists.txt (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/avg_pool3d_bwd/avg_pool3d_bwd_ndhwc_instance_common.hpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/avg_pool3d_bwd/device_avg_pool3d_bwd_ndhwc_bf16_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/avg_pool3d_bwd/device_avg_pool3d_bwd_ndhwc_f16_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/avg_pool3d_bwd/device_avg_pool3d_bwd_ndhwc_f32_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/batched_gemm/CMakeLists.txt (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_bf16_bf16_bf16_gkm_gkn_gmn_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_bf16_bf16_bf16_gkm_gnk_gmn_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_bf16_bf16_bf16_gmk_gkn_gmn_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_bf16_bf16_bf16_gmk_gnk_gmn_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f16_f16_f16_gkm_gkn_gmn_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f16_f16_f16_gkm_gnk_gmn_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f16_f16_f16_gmk_gkn_gmn_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f16_f16_f16_gmk_gnk_gmn_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f32_f32_f32_gkm_gkn_gmn_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f32_f32_f32_gkm_gnk_gmn_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f32_f32_f32_gmk_gkn_gmn_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f32_f32_f32_gmk_gnk_gmn_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_int8_int8_int8_gkm_gkn_gmn_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_int8_int8_int8_gkm_gnk_gmn_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_int8_int8_int8_gmk_gkn_gmn_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_int8_int8_int8_gmk_gnk_gmn_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/batched_gemm_add_relu_gemm_add/CMakeLists.txt (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/batched_gemm_add_relu_gemm_add/device_batched_gemm_add_relu_gemm_add_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/batched_gemm_add_relu_gemm_add/device_batched_gemm_add_relu_gemm_add_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gon_gmo_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/batched_gemm_bias_permute/CMakeLists.txt (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/batched_gemm_bias_permute/device_batched_gemm_bias_permute_m2_n3_k1_xdl_c_shuffle_f16_f16_f16_f16_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/batched_gemm_gemm/CMakeLists.txt (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/batched_gemm_gemm/device_batched_gemm_gemm_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/batched_gemm_gemm/device_batched_gemm_gemm_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gon_gmo_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/batched_gemm_multi_d/CMakeLists.txt (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_f16_f16_f16_gkm_gkn_gmn_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_f16_f16_f16_gkm_gkn_gmn_irregular_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_f16_f16_f16_gkm_gnk_gmn_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_f16_f16_f16_gkm_gnk_gmn_irregular_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_f16_f16_f16_gmk_gkn_gmn_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_f16_f16_f16_gmk_gkn_gmn_irregular_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_f16_f16_f16_gmk_gnk_gmn_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_f16_f16_f16_gmk_gnk_gmn_irregular_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_i8_i8_i8_gkm_gkn_gmn_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_i8_i8_i8_gkm_gkn_gmn_irregular_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_i8_i8_i8_gkm_gnk_gmn_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_i8_i8_i8_gkm_gnk_gmn_irregular_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_i8_i8_i8_gmk_gkn_gmn_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_i8_i8_i8_gmk_gkn_gmn_irregular_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_i8_i8_i8_gmk_gnk_gmn_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_i8_i8_i8_gmk_gnk_gmn_irregular_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/batched_gemm_reduce/CMakeLists.txt (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/batched_gemm_reduce/device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gkm_gkn_gmn_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/batched_gemm_reduce/device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gkm_gnk_gmn_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/batched_gemm_reduce/device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gmk_gkn_gmn_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/batched_gemm_reduce/device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gmk_gnk_gmn_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm/CMakeLists.txt (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm/device_batched_gemm_softmax_gemm_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute/CMakeLists.txt (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute/device_batched_gemm_bias_softmax_gemm_permute_xdl_cshuffle_bf16_bf16_bf16_bf16_gmk_gnk_gno_gmo_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute/device_batched_gemm_bias_softmax_gemm_permute_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle_bf16_bf16_bf16_bf16_gmk_gnk_gno_gmo_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/batchnorm/CMakeLists.txt (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_backward_bf16_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_backward_f16_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_backward_f32_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_backward_f64_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_forward_bf16_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_forward_f16_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_forward_f32_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_forward_f64_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_infer_bf16_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_infer_f16_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_infer_f32_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_infer_f64_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/column_to_image/CMakeLists.txt (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/column_to_image/device_column_to_image_nhwc_1d_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/column_to_image/device_column_to_image_nhwc_2d_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/column_to_image/device_column_to_image_nhwc_3d_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/contraction_bilinear/CMakeLists.txt (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/contraction_bilinear/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/contraction_bilinear/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_knnn_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/contraction_bilinear/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mknn_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/contraction_bilinear/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mnnn_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/contraction_bilinear/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_kknn_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/contraction_bilinear/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_knnn_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/contraction_bilinear/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mknn_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/contraction_bilinear/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mnnn_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/contraction_scale/CMakeLists.txt (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_kkn_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_knn_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mkn_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mnn_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_kkn_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_knn_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mkn_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mnn_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/conv1d_bwd_data/CMakeLists.txt (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/conv1d_bwd_data/device_conv1d_bwd_data_xdl_nwc_kxc_nwk_bf16_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/conv1d_bwd_data/device_conv1d_bwd_data_xdl_nwc_kxc_nwk_f16_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/conv1d_bwd_data/device_conv1d_bwd_data_xdl_nwc_kxc_nwk_f32_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/conv1d_bwd_data/device_conv1d_bwd_data_xdl_nwc_kxc_nwk_int8_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/conv2d_bwd_data/CMakeLists.txt (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_f16_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_f32_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_int8_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_bf16_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f16_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f32_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_int8_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/conv2d_fwd/CMakeLists.txt (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/conv2d_fwd_bias_relu/CMakeLists.txt (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/conv2d_fwd_bias_relu/device_conv2d_fwd_xdl_c_shuffle_bias_relu_nhwc_kyxc_nhwk_f16_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/conv2d_fwd_bias_relu_add/CMakeLists.txt (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/conv2d_fwd_bias_relu_add/device_conv2d_fwd_xdl_c_shuffle_bias_relu_add_nhwc_kyxc_nhwk_f16_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/conv3d_bwd_data/CMakeLists.txt (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/conv3d_bwd_data/device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_bf16_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/conv3d_bwd_data/device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_f16_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/conv3d_bwd_data/device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_f32_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/conv3d_bwd_data/device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_int8_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/elementwise/CMakeLists.txt (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/elementwise/device_normalize_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/elementwise_normalization/CMakeLists.txt (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/elementwise_normalization/device_elementwise_normalization_f16_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/gemm/CMakeLists.txt (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f16_f16_f16_km_kn_mn_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f16_f16_f16_km_kn_mn_irregular_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f16_f16_f16_km_nk_mn_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f16_f16_f16_km_nk_mn_irregular_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f16_f16_f16_mk_kn_mn_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f16_f16_f16_mk_kn_mn_irregular_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f16_f16_f16_mk_nk_mn_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f16_f16_f16_mk_nk_mn_irregular_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f32_f32_f32_km_kn_mn_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f32_f32_f32_km_nk_mn_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f32_f32_f32_mk_kn_mn_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f32_f32_f32_mk_nk_mn_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_i8_i8_i8_km_kn_mn_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_i8_i8_i8_km_kn_mn_irregular_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_i8_i8_i8_km_nk_mn_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_i8_i8_i8_km_nk_mn_irregular_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_i8_i8_i8_mk_kn_mn_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_i8_i8_i8_mk_kn_mn_irregular_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_i8_i8_i8_mk_nk_mn_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_i8_i8_i8_mk_nk_mn_irregular_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/gemm/device_gemm_dpp_f16_f16_f16_km_kn_mn_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/gemm/device_gemm_dpp_f16_f16_f16_km_kn_mn_irregular_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/gemm/device_gemm_dpp_f16_f16_f16_km_nk_mn_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/gemm/device_gemm_dpp_f16_f16_f16_km_nk_mn_irregular_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/gemm/device_gemm_dpp_f16_f16_f16_mk_kn_mn_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/gemm/device_gemm_dpp_f16_f16_f16_mk_kn_mn_irregular_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/gemm/device_gemm_dpp_f16_f16_f16_mk_nk_mn_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/gemm/device_gemm_dpp_f16_f16_f16_mk_nk_mn_irregular_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_2_stage_f16_f16_f16_mk_nk_mn_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_bf16_bf16_bf16_km_kn_mn_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_bf16_bf16_bf16_km_nk_mn_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_bf16_bf16_bf16_mk_kn_mn_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_bf16_bf16_bf16_mk_nk_mn_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f16_f16_f16_km_kn_mn_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f16_f16_f16_km_nk_mn_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f32_f32_f32_km_kn_mn_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f32_f32_f32_km_nk_mn_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f32_f32_f32_mk_kn_mn_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f32_f32_f32_mk_nk_mn_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_fp8_fp8_fp8_km_kn_mn_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_fp8_fp8_fp8_km_nk_mn_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_fp8_fp8_fp8_mk_kn_mn_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_fp8_fp8_fp8_mk_nk_mn_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_i8_i8_i8_km_kn_mn_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_i8_i8_i8_km_nk_mn_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_i8_i8_i8_mk_kn_mn_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_i8_i8_i8_mk_nk_mn_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/common.hpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/km_kn_mn_add_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/km_kn_mn_default_pipeline_v1_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/km_kn_mn_default_pipeline_v2_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/km_kn_mn_default_pipeline_v2_opt_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/km_kn_mn_interwave_pipeline_v1_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/km_kn_mn_irregular_default_pipeline_v1_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/km_kn_mn_irregular_default_pipeline_v2_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/km_kn_mn_irregular_interwave_pipeline_v1_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/km_nk_mn_add_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/km_nk_mn_default_pipeline_v1_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/km_nk_mn_default_pipeline_v2_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/km_nk_mn_default_pipeline_v2_opt_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/km_nk_mn_interwave_pipeline_v1_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/km_nk_mn_irregular_default_pipeline_v1_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/km_nk_mn_irregular_default_pipeline_v2_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/km_nk_mn_irregular_interwave_pipeline_v1_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/mk_kn_mn_add_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/mk_kn_mn_default_pipeline_v1_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/mk_kn_mn_default_pipeline_v2_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/mk_kn_mn_default_pipeline_v2_opt_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/mk_kn_mn_interwave_pipeline_v1_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/mk_kn_mn_irregular_default_pipeline_v1_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/mk_kn_mn_irregular_default_pipeline_v2_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/mk_kn_mn_irregular_interwave_pipeline_v1_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/mk_nk_mn_add_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/mk_nk_mn_default_pipeline_v1_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/mk_nk_mn_default_pipeline_v2_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/mk_nk_mn_default_pipeline_v2_opt_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/mk_nk_mn_interwave_pipeline_v1_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/mk_nk_mn_irregular_default_pipeline_v1_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/mk_nk_mn_irregular_default_pipeline_v2_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/mk_nk_mn_irregular_interwave_pipeline_v1_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f32_f32_f32_km_kn_mn_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f32_f32_f32_km_nk_mn_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f32_f32_f32_mk_kn_mn_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f32_f32_f32_mk_nk_mn_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f64_f64_f64_km_kn_mn_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f64_f64_f64_km_nk_mn_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f64_f64_f64_mk_kn_mn_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f64_f64_f64_mk_nk_mn_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/gemm_add_add_fastgelu/CMakeLists.txt (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/gemm_add_add_fastgelu/device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_km_kn_mn_mn_mn_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/gemm_add_add_fastgelu/device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_km_nk_mn_mn_mn_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/gemm_add_add_fastgelu/device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_kn_mn_mn_mn_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/gemm_add_add_fastgelu/device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_nk_mn_mn_mn_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/gemm_add_fastgelu/CMakeLists.txt (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/gemm_add_fastgelu/device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_km_kn_mn_mn_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/gemm_add_fastgelu/device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_km_nk_mn_mn_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/gemm_add_fastgelu/device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_mk_kn_mn_mn_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/gemm_add_fastgelu/device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_mk_nk_mn_mn_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/gemm_add_multiply/CMakeLists.txt (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/gemm_add_multiply/device_gemm_add_multiply_xdl_c_shuffle_f16_f16_f16_f16_f16_km_kn_mn_mn_mn_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/gemm_add_multiply/device_gemm_add_multiply_xdl_c_shuffle_f16_f16_f16_f16_f16_km_nk_mn_mn_mn_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/gemm_add_multiply/device_gemm_add_multiply_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_kn_mn_mn_mn_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/gemm_add_multiply/device_gemm_add_multiply_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_nk_mn_mn_mn_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/gemm_add_relu_add_layernorm/CMakeLists.txt (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/gemm_add_relu_add_layernorm/device_gemm_add_relu_add_xdl_c_shuffle_layernorm_f16_km_kn_mn_mn_mn_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/gemm_add_relu_add_layernorm/device_gemm_add_relu_add_xdl_c_shuffle_layernorm_f16_km_nk_mn_mn_mn_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/gemm_add_relu_add_layernorm/device_gemm_add_relu_add_xdl_c_shuffle_layernorm_f16_mk_kn_mn_mn_mn_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/gemm_add_relu_add_layernorm/device_gemm_add_relu_add_xdl_c_shuffle_layernorm_f16_mk_nk_mn_mn_mn_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/gemm_bias_add_reduce/CMakeLists.txt (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/gemm_bias_add_reduce/device_gemm_bias_add_mean_squaremean_xdl_cshuffle_f16_f16_f16_f32_f32_km_kn_mn_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/gemm_bias_add_reduce/device_gemm_bias_add_mean_squaremean_xdl_cshuffle_f16_f16_f16_f32_f32_km_nk_mn_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/gemm_bias_add_reduce/device_gemm_bias_add_mean_squaremean_xdl_cshuffle_f16_f16_f16_f32_f32_mk_kn_mn_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/gemm_bias_add_reduce/device_gemm_bias_add_mean_squaremean_xdl_cshuffle_f16_f16_f16_f32_f32_mk_nk_mn_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/gemm_bilinear/CMakeLists.txt (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/gemm_bilinear/device_gemm_bilinear_wmma_c_shuffle_i8_i8_i8_i8_km_kn_mn_mn_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/gemm_bilinear/device_gemm_bilinear_wmma_c_shuffle_i8_i8_i8_i8_km_nk_mn_mn_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/gemm_bilinear/device_gemm_bilinear_wmma_c_shuffle_i8_i8_i8_i8_mk_kn_mn_mn_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/gemm_bilinear/device_gemm_bilinear_wmma_c_shuffle_i8_i8_i8_i8_mk_nk_mn_mn_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/gemm_bilinear/device_gemm_bilinear_xdl_c_shuffle_f16_f16_f16_f16_km_kn_mn_mn_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/gemm_bilinear/device_gemm_bilinear_xdl_c_shuffle_f16_f16_f16_f16_km_nk_mn_mn_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/gemm_bilinear/device_gemm_bilinear_xdl_c_shuffle_f16_f16_f16_f16_mk_kn_mn_mn_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/gemm_bilinear/device_gemm_bilinear_xdl_c_shuffle_f16_f16_f16_f16_mk_nk_mn_mn_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/gemm_fastgelu/CMakeLists.txt (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/gemm_fastgelu/device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_km_kn_mn_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/gemm_fastgelu/device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_km_nk_mn_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/gemm_fastgelu/device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/gemm_fastgelu/device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/gemm_multiply_add/CMakeLists.txt (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/gemm_multiply_add/device_gemm_multiply_add_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_kn_mn_mn_mn_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/gemm_multiply_add/device_gemm_multiply_add_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_nk_mn_mn_mn_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/gemm_multiply_add/device_gemm_multiply_add_xdl_c_shuffle_f16_f8_f32_f32_f16_mk_kn_mn_mn_mn_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/gemm_multiply_add/device_gemm_multiply_add_xdl_c_shuffle_f16_f8_f32_f32_f16_mk_nk_mn_mn_mn_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/gemm_reduce/CMakeLists.txt (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/gemm_reduce/device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_km_kn_mn_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/gemm_reduce/device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_km_nk_mn_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/gemm_reduce/device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_mk_kn_mn_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/gemm_reduce/device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_mk_nk_mn_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/gemm_splitk/CMakeLists.txt (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f16_f16_f16_comp_fp8_km_kn_mn_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f16_f16_f16_comp_fp8_km_nk_mn_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f16_f16_f16_comp_fp8_mk_kn_mn_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f16_f16_f16_comp_fp8_mk_nk_mn_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f16_f16_f16_km_kn_mn_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f16_f16_f16_km_nk_mn_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f16_fp8_f16_km_kn_mn_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f16_fp8_f16_km_nk_mn_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f16_fp8_f16_mk_kn_mn_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f16_fp8_f16_mk_nk_mn_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f32_f32_f32_km_kn_mn_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f32_f32_f32_km_nk_mn_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f32_f32_f32_mk_kn_mn_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f32_f32_f32_mk_nk_mn_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_fp8_f16_f16_km_kn_mn_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_fp8_f16_f16_km_nk_mn_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_fp8_f16_f16_mk_kn_mn_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_fp8_f16_f16_mk_nk_mn_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/gemm_streamk/CMakeLists.txt (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/gemm_streamk/device_gemm_xdl_streamk_f16_f16_f16_mk_kn_mn_instance.cpp (100%) create mode 100755 composable_kernel/library/src/tensor_operation_instance/gpu/gemv_splitk/CMakeLists.txt rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/gemv_splitk/device_gemv_splitk_f16_f16_f16_mk_kn_mn_instance.cpp (90%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/gemv_splitk/device_gemv_splitk_f16_f16_f16_mk_nk_mn_instance.cpp (90%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/grouped_conv1d_bwd_weight/CMakeLists.txt (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/grouped_conv1d_bwd_weight/dl/device_grouped_conv1d_bwd_weight_dl_gnwc_gkxc_gnwk_bf16_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/grouped_conv1d_bwd_weight/dl/device_grouped_conv1d_bwd_weight_dl_gnwc_gkxc_gnwk_f16_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/grouped_conv1d_bwd_weight/dl/device_grouped_conv1d_bwd_weight_dl_gnwc_gkxc_gnwk_f32_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/grouped_conv1d_bwd_weight/dl/device_grouped_conv1d_bwd_weight_dl_nwgc_gkxc_nwgk_bf16_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/grouped_conv1d_bwd_weight/dl/device_grouped_conv1d_bwd_weight_dl_nwgc_gkxc_nwgk_f16_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/grouped_conv1d_bwd_weight/dl/device_grouped_conv1d_bwd_weight_dl_nwgc_gkxc_nwgk_f32_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/grouped_conv1d_bwd_weight/xdl/device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_bf16_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/grouped_conv1d_bwd_weight/xdl/device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_f16_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/grouped_conv1d_bwd_weight/xdl/device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_f32_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/grouped_conv1d_fwd/CMakeLists.txt (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/grouped_conv1d_fwd/xdl/device_grouped_conv1d_fwd_xdl_gnwc_gkxc_gnwk_bf16_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/grouped_conv1d_fwd/xdl/device_grouped_conv1d_fwd_xdl_gnwc_gkxc_gnwk_f16_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/grouped_conv1d_fwd/xdl/device_grouped_conv1d_fwd_xdl_gnwc_gkxc_gnwk_f32_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/grouped_conv1d_fwd/xdl/device_grouped_conv1d_fwd_xdl_gnwc_gkxc_gnwk_int8_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/CMakeLists.txt (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/wmma/device_grouped_conv2d_bwd_data_wmma_gnhwc_gkyxc_gnhwk_f16_1x1s1p0_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/wmma/device_grouped_conv2d_bwd_data_wmma_gnhwc_gkyxc_gnhwk_f16_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/wmma/device_grouped_conv2d_bwd_data_wmma_gnhwc_gkyxc_gnhwk_i8_1x1s1p0_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/wmma/device_grouped_conv2d_bwd_data_wmma_gnhwc_gkyxc_gnhwk_i8_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/wmma/device_grouped_conv2d_bwd_data_wmma_nhwgc_gkyxc_nhwgk_f16_1x1s1p0_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/wmma/device_grouped_conv2d_bwd_data_wmma_nhwgc_gkyxc_nhwgk_f16_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/wmma/device_grouped_conv2d_bwd_data_wmma_nhwgc_gkyxc_nhwgk_i8_1x1s1p0_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/wmma/device_grouped_conv2d_bwd_data_wmma_nhwgc_gkyxc_nhwgk_i8_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_gnhwc_gkyxc_gnhwk_bf16_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_gnhwc_gkyxc_gnhwk_f16_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_gnhwc_gkyxc_gnhwk_f32_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_nhwgc_gkyxc_nhwgk_f16_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_nhwgc_gkyxc_nhwgk_f32_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/CMakeLists.txt (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/dl/device_grouped_conv2d_bwd_weight_dl_gnhwc_gkyxc_gnhwk_bf16_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/dl/device_grouped_conv2d_bwd_weight_dl_gnhwc_gkyxc_gnhwk_f16_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/dl/device_grouped_conv2d_bwd_weight_dl_gnhwc_gkyxc_gnhwk_f32_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/dl/device_grouped_conv2d_bwd_weight_dl_nhwgc_gkyxc_nhwgk_bf16_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/dl/device_grouped_conv2d_bwd_weight_dl_nhwgc_gkyxc_nhwgk_f16_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/dl/device_grouped_conv2d_bwd_weight_dl_nhwgc_gkyxc_nhwgk_f32_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_bf16_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f16_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f32_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f16_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f32_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/CMakeLists.txt (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_f16_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_i8_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/dl/device_grouped_conv2d_fwd_dl_gnhwc_gkyxc_gnhwk_f16_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/dl/device_grouped_conv2d_fwd_dl_gnhwc_gkyxc_gnhwk_f32_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/dl/device_grouped_conv2d_fwd_dl_nhwgc_gkyxc_nhwgk_f16_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/dl/device_grouped_conv2d_fwd_dl_nhwgc_gkyxc_nhwgk_f32_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/wmma/device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_f16_1x1p0_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/wmma/device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_f16_1x1s1p0_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/wmma/device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_f16_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/wmma/device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_f16_oddc_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/wmma/device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_i8_1x1p0_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/wmma/device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_i8_1x1s1p0_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/wmma/device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_i8_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/wmma/device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_i8_oddc_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/wmma/device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_f16_1x1p0_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/wmma/device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_f16_1x1s1p0_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/wmma/device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_f16_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/wmma/device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_f16_oddc_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/wmma/device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_i8_1x1p0_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/wmma/device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_i8_1x1s1p0_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/wmma/device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_i8_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/wmma/device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_i8_oddc_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_bf16_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_f16_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_f32_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/CMakeLists.txt (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/wmma/device_grouped_conv3d_bwd_data_wmma_gndhwc_gkzyxc_gndhwk_f16_1x1s1p0_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/wmma/device_grouped_conv3d_bwd_data_wmma_gndhwc_gkzyxc_gndhwk_f16_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/wmma/device_grouped_conv3d_bwd_data_wmma_gndhwc_gkzyxc_gndhwk_i8_1x1s1p0_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/wmma/device_grouped_conv3d_bwd_data_wmma_gndhwc_gkzyxc_gndhwk_i8_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/wmma/device_grouped_conv3d_bwd_data_wmma_ndhwgc_gkzyxc_ndhwgk_f16_1x1s1p0_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/wmma/device_grouped_conv3d_bwd_data_wmma_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/wmma/device_grouped_conv3d_bwd_data_wmma_ndhwgc_gkzyxc_ndhwgk_i8_1x1s1p0_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/wmma/device_grouped_conv3d_bwd_data_wmma_ndhwgc_gkzyxc_ndhwgk_i8_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_gndhwc_gkzyxc_gndhwk_bf16_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_gndhwc_gkzyxc_gndhwk_f16_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_gndhwc_gkzyxc_gndhwk_f32_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ndhwgc_gkzyxc_ndhwgk_input_f16_comp_bf8_f8_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/CMakeLists.txt (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/dl/device_grouped_conv3d_bwd_weight_dl_gndhwc_gkzyxc_gndhwk_bf16_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/dl/device_grouped_conv3d_bwd_weight_dl_gndhwc_gkzyxc_gndhwk_f16_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/dl/device_grouped_conv3d_bwd_weight_dl_gndhwc_gkzyxc_gndhwk_f32_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/dl/device_grouped_conv3d_bwd_weight_dl_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/dl/device_grouped_conv3d_bwd_weight_dl_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/dl/device_grouped_conv3d_bwd_weight_dl_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/wmma/device_grouped_conv3d_bwd_weight_wmma_gndhwc_gkzyxc_gndhwk_f16_1x1s1p0_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/wmma/device_grouped_conv3d_bwd_weight_wmma_gndhwc_gkzyxc_gndhwk_f16_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/wmma/device_grouped_conv3d_bwd_weight_wmma_gndhwc_gkzyxc_gndhwk_i8_1x1s1p0_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/wmma/device_grouped_conv3d_bwd_weight_wmma_gndhwc_gkzyxc_gndhwk_i8_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/wmma/device_grouped_conv3d_bwd_weight_wmma_ndhwgc_gkzyxc_ndhwgk_f16_1x1s1p0_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/wmma/device_grouped_conv3d_bwd_weight_wmma_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/wmma/device_grouped_conv3d_bwd_weight_wmma_ndhwgc_gkzyxc_ndhwgk_i8_1x1s1p0_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/wmma/device_grouped_conv3d_bwd_weight_wmma_ndhwgc_gkzyxc_ndhwgk_i8_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_bf16_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_f16_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_f32_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_bf8_fp8_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/CMakeLists.txt (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/wmma/device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_f16_1x1p0_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/wmma/device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_f16_1x1s1p0_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/wmma/device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_f16_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/wmma/device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_f16_oddc_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/wmma/device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_i8_1x1p0_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/wmma/device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_i8_1x1s1p0_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/wmma/device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_i8_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/wmma/device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_i8_oddc_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/wmma/device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_f16_1x1p0_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/wmma/device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_f16_1x1s1p0_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/wmma/device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/wmma/device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_f16_oddc_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/wmma/device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_i8_1x1p0_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/wmma/device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_i8_1x1s1p0_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/wmma/device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_i8_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/wmma/device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_i8_oddc_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_gndhwc_gkzyxc_gndhwk_bf16_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_gndhwc_gkzyxc_gndhwk_f16_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_gndhwc_gkzyxc_gndhwk_f32_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_gndhwc_gkzyxc_gndhwk_int8_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_fp8_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_int8_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/grouped_gemm/CMakeLists.txt (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_f16_f16_f16_km_kn_mn_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_f16_f16_f16_km_nk_mn_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_f16_f16_f16_mk_kn_mn_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_f16_f16_f16_mk_nk_mn_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_irregular_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_irregular_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/grouped_gemm_bias/CMakeLists.txt (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/grouped_gemm_bias/device_grouped_gemm_xdl_fixed_nk_bias_f16_f16_f16_mk_kn_mn_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/grouped_gemm_bias/device_grouped_gemm_xdl_fixed_nk_bias_f16_f16_f16_mk_nk_mn_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/grouped_gemm_bias/device_grouped_gemm_xdl_fixed_nk_bias_f16_f16_f32_mk_kn_mn_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/grouped_gemm_bias/device_grouped_gemm_xdl_fixed_nk_bias_f16_f16_f32_mk_nk_mn_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/grouped_gemm_fastgelu/CMakeLists.txt (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/grouped_gemm_fastgelu/device_grouped_gemm_fastgelu_xdl_f16_f16_f16_km_kn_mn_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/grouped_gemm_fastgelu/device_grouped_gemm_fastgelu_xdl_f16_f16_f16_km_nk_mn_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/grouped_gemm_fastgelu/device_grouped_gemm_fastgelu_xdl_f16_f16_f16_mk_kn_mn_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/grouped_gemm_fastgelu/device_grouped_gemm_fastgelu_xdl_f16_f16_f16_mk_nk_mn_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/grouped_gemm_fixed_nk/CMakeLists.txt (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/grouped_gemm_fixed_nk/device_grouped_gemm_xdl_fixed_nk_f16_f16_f16_mk_kn_mn_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/grouped_gemm_fixed_nk/device_grouped_gemm_xdl_fixed_nk_f16_f16_f16_mk_nk_mn_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/grouped_gemm_fixed_nk/device_grouped_gemm_xdl_fixed_nk_f16_fp8_f16_mk_kn_mn_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/grouped_gemm_fixed_nk/device_grouped_gemm_xdl_fixed_nk_f16_fp8_f16_mk_nk_mn_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/grouped_gemm_fixed_nk/device_grouped_gemm_xdl_fixed_nk_f16_i8_f16_mk_kn_mn_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/grouped_gemm_fixed_nk/device_grouped_gemm_xdl_fixed_nk_f16_i8_f16_mk_nk_mn_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/image_to_column/CMakeLists.txt (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/image_to_column/device_image_to_column_nhwc_1d_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/image_to_column/device_image_to_column_nhwc_2d_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/image_to_column/device_image_to_column_nhwc_3d_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/max_pool_bwd/CMakeLists.txt (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/max_pool_bwd/device_max_pool_bwd_bf16_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/max_pool_bwd/device_max_pool_bwd_f16_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/max_pool_bwd/device_max_pool_bwd_f32_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/max_pool_bwd/max_pool_bwd_instance_common.hpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/normalization/CMakeLists.txt (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/normalization/device_groupnorm_f16_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/normalization/device_groupnorm_f32_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/normalization/device_groupnorm_swish_f16_f32_f32_f16_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/normalization/device_groupnorm_swish_f16_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/normalization/device_groupnorm_swish_f32_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/normalization/device_layernorm2d_f16_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/normalization/device_layernorm2d_f32_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/normalization/device_layernorm4d_f16_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/normalization/device_layernorm4d_f32_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/normalization/normalization_instance_common.hpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/pool3d_fwd/CMakeLists.txt (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/pool3d_fwd/device_avg_pool3d_fwd_ndhwc_bf16_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/pool3d_fwd/device_avg_pool3d_fwd_ndhwc_f16_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/pool3d_fwd/device_avg_pool3d_fwd_ndhwc_f32_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/pool3d_fwd/device_max_pool3d_fwd_ndhwc_bf16_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/pool3d_fwd/device_max_pool3d_fwd_ndhwc_f16_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/pool3d_fwd/device_max_pool3d_fwd_ndhwc_f32_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/pool3d_fwd/pool_fwd_instance_common.hpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/quantization/CMakeLists.txt (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/conv2d_quantization_common.hpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_dl_bias_perchannel_quantization_int8_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_dl_bias_perlayer_quantization_int8_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_dl_int8_instance.hpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_dl_perchannel_quantization_int8_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_dl_perlayer_quantization_int8_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_xdl_bias_perchannel_quantization_int8_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_xdl_bias_perlayer_quantization_int8_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_xdl_int8_instance.hpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_xdl_perchannel_quantization_int8_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_xdl_perlayer_quantization_int8_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/quantization/gemm/device_gemm_quantization_dl_c_shuffle_i8_i8_i8_instance.hpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/quantization/gemm/device_gemm_quantization_dl_c_shuffle_i8_i8_i8_km_kn_mn_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/quantization/gemm/device_gemm_quantization_dl_c_shuffle_i8_i8_i8_km_nk_mn_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/quantization/gemm/device_gemm_quantization_dl_c_shuffle_i8_i8_i8_mk_kn_mn_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/quantization/gemm/device_gemm_quantization_dl_c_shuffle_i8_i8_i8_mk_nk_mn_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/quantization/gemm/device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_instance.hpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/quantization/gemm/device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_km_kn_mn_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/quantization/gemm/device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_km_nk_mn_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/quantization/gemm/device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_mk_kn_mn_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/quantization/gemm/device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_mk_nk_mn_instance.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/quantization/gemm/gemm_quantization_common.hpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/reduce/CMakeLists.txt (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_add.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_amax.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_avg.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_max.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_min.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_norm2.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_amax.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_max.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_min.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_add.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_avg.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_norm2.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_add.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_amax.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_avg.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_max.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_min.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_norm2.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_add.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_avg.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_norm2.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_add.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_amax.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_avg.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_max.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_min.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_norm2.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i32_i8_add.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i32_i8_avg.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8_amax.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8_max.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8_min.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_b16_f32_f32_add.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_b16_f32_f32_avg.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32_add.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32_avg.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32_add.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32_avg.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32_add.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32_avg.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f64_f64_f64_add.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f64_f64_f64_avg.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_add.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_amax.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_avg.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_max.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_min.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_norm2.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16_amax.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16_max.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16_min.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16_add.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16_avg.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16_norm2.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_add.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_amax.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_avg.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_max.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_min.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_norm2.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_add.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_avg.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_norm2.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_add.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_amax.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_avg.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_max.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_min.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_norm2.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8_add.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8_avg.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_amax.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_max.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_min.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/softmax/CMakeLists.txt (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce1.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce2.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce3.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce1.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce2.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce3.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce4.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce1.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce2.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce3.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce1.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce2.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce3.cpp (100%) rename {library => composable_kernel/library}/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce4.cpp (100%) create mode 100755 composable_kernel/library/src/tensor_operation_instance/gpu/tall_and_skinny_gemm_splitk/CMakeLists.txt create mode 100755 composable_kernel/library/src/tensor_operation_instance/gpu/tall_and_skinny_gemm_splitk/device_tall_and_skinny_gemm_splitk_f16_f16_f16_mk_kn_mn_instance.cpp create mode 100755 composable_kernel/library/src/tensor_operation_instance/gpu/tall_and_skinny_gemm_splitk/device_tall_and_skinny_gemm_splitk_f16_f16_f16_mk_nk_mn_instance.cpp rename {library => composable_kernel/library}/src/utility/CMakeLists.txt (100%) rename {library => composable_kernel/library}/src/utility/convolution_parameter.cpp (100%) rename {library => composable_kernel/library}/src/utility/device_memory.cpp (100%) rename {library => composable_kernel/library}/src/utility/host_tensor.cpp (100%) rename {profiler => composable_kernel/profiler}/CMakeLists.txt (100%) rename {profiler => composable_kernel/profiler}/README.md (100%) rename {profiler => composable_kernel/profiler}/include/profiler/data_type_enum.hpp (100%) rename {profiler => composable_kernel/profiler}/include/profiler/profile_avg_pool3d_bwd_impl.hpp (100%) rename {profiler => composable_kernel/profiler}/include/profiler/profile_batched_gemm_add_relu_gemm_add_impl.hpp (100%) rename {profiler => composable_kernel/profiler}/include/profiler/profile_batched_gemm_bias_softmax_gemm_permute_impl.hpp (100%) rename {profiler => composable_kernel/profiler}/include/profiler/profile_batched_gemm_gemm_impl.hpp (100%) rename {profiler => composable_kernel/profiler}/include/profiler/profile_batched_gemm_impl.hpp (100%) rename {profiler => composable_kernel/profiler}/include/profiler/profile_batched_gemm_reduce_impl.hpp (100%) rename {profiler => composable_kernel/profiler}/include/profiler/profile_batched_gemm_softmax_gemm_impl.hpp (100%) rename {profiler => composable_kernel/profiler}/include/profiler/profile_batched_gemm_softmax_gemm_permute_impl.hpp (100%) rename {profiler => composable_kernel/profiler}/include/profiler/profile_batchnorm_backward_impl.hpp (100%) rename {profiler => composable_kernel/profiler}/include/profiler/profile_batchnorm_forward_impl.hpp (100%) rename {profiler => composable_kernel/profiler}/include/profiler/profile_batchnorm_infer_impl.hpp (100%) rename {profiler => composable_kernel/profiler}/include/profiler/profile_contraction_impl.hpp (100%) rename {profiler => composable_kernel/profiler}/include/profiler/profile_contraction_utils.hpp (100%) rename {profiler => composable_kernel/profiler}/include/profiler/profile_conv_bwd_data_impl.hpp (100%) rename {profiler => composable_kernel/profiler}/include/profiler/profile_conv_fwd_bias_relu_add_impl.hpp (100%) rename {profiler => composable_kernel/profiler}/include/profiler/profile_conv_fwd_bias_relu_impl.hpp (100%) rename {profiler => composable_kernel/profiler}/include/profiler/profile_conv_fwd_impl.hpp (100%) rename {profiler => composable_kernel/profiler}/include/profiler/profile_conv_tensor_rearrange_impl.hpp (100%) rename {profiler => composable_kernel/profiler}/include/profiler/profile_elementwise_layernorm_impl.hpp (100%) rename {profiler => composable_kernel/profiler}/include/profiler/profile_gemm_add_add_fastgelu_impl.hpp (100%) rename {profiler => composable_kernel/profiler}/include/profiler/profile_gemm_add_fastgelu_impl.hpp (100%) rename {profiler => composable_kernel/profiler}/include/profiler/profile_gemm_add_multiply_impl.hpp (100%) rename {profiler => composable_kernel/profiler}/include/profiler/profile_gemm_add_relu_add_layernorm_impl.hpp (100%) rename {profiler => composable_kernel/profiler}/include/profiler/profile_gemm_bias_add_reduce_impl.hpp (100%) rename {profiler => composable_kernel/profiler}/include/profiler/profile_gemm_bilinear_impl.hpp (100%) rename {profiler => composable_kernel/profiler}/include/profiler/profile_gemm_fastgelu_impl.hpp (100%) rename {profiler => composable_kernel/profiler}/include/profiler/profile_gemm_impl.hpp (100%) rename {profiler => composable_kernel/profiler}/include/profiler/profile_gemm_multiply_add_impl.hpp (100%) rename {profiler => composable_kernel/profiler}/include/profiler/profile_gemm_reduce_impl.hpp (100%) rename {profiler => composable_kernel/profiler}/include/profiler/profile_gemm_splitk_impl.hpp (100%) rename {profiler => composable_kernel/profiler}/include/profiler/profile_gemm_streamk_impl.hpp (100%) rename {profiler => composable_kernel/profiler}/include/profiler/profile_gemv_splitk_impl.hpp (98%) rename {profiler => composable_kernel/profiler}/include/profiler/profile_grouped_conv_bwd_data_impl.hpp (100%) rename {profiler => composable_kernel/profiler}/include/profiler/profile_grouped_conv_bwd_weight_impl.hpp (100%) rename {profiler => composable_kernel/profiler}/include/profiler/profile_grouped_conv_fwd_impl.hpp (100%) rename {profiler => composable_kernel/profiler}/include/profiler/profile_grouped_gemm_fastgelu_impl.hpp (100%) rename {profiler => composable_kernel/profiler}/include/profiler/profile_grouped_gemm_impl.hpp (100%) rename {profiler => composable_kernel/profiler}/include/profiler/profile_groupnorm_impl.hpp (100%) rename {profiler => composable_kernel/profiler}/include/profiler/profile_layernorm_impl.hpp (100%) rename {profiler => composable_kernel/profiler}/include/profiler/profile_max_pool3d_bwd_impl.hpp (100%) rename {profiler => composable_kernel/profiler}/include/profiler/profile_pool3d_fwd_impl.hpp (100%) rename {profiler => composable_kernel/profiler}/include/profiler/profile_reduce_impl.hpp (100%) rename {profiler => composable_kernel/profiler}/include/profiler/profile_softmax_impl.hpp (100%) create mode 100755 composable_kernel/profiler/include/profiler/profile_tall_and_skinny_gemm_splitk_impl.hpp rename {profiler => composable_kernel/profiler}/src/CMakeLists.txt (100%) rename {profiler => composable_kernel/profiler}/src/profile_avg_pool3d_bwd.cpp (100%) rename {profiler => composable_kernel/profiler}/src/profile_batched_gemm.cpp (100%) rename {profiler => composable_kernel/profiler}/src/profile_batched_gemm_add_relu_gemm_add.cpp (100%) rename {profiler => composable_kernel/profiler}/src/profile_batched_gemm_gemm.cpp (100%) rename {profiler => composable_kernel/profiler}/src/profile_batched_gemm_multi_d.cpp (100%) rename {profiler => composable_kernel/profiler}/src/profile_batched_gemm_reduce.cpp (100%) rename {profiler => composable_kernel/profiler}/src/profile_batchnorm_bwd.cpp (100%) rename {profiler => composable_kernel/profiler}/src/profile_batchnorm_fwd.cpp (100%) rename {profiler => composable_kernel/profiler}/src/profile_batchnorm_infer.cpp (100%) rename {profiler => composable_kernel/profiler}/src/profile_contraction_bilinear.cpp (100%) rename {profiler => composable_kernel/profiler}/src/profile_contraction_scale.cpp (100%) rename {profiler => composable_kernel/profiler}/src/profile_conv_bwd_data.cpp (100%) rename {profiler => composable_kernel/profiler}/src/profile_conv_fwd.cpp (100%) rename {profiler => composable_kernel/profiler}/src/profile_conv_fwd_bias_relu.cpp (100%) rename {profiler => composable_kernel/profiler}/src/profile_conv_fwd_bias_relu_add.cpp (100%) rename {profiler => composable_kernel/profiler}/src/profile_conv_tensor_rearrange.cpp (100%) rename {profiler => composable_kernel/profiler}/src/profile_gemm.cpp (100%) rename {profiler => composable_kernel/profiler}/src/profile_gemm_add_add_fastgelu.cpp (100%) rename {profiler => composable_kernel/profiler}/src/profile_gemm_add_fastgelu.cpp (100%) rename {profiler => composable_kernel/profiler}/src/profile_gemm_add_multiply.cpp (100%) rename {profiler => composable_kernel/profiler}/src/profile_gemm_add_relu_add_layernorm.cpp (100%) rename {profiler => composable_kernel/profiler}/src/profile_gemm_bias_add_reduce.cpp (100%) rename {profiler => composable_kernel/profiler}/src/profile_gemm_bilinear.cpp (100%) rename {profiler => composable_kernel/profiler}/src/profile_gemm_fastgelu.cpp (100%) rename {profiler => composable_kernel/profiler}/src/profile_gemm_multiply_add.cpp (100%) rename {profiler => composable_kernel/profiler}/src/profile_gemm_reduce.cpp (100%) rename {profiler => composable_kernel/profiler}/src/profile_gemm_splitk.cpp (100%) rename {profiler => composable_kernel/profiler}/src/profile_gemm_streamk.cpp (100%) rename {profiler => composable_kernel/profiler}/src/profile_gemv_splitk.cpp (100%) rename {profiler => composable_kernel/profiler}/src/profile_grouped_conv_bwd_data.cpp (100%) rename {profiler => composable_kernel/profiler}/src/profile_grouped_conv_bwd_weight.cpp (100%) rename {profiler => composable_kernel/profiler}/src/profile_grouped_conv_fwd.cpp (100%) rename {profiler => composable_kernel/profiler}/src/profile_grouped_gemm.cpp (100%) rename {profiler => composable_kernel/profiler}/src/profile_grouped_gemm_fastgelu.cpp (100%) rename {profiler => composable_kernel/profiler}/src/profile_groupnorm.cpp (100%) rename {profiler => composable_kernel/profiler}/src/profile_layernorm.cpp (100%) rename {profiler => composable_kernel/profiler}/src/profile_max_pool3d_bwd.cpp (100%) rename {profiler => composable_kernel/profiler}/src/profile_max_pool3d_fwd.cpp (100%) rename {profiler => composable_kernel/profiler}/src/profile_reduce.cpp (100%) rename {profiler => composable_kernel/profiler}/src/profile_softmax.cpp (100%) create mode 100755 composable_kernel/profiler/src/profile_tall_and_skinny_gemm_splitk.cpp rename {profiler => composable_kernel/profiler}/src/profiler.cpp (100%) rename {profiler => composable_kernel/profiler}/src/profiler_operation_registry.hpp (100%) rename rbuild.ini => composable_kernel/rbuild.ini (100%) rename requirements.txt => composable_kernel/requirements.txt (100%) rename {script => composable_kernel/script}/check_copyright_year.sh (100%) rename {script => composable_kernel/script}/clang-format-overwrite.sh (100%) rename {script => composable_kernel/script}/cmake-ck-dev.sh (100%) rename {script => composable_kernel/script}/cmake-ck-release.sh (100%) rename {script => composable_kernel/script}/count_vgpr.sh (100%) rename {script => composable_kernel/script}/hipclang_opt.sh (100%) rename {script => composable_kernel/script}/install_precommit.sh (100%) rename {script => composable_kernel/script}/parse_perf_data.py (100%) rename {script => composable_kernel/script}/process_perf_data.py (100%) rename {script => composable_kernel/script}/process_perf_data.sh (100%) rename {script => composable_kernel/script}/process_qa_data.sh (100%) rename {script => composable_kernel/script}/profile_batched_gemm.sh (100%) rename {script => composable_kernel/script}/profile_conv_bwd_data.sh (100%) rename {script => composable_kernel/script}/profile_conv_fwd.sh (100%) rename {script => composable_kernel/script}/profile_gemm.sh (100%) rename {script => composable_kernel/script}/profile_gemm_bilinear.sh (100%) rename {script => composable_kernel/script}/profile_grouped_gemm.sh (100%) rename {script => composable_kernel/script}/profile_onnx_gemm.sh (100%) rename {script => composable_kernel/script}/profile_reduce_no_index.sh (100%) rename {script => composable_kernel/script}/profile_reduce_with_index.sh (100%) rename {script => composable_kernel/script}/profile_resnet50.sh (100%) rename {script => composable_kernel/script}/profile_splitK_gemm.sh (100%) rename {script => composable_kernel/script}/run_full_performance_tests.sh (100%) rename {script => composable_kernel/script}/run_performance_tests.sh (100%) rename {script => composable_kernel/script}/test_convnd_fwd.sh (100%) rename {script => composable_kernel/script}/test_reduce_no_index.sh (100%) rename {script => composable_kernel/script}/test_reduce_with_index.sh (100%) rename {script => composable_kernel/script}/uninstall_precommit.sh (100%) rename {test => composable_kernel/test}/CMakeLists.txt (100%) rename {test => composable_kernel/test}/batched_gemm/CMakeLists.txt (100%) rename {test => composable_kernel/test}/batched_gemm/batched_gemm_bf16.cpp (100%) rename {test => composable_kernel/test}/batched_gemm/batched_gemm_fp16.cpp (100%) rename {test => composable_kernel/test}/batched_gemm/batched_gemm_fp32.cpp (100%) rename {test => composable_kernel/test}/batched_gemm/batched_gemm_int8.cpp (100%) rename {test => composable_kernel/test}/batched_gemm/test_batched_gemm.cpp (100%) rename {test => composable_kernel/test}/batched_gemm_gemm/CMakeLists.txt (100%) rename {test => composable_kernel/test}/batched_gemm_gemm/test_batched_gemm_gemm_fp16.cpp (100%) rename {test => composable_kernel/test}/batched_gemm_gemm/test_batched_gemm_gemm_util.hpp (100%) rename {test => composable_kernel/test}/batched_gemm_multi_d/CMakeLists.txt (100%) rename {test => composable_kernel/test}/batched_gemm_multi_d/test_batched_gemm_multi_d_dl.cpp (100%) rename {test => composable_kernel/test}/batched_gemm_reduce/CMakeLists.txt (100%) rename {test => composable_kernel/test}/batched_gemm_reduce/batched_gemm_reduce_fp16.cpp (100%) rename {test => composable_kernel/test}/batched_gemm_softmax_gemm/CMakeLists.txt (100%) rename {test => composable_kernel/test}/batched_gemm_softmax_gemm/test_batched_gemm_softmax_gemm_fp16.cpp (100%) rename {test => composable_kernel/test}/batched_gemm_softmax_gemm/test_batched_gemm_softmax_gemm_util.hpp (100%) rename {test => composable_kernel/test}/batched_gemm_softmax_gemm_permute/CMakeLists.txt (100%) rename {test => composable_kernel/test}/batched_gemm_softmax_gemm_permute/test_batched_gemm_bias_softmax_gemm_permute_bf16.cpp (100%) rename {test => composable_kernel/test}/batched_gemm_softmax_gemm_permute/test_batched_gemm_bias_softmax_gemm_permute_fp16.cpp (100%) rename {test => composable_kernel/test}/batched_gemm_softmax_gemm_permute/test_batched_gemm_bias_softmax_gemm_permute_util.hpp (100%) rename {test => composable_kernel/test}/batched_gemm_softmax_gemm_permute/test_batched_gemm_softmax_gemm_permute_bf16.cpp (100%) rename {test => composable_kernel/test}/batched_gemm_softmax_gemm_permute/test_batched_gemm_softmax_gemm_permute_fp16.cpp (100%) rename {test => composable_kernel/test}/batched_gemm_softmax_gemm_permute/test_batched_gemm_softmax_gemm_permute_util.hpp (100%) rename {test => composable_kernel/test}/batchnorm/CMakeLists.txt (100%) rename {test => composable_kernel/test}/batchnorm/batchnorm_bwd_rank_4.cpp (100%) rename {test => composable_kernel/test}/batchnorm/batchnorm_fwd_rank_4.cpp (100%) rename {test => composable_kernel/test}/batchnorm/batchnorm_infer_rank_4.cpp (100%) rename {test => composable_kernel/test}/block_swizzle_test/block_swizzle_test.cpp (100%) rename {test => composable_kernel/test}/block_swizzle_test/rebuild.sh (100%) rename {test => composable_kernel/test}/block_swizzle_test/simple_args.h (100%) rename {test => composable_kernel/test}/block_to_ctile_map/CMakeLists.txt (100%) rename {test => composable_kernel/test}/block_to_ctile_map/test_block_to_ctile_map.cpp (100%) rename {test => composable_kernel/test}/contraction/CMakeLists.txt (100%) rename {test => composable_kernel/test}/contraction/test_contraction.cpp (100%) rename {test => composable_kernel/test}/contraction/test_contraction_interface.cpp (100%) rename {test => composable_kernel/test}/conv_tensor_rearrange/CMakeLists.txt (100%) rename {test => composable_kernel/test}/conv_tensor_rearrange/test_conv_tensor_rearrange.cpp (100%) rename {test => composable_kernel/test}/conv_tensor_rearrange/test_conv_tensor_rearrange_interface.cpp (100%) rename {test => composable_kernel/test}/conv_util/CMakeLists.txt (100%) rename {test => composable_kernel/test}/conv_util/conv_util.cpp (100%) rename {test => composable_kernel/test}/convnd_bwd_data/CMakeLists.txt (100%) rename {test => composable_kernel/test}/convnd_bwd_data/convnd_bwd_data.cpp (100%) rename {test => composable_kernel/test}/convnd_fwd/CMakeLists.txt (100%) rename {test => composable_kernel/test}/convnd_fwd/convnd_fwd.cpp (100%) rename {test => composable_kernel/test}/data_type/CMakeLists.txt (100%) rename {test => composable_kernel/test}/data_type/test_bf8.cpp (100%) rename {test => composable_kernel/test}/data_type/test_fp8.cpp (100%) rename {test => composable_kernel/test}/data_type/test_int4.cpp (100%) rename {test => composable_kernel/test}/data_type/type_convert_const.cpp (100%) rename {test => composable_kernel/test}/elementwise_normalization/CMakeLists.txt (100%) rename {test => composable_kernel/test}/elementwise_normalization/test_elementwise_layernorm_fp16.cpp (100%) rename {test => composable_kernel/test}/gemm/CMakeLists.txt (100%) rename {test => composable_kernel/test}/gemm/gemm_bf16.cpp (100%) rename {test => composable_kernel/test}/gemm/gemm_fp16.cpp (100%) rename {test => composable_kernel/test}/gemm/gemm_fp32.cpp (100%) rename {test => composable_kernel/test}/gemm/gemm_fp64.cpp (100%) rename {test => composable_kernel/test}/gemm/gemm_int8.cpp (100%) rename {test => composable_kernel/test}/gemm/gemm_standalone_xdl_fp16.cpp (100%) rename {test => composable_kernel/test}/gemm/gemm_util.hpp (100%) rename {test => composable_kernel/test}/gemm/instance/gemm_f16_nn_instance.cpp (100%) rename {test => composable_kernel/test}/gemm/instance/gemm_f16_nn_instance.hpp (100%) rename {test => composable_kernel/test}/gemm/instance/gemm_f16_nt_instance.cpp (100%) rename {test => composable_kernel/test}/gemm/instance/gemm_f16_nt_instance.hpp (100%) rename {test => composable_kernel/test}/gemm/instance/gemm_f16_tn_instance.cpp (100%) rename {test => composable_kernel/test}/gemm/instance/gemm_f16_tn_instance.hpp (100%) rename {test => composable_kernel/test}/gemm/instance/gemm_f16_tt_instance.cpp (100%) rename {test => composable_kernel/test}/gemm/instance/gemm_f16_tt_instance.hpp (100%) rename {test => composable_kernel/test}/gemm/instance/gemm_wavelet_f16_tn_instance.cpp (100%) rename {test => composable_kernel/test}/gemm/instance/gemm_wavelet_f16_tn_instance.hpp (100%) rename {test => composable_kernel/test}/gemm/run_gemm_test.inc (100%) rename {test => composable_kernel/test}/gemm_layernorm/CMakeLists.txt (100%) rename {test => composable_kernel/test}/gemm_layernorm/test_gemm_add_relu_add_layernorm_fp16.cpp (100%) rename {test => composable_kernel/test}/gemm_reduce/CMakeLists.txt (100%) rename {test => composable_kernel/test}/gemm_reduce/gemm_reduce_fp16.cpp (100%) rename {test => composable_kernel/test}/gemm_split_k/CMakeLists.txt (100%) rename {test => composable_kernel/test}/gemm_split_k/test_gemm_splitk.cpp (100%) rename {test => composable_kernel/test}/gemm_split_k/test_gemm_splitk_ut_cases.inc (100%) rename {test => composable_kernel/test}/gemm_split_k/test_gemm_splitk_util.hpp (100%) rename {test => composable_kernel/test}/grouped_convnd_bwd_data/CMakeLists.txt (100%) rename {test => composable_kernel/test}/grouped_convnd_bwd_data/test_grouped_convnd_bwd_data.cpp (100%) rename {test => composable_kernel/test}/grouped_convnd_bwd_data/test_grouped_convnd_bwd_data_interface_wmma.cpp (100%) rename {test => composable_kernel/test}/grouped_convnd_bwd_data/test_grouped_convnd_bwd_data_interface_xdl.cpp (100%) rename {test => composable_kernel/test}/grouped_convnd_bwd_weight/CMakeLists.txt (100%) rename {test => composable_kernel/test}/grouped_convnd_bwd_weight/test_grouped_convnd_bwd_weight.cpp (100%) rename {test => composable_kernel/test}/grouped_convnd_bwd_weight/test_grouped_convnd_bwd_weight_interface_wmma.cpp (100%) rename {test => composable_kernel/test}/grouped_convnd_bwd_weight/test_grouped_convnd_bwd_weight_interface_xdl.cpp (100%) rename {test => composable_kernel/test}/grouped_convnd_fwd/CMakeLists.txt (100%) rename {test => composable_kernel/test}/grouped_convnd_fwd/grouped_convnd_fwd.cpp (100%) rename {test => composable_kernel/test}/grouped_convnd_fwd/test_grouped_convnd_fwd.cpp (100%) rename {test => composable_kernel/test}/grouped_gemm/CMakeLists.txt (100%) rename {test => composable_kernel/test}/grouped_gemm/test_grouped_gemm_interface.cpp (100%) rename {test => composable_kernel/test}/grouped_gemm/test_grouped_gemm_splitk.cpp (100%) rename {test => composable_kernel/test}/grouped_gemm/test_grouped_gemm_ut_cases.inc (100%) rename {test => composable_kernel/test}/grouped_gemm/test_grouped_gemm_util.hpp (100%) rename {test => composable_kernel/test}/image_to_column/CMakeLists.txt (100%) rename {test => composable_kernel/test}/image_to_column/test_image_to_column.cpp (100%) rename {test => composable_kernel/test}/image_to_column/test_image_to_column_interface.cpp (100%) rename {test => composable_kernel/test}/magic_number_division/CMakeLists.txt (100%) rename {test => composable_kernel/test}/magic_number_division/magic_number_division.cpp (100%) rename {test => composable_kernel/test}/normalization/CMakeLists.txt (100%) rename {test => composable_kernel/test}/normalization/test_groupnorm_fp16.cpp (100%) rename {test => composable_kernel/test}/normalization/test_groupnorm_fp32.cpp (100%) rename {test => composable_kernel/test}/normalization/test_layernorm2d_fp16.cpp (100%) rename {test => composable_kernel/test}/normalization/test_layernorm2d_fp32.cpp (100%) rename {test => composable_kernel/test}/pool/CMakeLists.txt (100%) rename {test => composable_kernel/test}/pool/test_avg_pool3d_bwd.cpp (100%) rename {test => composable_kernel/test}/pool/test_avg_pool3d_fwd.cpp (100%) rename {test => composable_kernel/test}/pool/test_max_pool3d_bwd.cpp (100%) rename {test => composable_kernel/test}/pool/test_max_pool3d_fwd.cpp (100%) rename {test => composable_kernel/test}/pool/test_pool_fwd_common.hpp (100%) rename {test => composable_kernel/test}/reduce/CMakeLists.txt (100%) rename {test => composable_kernel/test}/reduce/reduce_no_index.cpp (100%) rename {test => composable_kernel/test}/reduce/reduce_with_index.cpp (100%) rename {test => composable_kernel/test}/reference_conv_fwd/CMakeLists.txt (100%) rename {test => composable_kernel/test}/reference_conv_fwd/reference_conv_fwd.cpp (100%) rename {test => composable_kernel/test}/softmax/CMakeLists.txt (100%) rename {test => composable_kernel/test}/softmax/test_softmax_interface.cpp (100%) rename {test => composable_kernel/test}/softmax/test_softmax_rank3.cpp (100%) rename {test => composable_kernel/test}/softmax/test_softmax_rank4.cpp (100%) rename {test => composable_kernel/test}/softmax/test_softmax_ut_cases.inc (100%) rename {test => composable_kernel/test}/softmax/test_softmax_util.hpp (100%) rename {test => composable_kernel/test}/space_filling_curve/CMakeLists.txt (100%) rename {test => composable_kernel/test}/space_filling_curve/space_filling_curve.cpp (100%) rename {test => composable_kernel/test}/wmma_op/CMakeLists.txt (100%) rename {test => composable_kernel/test}/wmma_op/wmma_op.cpp (100%) rename {test => composable_kernel/test}/wmma_op/wmma_op_util.hpp (100%) delete mode 100755 library/src/tensor_operation_instance/gpu/gemv_splitk/CMakeLists.txt diff --git a/CHANGELOG.md b/composable_kernel/CHANGELOG.md similarity index 100% rename from CHANGELOG.md rename to composable_kernel/CHANGELOG.md diff --git a/CITATION.cff b/composable_kernel/CITATION.cff similarity index 100% rename from CITATION.cff rename to composable_kernel/CITATION.cff diff --git a/CMakeLists.txt b/composable_kernel/CMakeLists.txt similarity index 100% rename from CMakeLists.txt rename to composable_kernel/CMakeLists.txt diff --git a/CONTRIBUTORS.md b/composable_kernel/CONTRIBUTORS.md similarity index 100% rename from CONTRIBUTORS.md rename to composable_kernel/CONTRIBUTORS.md diff --git a/Config.cmake.in b/composable_kernel/Config.cmake.in similarity index 100% rename from Config.cmake.in rename to composable_kernel/Config.cmake.in diff --git a/Dockerfile b/composable_kernel/Dockerfile similarity index 100% rename from Dockerfile rename to composable_kernel/Dockerfile diff --git a/Jenkinsfile b/composable_kernel/Jenkinsfile similarity index 100% rename from Jenkinsfile rename to composable_kernel/Jenkinsfile diff --git a/LICENSE b/composable_kernel/LICENSE similarity index 100% rename from LICENSE rename to composable_kernel/LICENSE diff --git a/README.md b/composable_kernel/README.md similarity index 100% rename from README.md rename to composable_kernel/README.md diff --git a/client_example/01_gemm/CMakeLists.txt b/composable_kernel/client_example/01_gemm/CMakeLists.txt similarity index 100% rename from client_example/01_gemm/CMakeLists.txt rename to composable_kernel/client_example/01_gemm/CMakeLists.txt diff --git a/client_example/01_gemm/gemm.cpp b/composable_kernel/client_example/01_gemm/gemm.cpp similarity index 100% rename from client_example/01_gemm/gemm.cpp rename to composable_kernel/client_example/01_gemm/gemm.cpp diff --git a/client_example/02_gemm_add_add_fastgelu/CMakeLists.txt b/composable_kernel/client_example/02_gemm_add_add_fastgelu/CMakeLists.txt similarity index 100% rename from client_example/02_gemm_add_add_fastgelu/CMakeLists.txt rename to composable_kernel/client_example/02_gemm_add_add_fastgelu/CMakeLists.txt diff --git a/client_example/02_gemm_add_add_fastgelu/gemm_add_add_fastgelu.cpp b/composable_kernel/client_example/02_gemm_add_add_fastgelu/gemm_add_add_fastgelu.cpp similarity index 100% rename from client_example/02_gemm_add_add_fastgelu/gemm_add_add_fastgelu.cpp rename to composable_kernel/client_example/02_gemm_add_add_fastgelu/gemm_add_add_fastgelu.cpp diff --git a/client_example/02_gemm_add_add_fastgelu/gemm_add_add_fastgelu_generic.cpp b/composable_kernel/client_example/02_gemm_add_add_fastgelu/gemm_add_add_fastgelu_generic.cpp similarity index 100% rename from client_example/02_gemm_add_add_fastgelu/gemm_add_add_fastgelu_generic.cpp rename to composable_kernel/client_example/02_gemm_add_add_fastgelu/gemm_add_add_fastgelu_generic.cpp diff --git a/client_example/02_gemm_add_add_fastgelu/gemm_add_fastgelu.cpp b/composable_kernel/client_example/02_gemm_add_add_fastgelu/gemm_add_fastgelu.cpp similarity index 100% rename from client_example/02_gemm_add_add_fastgelu/gemm_add_fastgelu.cpp rename to composable_kernel/client_example/02_gemm_add_add_fastgelu/gemm_add_fastgelu.cpp diff --git a/client_example/02_gemm_add_add_fastgelu/gemm_add_fastgelu_generic.cpp b/composable_kernel/client_example/02_gemm_add_add_fastgelu/gemm_add_fastgelu_generic.cpp similarity index 100% rename from client_example/02_gemm_add_add_fastgelu/gemm_add_fastgelu_generic.cpp rename to composable_kernel/client_example/02_gemm_add_add_fastgelu/gemm_add_fastgelu_generic.cpp diff --git a/client_example/02_gemm_add_add_fastgelu/gemm_fastgelu.cpp b/composable_kernel/client_example/02_gemm_add_add_fastgelu/gemm_fastgelu.cpp similarity index 100% rename from client_example/02_gemm_add_add_fastgelu/gemm_fastgelu.cpp rename to composable_kernel/client_example/02_gemm_add_add_fastgelu/gemm_fastgelu.cpp diff --git a/client_example/02_gemm_add_add_fastgelu/gemm_fastgelu_generic.cpp b/composable_kernel/client_example/02_gemm_add_add_fastgelu/gemm_fastgelu_generic.cpp similarity index 100% rename from client_example/02_gemm_add_add_fastgelu/gemm_fastgelu_generic.cpp rename to composable_kernel/client_example/02_gemm_add_add_fastgelu/gemm_fastgelu_generic.cpp diff --git a/client_example/03_gemm_layernorm/CMakeLists.txt b/composable_kernel/client_example/03_gemm_layernorm/CMakeLists.txt similarity index 100% rename from client_example/03_gemm_layernorm/CMakeLists.txt rename to composable_kernel/client_example/03_gemm_layernorm/CMakeLists.txt diff --git a/client_example/03_gemm_layernorm/gemm_add_add_layernorm_naive.cpp b/composable_kernel/client_example/03_gemm_layernorm/gemm_add_add_layernorm_naive.cpp similarity index 100% rename from client_example/03_gemm_layernorm/gemm_add_add_layernorm_naive.cpp rename to composable_kernel/client_example/03_gemm_layernorm/gemm_add_add_layernorm_naive.cpp diff --git a/client_example/03_gemm_layernorm/gemm_add_relu_add_layernorm_welford.cpp b/composable_kernel/client_example/03_gemm_layernorm/gemm_add_relu_add_layernorm_welford.cpp similarity index 100% rename from client_example/03_gemm_layernorm/gemm_add_relu_add_layernorm_welford.cpp rename to composable_kernel/client_example/03_gemm_layernorm/gemm_add_relu_add_layernorm_welford.cpp diff --git a/client_example/04_contraction/CMakeLists.txt b/composable_kernel/client_example/04_contraction/CMakeLists.txt similarity index 100% rename from client_example/04_contraction/CMakeLists.txt rename to composable_kernel/client_example/04_contraction/CMakeLists.txt diff --git a/client_example/04_contraction/contraction_bilinear_fp32.cpp b/composable_kernel/client_example/04_contraction/contraction_bilinear_fp32.cpp similarity index 100% rename from client_example/04_contraction/contraction_bilinear_fp32.cpp rename to composable_kernel/client_example/04_contraction/contraction_bilinear_fp32.cpp diff --git a/client_example/04_contraction/contraction_bilinear_fp64.cpp b/composable_kernel/client_example/04_contraction/contraction_bilinear_fp64.cpp similarity index 100% rename from client_example/04_contraction/contraction_bilinear_fp64.cpp rename to composable_kernel/client_example/04_contraction/contraction_bilinear_fp64.cpp diff --git a/client_example/04_contraction/contraction_g1m2n3k1_add_xdl_fp16.cpp b/composable_kernel/client_example/04_contraction/contraction_g1m2n3k1_add_xdl_fp16.cpp similarity index 100% rename from client_example/04_contraction/contraction_g1m2n3k1_add_xdl_fp16.cpp rename to composable_kernel/client_example/04_contraction/contraction_g1m2n3k1_add_xdl_fp16.cpp diff --git a/client_example/04_contraction/contraction_scale_fp32.cpp b/composable_kernel/client_example/04_contraction/contraction_scale_fp32.cpp similarity index 100% rename from client_example/04_contraction/contraction_scale_fp32.cpp rename to composable_kernel/client_example/04_contraction/contraction_scale_fp32.cpp diff --git a/client_example/04_contraction/contraction_scale_fp64.cpp b/composable_kernel/client_example/04_contraction/contraction_scale_fp64.cpp similarity index 100% rename from client_example/04_contraction/contraction_scale_fp64.cpp rename to composable_kernel/client_example/04_contraction/contraction_scale_fp64.cpp diff --git a/client_example/05_layernorm/CMakeLists.txt b/composable_kernel/client_example/05_layernorm/CMakeLists.txt similarity index 100% rename from client_example/05_layernorm/CMakeLists.txt rename to composable_kernel/client_example/05_layernorm/CMakeLists.txt diff --git a/client_example/05_layernorm/layernorm2d.cpp b/composable_kernel/client_example/05_layernorm/layernorm2d.cpp similarity index 100% rename from client_example/05_layernorm/layernorm2d.cpp rename to composable_kernel/client_example/05_layernorm/layernorm2d.cpp diff --git a/client_example/06_softmax/CMakeLists.txt b/composable_kernel/client_example/06_softmax/CMakeLists.txt similarity index 100% rename from client_example/06_softmax/CMakeLists.txt rename to composable_kernel/client_example/06_softmax/CMakeLists.txt diff --git a/client_example/06_softmax/softmax4d.cpp b/composable_kernel/client_example/06_softmax/softmax4d.cpp similarity index 100% rename from client_example/06_softmax/softmax4d.cpp rename to composable_kernel/client_example/06_softmax/softmax4d.cpp diff --git a/client_example/07_grouped_convnd_fwd/CMakeLists.txt b/composable_kernel/client_example/07_grouped_convnd_fwd/CMakeLists.txt similarity index 100% rename from client_example/07_grouped_convnd_fwd/CMakeLists.txt rename to composable_kernel/client_example/07_grouped_convnd_fwd/CMakeLists.txt diff --git a/client_example/07_grouped_convnd_fwd/grouped_conv1d_fwd.cpp b/composable_kernel/client_example/07_grouped_convnd_fwd/grouped_conv1d_fwd.cpp similarity index 100% rename from client_example/07_grouped_convnd_fwd/grouped_conv1d_fwd.cpp rename to composable_kernel/client_example/07_grouped_convnd_fwd/grouped_conv1d_fwd.cpp diff --git a/client_example/07_grouped_convnd_fwd/grouped_conv2d_fwd.cpp b/composable_kernel/client_example/07_grouped_convnd_fwd/grouped_conv2d_fwd.cpp similarity index 100% rename from client_example/07_grouped_convnd_fwd/grouped_conv2d_fwd.cpp rename to composable_kernel/client_example/07_grouped_convnd_fwd/grouped_conv2d_fwd.cpp diff --git a/client_example/08_fused_attention/CMakeLists.txt b/composable_kernel/client_example/08_fused_attention/CMakeLists.txt similarity index 100% rename from client_example/08_fused_attention/CMakeLists.txt rename to composable_kernel/client_example/08_fused_attention/CMakeLists.txt diff --git a/client_example/08_fused_attention/fused_attention.cpp b/composable_kernel/client_example/08_fused_attention/fused_attention.cpp similarity index 100% rename from client_example/08_fused_attention/fused_attention.cpp rename to composable_kernel/client_example/08_fused_attention/fused_attention.cpp diff --git a/client_example/08_fused_attention/fused_attention_bias.cpp b/composable_kernel/client_example/08_fused_attention/fused_attention_bias.cpp similarity index 100% rename from client_example/08_fused_attention/fused_attention_bias.cpp rename to composable_kernel/client_example/08_fused_attention/fused_attention_bias.cpp diff --git a/client_example/09_quantization/CMakeLists.txt b/composable_kernel/client_example/09_quantization/CMakeLists.txt similarity index 100% rename from client_example/09_quantization/CMakeLists.txt rename to composable_kernel/client_example/09_quantization/CMakeLists.txt diff --git a/client_example/09_quantization/conv2d_fwd_bias_relu_perchannel_quantization.cpp b/composable_kernel/client_example/09_quantization/conv2d_fwd_bias_relu_perchannel_quantization.cpp similarity index 100% rename from client_example/09_quantization/conv2d_fwd_bias_relu_perchannel_quantization.cpp rename to composable_kernel/client_example/09_quantization/conv2d_fwd_bias_relu_perchannel_quantization.cpp diff --git a/client_example/09_quantization/conv2d_fwd_bias_relu_perlayer_quantization.cpp b/composable_kernel/client_example/09_quantization/conv2d_fwd_bias_relu_perlayer_quantization.cpp similarity index 100% rename from client_example/09_quantization/conv2d_fwd_bias_relu_perlayer_quantization.cpp rename to composable_kernel/client_example/09_quantization/conv2d_fwd_bias_relu_perlayer_quantization.cpp diff --git a/client_example/09_quantization/conv2d_fwd_bias_tanh_perchannel_quantization.cpp b/composable_kernel/client_example/09_quantization/conv2d_fwd_bias_tanh_perchannel_quantization.cpp similarity index 100% rename from client_example/09_quantization/conv2d_fwd_bias_tanh_perchannel_quantization.cpp rename to composable_kernel/client_example/09_quantization/conv2d_fwd_bias_tanh_perchannel_quantization.cpp diff --git a/client_example/09_quantization/conv2d_fwd_bias_tanh_perlayer_quantization.cpp b/composable_kernel/client_example/09_quantization/conv2d_fwd_bias_tanh_perlayer_quantization.cpp similarity index 100% rename from client_example/09_quantization/conv2d_fwd_bias_tanh_perlayer_quantization.cpp rename to composable_kernel/client_example/09_quantization/conv2d_fwd_bias_tanh_perlayer_quantization.cpp diff --git a/client_example/09_quantization/conv2d_fwd_perchannel_quantization.cpp b/composable_kernel/client_example/09_quantization/conv2d_fwd_perchannel_quantization.cpp similarity index 100% rename from client_example/09_quantization/conv2d_fwd_perchannel_quantization.cpp rename to composable_kernel/client_example/09_quantization/conv2d_fwd_perchannel_quantization.cpp diff --git a/client_example/09_quantization/conv2d_fwd_perlayer_quantization.cpp b/composable_kernel/client_example/09_quantization/conv2d_fwd_perlayer_quantization.cpp similarity index 100% rename from client_example/09_quantization/conv2d_fwd_perlayer_quantization.cpp rename to composable_kernel/client_example/09_quantization/conv2d_fwd_perlayer_quantization.cpp diff --git a/client_example/09_quantization/gemm_quantization.cpp b/composable_kernel/client_example/09_quantization/gemm_quantization.cpp similarity index 100% rename from client_example/09_quantization/gemm_quantization.cpp rename to composable_kernel/client_example/09_quantization/gemm_quantization.cpp diff --git a/client_example/10_grouped_convnd_bwd_data/CMakeLists.txt b/composable_kernel/client_example/10_grouped_convnd_bwd_data/CMakeLists.txt similarity index 100% rename from client_example/10_grouped_convnd_bwd_data/CMakeLists.txt rename to composable_kernel/client_example/10_grouped_convnd_bwd_data/CMakeLists.txt diff --git a/client_example/10_grouped_convnd_bwd_data/grouped_conv2d_bwd_data.cpp b/composable_kernel/client_example/10_grouped_convnd_bwd_data/grouped_conv2d_bwd_data.cpp similarity index 100% rename from client_example/10_grouped_convnd_bwd_data/grouped_conv2d_bwd_data.cpp rename to composable_kernel/client_example/10_grouped_convnd_bwd_data/grouped_conv2d_bwd_data.cpp diff --git a/client_example/10_grouped_convnd_bwd_data/grouped_conv3d_bwd_data.cpp b/composable_kernel/client_example/10_grouped_convnd_bwd_data/grouped_conv3d_bwd_data.cpp similarity index 100% rename from client_example/10_grouped_convnd_bwd_data/grouped_conv3d_bwd_data.cpp rename to composable_kernel/client_example/10_grouped_convnd_bwd_data/grouped_conv3d_bwd_data.cpp diff --git a/client_example/10_grouped_convnd_bwd_data/grouped_conv3d_bwd_data_input_fp16_comp_bf8f8.cpp b/composable_kernel/client_example/10_grouped_convnd_bwd_data/grouped_conv3d_bwd_data_input_fp16_comp_bf8f8.cpp similarity index 100% rename from client_example/10_grouped_convnd_bwd_data/grouped_conv3d_bwd_data_input_fp16_comp_bf8f8.cpp rename to composable_kernel/client_example/10_grouped_convnd_bwd_data/grouped_conv3d_bwd_data_input_fp16_comp_bf8f8.cpp diff --git a/client_example/11_grouped_conv_bwd_weight/CMakeLists.txt b/composable_kernel/client_example/11_grouped_conv_bwd_weight/CMakeLists.txt similarity index 100% rename from client_example/11_grouped_conv_bwd_weight/CMakeLists.txt rename to composable_kernel/client_example/11_grouped_conv_bwd_weight/CMakeLists.txt diff --git a/client_example/11_grouped_conv_bwd_weight/common.hpp b/composable_kernel/client_example/11_grouped_conv_bwd_weight/common.hpp similarity index 100% rename from client_example/11_grouped_conv_bwd_weight/common.hpp rename to composable_kernel/client_example/11_grouped_conv_bwd_weight/common.hpp diff --git a/client_example/11_grouped_conv_bwd_weight/grouped_conv1d_bwd_weight_fp16.cpp b/composable_kernel/client_example/11_grouped_conv_bwd_weight/grouped_conv1d_bwd_weight_fp16.cpp similarity index 100% rename from client_example/11_grouped_conv_bwd_weight/grouped_conv1d_bwd_weight_fp16.cpp rename to composable_kernel/client_example/11_grouped_conv_bwd_weight/grouped_conv1d_bwd_weight_fp16.cpp diff --git a/client_example/11_grouped_conv_bwd_weight/grouped_conv2d_bwd_weight_fp16.cpp b/composable_kernel/client_example/11_grouped_conv_bwd_weight/grouped_conv2d_bwd_weight_fp16.cpp similarity index 100% rename from client_example/11_grouped_conv_bwd_weight/grouped_conv2d_bwd_weight_fp16.cpp rename to composable_kernel/client_example/11_grouped_conv_bwd_weight/grouped_conv2d_bwd_weight_fp16.cpp diff --git a/client_example/11_grouped_conv_bwd_weight/grouped_conv3d_bwd_weight_fp16.cpp b/composable_kernel/client_example/11_grouped_conv_bwd_weight/grouped_conv3d_bwd_weight_fp16.cpp similarity index 100% rename from client_example/11_grouped_conv_bwd_weight/grouped_conv3d_bwd_weight_fp16.cpp rename to composable_kernel/client_example/11_grouped_conv_bwd_weight/grouped_conv3d_bwd_weight_fp16.cpp diff --git a/client_example/11_grouped_conv_bwd_weight/grouped_conv3d_bwd_weight_fp32.cpp b/composable_kernel/client_example/11_grouped_conv_bwd_weight/grouped_conv3d_bwd_weight_fp32.cpp similarity index 100% rename from client_example/11_grouped_conv_bwd_weight/grouped_conv3d_bwd_weight_fp32.cpp rename to composable_kernel/client_example/11_grouped_conv_bwd_weight/grouped_conv3d_bwd_weight_fp32.cpp diff --git a/client_example/12_elementwise_normalization/CMakeLists.txt b/composable_kernel/client_example/12_elementwise_normalization/CMakeLists.txt similarity index 100% rename from client_example/12_elementwise_normalization/CMakeLists.txt rename to composable_kernel/client_example/12_elementwise_normalization/CMakeLists.txt diff --git a/client_example/12_elementwise_normalization/elementwise_layernorm2d.cpp b/composable_kernel/client_example/12_elementwise_normalization/elementwise_layernorm2d.cpp similarity index 100% rename from client_example/12_elementwise_normalization/elementwise_layernorm2d.cpp rename to composable_kernel/client_example/12_elementwise_normalization/elementwise_layernorm2d.cpp diff --git a/client_example/13_batchnorm/CMakeLists.txt b/composable_kernel/client_example/13_batchnorm/CMakeLists.txt similarity index 100% rename from client_example/13_batchnorm/CMakeLists.txt rename to composable_kernel/client_example/13_batchnorm/CMakeLists.txt diff --git a/client_example/13_batchnorm/batchnorm_bwd_nhwc.cpp b/composable_kernel/client_example/13_batchnorm/batchnorm_bwd_nhwc.cpp similarity index 100% rename from client_example/13_batchnorm/batchnorm_bwd_nhwc.cpp rename to composable_kernel/client_example/13_batchnorm/batchnorm_bwd_nhwc.cpp diff --git a/client_example/13_batchnorm/batchnorm_fwd_nhwc.cpp b/composable_kernel/client_example/13_batchnorm/batchnorm_fwd_nhwc.cpp similarity index 100% rename from client_example/13_batchnorm/batchnorm_fwd_nhwc.cpp rename to composable_kernel/client_example/13_batchnorm/batchnorm_fwd_nhwc.cpp diff --git a/client_example/13_batchnorm/batchnorm_infer_nhwc.cpp b/composable_kernel/client_example/13_batchnorm/batchnorm_infer_nhwc.cpp similarity index 100% rename from client_example/13_batchnorm/batchnorm_infer_nhwc.cpp rename to composable_kernel/client_example/13_batchnorm/batchnorm_infer_nhwc.cpp diff --git a/client_example/14_instance_id/CMakeLists.txt b/composable_kernel/client_example/14_instance_id/CMakeLists.txt similarity index 100% rename from client_example/14_instance_id/CMakeLists.txt rename to composable_kernel/client_example/14_instance_id/CMakeLists.txt diff --git a/client_example/14_instance_id/batchnorm_fwd_instance_id.cpp b/composable_kernel/client_example/14_instance_id/batchnorm_fwd_instance_id.cpp similarity index 100% rename from client_example/14_instance_id/batchnorm_fwd_instance_id.cpp rename to composable_kernel/client_example/14_instance_id/batchnorm_fwd_instance_id.cpp diff --git a/client_example/15_convnd_bwd_data/CMakeLists.txt b/composable_kernel/client_example/15_convnd_bwd_data/CMakeLists.txt similarity index 100% rename from client_example/15_convnd_bwd_data/CMakeLists.txt rename to composable_kernel/client_example/15_convnd_bwd_data/CMakeLists.txt diff --git a/client_example/15_convnd_bwd_data/common.hpp b/composable_kernel/client_example/15_convnd_bwd_data/common.hpp similarity index 100% rename from client_example/15_convnd_bwd_data/common.hpp rename to composable_kernel/client_example/15_convnd_bwd_data/common.hpp diff --git a/client_example/15_convnd_bwd_data/conv3d_bwd_data_fp16.cpp b/composable_kernel/client_example/15_convnd_bwd_data/conv3d_bwd_data_fp16.cpp similarity index 100% rename from client_example/15_convnd_bwd_data/conv3d_bwd_data_fp16.cpp rename to composable_kernel/client_example/15_convnd_bwd_data/conv3d_bwd_data_fp16.cpp diff --git a/client_example/15_convnd_bwd_data/conv3d_bwd_data_fp32.cpp b/composable_kernel/client_example/15_convnd_bwd_data/conv3d_bwd_data_fp32.cpp similarity index 100% rename from client_example/15_convnd_bwd_data/conv3d_bwd_data_fp32.cpp rename to composable_kernel/client_example/15_convnd_bwd_data/conv3d_bwd_data_fp32.cpp diff --git a/client_example/15_gemm_add_multiply/CMakeLists.txt b/composable_kernel/client_example/15_gemm_add_multiply/CMakeLists.txt similarity index 100% rename from client_example/15_gemm_add_multiply/CMakeLists.txt rename to composable_kernel/client_example/15_gemm_add_multiply/CMakeLists.txt diff --git a/client_example/15_gemm_add_multiply/gemm_add_multiply.cpp b/composable_kernel/client_example/15_gemm_add_multiply/gemm_add_multiply.cpp similarity index 100% rename from client_example/15_gemm_add_multiply/gemm_add_multiply.cpp rename to composable_kernel/client_example/15_gemm_add_multiply/gemm_add_multiply.cpp diff --git a/client_example/15_reduce/CMakeLists.txt b/composable_kernel/client_example/15_reduce/CMakeLists.txt similarity index 100% rename from client_example/15_reduce/CMakeLists.txt rename to composable_kernel/client_example/15_reduce/CMakeLists.txt diff --git a/client_example/15_reduce/reduce_nhwc_c.cpp b/composable_kernel/client_example/15_reduce/reduce_nhwc_c.cpp similarity index 100% rename from client_example/15_reduce/reduce_nhwc_c.cpp rename to composable_kernel/client_example/15_reduce/reduce_nhwc_c.cpp diff --git a/client_example/16_convnd_fwd/CMakeLists.txt b/composable_kernel/client_example/16_convnd_fwd/CMakeLists.txt similarity index 100% rename from client_example/16_convnd_fwd/CMakeLists.txt rename to composable_kernel/client_example/16_convnd_fwd/CMakeLists.txt diff --git a/client_example/16_convnd_fwd/common.hpp b/composable_kernel/client_example/16_convnd_fwd/common.hpp similarity index 100% rename from client_example/16_convnd_fwd/common.hpp rename to composable_kernel/client_example/16_convnd_fwd/common.hpp diff --git a/client_example/16_convnd_fwd/conv3d_fwd_fp16.cpp b/composable_kernel/client_example/16_convnd_fwd/conv3d_fwd_fp16.cpp similarity index 100% rename from client_example/16_convnd_fwd/conv3d_fwd_fp16.cpp rename to composable_kernel/client_example/16_convnd_fwd/conv3d_fwd_fp16.cpp diff --git a/client_example/16_convnd_fwd/conv3d_fwd_fp16_comp_fp8.cpp b/composable_kernel/client_example/16_convnd_fwd/conv3d_fwd_fp16_comp_fp8.cpp similarity index 100% rename from client_example/16_convnd_fwd/conv3d_fwd_fp16_comp_fp8.cpp rename to composable_kernel/client_example/16_convnd_fwd/conv3d_fwd_fp16_comp_fp8.cpp diff --git a/client_example/16_convnd_fwd/conv3d_fwd_fp32.cpp b/composable_kernel/client_example/16_convnd_fwd/conv3d_fwd_fp32.cpp similarity index 100% rename from client_example/16_convnd_fwd/conv3d_fwd_fp32.cpp rename to composable_kernel/client_example/16_convnd_fwd/conv3d_fwd_fp32.cpp diff --git a/client_example/17_grouped_gemm_fastgelu/CMakeLists.txt b/composable_kernel/client_example/17_grouped_gemm_fastgelu/CMakeLists.txt similarity index 100% rename from client_example/17_grouped_gemm_fastgelu/CMakeLists.txt rename to composable_kernel/client_example/17_grouped_gemm_fastgelu/CMakeLists.txt diff --git a/client_example/17_grouped_gemm_fastgelu/grouped_gemm_fastgelu.cpp b/composable_kernel/client_example/17_grouped_gemm_fastgelu/grouped_gemm_fastgelu.cpp similarity index 100% rename from client_example/17_grouped_gemm_fastgelu/grouped_gemm_fastgelu.cpp rename to composable_kernel/client_example/17_grouped_gemm_fastgelu/grouped_gemm_fastgelu.cpp diff --git a/client_example/18_groupnorm/CMakeLists.txt b/composable_kernel/client_example/18_groupnorm/CMakeLists.txt similarity index 100% rename from client_example/18_groupnorm/CMakeLists.txt rename to composable_kernel/client_example/18_groupnorm/CMakeLists.txt diff --git a/client_example/18_groupnorm/groupnorm_swish.cpp b/composable_kernel/client_example/18_groupnorm/groupnorm_swish.cpp similarity index 100% rename from client_example/18_groupnorm/groupnorm_swish.cpp rename to composable_kernel/client_example/18_groupnorm/groupnorm_swish.cpp diff --git a/client_example/19_pool/CMakeLists.txt b/composable_kernel/client_example/19_pool/CMakeLists.txt similarity index 100% rename from client_example/19_pool/CMakeLists.txt rename to composable_kernel/client_example/19_pool/CMakeLists.txt diff --git a/client_example/19_pool/avg_pool3d_bwd.cpp b/composable_kernel/client_example/19_pool/avg_pool3d_bwd.cpp similarity index 100% rename from client_example/19_pool/avg_pool3d_bwd.cpp rename to composable_kernel/client_example/19_pool/avg_pool3d_bwd.cpp diff --git a/client_example/19_pool/avg_pool3d_fwd.cpp b/composable_kernel/client_example/19_pool/avg_pool3d_fwd.cpp similarity index 100% rename from client_example/19_pool/avg_pool3d_fwd.cpp rename to composable_kernel/client_example/19_pool/avg_pool3d_fwd.cpp diff --git a/client_example/19_pool/max_pool2d_bwd.cpp b/composable_kernel/client_example/19_pool/max_pool2d_bwd.cpp similarity index 100% rename from client_example/19_pool/max_pool2d_bwd.cpp rename to composable_kernel/client_example/19_pool/max_pool2d_bwd.cpp diff --git a/client_example/19_pool/max_pool2d_fwd.cpp b/composable_kernel/client_example/19_pool/max_pool2d_fwd.cpp similarity index 100% rename from client_example/19_pool/max_pool2d_fwd.cpp rename to composable_kernel/client_example/19_pool/max_pool2d_fwd.cpp diff --git a/client_example/20_splitk_gemm/CMakeLists.txt b/composable_kernel/client_example/20_splitk_gemm/CMakeLists.txt similarity index 100% rename from client_example/20_splitk_gemm/CMakeLists.txt rename to composable_kernel/client_example/20_splitk_gemm/CMakeLists.txt diff --git a/client_example/20_splitk_gemm/splitK_gemm_fp16_f8.cpp b/composable_kernel/client_example/20_splitk_gemm/splitK_gemm_fp16_f8.cpp similarity index 100% rename from client_example/20_splitk_gemm/splitK_gemm_fp16_f8.cpp rename to composable_kernel/client_example/20_splitk_gemm/splitK_gemm_fp16_f8.cpp diff --git a/client_example/21_grouped_gemm_bias/CMakeLists.txt b/composable_kernel/client_example/21_grouped_gemm_bias/CMakeLists.txt similarity index 100% rename from client_example/21_grouped_gemm_bias/CMakeLists.txt rename to composable_kernel/client_example/21_grouped_gemm_bias/CMakeLists.txt diff --git a/client_example/21_grouped_gemm_bias/grouped_gemm_fixed_nk_bias_fp16.cpp b/composable_kernel/client_example/21_grouped_gemm_bias/grouped_gemm_fixed_nk_bias_fp16.cpp similarity index 100% rename from client_example/21_grouped_gemm_bias/grouped_gemm_fixed_nk_bias_fp16.cpp rename to composable_kernel/client_example/21_grouped_gemm_bias/grouped_gemm_fixed_nk_bias_fp16.cpp diff --git a/client_example/22_grouped_gemm/CMakeLists.txt b/composable_kernel/client_example/22_grouped_gemm/CMakeLists.txt similarity index 100% rename from client_example/22_grouped_gemm/CMakeLists.txt rename to composable_kernel/client_example/22_grouped_gemm/CMakeLists.txt diff --git a/client_example/22_grouped_gemm/grouped_gemm_fixed_nk_fp16.cpp b/composable_kernel/client_example/22_grouped_gemm/grouped_gemm_fixed_nk_fp16.cpp similarity index 100% rename from client_example/22_grouped_gemm/grouped_gemm_fixed_nk_fp16.cpp rename to composable_kernel/client_example/22_grouped_gemm/grouped_gemm_fixed_nk_fp16.cpp diff --git a/client_example/22_grouped_gemm/grouped_gemm_fixed_nk_fp8.cpp b/composable_kernel/client_example/22_grouped_gemm/grouped_gemm_fixed_nk_fp8.cpp similarity index 100% rename from client_example/22_grouped_gemm/grouped_gemm_fixed_nk_fp8.cpp rename to composable_kernel/client_example/22_grouped_gemm/grouped_gemm_fixed_nk_fp8.cpp diff --git a/client_example/22_grouped_gemm/grouped_gemm_fixed_nk_i8.cpp b/composable_kernel/client_example/22_grouped_gemm/grouped_gemm_fixed_nk_i8.cpp similarity index 100% rename from client_example/22_grouped_gemm/grouped_gemm_fixed_nk_i8.cpp rename to composable_kernel/client_example/22_grouped_gemm/grouped_gemm_fixed_nk_i8.cpp diff --git a/client_example/22_im2col_col2im/CMakeLists.txt b/composable_kernel/client_example/22_im2col_col2im/CMakeLists.txt similarity index 100% rename from client_example/22_im2col_col2im/CMakeLists.txt rename to composable_kernel/client_example/22_im2col_col2im/CMakeLists.txt diff --git a/client_example/22_im2col_col2im/column_to_image.cpp b/composable_kernel/client_example/22_im2col_col2im/column_to_image.cpp similarity index 100% rename from client_example/22_im2col_col2im/column_to_image.cpp rename to composable_kernel/client_example/22_im2col_col2im/column_to_image.cpp diff --git a/client_example/22_im2col_col2im/image_to_column.cpp b/composable_kernel/client_example/22_im2col_col2im/image_to_column.cpp similarity index 100% rename from client_example/22_im2col_col2im/image_to_column.cpp rename to composable_kernel/client_example/22_im2col_col2im/image_to_column.cpp diff --git a/client_example/CMakeLists.txt b/composable_kernel/client_example/CMakeLists.txt similarity index 100% rename from client_example/CMakeLists.txt rename to composable_kernel/client_example/CMakeLists.txt diff --git a/client_example/README.md b/composable_kernel/client_example/README.md similarity index 100% rename from client_example/README.md rename to composable_kernel/client_example/README.md diff --git a/cmake/Analyzers.cmake b/composable_kernel/cmake/Analyzers.cmake similarity index 100% rename from cmake/Analyzers.cmake rename to composable_kernel/cmake/Analyzers.cmake diff --git a/cmake/ClangTidy.cmake b/composable_kernel/cmake/ClangTidy.cmake similarity index 100% rename from cmake/ClangTidy.cmake rename to composable_kernel/cmake/ClangTidy.cmake diff --git a/cmake/CppCheck.cmake b/composable_kernel/cmake/CppCheck.cmake similarity index 100% rename from cmake/CppCheck.cmake rename to composable_kernel/cmake/CppCheck.cmake diff --git a/cmake/DoxygenDoc.cmake b/composable_kernel/cmake/DoxygenDoc.cmake similarity index 100% rename from cmake/DoxygenDoc.cmake rename to composable_kernel/cmake/DoxygenDoc.cmake diff --git a/cmake/EnableCompilerWarnings.cmake b/composable_kernel/cmake/EnableCompilerWarnings.cmake similarity index 100% rename from cmake/EnableCompilerWarnings.cmake rename to composable_kernel/cmake/EnableCompilerWarnings.cmake diff --git a/cmake/TargetFlags.cmake b/composable_kernel/cmake/TargetFlags.cmake similarity index 100% rename from cmake/TargetFlags.cmake rename to composable_kernel/cmake/TargetFlags.cmake diff --git a/cmake/googletest.cmake b/composable_kernel/cmake/googletest.cmake similarity index 100% rename from cmake/googletest.cmake rename to composable_kernel/cmake/googletest.cmake diff --git a/dev-requirements.txt b/composable_kernel/dev-requirements.txt similarity index 100% rename from dev-requirements.txt rename to composable_kernel/dev-requirements.txt diff --git a/docs/API_Reference_Guide.rst b/composable_kernel/docs/API_Reference_Guide.rst similarity index 100% rename from docs/API_Reference_Guide.rst rename to composable_kernel/docs/API_Reference_Guide.rst diff --git a/docs/Contributors_Guide.rst b/composable_kernel/docs/Contributors_Guide.rst similarity index 100% rename from docs/Contributors_Guide.rst rename to composable_kernel/docs/Contributors_Guide.rst diff --git a/docs/Supported_Primitives_Guide.rst b/composable_kernel/docs/Supported_Primitives_Guide.rst similarity index 100% rename from docs/Supported_Primitives_Guide.rst rename to composable_kernel/docs/Supported_Primitives_Guide.rst diff --git a/docs/conf.py b/composable_kernel/docs/conf.py similarity index 100% rename from docs/conf.py rename to composable_kernel/docs/conf.py diff --git a/docs/data/ck_component.png b/composable_kernel/docs/data/ck_component.png similarity index 100% rename from docs/data/ck_component.png rename to composable_kernel/docs/data/ck_component.png diff --git a/docs/data/ck_layer.png b/composable_kernel/docs/data/ck_layer.png similarity index 100% rename from docs/data/ck_layer.png rename to composable_kernel/docs/data/ck_layer.png diff --git a/docs/dockerhub.rst b/composable_kernel/docs/dockerhub.rst similarity index 100% rename from docs/dockerhub.rst rename to composable_kernel/docs/dockerhub.rst diff --git a/docs/doxygen/Doxyfile b/composable_kernel/docs/doxygen/Doxyfile similarity index 100% rename from docs/doxygen/Doxyfile rename to composable_kernel/docs/doxygen/Doxyfile diff --git a/docs/index.rst b/composable_kernel/docs/index.rst similarity index 100% rename from docs/index.rst rename to composable_kernel/docs/index.rst diff --git a/docs/license.rst b/composable_kernel/docs/license.rst similarity index 100% rename from docs/license.rst rename to composable_kernel/docs/license.rst diff --git a/docs/refs.bib b/composable_kernel/docs/refs.bib similarity index 100% rename from docs/refs.bib rename to composable_kernel/docs/refs.bib diff --git a/docs/sphinx/_toc.yml.in b/composable_kernel/docs/sphinx/_toc.yml.in similarity index 100% rename from docs/sphinx/_toc.yml.in rename to composable_kernel/docs/sphinx/_toc.yml.in diff --git a/docs/sphinx/requirements.in b/composable_kernel/docs/sphinx/requirements.in similarity index 100% rename from docs/sphinx/requirements.in rename to composable_kernel/docs/sphinx/requirements.in diff --git a/docs/sphinx/requirements.txt b/composable_kernel/docs/sphinx/requirements.txt similarity index 100% rename from docs/sphinx/requirements.txt rename to composable_kernel/docs/sphinx/requirements.txt diff --git a/docs/tutorial_hello_world.rst b/composable_kernel/docs/tutorial_hello_world.rst similarity index 100% rename from docs/tutorial_hello_world.rst rename to composable_kernel/docs/tutorial_hello_world.rst diff --git a/example/01_gemm/CMakeLists.txt b/composable_kernel/example/01_gemm/CMakeLists.txt similarity index 100% rename from example/01_gemm/CMakeLists.txt rename to composable_kernel/example/01_gemm/CMakeLists.txt diff --git a/example/01_gemm/README.md b/composable_kernel/example/01_gemm/README.md similarity index 100% rename from example/01_gemm/README.md rename to composable_kernel/example/01_gemm/README.md diff --git a/example/01_gemm/common.hpp b/composable_kernel/example/01_gemm/common.hpp similarity index 100% rename from example/01_gemm/common.hpp rename to composable_kernel/example/01_gemm/common.hpp diff --git a/example/01_gemm/gemm_dl_fp16.cpp b/composable_kernel/example/01_gemm/gemm_dl_fp16.cpp similarity index 100% rename from example/01_gemm/gemm_dl_fp16.cpp rename to composable_kernel/example/01_gemm/gemm_dl_fp16.cpp diff --git a/example/01_gemm/gemm_dl_fp32.cpp b/composable_kernel/example/01_gemm/gemm_dl_fp32.cpp similarity index 100% rename from example/01_gemm/gemm_dl_fp32.cpp rename to composable_kernel/example/01_gemm/gemm_dl_fp32.cpp diff --git a/example/01_gemm/gemm_dl_int4.cpp b/composable_kernel/example/01_gemm/gemm_dl_int4.cpp similarity index 100% rename from example/01_gemm/gemm_dl_int4.cpp rename to composable_kernel/example/01_gemm/gemm_dl_int4.cpp diff --git a/example/01_gemm/gemm_dl_int8.cpp b/composable_kernel/example/01_gemm/gemm_dl_int8.cpp similarity index 100% rename from example/01_gemm/gemm_dl_int8.cpp rename to composable_kernel/example/01_gemm/gemm_dl_int8.cpp diff --git a/example/01_gemm/gemm_dpp_fp16.cpp b/composable_kernel/example/01_gemm/gemm_dpp_fp16.cpp similarity index 100% rename from example/01_gemm/gemm_dpp_fp16.cpp rename to composable_kernel/example/01_gemm/gemm_dpp_fp16.cpp diff --git a/example/01_gemm/gemm_wmma_fp16.cpp b/composable_kernel/example/01_gemm/gemm_wmma_fp16.cpp similarity index 100% rename from example/01_gemm/gemm_wmma_fp16.cpp rename to composable_kernel/example/01_gemm/gemm_wmma_fp16.cpp diff --git a/example/01_gemm/gemm_xdl_bf16.cpp b/composable_kernel/example/01_gemm/gemm_xdl_bf16.cpp similarity index 100% rename from example/01_gemm/gemm_xdl_bf16.cpp rename to composable_kernel/example/01_gemm/gemm_xdl_bf16.cpp diff --git a/example/01_gemm/gemm_xdl_bf16_rtn.cpp b/composable_kernel/example/01_gemm/gemm_xdl_bf16_rtn.cpp similarity index 100% rename from example/01_gemm/gemm_xdl_bf16_rtn.cpp rename to composable_kernel/example/01_gemm/gemm_xdl_bf16_rtn.cpp diff --git a/example/01_gemm/gemm_xdl_fp16.cpp b/composable_kernel/example/01_gemm/gemm_xdl_fp16.cpp similarity index 100% rename from example/01_gemm/gemm_xdl_fp16.cpp rename to composable_kernel/example/01_gemm/gemm_xdl_fp16.cpp diff --git a/example/01_gemm/gemm_xdl_fp16_fp8.cpp b/composable_kernel/example/01_gemm/gemm_xdl_fp16_fp8.cpp similarity index 100% rename from example/01_gemm/gemm_xdl_fp16_fp8.cpp rename to composable_kernel/example/01_gemm/gemm_xdl_fp16_fp8.cpp diff --git a/example/01_gemm/gemm_xdl_fp64.cpp b/composable_kernel/example/01_gemm/gemm_xdl_fp64.cpp similarity index 100% rename from example/01_gemm/gemm_xdl_fp64.cpp rename to composable_kernel/example/01_gemm/gemm_xdl_fp64.cpp diff --git a/example/01_gemm/gemm_xdl_fp8.cpp b/composable_kernel/example/01_gemm/gemm_xdl_fp8.cpp similarity index 100% rename from example/01_gemm/gemm_xdl_fp8.cpp rename to composable_kernel/example/01_gemm/gemm_xdl_fp8.cpp diff --git a/example/01_gemm/gemm_xdl_fp8_bf8.cpp b/composable_kernel/example/01_gemm/gemm_xdl_fp8_bf8.cpp similarity index 100% rename from example/01_gemm/gemm_xdl_fp8_bf8.cpp rename to composable_kernel/example/01_gemm/gemm_xdl_fp8_bf8.cpp diff --git a/example/01_gemm/gemm_xdl_int4.cpp b/composable_kernel/example/01_gemm/gemm_xdl_int4.cpp similarity index 100% rename from example/01_gemm/gemm_xdl_int4.cpp rename to composable_kernel/example/01_gemm/gemm_xdl_int4.cpp diff --git a/example/01_gemm/gemm_xdl_int8.cpp b/composable_kernel/example/01_gemm/gemm_xdl_int8.cpp similarity index 100% rename from example/01_gemm/gemm_xdl_int8.cpp rename to composable_kernel/example/01_gemm/gemm_xdl_int8.cpp diff --git a/example/01_gemm/gemm_xdl_skip_b_lds_fp16.cpp b/composable_kernel/example/01_gemm/gemm_xdl_skip_b_lds_fp16.cpp similarity index 100% rename from example/01_gemm/gemm_xdl_skip_b_lds_fp16.cpp rename to composable_kernel/example/01_gemm/gemm_xdl_skip_b_lds_fp16.cpp diff --git a/example/01_gemm/gemm_xdl_streamk.cpp b/composable_kernel/example/01_gemm/gemm_xdl_streamk.cpp similarity index 100% rename from example/01_gemm/gemm_xdl_streamk.cpp rename to composable_kernel/example/01_gemm/gemm_xdl_streamk.cpp diff --git a/example/01_gemm/gemm_xdl_wavelet_fp16.cpp b/composable_kernel/example/01_gemm/gemm_xdl_wavelet_fp16.cpp similarity index 100% rename from example/01_gemm/gemm_xdl_wavelet_fp16.cpp rename to composable_kernel/example/01_gemm/gemm_xdl_wavelet_fp16.cpp diff --git a/example/01_gemm/run_gemm_example.inc b/composable_kernel/example/01_gemm/run_gemm_example.inc similarity index 100% rename from example/01_gemm/run_gemm_example.inc rename to composable_kernel/example/01_gemm/run_gemm_example.inc diff --git a/example/02_gemm_bilinear/CMakeLists.txt b/composable_kernel/example/02_gemm_bilinear/CMakeLists.txt similarity index 100% rename from example/02_gemm_bilinear/CMakeLists.txt rename to composable_kernel/example/02_gemm_bilinear/CMakeLists.txt diff --git a/example/02_gemm_bilinear/README.md b/composable_kernel/example/02_gemm_bilinear/README.md similarity index 100% rename from example/02_gemm_bilinear/README.md rename to composable_kernel/example/02_gemm_bilinear/README.md diff --git a/example/02_gemm_bilinear/gemm_bilinear_wmma_fp16.cpp b/composable_kernel/example/02_gemm_bilinear/gemm_bilinear_wmma_fp16.cpp similarity index 100% rename from example/02_gemm_bilinear/gemm_bilinear_wmma_fp16.cpp rename to composable_kernel/example/02_gemm_bilinear/gemm_bilinear_wmma_fp16.cpp diff --git a/example/02_gemm_bilinear/gemm_bilinear_wmma_int8.cpp b/composable_kernel/example/02_gemm_bilinear/gemm_bilinear_wmma_int8.cpp similarity index 100% rename from example/02_gemm_bilinear/gemm_bilinear_wmma_int8.cpp rename to composable_kernel/example/02_gemm_bilinear/gemm_bilinear_wmma_int8.cpp diff --git a/example/02_gemm_bilinear/gemm_bilinear_xdl_fp16.cpp b/composable_kernel/example/02_gemm_bilinear/gemm_bilinear_xdl_fp16.cpp similarity index 100% rename from example/02_gemm_bilinear/gemm_bilinear_xdl_fp16.cpp rename to composable_kernel/example/02_gemm_bilinear/gemm_bilinear_xdl_fp16.cpp diff --git a/example/03_gemm_bias_relu/CMakeLists.txt b/composable_kernel/example/03_gemm_bias_relu/CMakeLists.txt similarity index 100% rename from example/03_gemm_bias_relu/CMakeLists.txt rename to composable_kernel/example/03_gemm_bias_relu/CMakeLists.txt diff --git a/example/03_gemm_bias_relu/README.md b/composable_kernel/example/03_gemm_bias_relu/README.md similarity index 100% rename from example/03_gemm_bias_relu/README.md rename to composable_kernel/example/03_gemm_bias_relu/README.md diff --git a/example/03_gemm_bias_relu/gemm_bias_relu_xdl_fp16.cpp b/composable_kernel/example/03_gemm_bias_relu/gemm_bias_relu_xdl_fp16.cpp similarity index 100% rename from example/03_gemm_bias_relu/gemm_bias_relu_xdl_fp16.cpp rename to composable_kernel/example/03_gemm_bias_relu/gemm_bias_relu_xdl_fp16.cpp diff --git a/example/04_gemm_add_add_fastgelu/CMakeLists.txt b/composable_kernel/example/04_gemm_add_add_fastgelu/CMakeLists.txt similarity index 100% rename from example/04_gemm_add_add_fastgelu/CMakeLists.txt rename to composable_kernel/example/04_gemm_add_add_fastgelu/CMakeLists.txt diff --git a/example/04_gemm_add_add_fastgelu/README.md b/composable_kernel/example/04_gemm_add_add_fastgelu/README.md similarity index 100% rename from example/04_gemm_add_add_fastgelu/README.md rename to composable_kernel/example/04_gemm_add_add_fastgelu/README.md diff --git a/example/04_gemm_add_add_fastgelu/common.hpp b/composable_kernel/example/04_gemm_add_add_fastgelu/common.hpp similarity index 100% rename from example/04_gemm_add_add_fastgelu/common.hpp rename to composable_kernel/example/04_gemm_add_add_fastgelu/common.hpp diff --git a/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_bf16.cpp b/composable_kernel/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_bf16.cpp similarity index 100% rename from example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_bf16.cpp rename to composable_kernel/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_bf16.cpp diff --git a/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_fp16.cpp b/composable_kernel/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_fp16.cpp similarity index 100% rename from example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_fp16.cpp rename to composable_kernel/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_fp16.cpp diff --git a/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_fp32.cpp b/composable_kernel/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_fp32.cpp similarity index 100% rename from example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_fp32.cpp rename to composable_kernel/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_fp32.cpp diff --git a/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_int4.cpp b/composable_kernel/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_int4.cpp similarity index 100% rename from example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_int4.cpp rename to composable_kernel/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_int4.cpp diff --git a/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_int8.cpp b/composable_kernel/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_int8.cpp similarity index 100% rename from example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_int8.cpp rename to composable_kernel/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_int8.cpp diff --git a/example/04_gemm_add_add_fastgelu/run_gemm_add_add_fastgelu_example.inc b/composable_kernel/example/04_gemm_add_add_fastgelu/run_gemm_add_add_fastgelu_example.inc similarity index 100% rename from example/04_gemm_add_add_fastgelu/run_gemm_add_add_fastgelu_example.inc rename to composable_kernel/example/04_gemm_add_add_fastgelu/run_gemm_add_add_fastgelu_example.inc diff --git a/example/09_convnd_fwd/CMakeLists.txt b/composable_kernel/example/09_convnd_fwd/CMakeLists.txt similarity index 100% rename from example/09_convnd_fwd/CMakeLists.txt rename to composable_kernel/example/09_convnd_fwd/CMakeLists.txt diff --git a/example/09_convnd_fwd/README.md b/composable_kernel/example/09_convnd_fwd/README.md similarity index 100% rename from example/09_convnd_fwd/README.md rename to composable_kernel/example/09_convnd_fwd/README.md diff --git a/example/09_convnd_fwd/convnd_fwd_common.hpp b/composable_kernel/example/09_convnd_fwd/convnd_fwd_common.hpp similarity index 100% rename from example/09_convnd_fwd/convnd_fwd_common.hpp rename to composable_kernel/example/09_convnd_fwd/convnd_fwd_common.hpp diff --git a/example/09_convnd_fwd/convnd_fwd_dl_common.hpp b/composable_kernel/example/09_convnd_fwd/convnd_fwd_dl_common.hpp similarity index 100% rename from example/09_convnd_fwd/convnd_fwd_dl_common.hpp rename to composable_kernel/example/09_convnd_fwd/convnd_fwd_dl_common.hpp diff --git a/example/09_convnd_fwd/convnd_fwd_dl_fp16.cpp b/composable_kernel/example/09_convnd_fwd/convnd_fwd_dl_fp16.cpp similarity index 100% rename from example/09_convnd_fwd/convnd_fwd_dl_fp16.cpp rename to composable_kernel/example/09_convnd_fwd/convnd_fwd_dl_fp16.cpp diff --git a/example/09_convnd_fwd/convnd_fwd_dl_fp32.cpp b/composable_kernel/example/09_convnd_fwd/convnd_fwd_dl_fp32.cpp similarity index 100% rename from example/09_convnd_fwd/convnd_fwd_dl_fp32.cpp rename to composable_kernel/example/09_convnd_fwd/convnd_fwd_dl_fp32.cpp diff --git a/example/09_convnd_fwd/convnd_fwd_dl_int8.cpp b/composable_kernel/example/09_convnd_fwd/convnd_fwd_dl_int8.cpp similarity index 100% rename from example/09_convnd_fwd/convnd_fwd_dl_int8.cpp rename to composable_kernel/example/09_convnd_fwd/convnd_fwd_dl_int8.cpp diff --git a/example/09_convnd_fwd/convnd_fwd_xdl_bf16.cpp b/composable_kernel/example/09_convnd_fwd/convnd_fwd_xdl_bf16.cpp similarity index 100% rename from example/09_convnd_fwd/convnd_fwd_xdl_bf16.cpp rename to composable_kernel/example/09_convnd_fwd/convnd_fwd_xdl_bf16.cpp diff --git a/example/09_convnd_fwd/convnd_fwd_xdl_fp16.cpp b/composable_kernel/example/09_convnd_fwd/convnd_fwd_xdl_fp16.cpp similarity index 100% rename from example/09_convnd_fwd/convnd_fwd_xdl_fp16.cpp rename to composable_kernel/example/09_convnd_fwd/convnd_fwd_xdl_fp16.cpp diff --git a/example/09_convnd_fwd/convnd_fwd_xdl_fp32.cpp b/composable_kernel/example/09_convnd_fwd/convnd_fwd_xdl_fp32.cpp similarity index 100% rename from example/09_convnd_fwd/convnd_fwd_xdl_fp32.cpp rename to composable_kernel/example/09_convnd_fwd/convnd_fwd_xdl_fp32.cpp diff --git a/example/09_convnd_fwd/convnd_fwd_xdl_fp64.cpp b/composable_kernel/example/09_convnd_fwd/convnd_fwd_xdl_fp64.cpp similarity index 100% rename from example/09_convnd_fwd/convnd_fwd_xdl_fp64.cpp rename to composable_kernel/example/09_convnd_fwd/convnd_fwd_xdl_fp64.cpp diff --git a/example/09_convnd_fwd/convnd_fwd_xdl_int8.cpp b/composable_kernel/example/09_convnd_fwd/convnd_fwd_xdl_int8.cpp similarity index 100% rename from example/09_convnd_fwd/convnd_fwd_xdl_int8.cpp rename to composable_kernel/example/09_convnd_fwd/convnd_fwd_xdl_int8.cpp diff --git a/example/09_convnd_fwd/run_convnd_fwd_dl_example.inc b/composable_kernel/example/09_convnd_fwd/run_convnd_fwd_dl_example.inc similarity index 100% rename from example/09_convnd_fwd/run_convnd_fwd_dl_example.inc rename to composable_kernel/example/09_convnd_fwd/run_convnd_fwd_dl_example.inc diff --git a/example/09_convnd_fwd/run_convnd_fwd_example.inc b/composable_kernel/example/09_convnd_fwd/run_convnd_fwd_example.inc similarity index 100% rename from example/09_convnd_fwd/run_convnd_fwd_example.inc rename to composable_kernel/example/09_convnd_fwd/run_convnd_fwd_example.inc diff --git a/example/10_convnd_fwd_multiple_d_multiple_reduce/CMakeLists.txt b/composable_kernel/example/10_convnd_fwd_multiple_d_multiple_reduce/CMakeLists.txt similarity index 100% rename from example/10_convnd_fwd_multiple_d_multiple_reduce/CMakeLists.txt rename to composable_kernel/example/10_convnd_fwd_multiple_d_multiple_reduce/CMakeLists.txt diff --git a/example/10_convnd_fwd_multiple_d_multiple_reduce/common.hpp b/composable_kernel/example/10_convnd_fwd_multiple_d_multiple_reduce/common.hpp similarity index 100% rename from example/10_convnd_fwd_multiple_d_multiple_reduce/common.hpp rename to composable_kernel/example/10_convnd_fwd_multiple_d_multiple_reduce/common.hpp diff --git a/example/10_convnd_fwd_multiple_d_multiple_reduce/convnd_fwd_max_xdl_bf16.cpp b/composable_kernel/example/10_convnd_fwd_multiple_d_multiple_reduce/convnd_fwd_max_xdl_bf16.cpp similarity index 100% rename from example/10_convnd_fwd_multiple_d_multiple_reduce/convnd_fwd_max_xdl_bf16.cpp rename to composable_kernel/example/10_convnd_fwd_multiple_d_multiple_reduce/convnd_fwd_max_xdl_bf16.cpp diff --git a/example/10_convnd_fwd_multiple_d_multiple_reduce/convnd_fwd_max_xdl_fp16.cpp b/composable_kernel/example/10_convnd_fwd_multiple_d_multiple_reduce/convnd_fwd_max_xdl_fp16.cpp similarity index 100% rename from example/10_convnd_fwd_multiple_d_multiple_reduce/convnd_fwd_max_xdl_fp16.cpp rename to composable_kernel/example/10_convnd_fwd_multiple_d_multiple_reduce/convnd_fwd_max_xdl_fp16.cpp diff --git a/example/10_convnd_fwd_multiple_d_multiple_reduce/convnd_fwd_max_xdl_fp32.cpp b/composable_kernel/example/10_convnd_fwd_multiple_d_multiple_reduce/convnd_fwd_max_xdl_fp32.cpp similarity index 100% rename from example/10_convnd_fwd_multiple_d_multiple_reduce/convnd_fwd_max_xdl_fp32.cpp rename to composable_kernel/example/10_convnd_fwd_multiple_d_multiple_reduce/convnd_fwd_max_xdl_fp32.cpp diff --git a/example/10_convnd_fwd_multiple_d_multiple_reduce/convnd_fwd_max_xdl_int4.cpp b/composable_kernel/example/10_convnd_fwd_multiple_d_multiple_reduce/convnd_fwd_max_xdl_int4.cpp similarity index 100% rename from example/10_convnd_fwd_multiple_d_multiple_reduce/convnd_fwd_max_xdl_int4.cpp rename to composable_kernel/example/10_convnd_fwd_multiple_d_multiple_reduce/convnd_fwd_max_xdl_int4.cpp diff --git a/example/10_convnd_fwd_multiple_d_multiple_reduce/convnd_fwd_max_xdl_int8.cpp b/composable_kernel/example/10_convnd_fwd_multiple_d_multiple_reduce/convnd_fwd_max_xdl_int8.cpp similarity index 100% rename from example/10_convnd_fwd_multiple_d_multiple_reduce/convnd_fwd_max_xdl_int8.cpp rename to composable_kernel/example/10_convnd_fwd_multiple_d_multiple_reduce/convnd_fwd_max_xdl_int8.cpp diff --git a/example/10_convnd_fwd_multiple_d_multiple_reduce/run_convnd_fwd_max_example.inc b/composable_kernel/example/10_convnd_fwd_multiple_d_multiple_reduce/run_convnd_fwd_max_example.inc similarity index 100% rename from example/10_convnd_fwd_multiple_d_multiple_reduce/run_convnd_fwd_max_example.inc rename to composable_kernel/example/10_convnd_fwd_multiple_d_multiple_reduce/run_convnd_fwd_max_example.inc diff --git a/example/12_reduce/CMakeLists.txt b/composable_kernel/example/12_reduce/CMakeLists.txt similarity index 100% rename from example/12_reduce/CMakeLists.txt rename to composable_kernel/example/12_reduce/CMakeLists.txt diff --git a/example/12_reduce/README.md b/composable_kernel/example/12_reduce/README.md similarity index 100% rename from example/12_reduce/README.md rename to composable_kernel/example/12_reduce/README.md diff --git a/example/12_reduce/reduce_blockwise.cpp b/composable_kernel/example/12_reduce/reduce_blockwise.cpp similarity index 100% rename from example/12_reduce/reduce_blockwise.cpp rename to composable_kernel/example/12_reduce/reduce_blockwise.cpp diff --git a/example/12_reduce/reduce_blockwise_impl.hpp b/composable_kernel/example/12_reduce/reduce_blockwise_impl.hpp similarity index 100% rename from example/12_reduce/reduce_blockwise_impl.hpp rename to composable_kernel/example/12_reduce/reduce_blockwise_impl.hpp diff --git a/example/12_reduce/reduce_blockwise_two_call.cpp b/composable_kernel/example/12_reduce/reduce_blockwise_two_call.cpp similarity index 100% rename from example/12_reduce/reduce_blockwise_two_call.cpp rename to composable_kernel/example/12_reduce/reduce_blockwise_two_call.cpp diff --git a/example/12_reduce/reduce_example_common.hpp b/composable_kernel/example/12_reduce/reduce_example_common.hpp similarity index 100% rename from example/12_reduce/reduce_example_common.hpp rename to composable_kernel/example/12_reduce/reduce_example_common.hpp diff --git a/example/12_reduce/reduce_multiblock_atomic_add.cpp b/composable_kernel/example/12_reduce/reduce_multiblock_atomic_add.cpp similarity index 100% rename from example/12_reduce/reduce_multiblock_atomic_add.cpp rename to composable_kernel/example/12_reduce/reduce_multiblock_atomic_add.cpp diff --git a/example/12_reduce/reduce_multiblock_atomic_add_impl.hpp b/composable_kernel/example/12_reduce/reduce_multiblock_atomic_add_impl.hpp similarity index 100% rename from example/12_reduce/reduce_multiblock_atomic_add_impl.hpp rename to composable_kernel/example/12_reduce/reduce_multiblock_atomic_add_impl.hpp diff --git a/example/13_pool2d_fwd/CMakeLists.txt b/composable_kernel/example/13_pool2d_fwd/CMakeLists.txt similarity index 100% rename from example/13_pool2d_fwd/CMakeLists.txt rename to composable_kernel/example/13_pool2d_fwd/CMakeLists.txt diff --git a/example/13_pool2d_fwd/README.md b/composable_kernel/example/13_pool2d_fwd/README.md similarity index 100% rename from example/13_pool2d_fwd/README.md rename to composable_kernel/example/13_pool2d_fwd/README.md diff --git a/example/13_pool2d_fwd/pool2d_fwd_common.hpp b/composable_kernel/example/13_pool2d_fwd/pool2d_fwd_common.hpp similarity index 100% rename from example/13_pool2d_fwd/pool2d_fwd_common.hpp rename to composable_kernel/example/13_pool2d_fwd/pool2d_fwd_common.hpp diff --git a/example/13_pool2d_fwd/pool2d_fwd_fp16.cpp b/composable_kernel/example/13_pool2d_fwd/pool2d_fwd_fp16.cpp similarity index 100% rename from example/13_pool2d_fwd/pool2d_fwd_fp16.cpp rename to composable_kernel/example/13_pool2d_fwd/pool2d_fwd_fp16.cpp diff --git a/example/13_pool2d_fwd/pool2d_fwd_fp32.cpp b/composable_kernel/example/13_pool2d_fwd/pool2d_fwd_fp32.cpp similarity index 100% rename from example/13_pool2d_fwd/pool2d_fwd_fp32.cpp rename to composable_kernel/example/13_pool2d_fwd/pool2d_fwd_fp32.cpp diff --git a/example/14_gemm_quantization/CMakeLists.txt b/composable_kernel/example/14_gemm_quantization/CMakeLists.txt similarity index 100% rename from example/14_gemm_quantization/CMakeLists.txt rename to composable_kernel/example/14_gemm_quantization/CMakeLists.txt diff --git a/example/14_gemm_quantization/gemm_dl_quantization_int8.cpp b/composable_kernel/example/14_gemm_quantization/gemm_dl_quantization_int8.cpp similarity index 100% rename from example/14_gemm_quantization/gemm_dl_quantization_int8.cpp rename to composable_kernel/example/14_gemm_quantization/gemm_dl_quantization_int8.cpp diff --git a/example/14_gemm_quantization/gemm_xdl_bias_relu_quantization_int8.cpp b/composable_kernel/example/14_gemm_quantization/gemm_xdl_bias_relu_quantization_int8.cpp similarity index 100% rename from example/14_gemm_quantization/gemm_xdl_bias_relu_quantization_int8.cpp rename to composable_kernel/example/14_gemm_quantization/gemm_xdl_bias_relu_quantization_int8.cpp diff --git a/example/14_gemm_quantization/gemm_xdl_quantization_int8.cpp b/composable_kernel/example/14_gemm_quantization/gemm_xdl_quantization_int8.cpp similarity index 100% rename from example/14_gemm_quantization/gemm_xdl_quantization_int8.cpp rename to composable_kernel/example/14_gemm_quantization/gemm_xdl_quantization_int8.cpp diff --git a/example/15_grouped_gemm/CMakeLists.txt b/composable_kernel/example/15_grouped_gemm/CMakeLists.txt similarity index 100% rename from example/15_grouped_gemm/CMakeLists.txt rename to composable_kernel/example/15_grouped_gemm/CMakeLists.txt diff --git a/example/15_grouped_gemm/README.md b/composable_kernel/example/15_grouped_gemm/README.md similarity index 100% rename from example/15_grouped_gemm/README.md rename to composable_kernel/example/15_grouped_gemm/README.md diff --git a/example/15_grouped_gemm/grouped_gemm_multiple_d_dl_fp16.cpp b/composable_kernel/example/15_grouped_gemm/grouped_gemm_multiple_d_dl_fp16.cpp similarity index 100% rename from example/15_grouped_gemm/grouped_gemm_multiple_d_dl_fp16.cpp rename to composable_kernel/example/15_grouped_gemm/grouped_gemm_multiple_d_dl_fp16.cpp diff --git a/example/15_grouped_gemm/grouped_gemm_xdl_bf16.cpp b/composable_kernel/example/15_grouped_gemm/grouped_gemm_xdl_bf16.cpp similarity index 100% rename from example/15_grouped_gemm/grouped_gemm_xdl_bf16.cpp rename to composable_kernel/example/15_grouped_gemm/grouped_gemm_xdl_bf16.cpp diff --git a/example/15_grouped_gemm/grouped_gemm_xdl_fixed_nk_bias_fp16.cpp b/composable_kernel/example/15_grouped_gemm/grouped_gemm_xdl_fixed_nk_bias_fp16.cpp similarity index 100% rename from example/15_grouped_gemm/grouped_gemm_xdl_fixed_nk_bias_fp16.cpp rename to composable_kernel/example/15_grouped_gemm/grouped_gemm_xdl_fixed_nk_bias_fp16.cpp diff --git a/example/15_grouped_gemm/grouped_gemm_xdl_fixed_nk_fp16.cpp b/composable_kernel/example/15_grouped_gemm/grouped_gemm_xdl_fixed_nk_fp16.cpp similarity index 100% rename from example/15_grouped_gemm/grouped_gemm_xdl_fixed_nk_fp16.cpp rename to composable_kernel/example/15_grouped_gemm/grouped_gemm_xdl_fixed_nk_fp16.cpp diff --git a/example/15_grouped_gemm/grouped_gemm_xdl_fixed_nk_fp8.cpp b/composable_kernel/example/15_grouped_gemm/grouped_gemm_xdl_fixed_nk_fp8.cpp similarity index 100% rename from example/15_grouped_gemm/grouped_gemm_xdl_fixed_nk_fp8.cpp rename to composable_kernel/example/15_grouped_gemm/grouped_gemm_xdl_fixed_nk_fp8.cpp diff --git a/example/15_grouped_gemm/grouped_gemm_xdl_fp16.cpp b/composable_kernel/example/15_grouped_gemm/grouped_gemm_xdl_fp16.cpp similarity index 100% rename from example/15_grouped_gemm/grouped_gemm_xdl_fp16.cpp rename to composable_kernel/example/15_grouped_gemm/grouped_gemm_xdl_fp16.cpp diff --git a/example/15_grouped_gemm/grouped_gemm_xdl_fp32.cpp b/composable_kernel/example/15_grouped_gemm/grouped_gemm_xdl_fp32.cpp similarity index 100% rename from example/15_grouped_gemm/grouped_gemm_xdl_fp32.cpp rename to composable_kernel/example/15_grouped_gemm/grouped_gemm_xdl_fp32.cpp diff --git a/example/15_grouped_gemm/grouped_gemm_xdl_int4.cpp b/composable_kernel/example/15_grouped_gemm/grouped_gemm_xdl_int4.cpp similarity index 100% rename from example/15_grouped_gemm/grouped_gemm_xdl_int4.cpp rename to composable_kernel/example/15_grouped_gemm/grouped_gemm_xdl_int4.cpp diff --git a/example/15_grouped_gemm/grouped_gemm_xdl_int8.cpp b/composable_kernel/example/15_grouped_gemm/grouped_gemm_xdl_int8.cpp similarity index 100% rename from example/15_grouped_gemm/grouped_gemm_xdl_int8.cpp rename to composable_kernel/example/15_grouped_gemm/grouped_gemm_xdl_int8.cpp diff --git a/example/15_grouped_gemm/grouped_gemm_xdl_splitk_fp16.cpp b/composable_kernel/example/15_grouped_gemm/grouped_gemm_xdl_splitk_fp16.cpp similarity index 100% rename from example/15_grouped_gemm/grouped_gemm_xdl_splitk_fp16.cpp rename to composable_kernel/example/15_grouped_gemm/grouped_gemm_xdl_splitk_fp16.cpp diff --git a/example/15_grouped_gemm/run_grouped_gemm_example.inc b/composable_kernel/example/15_grouped_gemm/run_grouped_gemm_example.inc similarity index 100% rename from example/15_grouped_gemm/run_grouped_gemm_example.inc rename to composable_kernel/example/15_grouped_gemm/run_grouped_gemm_example.inc diff --git a/example/16_gemm_multi_d_multi_reduces/CMakeLists.txt b/composable_kernel/example/16_gemm_multi_d_multi_reduces/CMakeLists.txt similarity index 100% rename from example/16_gemm_multi_d_multi_reduces/CMakeLists.txt rename to composable_kernel/example/16_gemm_multi_d_multi_reduces/CMakeLists.txt diff --git a/example/16_gemm_multi_d_multi_reduces/gemm_add_add_mean_meansquare_xdl_fp16.cpp b/composable_kernel/example/16_gemm_multi_d_multi_reduces/gemm_add_add_mean_meansquare_xdl_fp16.cpp similarity index 100% rename from example/16_gemm_multi_d_multi_reduces/gemm_add_add_mean_meansquare_xdl_fp16.cpp rename to composable_kernel/example/16_gemm_multi_d_multi_reduces/gemm_add_add_mean_meansquare_xdl_fp16.cpp diff --git a/example/16_gemm_multi_d_multi_reduces/gemm_add_addsquare_xdl_int8.cpp b/composable_kernel/example/16_gemm_multi_d_multi_reduces/gemm_add_addsquare_xdl_int8.cpp similarity index 100% rename from example/16_gemm_multi_d_multi_reduces/gemm_add_addsquare_xdl_int8.cpp rename to composable_kernel/example/16_gemm_multi_d_multi_reduces/gemm_add_addsquare_xdl_int8.cpp diff --git a/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_bf16.cpp b/composable_kernel/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_bf16.cpp similarity index 100% rename from example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_bf16.cpp rename to composable_kernel/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_bf16.cpp diff --git a/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_fp16.cpp b/composable_kernel/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_fp16.cpp similarity index 100% rename from example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_fp16.cpp rename to composable_kernel/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_fp16.cpp diff --git a/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_fp32.cpp b/composable_kernel/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_fp32.cpp similarity index 100% rename from example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_fp32.cpp rename to composable_kernel/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_fp32.cpp diff --git a/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_int4.cpp b/composable_kernel/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_int4.cpp similarity index 100% rename from example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_int4.cpp rename to composable_kernel/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_int4.cpp diff --git a/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_int8.cpp b/composable_kernel/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_int8.cpp similarity index 100% rename from example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_int8.cpp rename to composable_kernel/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_int8.cpp diff --git a/example/16_gemm_multi_d_multi_reduces/gemm_mean_meansquare_xdl_bf16.cpp b/composable_kernel/example/16_gemm_multi_d_multi_reduces/gemm_mean_meansquare_xdl_bf16.cpp similarity index 100% rename from example/16_gemm_multi_d_multi_reduces/gemm_mean_meansquare_xdl_bf16.cpp rename to composable_kernel/example/16_gemm_multi_d_multi_reduces/gemm_mean_meansquare_xdl_bf16.cpp diff --git a/example/16_gemm_multi_d_multi_reduces/gemm_mean_meansquare_xdl_fp16.cpp b/composable_kernel/example/16_gemm_multi_d_multi_reduces/gemm_mean_meansquare_xdl_fp16.cpp similarity index 100% rename from example/16_gemm_multi_d_multi_reduces/gemm_mean_meansquare_xdl_fp16.cpp rename to composable_kernel/example/16_gemm_multi_d_multi_reduces/gemm_mean_meansquare_xdl_fp16.cpp diff --git a/example/16_gemm_multi_d_multi_reduces/gemm_mean_meansquare_xdl_fp32.cpp b/composable_kernel/example/16_gemm_multi_d_multi_reduces/gemm_mean_meansquare_xdl_fp32.cpp similarity index 100% rename from example/16_gemm_multi_d_multi_reduces/gemm_mean_meansquare_xdl_fp32.cpp rename to composable_kernel/example/16_gemm_multi_d_multi_reduces/gemm_mean_meansquare_xdl_fp32.cpp diff --git a/example/16_gemm_multi_d_multi_reduces/gemm_reduce_xdl_common.hpp b/composable_kernel/example/16_gemm_multi_d_multi_reduces/gemm_reduce_xdl_common.hpp similarity index 100% rename from example/16_gemm_multi_d_multi_reduces/gemm_reduce_xdl_common.hpp rename to composable_kernel/example/16_gemm_multi_d_multi_reduces/gemm_reduce_xdl_common.hpp diff --git a/example/17_convnd_bwd_data/CMakeLists.txt b/composable_kernel/example/17_convnd_bwd_data/CMakeLists.txt similarity index 100% rename from example/17_convnd_bwd_data/CMakeLists.txt rename to composable_kernel/example/17_convnd_bwd_data/CMakeLists.txt diff --git a/example/17_convnd_bwd_data/README.md b/composable_kernel/example/17_convnd_bwd_data/README.md similarity index 100% rename from example/17_convnd_bwd_data/README.md rename to composable_kernel/example/17_convnd_bwd_data/README.md diff --git a/example/17_convnd_bwd_data/convnd_bwd_data_common.hpp b/composable_kernel/example/17_convnd_bwd_data/convnd_bwd_data_common.hpp similarity index 100% rename from example/17_convnd_bwd_data/convnd_bwd_data_common.hpp rename to composable_kernel/example/17_convnd_bwd_data/convnd_bwd_data_common.hpp diff --git a/example/17_convnd_bwd_data/convnd_bwd_data_dl_fp16.cpp b/composable_kernel/example/17_convnd_bwd_data/convnd_bwd_data_dl_fp16.cpp similarity index 100% rename from example/17_convnd_bwd_data/convnd_bwd_data_dl_fp16.cpp rename to composable_kernel/example/17_convnd_bwd_data/convnd_bwd_data_dl_fp16.cpp diff --git a/example/17_convnd_bwd_data/convnd_bwd_data_xdl_fp16.cpp b/composable_kernel/example/17_convnd_bwd_data/convnd_bwd_data_xdl_fp16.cpp similarity index 100% rename from example/17_convnd_bwd_data/convnd_bwd_data_xdl_fp16.cpp rename to composable_kernel/example/17_convnd_bwd_data/convnd_bwd_data_xdl_fp16.cpp diff --git a/example/18_batched_gemm_reduce/CMakeLists.txt b/composable_kernel/example/18_batched_gemm_reduce/CMakeLists.txt similarity index 100% rename from example/18_batched_gemm_reduce/CMakeLists.txt rename to composable_kernel/example/18_batched_gemm_reduce/CMakeLists.txt diff --git a/example/18_batched_gemm_reduce/batched_gemm_reduce_xdl_fp16.cpp b/composable_kernel/example/18_batched_gemm_reduce/batched_gemm_reduce_xdl_fp16.cpp similarity index 100% rename from example/18_batched_gemm_reduce/batched_gemm_reduce_xdl_fp16.cpp rename to composable_kernel/example/18_batched_gemm_reduce/batched_gemm_reduce_xdl_fp16.cpp diff --git a/example/19_binary_elementwise/CMakeLists.txt b/composable_kernel/example/19_binary_elementwise/CMakeLists.txt similarity index 100% rename from example/19_binary_elementwise/CMakeLists.txt rename to composable_kernel/example/19_binary_elementwise/CMakeLists.txt diff --git a/example/19_binary_elementwise/broadcast_add_2d_amn_bn.cpp b/composable_kernel/example/19_binary_elementwise/broadcast_add_2d_amn_bn.cpp similarity index 100% rename from example/19_binary_elementwise/broadcast_add_2d_amn_bn.cpp rename to composable_kernel/example/19_binary_elementwise/broadcast_add_2d_amn_bn.cpp diff --git a/example/19_binary_elementwise/broadcast_add_3d_am_bmnk.cpp b/composable_kernel/example/19_binary_elementwise/broadcast_add_3d_am_bmnk.cpp similarity index 100% rename from example/19_binary_elementwise/broadcast_add_3d_am_bmnk.cpp rename to composable_kernel/example/19_binary_elementwise/broadcast_add_3d_am_bmnk.cpp diff --git a/example/19_binary_elementwise/elementwise_add_1d.cpp b/composable_kernel/example/19_binary_elementwise/elementwise_add_1d.cpp similarity index 100% rename from example/19_binary_elementwise/elementwise_add_1d.cpp rename to composable_kernel/example/19_binary_elementwise/elementwise_add_1d.cpp diff --git a/example/19_binary_elementwise/elementwise_add_4d.cpp b/composable_kernel/example/19_binary_elementwise/elementwise_add_4d.cpp similarity index 100% rename from example/19_binary_elementwise/elementwise_add_4d.cpp rename to composable_kernel/example/19_binary_elementwise/elementwise_add_4d.cpp diff --git a/example/20_grouped_conv_bwd_weight/CMakeLists.txt b/composable_kernel/example/20_grouped_conv_bwd_weight/CMakeLists.txt similarity index 100% rename from example/20_grouped_conv_bwd_weight/CMakeLists.txt rename to composable_kernel/example/20_grouped_conv_bwd_weight/CMakeLists.txt diff --git a/example/20_grouped_conv_bwd_weight/common.hpp b/composable_kernel/example/20_grouped_conv_bwd_weight/common.hpp similarity index 100% rename from example/20_grouped_conv_bwd_weight/common.hpp rename to composable_kernel/example/20_grouped_conv_bwd_weight/common.hpp diff --git a/example/20_grouped_conv_bwd_weight/grouped_conv_bwd_weight_dl_fp16.cpp b/composable_kernel/example/20_grouped_conv_bwd_weight/grouped_conv_bwd_weight_dl_fp16.cpp similarity index 100% rename from example/20_grouped_conv_bwd_weight/grouped_conv_bwd_weight_dl_fp16.cpp rename to composable_kernel/example/20_grouped_conv_bwd_weight/grouped_conv_bwd_weight_dl_fp16.cpp diff --git a/example/20_grouped_conv_bwd_weight/grouped_conv_bwd_weight_wmma_fp16.cpp b/composable_kernel/example/20_grouped_conv_bwd_weight/grouped_conv_bwd_weight_wmma_fp16.cpp similarity index 100% rename from example/20_grouped_conv_bwd_weight/grouped_conv_bwd_weight_wmma_fp16.cpp rename to composable_kernel/example/20_grouped_conv_bwd_weight/grouped_conv_bwd_weight_wmma_fp16.cpp diff --git a/example/20_grouped_conv_bwd_weight/grouped_conv_bwd_weight_xdl_bf16.cpp b/composable_kernel/example/20_grouped_conv_bwd_weight/grouped_conv_bwd_weight_xdl_bf16.cpp similarity index 100% rename from example/20_grouped_conv_bwd_weight/grouped_conv_bwd_weight_xdl_bf16.cpp rename to composable_kernel/example/20_grouped_conv_bwd_weight/grouped_conv_bwd_weight_xdl_bf16.cpp diff --git a/example/20_grouped_conv_bwd_weight/grouped_conv_bwd_weight_xdl_fp16.cpp b/composable_kernel/example/20_grouped_conv_bwd_weight/grouped_conv_bwd_weight_xdl_fp16.cpp similarity index 100% rename from example/20_grouped_conv_bwd_weight/grouped_conv_bwd_weight_xdl_fp16.cpp rename to composable_kernel/example/20_grouped_conv_bwd_weight/grouped_conv_bwd_weight_xdl_fp16.cpp diff --git a/example/20_grouped_conv_bwd_weight/grouped_conv_bwd_weight_xdl_fp16_comp_bf8_fp8.cpp b/composable_kernel/example/20_grouped_conv_bwd_weight/grouped_conv_bwd_weight_xdl_fp16_comp_bf8_fp8.cpp similarity index 100% rename from example/20_grouped_conv_bwd_weight/grouped_conv_bwd_weight_xdl_fp16_comp_bf8_fp8.cpp rename to composable_kernel/example/20_grouped_conv_bwd_weight/grouped_conv_bwd_weight_xdl_fp16_comp_bf8_fp8.cpp diff --git a/example/20_grouped_conv_bwd_weight/run_grouped_conv_bwd_weight_example.inc b/composable_kernel/example/20_grouped_conv_bwd_weight/run_grouped_conv_bwd_weight_example.inc similarity index 100% rename from example/20_grouped_conv_bwd_weight/run_grouped_conv_bwd_weight_example.inc rename to composable_kernel/example/20_grouped_conv_bwd_weight/run_grouped_conv_bwd_weight_example.inc diff --git a/example/21_gemm_layernorm/CMakeLists.txt b/composable_kernel/example/21_gemm_layernorm/CMakeLists.txt similarity index 100% rename from example/21_gemm_layernorm/CMakeLists.txt rename to composable_kernel/example/21_gemm_layernorm/CMakeLists.txt diff --git a/example/21_gemm_layernorm/gemm_bias_relu_add_layernorm_xdl_naive_fp16.cpp b/composable_kernel/example/21_gemm_layernorm/gemm_bias_relu_add_layernorm_xdl_naive_fp16.cpp similarity index 100% rename from example/21_gemm_layernorm/gemm_bias_relu_add_layernorm_xdl_naive_fp16.cpp rename to composable_kernel/example/21_gemm_layernorm/gemm_bias_relu_add_layernorm_xdl_naive_fp16.cpp diff --git a/example/21_gemm_layernorm/gemm_bias_relu_add_layernorm_xdl_welford_fp16.cpp b/composable_kernel/example/21_gemm_layernorm/gemm_bias_relu_add_layernorm_xdl_welford_fp16.cpp similarity index 100% rename from example/21_gemm_layernorm/gemm_bias_relu_add_layernorm_xdl_welford_fp16.cpp rename to composable_kernel/example/21_gemm_layernorm/gemm_bias_relu_add_layernorm_xdl_welford_fp16.cpp diff --git a/example/21_gemm_layernorm/gemm_layernorm_xdl_naive_fp16.cpp b/composable_kernel/example/21_gemm_layernorm/gemm_layernorm_xdl_naive_fp16.cpp similarity index 100% rename from example/21_gemm_layernorm/gemm_layernorm_xdl_naive_fp16.cpp rename to composable_kernel/example/21_gemm_layernorm/gemm_layernorm_xdl_naive_fp16.cpp diff --git a/example/21_gemm_layernorm/gemm_xdl_layernorm_naive_single_kernel_fp16.cpp b/composable_kernel/example/21_gemm_layernorm/gemm_xdl_layernorm_naive_single_kernel_fp16.cpp similarity index 100% rename from example/21_gemm_layernorm/gemm_xdl_layernorm_naive_single_kernel_fp16.cpp rename to composable_kernel/example/21_gemm_layernorm/gemm_xdl_layernorm_naive_single_kernel_fp16.cpp diff --git a/example/22_cgemm/CMakeLists.txt b/composable_kernel/example/22_cgemm/CMakeLists.txt similarity index 100% rename from example/22_cgemm/CMakeLists.txt rename to composable_kernel/example/22_cgemm/CMakeLists.txt diff --git a/example/22_cgemm/cgemm_xdl_bf16.cpp b/composable_kernel/example/22_cgemm/cgemm_xdl_bf16.cpp similarity index 100% rename from example/22_cgemm/cgemm_xdl_bf16.cpp rename to composable_kernel/example/22_cgemm/cgemm_xdl_bf16.cpp diff --git a/example/22_cgemm/cgemm_xdl_common.hpp b/composable_kernel/example/22_cgemm/cgemm_xdl_common.hpp similarity index 100% rename from example/22_cgemm/cgemm_xdl_common.hpp rename to composable_kernel/example/22_cgemm/cgemm_xdl_common.hpp diff --git a/example/22_cgemm/cgemm_xdl_fp16.cpp b/composable_kernel/example/22_cgemm/cgemm_xdl_fp16.cpp similarity index 100% rename from example/22_cgemm/cgemm_xdl_fp16.cpp rename to composable_kernel/example/22_cgemm/cgemm_xdl_fp16.cpp diff --git a/example/22_cgemm/cgemm_xdl_fp32.cpp b/composable_kernel/example/22_cgemm/cgemm_xdl_fp32.cpp similarity index 100% rename from example/22_cgemm/cgemm_xdl_fp32.cpp rename to composable_kernel/example/22_cgemm/cgemm_xdl_fp32.cpp diff --git a/example/22_cgemm/cgemm_xdl_int4.cpp b/composable_kernel/example/22_cgemm/cgemm_xdl_int4.cpp similarity index 100% rename from example/22_cgemm/cgemm_xdl_int4.cpp rename to composable_kernel/example/22_cgemm/cgemm_xdl_int4.cpp diff --git a/example/22_cgemm/cgemm_xdl_int8.cpp b/composable_kernel/example/22_cgemm/cgemm_xdl_int8.cpp similarity index 100% rename from example/22_cgemm/cgemm_xdl_int8.cpp rename to composable_kernel/example/22_cgemm/cgemm_xdl_int8.cpp diff --git a/example/23_softmax/CMakeLists.txt b/composable_kernel/example/23_softmax/CMakeLists.txt similarity index 100% rename from example/23_softmax/CMakeLists.txt rename to composable_kernel/example/23_softmax/CMakeLists.txt diff --git a/example/23_softmax/README.md b/composable_kernel/example/23_softmax/README.md similarity index 100% rename from example/23_softmax/README.md rename to composable_kernel/example/23_softmax/README.md diff --git a/example/23_softmax/softmax_blockwise.cpp b/composable_kernel/example/23_softmax/softmax_blockwise.cpp similarity index 100% rename from example/23_softmax/softmax_blockwise.cpp rename to composable_kernel/example/23_softmax/softmax_blockwise.cpp diff --git a/example/24_batched_gemm/CMakeLists.txt b/composable_kernel/example/24_batched_gemm/CMakeLists.txt similarity index 100% rename from example/24_batched_gemm/CMakeLists.txt rename to composable_kernel/example/24_batched_gemm/CMakeLists.txt diff --git a/example/24_batched_gemm/batched_gemm_xdl_bf16.cpp b/composable_kernel/example/24_batched_gemm/batched_gemm_xdl_bf16.cpp similarity index 100% rename from example/24_batched_gemm/batched_gemm_xdl_bf16.cpp rename to composable_kernel/example/24_batched_gemm/batched_gemm_xdl_bf16.cpp diff --git a/example/24_batched_gemm/batched_gemm_xdl_fp16.cpp b/composable_kernel/example/24_batched_gemm/batched_gemm_xdl_fp16.cpp similarity index 100% rename from example/24_batched_gemm/batched_gemm_xdl_fp16.cpp rename to composable_kernel/example/24_batched_gemm/batched_gemm_xdl_fp16.cpp diff --git a/example/24_batched_gemm/batched_gemm_xdl_fp32.cpp b/composable_kernel/example/24_batched_gemm/batched_gemm_xdl_fp32.cpp similarity index 100% rename from example/24_batched_gemm/batched_gemm_xdl_fp32.cpp rename to composable_kernel/example/24_batched_gemm/batched_gemm_xdl_fp32.cpp diff --git a/example/24_batched_gemm/batched_gemm_xdl_int4.cpp b/composable_kernel/example/24_batched_gemm/batched_gemm_xdl_int4.cpp similarity index 100% rename from example/24_batched_gemm/batched_gemm_xdl_int4.cpp rename to composable_kernel/example/24_batched_gemm/batched_gemm_xdl_int4.cpp diff --git a/example/24_batched_gemm/batched_gemm_xdl_int8.cpp b/composable_kernel/example/24_batched_gemm/batched_gemm_xdl_int8.cpp similarity index 100% rename from example/24_batched_gemm/batched_gemm_xdl_int8.cpp rename to composable_kernel/example/24_batched_gemm/batched_gemm_xdl_int8.cpp diff --git a/example/24_batched_gemm/run_batched_gemm_example.inc b/composable_kernel/example/24_batched_gemm/run_batched_gemm_example.inc similarity index 100% rename from example/24_batched_gemm/run_batched_gemm_example.inc rename to composable_kernel/example/24_batched_gemm/run_batched_gemm_example.inc diff --git a/example/25_gemm_bias_e_permute/CMakeLists.txt b/composable_kernel/example/25_gemm_bias_e_permute/CMakeLists.txt similarity index 100% rename from example/25_gemm_bias_e_permute/CMakeLists.txt rename to composable_kernel/example/25_gemm_bias_e_permute/CMakeLists.txt diff --git a/example/25_gemm_bias_e_permute/gemm_bias_e_permute_g1m2n3k1_xdl_fp16.cpp b/composable_kernel/example/25_gemm_bias_e_permute/gemm_bias_e_permute_g1m2n3k1_xdl_fp16.cpp similarity index 100% rename from example/25_gemm_bias_e_permute/gemm_bias_e_permute_g1m2n3k1_xdl_fp16.cpp rename to composable_kernel/example/25_gemm_bias_e_permute/gemm_bias_e_permute_g1m2n3k1_xdl_fp16.cpp diff --git a/example/25_gemm_bias_e_permute/gemm_bias_e_permute_g1m3n2k1_xdl_fp16.cpp b/composable_kernel/example/25_gemm_bias_e_permute/gemm_bias_e_permute_g1m3n2k1_xdl_fp16.cpp similarity index 100% rename from example/25_gemm_bias_e_permute/gemm_bias_e_permute_g1m3n2k1_xdl_fp16.cpp rename to composable_kernel/example/25_gemm_bias_e_permute/gemm_bias_e_permute_g1m3n2k1_xdl_fp16.cpp diff --git a/example/26_contraction/CMakeLists.txt b/composable_kernel/example/26_contraction/CMakeLists.txt similarity index 100% rename from example/26_contraction/CMakeLists.txt rename to composable_kernel/example/26_contraction/CMakeLists.txt diff --git a/example/26_contraction/README.md b/composable_kernel/example/26_contraction/README.md similarity index 100% rename from example/26_contraction/README.md rename to composable_kernel/example/26_contraction/README.md diff --git a/example/26_contraction/contraction_bilinear_xdl_fp32.cpp b/composable_kernel/example/26_contraction/contraction_bilinear_xdl_fp32.cpp similarity index 100% rename from example/26_contraction/contraction_bilinear_xdl_fp32.cpp rename to composable_kernel/example/26_contraction/contraction_bilinear_xdl_fp32.cpp diff --git a/example/26_contraction/contraction_bilinear_xdl_fp64.cpp b/composable_kernel/example/26_contraction/contraction_bilinear_xdl_fp64.cpp similarity index 100% rename from example/26_contraction/contraction_bilinear_xdl_fp64.cpp rename to composable_kernel/example/26_contraction/contraction_bilinear_xdl_fp64.cpp diff --git a/example/26_contraction/contraction_scale_xdl_fp32.cpp b/composable_kernel/example/26_contraction/contraction_scale_xdl_fp32.cpp similarity index 100% rename from example/26_contraction/contraction_scale_xdl_fp32.cpp rename to composable_kernel/example/26_contraction/contraction_scale_xdl_fp32.cpp diff --git a/example/26_contraction/contraction_scale_xdl_fp64.cpp b/composable_kernel/example/26_contraction/contraction_scale_xdl_fp64.cpp similarity index 100% rename from example/26_contraction/contraction_scale_xdl_fp64.cpp rename to composable_kernel/example/26_contraction/contraction_scale_xdl_fp64.cpp diff --git a/example/27_layernorm/CMakeLists.txt b/composable_kernel/example/27_layernorm/CMakeLists.txt similarity index 100% rename from example/27_layernorm/CMakeLists.txt rename to composable_kernel/example/27_layernorm/CMakeLists.txt diff --git a/example/27_layernorm/common.hpp b/composable_kernel/example/27_layernorm/common.hpp similarity index 100% rename from example/27_layernorm/common.hpp rename to composable_kernel/example/27_layernorm/common.hpp diff --git a/example/27_layernorm/layernorm_fp16.cpp b/composable_kernel/example/27_layernorm/layernorm_fp16.cpp similarity index 100% rename from example/27_layernorm/layernorm_fp16.cpp rename to composable_kernel/example/27_layernorm/layernorm_fp16.cpp diff --git a/example/27_layernorm/layernorm_splitk_fp16.cpp b/composable_kernel/example/27_layernorm/layernorm_splitk_fp16.cpp similarity index 100% rename from example/27_layernorm/layernorm_splitk_fp16.cpp rename to composable_kernel/example/27_layernorm/layernorm_splitk_fp16.cpp diff --git a/example/27_layernorm/run_layernorm_example.inc b/composable_kernel/example/27_layernorm/run_layernorm_example.inc similarity index 100% rename from example/27_layernorm/run_layernorm_example.inc rename to composable_kernel/example/27_layernorm/run_layernorm_example.inc diff --git a/example/28_grouped_gemm_bias_e_permute/CMakeLists.txt b/composable_kernel/example/28_grouped_gemm_bias_e_permute/CMakeLists.txt similarity index 100% rename from example/28_grouped_gemm_bias_e_permute/CMakeLists.txt rename to composable_kernel/example/28_grouped_gemm_bias_e_permute/CMakeLists.txt diff --git a/example/28_grouped_gemm_bias_e_permute/grouped_gemm_bias_e_permute_xdl_fp16.cpp b/composable_kernel/example/28_grouped_gemm_bias_e_permute/grouped_gemm_bias_e_permute_xdl_fp16.cpp similarity index 100% rename from example/28_grouped_gemm_bias_e_permute/grouped_gemm_bias_e_permute_xdl_fp16.cpp rename to composable_kernel/example/28_grouped_gemm_bias_e_permute/grouped_gemm_bias_e_permute_xdl_fp16.cpp diff --git a/example/29_batched_gemm_bias_e_permute/CMakeLists.txt b/composable_kernel/example/29_batched_gemm_bias_e_permute/CMakeLists.txt similarity index 100% rename from example/29_batched_gemm_bias_e_permute/CMakeLists.txt rename to composable_kernel/example/29_batched_gemm_bias_e_permute/CMakeLists.txt diff --git a/example/29_batched_gemm_bias_e_permute/batched_gemm_bias_e_permute_wmma_fp16.cpp b/composable_kernel/example/29_batched_gemm_bias_e_permute/batched_gemm_bias_e_permute_wmma_fp16.cpp similarity index 100% rename from example/29_batched_gemm_bias_e_permute/batched_gemm_bias_e_permute_wmma_fp16.cpp rename to composable_kernel/example/29_batched_gemm_bias_e_permute/batched_gemm_bias_e_permute_wmma_fp16.cpp diff --git a/example/29_batched_gemm_bias_e_permute/batched_gemm_bias_e_permute_xdl_fp16.cpp b/composable_kernel/example/29_batched_gemm_bias_e_permute/batched_gemm_bias_e_permute_xdl_fp16.cpp similarity index 100% rename from example/29_batched_gemm_bias_e_permute/batched_gemm_bias_e_permute_xdl_fp16.cpp rename to composable_kernel/example/29_batched_gemm_bias_e_permute/batched_gemm_bias_e_permute_xdl_fp16.cpp diff --git a/example/30_grouped_conv_fwd_multiple_d/CMakeLists.txt b/composable_kernel/example/30_grouped_conv_fwd_multiple_d/CMakeLists.txt similarity index 100% rename from example/30_grouped_conv_fwd_multiple_d/CMakeLists.txt rename to composable_kernel/example/30_grouped_conv_fwd_multiple_d/CMakeLists.txt diff --git a/example/30_grouped_conv_fwd_multiple_d/README.md b/composable_kernel/example/30_grouped_conv_fwd_multiple_d/README.md similarity index 100% rename from example/30_grouped_conv_fwd_multiple_d/README.md rename to composable_kernel/example/30_grouped_conv_fwd_multiple_d/README.md diff --git a/example/30_grouped_conv_fwd_multiple_d/common.hpp b/composable_kernel/example/30_grouped_conv_fwd_multiple_d/common.hpp similarity index 100% rename from example/30_grouped_conv_fwd_multiple_d/common.hpp rename to composable_kernel/example/30_grouped_conv_fwd_multiple_d/common.hpp diff --git a/example/30_grouped_conv_fwd_multiple_d/common_wmma.hpp b/composable_kernel/example/30_grouped_conv_fwd_multiple_d/common_wmma.hpp similarity index 100% rename from example/30_grouped_conv_fwd_multiple_d/common_wmma.hpp rename to composable_kernel/example/30_grouped_conv_fwd_multiple_d/common_wmma.hpp diff --git a/example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_bias_relu_add_wmma_fp16.cpp b/composable_kernel/example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_bias_relu_add_wmma_fp16.cpp similarity index 100% rename from example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_bias_relu_add_wmma_fp16.cpp rename to composable_kernel/example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_bias_relu_add_wmma_fp16.cpp diff --git a/example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_bias_relu_add_wmma_int8.cpp b/composable_kernel/example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_bias_relu_add_wmma_int8.cpp similarity index 100% rename from example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_bias_relu_add_wmma_int8.cpp rename to composable_kernel/example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_bias_relu_add_wmma_int8.cpp diff --git a/example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_bias_relu_add_xdl_bf16.cpp b/composable_kernel/example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_bias_relu_add_xdl_bf16.cpp similarity index 100% rename from example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_bias_relu_add_xdl_bf16.cpp rename to composable_kernel/example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_bias_relu_add_xdl_bf16.cpp diff --git a/example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_bias_relu_add_xdl_fp16.cpp b/composable_kernel/example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_bias_relu_add_xdl_fp16.cpp similarity index 100% rename from example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_bias_relu_add_xdl_fp16.cpp rename to composable_kernel/example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_bias_relu_add_xdl_fp16.cpp diff --git a/example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_bias_relu_add_xdl_fp32.cpp b/composable_kernel/example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_bias_relu_add_xdl_fp32.cpp similarity index 100% rename from example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_bias_relu_add_xdl_fp32.cpp rename to composable_kernel/example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_bias_relu_add_xdl_fp32.cpp diff --git a/example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_bias_relu_add_xdl_int4.cpp b/composable_kernel/example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_bias_relu_add_xdl_int4.cpp similarity index 100% rename from example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_bias_relu_add_xdl_int4.cpp rename to composable_kernel/example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_bias_relu_add_xdl_int4.cpp diff --git a/example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_bias_relu_add_xdl_int8.cpp b/composable_kernel/example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_bias_relu_add_xdl_int8.cpp similarity index 100% rename from example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_bias_relu_add_xdl_int8.cpp rename to composable_kernel/example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_bias_relu_add_xdl_int8.cpp diff --git a/example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_xdl_fp16.cpp b/composable_kernel/example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_xdl_fp16.cpp similarity index 100% rename from example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_xdl_fp16.cpp rename to composable_kernel/example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_xdl_fp16.cpp diff --git a/example/30_grouped_conv_fwd_multiple_d/run_grouped_conv_fwd_bias_relu_add_example.inc b/composable_kernel/example/30_grouped_conv_fwd_multiple_d/run_grouped_conv_fwd_bias_relu_add_example.inc similarity index 100% rename from example/30_grouped_conv_fwd_multiple_d/run_grouped_conv_fwd_bias_relu_add_example.inc rename to composable_kernel/example/30_grouped_conv_fwd_multiple_d/run_grouped_conv_fwd_bias_relu_add_example.inc diff --git a/example/30_grouped_conv_fwd_multiple_d/run_grouped_conv_fwd_bias_relu_add_wmma_example.inc b/composable_kernel/example/30_grouped_conv_fwd_multiple_d/run_grouped_conv_fwd_bias_relu_add_wmma_example.inc similarity index 100% rename from example/30_grouped_conv_fwd_multiple_d/run_grouped_conv_fwd_bias_relu_add_wmma_example.inc rename to composable_kernel/example/30_grouped_conv_fwd_multiple_d/run_grouped_conv_fwd_bias_relu_add_wmma_example.inc diff --git a/example/30_grouped_conv_fwd_multiple_d/run_grouped_conv_fwd_example.inc b/composable_kernel/example/30_grouped_conv_fwd_multiple_d/run_grouped_conv_fwd_example.inc similarity index 100% rename from example/30_grouped_conv_fwd_multiple_d/run_grouped_conv_fwd_example.inc rename to composable_kernel/example/30_grouped_conv_fwd_multiple_d/run_grouped_conv_fwd_example.inc diff --git a/example/31_batched_gemm_gemm/CMakeLists.txt b/composable_kernel/example/31_batched_gemm_gemm/CMakeLists.txt similarity index 100% rename from example/31_batched_gemm_gemm/CMakeLists.txt rename to composable_kernel/example/31_batched_gemm_gemm/CMakeLists.txt diff --git a/example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_bf16.cpp b/composable_kernel/example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_bf16.cpp similarity index 100% rename from example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_bf16.cpp rename to composable_kernel/example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_bf16.cpp diff --git a/example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_fp16.cpp b/composable_kernel/example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_fp16.cpp similarity index 100% rename from example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_fp16.cpp rename to composable_kernel/example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_fp16.cpp diff --git a/example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_fp32.cpp b/composable_kernel/example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_fp32.cpp similarity index 100% rename from example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_fp32.cpp rename to composable_kernel/example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_fp32.cpp diff --git a/example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_int4.cpp b/composable_kernel/example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_int4.cpp similarity index 100% rename from example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_int4.cpp rename to composable_kernel/example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_int4.cpp diff --git a/example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_int8.cpp b/composable_kernel/example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_int8.cpp similarity index 100% rename from example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_int8.cpp rename to composable_kernel/example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_int8.cpp diff --git a/example/31_batched_gemm_gemm/run_batched_gemm_gemm_example.inc b/composable_kernel/example/31_batched_gemm_gemm/run_batched_gemm_gemm_example.inc similarity index 100% rename from example/31_batched_gemm_gemm/run_batched_gemm_gemm_example.inc rename to composable_kernel/example/31_batched_gemm_gemm/run_batched_gemm_gemm_example.inc diff --git a/example/32_batched_gemm_scale_softmax_gemm/CMakeLists.txt b/composable_kernel/example/32_batched_gemm_scale_softmax_gemm/CMakeLists.txt similarity index 100% rename from example/32_batched_gemm_scale_softmax_gemm/CMakeLists.txt rename to composable_kernel/example/32_batched_gemm_scale_softmax_gemm/CMakeLists.txt diff --git a/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_lower_triangle_scale_softmax_gemm_permute_xdl_fp16.cpp b/composable_kernel/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_lower_triangle_scale_softmax_gemm_permute_xdl_fp16.cpp similarity index 100% rename from example/32_batched_gemm_scale_softmax_gemm/batched_gemm_lower_triangle_scale_softmax_gemm_permute_xdl_fp16.cpp rename to composable_kernel/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_lower_triangle_scale_softmax_gemm_permute_xdl_fp16.cpp diff --git a/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_permute_xdl_bf16.cpp b/composable_kernel/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_permute_xdl_bf16.cpp similarity index 100% rename from example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_permute_xdl_bf16.cpp rename to composable_kernel/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_permute_xdl_bf16.cpp diff --git a/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_permute_xdl_fp16.cpp b/composable_kernel/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_permute_xdl_fp16.cpp similarity index 100% rename from example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_permute_xdl_fp16.cpp rename to composable_kernel/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_permute_xdl_fp16.cpp diff --git a/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_xdl_bf16.cpp b/composable_kernel/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_xdl_bf16.cpp similarity index 100% rename from example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_xdl_bf16.cpp rename to composable_kernel/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_xdl_bf16.cpp diff --git a/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_xdl_fp16.cpp b/composable_kernel/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_xdl_fp16.cpp similarity index 100% rename from example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_xdl_fp16.cpp rename to composable_kernel/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_xdl_fp16.cpp diff --git a/example/32_batched_gemm_scale_softmax_gemm/grouped_gemm_lower_triangle_scale_softmax_gemm_permute_xdl_fp16.cpp b/composable_kernel/example/32_batched_gemm_scale_softmax_gemm/grouped_gemm_lower_triangle_scale_softmax_gemm_permute_xdl_fp16.cpp similarity index 100% rename from example/32_batched_gemm_scale_softmax_gemm/grouped_gemm_lower_triangle_scale_softmax_gemm_permute_xdl_fp16.cpp rename to composable_kernel/example/32_batched_gemm_scale_softmax_gemm/grouped_gemm_lower_triangle_scale_softmax_gemm_permute_xdl_fp16.cpp diff --git a/example/32_batched_gemm_scale_softmax_gemm/grouped_gemm_scale_softmax_gemm_permute_xdl_fp16.cpp b/composable_kernel/example/32_batched_gemm_scale_softmax_gemm/grouped_gemm_scale_softmax_gemm_permute_xdl_fp16.cpp similarity index 100% rename from example/32_batched_gemm_scale_softmax_gemm/grouped_gemm_scale_softmax_gemm_permute_xdl_fp16.cpp rename to composable_kernel/example/32_batched_gemm_scale_softmax_gemm/grouped_gemm_scale_softmax_gemm_permute_xdl_fp16.cpp diff --git a/example/32_batched_gemm_scale_softmax_gemm/run_batched_gemm_scale_softmax_gemm.inc b/composable_kernel/example/32_batched_gemm_scale_softmax_gemm/run_batched_gemm_scale_softmax_gemm.inc similarity index 100% rename from example/32_batched_gemm_scale_softmax_gemm/run_batched_gemm_scale_softmax_gemm.inc rename to composable_kernel/example/32_batched_gemm_scale_softmax_gemm/run_batched_gemm_scale_softmax_gemm.inc diff --git a/example/32_batched_gemm_scale_softmax_gemm/run_batched_gemm_scale_softmax_gemm_permute.inc b/composable_kernel/example/32_batched_gemm_scale_softmax_gemm/run_batched_gemm_scale_softmax_gemm_permute.inc similarity index 100% rename from example/32_batched_gemm_scale_softmax_gemm/run_batched_gemm_scale_softmax_gemm_permute.inc rename to composable_kernel/example/32_batched_gemm_scale_softmax_gemm/run_batched_gemm_scale_softmax_gemm_permute.inc diff --git a/example/32_batched_gemm_scale_softmax_gemm/run_grouped_gemm_scale_softmax_gemm_permute.inc b/composable_kernel/example/32_batched_gemm_scale_softmax_gemm/run_grouped_gemm_scale_softmax_gemm_permute.inc similarity index 100% rename from example/32_batched_gemm_scale_softmax_gemm/run_grouped_gemm_scale_softmax_gemm_permute.inc rename to composable_kernel/example/32_batched_gemm_scale_softmax_gemm/run_grouped_gemm_scale_softmax_gemm_permute.inc diff --git a/example/33_multiple_reduce/CMakeLists.txt b/composable_kernel/example/33_multiple_reduce/CMakeLists.txt similarity index 100% rename from example/33_multiple_reduce/CMakeLists.txt rename to composable_kernel/example/33_multiple_reduce/CMakeLists.txt diff --git a/example/33_multiple_reduce/README.md b/composable_kernel/example/33_multiple_reduce/README.md similarity index 100% rename from example/33_multiple_reduce/README.md rename to composable_kernel/example/33_multiple_reduce/README.md diff --git a/example/33_multiple_reduce/dual_reduce_common.hpp b/composable_kernel/example/33_multiple_reduce/dual_reduce_common.hpp similarity index 100% rename from example/33_multiple_reduce/dual_reduce_common.hpp rename to composable_kernel/example/33_multiple_reduce/dual_reduce_common.hpp diff --git a/example/33_multiple_reduce/dual_reduce_multiblock.cpp b/composable_kernel/example/33_multiple_reduce/dual_reduce_multiblock.cpp similarity index 100% rename from example/33_multiple_reduce/dual_reduce_multiblock.cpp rename to composable_kernel/example/33_multiple_reduce/dual_reduce_multiblock.cpp diff --git a/example/33_multiple_reduce/dual_reduce_threadwise.cpp b/composable_kernel/example/33_multiple_reduce/dual_reduce_threadwise.cpp similarity index 100% rename from example/33_multiple_reduce/dual_reduce_threadwise.cpp rename to composable_kernel/example/33_multiple_reduce/dual_reduce_threadwise.cpp diff --git a/example/34_batchnorm/CMakeLists.txt b/composable_kernel/example/34_batchnorm/CMakeLists.txt similarity index 100% rename from example/34_batchnorm/CMakeLists.txt rename to composable_kernel/example/34_batchnorm/CMakeLists.txt diff --git a/example/34_batchnorm/README.md b/composable_kernel/example/34_batchnorm/README.md similarity index 100% rename from example/34_batchnorm/README.md rename to composable_kernel/example/34_batchnorm/README.md diff --git a/example/34_batchnorm/batchnorm_backward_nhwc.cpp b/composable_kernel/example/34_batchnorm/batchnorm_backward_nhwc.cpp similarity index 100% rename from example/34_batchnorm/batchnorm_backward_nhwc.cpp rename to composable_kernel/example/34_batchnorm/batchnorm_backward_nhwc.cpp diff --git a/example/34_batchnorm/batchnorm_common.hpp b/composable_kernel/example/34_batchnorm/batchnorm_common.hpp similarity index 100% rename from example/34_batchnorm/batchnorm_common.hpp rename to composable_kernel/example/34_batchnorm/batchnorm_common.hpp diff --git a/example/34_batchnorm/batchnorm_forward_inferring_nhwc.cpp b/composable_kernel/example/34_batchnorm/batchnorm_forward_inferring_nhwc.cpp similarity index 100% rename from example/34_batchnorm/batchnorm_forward_inferring_nhwc.cpp rename to composable_kernel/example/34_batchnorm/batchnorm_forward_inferring_nhwc.cpp diff --git a/example/34_batchnorm/batchnorm_forward_training_nhwc.cpp b/composable_kernel/example/34_batchnorm/batchnorm_forward_training_nhwc.cpp similarity index 100% rename from example/34_batchnorm/batchnorm_forward_training_nhwc.cpp rename to composable_kernel/example/34_batchnorm/batchnorm_forward_training_nhwc.cpp diff --git a/example/34_batchnorm/batchnorm_forward_training_nhwc_obsolete.cpp b/composable_kernel/example/34_batchnorm/batchnorm_forward_training_nhwc_obsolete.cpp similarity index 100% rename from example/34_batchnorm/batchnorm_forward_training_nhwc_obsolete.cpp rename to composable_kernel/example/34_batchnorm/batchnorm_forward_training_nhwc_obsolete.cpp diff --git a/example/34_batchnorm/batchnorm_infer_impl.hpp b/composable_kernel/example/34_batchnorm/batchnorm_infer_impl.hpp similarity index 100% rename from example/34_batchnorm/batchnorm_infer_impl.hpp rename to composable_kernel/example/34_batchnorm/batchnorm_infer_impl.hpp diff --git a/example/35_splitK_gemm/CMakeLists.txt b/composable_kernel/example/35_splitK_gemm/CMakeLists.txt similarity index 100% rename from example/35_splitK_gemm/CMakeLists.txt rename to composable_kernel/example/35_splitK_gemm/CMakeLists.txt diff --git a/example/35_splitK_gemm/run_splitK_gemm_example.inc b/composable_kernel/example/35_splitK_gemm/run_splitK_gemm_example.inc similarity index 100% rename from example/35_splitK_gemm/run_splitK_gemm_example.inc rename to composable_kernel/example/35_splitK_gemm/run_splitK_gemm_example.inc diff --git a/example/35_splitK_gemm/splitK_gemm_xdl_bf16.cpp b/composable_kernel/example/35_splitK_gemm/splitK_gemm_xdl_bf16.cpp similarity index 100% rename from example/35_splitK_gemm/splitK_gemm_xdl_bf16.cpp rename to composable_kernel/example/35_splitK_gemm/splitK_gemm_xdl_bf16.cpp diff --git a/example/35_splitK_gemm/splitK_gemm_xdl_fp16.cpp b/composable_kernel/example/35_splitK_gemm/splitK_gemm_xdl_fp16.cpp similarity index 100% rename from example/35_splitK_gemm/splitK_gemm_xdl_fp16.cpp rename to composable_kernel/example/35_splitK_gemm/splitK_gemm_xdl_fp16.cpp diff --git a/example/35_splitK_gemm/splitK_gemm_xdl_fp32.cpp b/composable_kernel/example/35_splitK_gemm/splitK_gemm_xdl_fp32.cpp similarity index 100% rename from example/35_splitK_gemm/splitK_gemm_xdl_fp32.cpp rename to composable_kernel/example/35_splitK_gemm/splitK_gemm_xdl_fp32.cpp diff --git a/example/35_splitK_gemm/splitK_gemm_xdl_int4.cpp b/composable_kernel/example/35_splitK_gemm/splitK_gemm_xdl_int4.cpp similarity index 100% rename from example/35_splitK_gemm/splitK_gemm_xdl_int4.cpp rename to composable_kernel/example/35_splitK_gemm/splitK_gemm_xdl_int4.cpp diff --git a/example/35_splitK_gemm/splitK_gemm_xdl_int8.cpp b/composable_kernel/example/35_splitK_gemm/splitK_gemm_xdl_int8.cpp similarity index 100% rename from example/35_splitK_gemm/splitK_gemm_xdl_int8.cpp rename to composable_kernel/example/35_splitK_gemm/splitK_gemm_xdl_int8.cpp diff --git a/example/36_sparse_embedding/CMakeLists.txt b/composable_kernel/example/36_sparse_embedding/CMakeLists.txt similarity index 100% rename from example/36_sparse_embedding/CMakeLists.txt rename to composable_kernel/example/36_sparse_embedding/CMakeLists.txt diff --git a/example/36_sparse_embedding/sparse_embedding3_forward_layernorm.cpp b/composable_kernel/example/36_sparse_embedding/sparse_embedding3_forward_layernorm.cpp similarity index 100% rename from example/36_sparse_embedding/sparse_embedding3_forward_layernorm.cpp rename to composable_kernel/example/36_sparse_embedding/sparse_embedding3_forward_layernorm.cpp diff --git a/example/37_batched_gemm_add_add_relu_gemm_add/CMakeLists.txt b/composable_kernel/example/37_batched_gemm_add_add_relu_gemm_add/CMakeLists.txt similarity index 100% rename from example/37_batched_gemm_add_add_relu_gemm_add/CMakeLists.txt rename to composable_kernel/example/37_batched_gemm_add_add_relu_gemm_add/CMakeLists.txt diff --git a/example/37_batched_gemm_add_add_relu_gemm_add/batched_gemm_add_add_relu_gemm_add_xdl_fp16.cpp b/composable_kernel/example/37_batched_gemm_add_add_relu_gemm_add/batched_gemm_add_add_relu_gemm_add_xdl_fp16.cpp similarity index 100% rename from example/37_batched_gemm_add_add_relu_gemm_add/batched_gemm_add_add_relu_gemm_add_xdl_fp16.cpp rename to composable_kernel/example/37_batched_gemm_add_add_relu_gemm_add/batched_gemm_add_add_relu_gemm_add_xdl_fp16.cpp diff --git a/example/38_grouped_conv_bwd_data_multiple_d/CMakeLists.txt b/composable_kernel/example/38_grouped_conv_bwd_data_multiple_d/CMakeLists.txt similarity index 100% rename from example/38_grouped_conv_bwd_data_multiple_d/CMakeLists.txt rename to composable_kernel/example/38_grouped_conv_bwd_data_multiple_d/CMakeLists.txt diff --git a/example/38_grouped_conv_bwd_data_multiple_d/common.hpp b/composable_kernel/example/38_grouped_conv_bwd_data_multiple_d/common.hpp similarity index 100% rename from example/38_grouped_conv_bwd_data_multiple_d/common.hpp rename to composable_kernel/example/38_grouped_conv_bwd_data_multiple_d/common.hpp diff --git a/example/38_grouped_conv_bwd_data_multiple_d/grouped_conv_bwd_data_bias_relu_xdl_fp16.cpp b/composable_kernel/example/38_grouped_conv_bwd_data_multiple_d/grouped_conv_bwd_data_bias_relu_xdl_fp16.cpp similarity index 100% rename from example/38_grouped_conv_bwd_data_multiple_d/grouped_conv_bwd_data_bias_relu_xdl_fp16.cpp rename to composable_kernel/example/38_grouped_conv_bwd_data_multiple_d/grouped_conv_bwd_data_bias_relu_xdl_fp16.cpp diff --git a/example/38_grouped_conv_bwd_data_multiple_d/grouped_conv_bwd_data_wmma_fp16.cpp b/composable_kernel/example/38_grouped_conv_bwd_data_multiple_d/grouped_conv_bwd_data_wmma_fp16.cpp similarity index 100% rename from example/38_grouped_conv_bwd_data_multiple_d/grouped_conv_bwd_data_wmma_fp16.cpp rename to composable_kernel/example/38_grouped_conv_bwd_data_multiple_d/grouped_conv_bwd_data_wmma_fp16.cpp diff --git a/example/38_grouped_conv_bwd_data_multiple_d/grouped_conv_bwd_data_xdl_fp16.cpp b/composable_kernel/example/38_grouped_conv_bwd_data_multiple_d/grouped_conv_bwd_data_xdl_fp16.cpp similarity index 100% rename from example/38_grouped_conv_bwd_data_multiple_d/grouped_conv_bwd_data_xdl_fp16.cpp rename to composable_kernel/example/38_grouped_conv_bwd_data_multiple_d/grouped_conv_bwd_data_xdl_fp16.cpp diff --git a/example/38_grouped_conv_bwd_data_multiple_d/run_grouped_conv_bwd_data_bias_relu_example.inc b/composable_kernel/example/38_grouped_conv_bwd_data_multiple_d/run_grouped_conv_bwd_data_bias_relu_example.inc similarity index 100% rename from example/38_grouped_conv_bwd_data_multiple_d/run_grouped_conv_bwd_data_bias_relu_example.inc rename to composable_kernel/example/38_grouped_conv_bwd_data_multiple_d/run_grouped_conv_bwd_data_bias_relu_example.inc diff --git a/example/38_grouped_conv_bwd_data_multiple_d/run_grouped_conv_bwd_data_example.inc b/composable_kernel/example/38_grouped_conv_bwd_data_multiple_d/run_grouped_conv_bwd_data_example.inc similarity index 100% rename from example/38_grouped_conv_bwd_data_multiple_d/run_grouped_conv_bwd_data_example.inc rename to composable_kernel/example/38_grouped_conv_bwd_data_multiple_d/run_grouped_conv_bwd_data_example.inc diff --git a/example/39_permute/CMakeLists.txt b/composable_kernel/example/39_permute/CMakeLists.txt similarity index 100% rename from example/39_permute/CMakeLists.txt rename to composable_kernel/example/39_permute/CMakeLists.txt diff --git a/example/39_permute/common.hpp b/composable_kernel/example/39_permute/common.hpp similarity index 100% rename from example/39_permute/common.hpp rename to composable_kernel/example/39_permute/common.hpp diff --git a/example/39_permute/permute_1xHxW_fp16.cpp b/composable_kernel/example/39_permute/permute_1xHxW_fp16.cpp similarity index 100% rename from example/39_permute/permute_1xHxW_fp16.cpp rename to composable_kernel/example/39_permute/permute_1xHxW_fp16.cpp diff --git a/example/39_permute/permute_HxWx4_fp16.cpp b/composable_kernel/example/39_permute/permute_HxWx4_fp16.cpp similarity index 100% rename from example/39_permute/permute_HxWx4_fp16.cpp rename to composable_kernel/example/39_permute/permute_HxWx4_fp16.cpp diff --git a/example/39_permute/permute_NxHxW_fp16.cpp b/composable_kernel/example/39_permute/permute_NxHxW_fp16.cpp similarity index 100% rename from example/39_permute/permute_NxHxW_fp16.cpp rename to composable_kernel/example/39_permute/permute_NxHxW_fp16.cpp diff --git a/example/39_permute/run_permute_bundle_example.inc b/composable_kernel/example/39_permute/run_permute_bundle_example.inc similarity index 100% rename from example/39_permute/run_permute_bundle_example.inc rename to composable_kernel/example/39_permute/run_permute_bundle_example.inc diff --git a/example/39_permute/run_permute_element_example.inc b/composable_kernel/example/39_permute/run_permute_element_example.inc similarity index 100% rename from example/39_permute/run_permute_element_example.inc rename to composable_kernel/example/39_permute/run_permute_element_example.inc diff --git a/example/40_conv2d_fwd_quantization/CMakeLists.txt b/composable_kernel/example/40_conv2d_fwd_quantization/CMakeLists.txt similarity index 100% rename from example/40_conv2d_fwd_quantization/CMakeLists.txt rename to composable_kernel/example/40_conv2d_fwd_quantization/CMakeLists.txt diff --git a/example/40_conv2d_fwd_quantization/common.hpp b/composable_kernel/example/40_conv2d_fwd_quantization/common.hpp similarity index 100% rename from example/40_conv2d_fwd_quantization/common.hpp rename to composable_kernel/example/40_conv2d_fwd_quantization/common.hpp diff --git a/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_relu_perchannel_quantization_int8.cpp b/composable_kernel/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_relu_perchannel_quantization_int8.cpp similarity index 100% rename from example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_relu_perchannel_quantization_int8.cpp rename to composable_kernel/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_relu_perchannel_quantization_int8.cpp diff --git a/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_relu_perlayer_quantization_int8.cpp b/composable_kernel/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_relu_perlayer_quantization_int8.cpp similarity index 100% rename from example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_relu_perlayer_quantization_int8.cpp rename to composable_kernel/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_relu_perlayer_quantization_int8.cpp diff --git a/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_tanh_perchannel_quantization_int8.cpp b/composable_kernel/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_tanh_perchannel_quantization_int8.cpp similarity index 100% rename from example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_tanh_perchannel_quantization_int8.cpp rename to composable_kernel/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_tanh_perchannel_quantization_int8.cpp diff --git a/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_tanh_perlayer_quantization_int8.cpp b/composable_kernel/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_tanh_perlayer_quantization_int8.cpp similarity index 100% rename from example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_tanh_perlayer_quantization_int8.cpp rename to composable_kernel/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_tanh_perlayer_quantization_int8.cpp diff --git a/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_perchannel_quantization_int8.cpp b/composable_kernel/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_perchannel_quantization_int8.cpp similarity index 100% rename from example/40_conv2d_fwd_quantization/conv2d_fwd_dl_perchannel_quantization_int8.cpp rename to composable_kernel/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_perchannel_quantization_int8.cpp diff --git a/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_perlayer_quantization_int8.cpp b/composable_kernel/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_perlayer_quantization_int8.cpp similarity index 100% rename from example/40_conv2d_fwd_quantization/conv2d_fwd_dl_perlayer_quantization_int8.cpp rename to composable_kernel/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_perlayer_quantization_int8.cpp diff --git a/example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_bias_relu_perchannel_quantization_int8.cpp b/composable_kernel/example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_bias_relu_perchannel_quantization_int8.cpp similarity index 100% rename from example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_bias_relu_perchannel_quantization_int8.cpp rename to composable_kernel/example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_bias_relu_perchannel_quantization_int8.cpp diff --git a/example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_bias_relu_perlayer_quantization_int8.cpp b/composable_kernel/example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_bias_relu_perlayer_quantization_int8.cpp similarity index 100% rename from example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_bias_relu_perlayer_quantization_int8.cpp rename to composable_kernel/example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_bias_relu_perlayer_quantization_int8.cpp diff --git a/example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_perchannel_quantization_int8.cpp b/composable_kernel/example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_perchannel_quantization_int8.cpp similarity index 100% rename from example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_perchannel_quantization_int8.cpp rename to composable_kernel/example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_perchannel_quantization_int8.cpp diff --git a/example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_perlayer_quantization_int8.cpp b/composable_kernel/example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_perlayer_quantization_int8.cpp similarity index 100% rename from example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_perlayer_quantization_int8.cpp rename to composable_kernel/example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_perlayer_quantization_int8.cpp diff --git a/example/40_conv2d_fwd_quantization/run_conv2d_fwd_bias_perchannel_quantization_example.inc b/composable_kernel/example/40_conv2d_fwd_quantization/run_conv2d_fwd_bias_perchannel_quantization_example.inc similarity index 100% rename from example/40_conv2d_fwd_quantization/run_conv2d_fwd_bias_perchannel_quantization_example.inc rename to composable_kernel/example/40_conv2d_fwd_quantization/run_conv2d_fwd_bias_perchannel_quantization_example.inc diff --git a/example/40_conv2d_fwd_quantization/run_conv2d_fwd_bias_perlayer_quantization_example.inc b/composable_kernel/example/40_conv2d_fwd_quantization/run_conv2d_fwd_bias_perlayer_quantization_example.inc similarity index 100% rename from example/40_conv2d_fwd_quantization/run_conv2d_fwd_bias_perlayer_quantization_example.inc rename to composable_kernel/example/40_conv2d_fwd_quantization/run_conv2d_fwd_bias_perlayer_quantization_example.inc diff --git a/example/40_conv2d_fwd_quantization/run_conv2d_fwd_perchannel_quantization_example.inc b/composable_kernel/example/40_conv2d_fwd_quantization/run_conv2d_fwd_perchannel_quantization_example.inc similarity index 100% rename from example/40_conv2d_fwd_quantization/run_conv2d_fwd_perchannel_quantization_example.inc rename to composable_kernel/example/40_conv2d_fwd_quantization/run_conv2d_fwd_perchannel_quantization_example.inc diff --git a/example/40_conv2d_fwd_quantization/run_conv2d_fwd_perlayer_quantization_example.inc b/composable_kernel/example/40_conv2d_fwd_quantization/run_conv2d_fwd_perlayer_quantization_example.inc similarity index 100% rename from example/40_conv2d_fwd_quantization/run_conv2d_fwd_perlayer_quantization_example.inc rename to composable_kernel/example/40_conv2d_fwd_quantization/run_conv2d_fwd_perlayer_quantization_example.inc diff --git a/example/41_grouped_conv_conv_fwd/CMakeLists.txt b/composable_kernel/example/41_grouped_conv_conv_fwd/CMakeLists.txt similarity index 100% rename from example/41_grouped_conv_conv_fwd/CMakeLists.txt rename to composable_kernel/example/41_grouped_conv_conv_fwd/CMakeLists.txt diff --git a/example/41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_bf16.cpp b/composable_kernel/example/41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_bf16.cpp similarity index 100% rename from example/41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_bf16.cpp rename to composable_kernel/example/41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_bf16.cpp diff --git a/example/41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_fp16.cpp b/composable_kernel/example/41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_fp16.cpp similarity index 100% rename from example/41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_fp16.cpp rename to composable_kernel/example/41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_fp16.cpp diff --git a/example/41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_fp32.cpp b/composable_kernel/example/41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_fp32.cpp similarity index 100% rename from example/41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_fp32.cpp rename to composable_kernel/example/41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_fp32.cpp diff --git a/example/41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_int4.cpp b/composable_kernel/example/41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_int4.cpp similarity index 100% rename from example/41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_int4.cpp rename to composable_kernel/example/41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_int4.cpp diff --git a/example/41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_int8.cpp b/composable_kernel/example/41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_int8.cpp similarity index 100% rename from example/41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_int8.cpp rename to composable_kernel/example/41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_int8.cpp diff --git a/example/41_grouped_conv_conv_fwd/run_grouped_conv_conv_fwd_example.inc b/composable_kernel/example/41_grouped_conv_conv_fwd/run_grouped_conv_conv_fwd_example.inc similarity index 100% rename from example/41_grouped_conv_conv_fwd/run_grouped_conv_conv_fwd_example.inc rename to composable_kernel/example/41_grouped_conv_conv_fwd/run_grouped_conv_conv_fwd_example.inc diff --git a/example/42_groupnorm/CMakeLists.txt b/composable_kernel/example/42_groupnorm/CMakeLists.txt similarity index 100% rename from example/42_groupnorm/CMakeLists.txt rename to composable_kernel/example/42_groupnorm/CMakeLists.txt diff --git a/example/42_groupnorm/common.hpp b/composable_kernel/example/42_groupnorm/common.hpp similarity index 100% rename from example/42_groupnorm/common.hpp rename to composable_kernel/example/42_groupnorm/common.hpp diff --git a/example/42_groupnorm/groupnorm_sigmoid_mul_fp16.cpp b/composable_kernel/example/42_groupnorm/groupnorm_sigmoid_mul_fp16.cpp similarity index 100% rename from example/42_groupnorm/groupnorm_sigmoid_mul_fp16.cpp rename to composable_kernel/example/42_groupnorm/groupnorm_sigmoid_mul_fp16.cpp diff --git a/example/42_groupnorm/groupnorm_splitk_fp16.cpp b/composable_kernel/example/42_groupnorm/groupnorm_splitk_fp16.cpp similarity index 100% rename from example/42_groupnorm/groupnorm_splitk_fp16.cpp rename to composable_kernel/example/42_groupnorm/groupnorm_splitk_fp16.cpp diff --git a/example/42_groupnorm/groupnorm_swish_fp16.cpp b/composable_kernel/example/42_groupnorm/groupnorm_swish_fp16.cpp similarity index 100% rename from example/42_groupnorm/groupnorm_swish_fp16.cpp rename to composable_kernel/example/42_groupnorm/groupnorm_swish_fp16.cpp diff --git a/example/42_groupnorm/run_groupnorm_example.inc b/composable_kernel/example/42_groupnorm/run_groupnorm_example.inc similarity index 100% rename from example/42_groupnorm/run_groupnorm_example.inc rename to composable_kernel/example/42_groupnorm/run_groupnorm_example.inc diff --git a/example/43_splitk_gemm_bias_e_permute/CMakeLists.txt b/composable_kernel/example/43_splitk_gemm_bias_e_permute/CMakeLists.txt similarity index 100% rename from example/43_splitk_gemm_bias_e_permute/CMakeLists.txt rename to composable_kernel/example/43_splitk_gemm_bias_e_permute/CMakeLists.txt diff --git a/example/43_splitk_gemm_bias_e_permute/splitk_gemm_bias_e_permute_xdl_fp16.cpp b/composable_kernel/example/43_splitk_gemm_bias_e_permute/splitk_gemm_bias_e_permute_xdl_fp16.cpp similarity index 100% rename from example/43_splitk_gemm_bias_e_permute/splitk_gemm_bias_e_permute_xdl_fp16.cpp rename to composable_kernel/example/43_splitk_gemm_bias_e_permute/splitk_gemm_bias_e_permute_xdl_fp16.cpp diff --git a/example/43_splitk_gemm_bias_e_permute/splitk_gemm_bias_e_permute_xdl_fp32.cpp b/composable_kernel/example/43_splitk_gemm_bias_e_permute/splitk_gemm_bias_e_permute_xdl_fp32.cpp similarity index 100% rename from example/43_splitk_gemm_bias_e_permute/splitk_gemm_bias_e_permute_xdl_fp32.cpp rename to composable_kernel/example/43_splitk_gemm_bias_e_permute/splitk_gemm_bias_e_permute_xdl_fp32.cpp diff --git a/example/44_elementwise_permute/CMakeLists.txt b/composable_kernel/example/44_elementwise_permute/CMakeLists.txt similarity index 100% rename from example/44_elementwise_permute/CMakeLists.txt rename to composable_kernel/example/44_elementwise_permute/CMakeLists.txt diff --git a/example/44_elementwise_permute/elementwise_permute_4D_fp16.cpp b/composable_kernel/example/44_elementwise_permute/elementwise_permute_4D_fp16.cpp similarity index 100% rename from example/44_elementwise_permute/elementwise_permute_4D_fp16.cpp rename to composable_kernel/example/44_elementwise_permute/elementwise_permute_4D_fp16.cpp diff --git a/example/44_elementwise_permute/elementwise_permute_4D_fp16_2d.cpp b/composable_kernel/example/44_elementwise_permute/elementwise_permute_4D_fp16_2d.cpp similarity index 100% rename from example/44_elementwise_permute/elementwise_permute_4D_fp16_2d.cpp rename to composable_kernel/example/44_elementwise_permute/elementwise_permute_4D_fp16_2d.cpp diff --git a/example/45_elementwise_normalization/CMakeLists.txt b/composable_kernel/example/45_elementwise_normalization/CMakeLists.txt similarity index 100% rename from example/45_elementwise_normalization/CMakeLists.txt rename to composable_kernel/example/45_elementwise_normalization/CMakeLists.txt diff --git a/example/45_elementwise_normalization/elementwise_layernorm_blockwise.cpp b/composable_kernel/example/45_elementwise_normalization/elementwise_layernorm_blockwise.cpp similarity index 100% rename from example/45_elementwise_normalization/elementwise_layernorm_blockwise.cpp rename to composable_kernel/example/45_elementwise_normalization/elementwise_layernorm_blockwise.cpp diff --git a/example/46_gemm_add_multiply/CMakeLists.txt b/composable_kernel/example/46_gemm_add_multiply/CMakeLists.txt similarity index 100% rename from example/46_gemm_add_multiply/CMakeLists.txt rename to composable_kernel/example/46_gemm_add_multiply/CMakeLists.txt diff --git a/example/46_gemm_add_multiply/README.md b/composable_kernel/example/46_gemm_add_multiply/README.md similarity index 100% rename from example/46_gemm_add_multiply/README.md rename to composable_kernel/example/46_gemm_add_multiply/README.md diff --git a/example/46_gemm_add_multiply/common.hpp b/composable_kernel/example/46_gemm_add_multiply/common.hpp similarity index 100% rename from example/46_gemm_add_multiply/common.hpp rename to composable_kernel/example/46_gemm_add_multiply/common.hpp diff --git a/example/46_gemm_add_multiply/gemm_add_multiply_dl_fp16.cpp b/composable_kernel/example/46_gemm_add_multiply/gemm_add_multiply_dl_fp16.cpp similarity index 100% rename from example/46_gemm_add_multiply/gemm_add_multiply_dl_fp16.cpp rename to composable_kernel/example/46_gemm_add_multiply/gemm_add_multiply_dl_fp16.cpp diff --git a/example/46_gemm_add_multiply/gemm_add_multiply_xdl_fp16.cpp b/composable_kernel/example/46_gemm_add_multiply/gemm_add_multiply_xdl_fp16.cpp similarity index 100% rename from example/46_gemm_add_multiply/gemm_add_multiply_xdl_fp16.cpp rename to composable_kernel/example/46_gemm_add_multiply/gemm_add_multiply_xdl_fp16.cpp diff --git a/example/46_gemm_add_multiply/run_gemm_add_multiply_example.inc b/composable_kernel/example/46_gemm_add_multiply/run_gemm_add_multiply_example.inc similarity index 100% rename from example/46_gemm_add_multiply/run_gemm_add_multiply_example.inc rename to composable_kernel/example/46_gemm_add_multiply/run_gemm_add_multiply_example.inc diff --git a/example/47_gemm_bias_softmax_gemm_permute/CMakeLists.txt b/composable_kernel/example/47_gemm_bias_softmax_gemm_permute/CMakeLists.txt similarity index 100% rename from example/47_gemm_bias_softmax_gemm_permute/CMakeLists.txt rename to composable_kernel/example/47_gemm_bias_softmax_gemm_permute/CMakeLists.txt diff --git a/example/47_gemm_bias_softmax_gemm_permute/gemm_bias_softmax_gemm_permute.cpp b/composable_kernel/example/47_gemm_bias_softmax_gemm_permute/gemm_bias_softmax_gemm_permute.cpp similarity index 100% rename from example/47_gemm_bias_softmax_gemm_permute/gemm_bias_softmax_gemm_permute.cpp rename to composable_kernel/example/47_gemm_bias_softmax_gemm_permute/gemm_bias_softmax_gemm_permute.cpp diff --git a/example/48_pool3d_fwd/CMakeLists.txt b/composable_kernel/example/48_pool3d_fwd/CMakeLists.txt similarity index 100% rename from example/48_pool3d_fwd/CMakeLists.txt rename to composable_kernel/example/48_pool3d_fwd/CMakeLists.txt diff --git a/example/48_pool3d_fwd/pool3d_fwd_common.hpp b/composable_kernel/example/48_pool3d_fwd/pool3d_fwd_common.hpp similarity index 100% rename from example/48_pool3d_fwd/pool3d_fwd_common.hpp rename to composable_kernel/example/48_pool3d_fwd/pool3d_fwd_common.hpp diff --git a/example/48_pool3d_fwd/pool3d_fwd_fp16.cpp b/composable_kernel/example/48_pool3d_fwd/pool3d_fwd_fp16.cpp similarity index 100% rename from example/48_pool3d_fwd/pool3d_fwd_fp16.cpp rename to composable_kernel/example/48_pool3d_fwd/pool3d_fwd_fp16.cpp diff --git a/example/49_maxpool2d_bwd/CMakeLists.txt b/composable_kernel/example/49_maxpool2d_bwd/CMakeLists.txt similarity index 100% rename from example/49_maxpool2d_bwd/CMakeLists.txt rename to composable_kernel/example/49_maxpool2d_bwd/CMakeLists.txt diff --git a/example/49_maxpool2d_bwd/maxpool2d_bwd_bf16.cpp b/composable_kernel/example/49_maxpool2d_bwd/maxpool2d_bwd_bf16.cpp similarity index 100% rename from example/49_maxpool2d_bwd/maxpool2d_bwd_bf16.cpp rename to composable_kernel/example/49_maxpool2d_bwd/maxpool2d_bwd_bf16.cpp diff --git a/example/49_maxpool2d_bwd/maxpool2d_bwd_common.hpp b/composable_kernel/example/49_maxpool2d_bwd/maxpool2d_bwd_common.hpp similarity index 100% rename from example/49_maxpool2d_bwd/maxpool2d_bwd_common.hpp rename to composable_kernel/example/49_maxpool2d_bwd/maxpool2d_bwd_common.hpp diff --git a/example/49_maxpool2d_bwd/maxpool2d_bwd_fp16.cpp b/composable_kernel/example/49_maxpool2d_bwd/maxpool2d_bwd_fp16.cpp similarity index 100% rename from example/49_maxpool2d_bwd/maxpool2d_bwd_fp16.cpp rename to composable_kernel/example/49_maxpool2d_bwd/maxpool2d_bwd_fp16.cpp diff --git a/example/49_maxpool2d_bwd/maxpool2d_bwd_fp32.cpp b/composable_kernel/example/49_maxpool2d_bwd/maxpool2d_bwd_fp32.cpp similarity index 100% rename from example/49_maxpool2d_bwd/maxpool2d_bwd_fp32.cpp rename to composable_kernel/example/49_maxpool2d_bwd/maxpool2d_bwd_fp32.cpp diff --git a/example/50_put_element/CMakeLists.txt b/composable_kernel/example/50_put_element/CMakeLists.txt similarity index 100% rename from example/50_put_element/CMakeLists.txt rename to composable_kernel/example/50_put_element/CMakeLists.txt diff --git a/example/50_put_element/put_element_fp16.cpp b/composable_kernel/example/50_put_element/put_element_fp16.cpp similarity index 100% rename from example/50_put_element/put_element_fp16.cpp rename to composable_kernel/example/50_put_element/put_element_fp16.cpp diff --git a/example/51_avgpool3d_bwd/CMakeLists.txt b/composable_kernel/example/51_avgpool3d_bwd/CMakeLists.txt similarity index 100% rename from example/51_avgpool3d_bwd/CMakeLists.txt rename to composable_kernel/example/51_avgpool3d_bwd/CMakeLists.txt diff --git a/example/51_avgpool3d_bwd/avgpool3d_bwd_bf16.cpp b/composable_kernel/example/51_avgpool3d_bwd/avgpool3d_bwd_bf16.cpp similarity index 100% rename from example/51_avgpool3d_bwd/avgpool3d_bwd_bf16.cpp rename to composable_kernel/example/51_avgpool3d_bwd/avgpool3d_bwd_bf16.cpp diff --git a/example/51_avgpool3d_bwd/avgpool3d_bwd_common.hpp b/composable_kernel/example/51_avgpool3d_bwd/avgpool3d_bwd_common.hpp similarity index 100% rename from example/51_avgpool3d_bwd/avgpool3d_bwd_common.hpp rename to composable_kernel/example/51_avgpool3d_bwd/avgpool3d_bwd_common.hpp diff --git a/example/51_avgpool3d_bwd/avgpool3d_bwd_fp16.cpp b/composable_kernel/example/51_avgpool3d_bwd/avgpool3d_bwd_fp16.cpp similarity index 100% rename from example/51_avgpool3d_bwd/avgpool3d_bwd_fp16.cpp rename to composable_kernel/example/51_avgpool3d_bwd/avgpool3d_bwd_fp16.cpp diff --git a/example/51_avgpool3d_bwd/avgpool3d_bwd_fp32.cpp b/composable_kernel/example/51_avgpool3d_bwd/avgpool3d_bwd_fp32.cpp similarity index 100% rename from example/51_avgpool3d_bwd/avgpool3d_bwd_fp32.cpp rename to composable_kernel/example/51_avgpool3d_bwd/avgpool3d_bwd_fp32.cpp diff --git a/example/52_im2col_col2im/CMakeLists.txt b/composable_kernel/example/52_im2col_col2im/CMakeLists.txt similarity index 100% rename from example/52_im2col_col2im/CMakeLists.txt rename to composable_kernel/example/52_im2col_col2im/CMakeLists.txt diff --git a/example/52_im2col_col2im/column_to_image_f32.cpp b/composable_kernel/example/52_im2col_col2im/column_to_image_f32.cpp similarity index 100% rename from example/52_im2col_col2im/column_to_image_f32.cpp rename to composable_kernel/example/52_im2col_col2im/column_to_image_f32.cpp diff --git a/example/52_im2col_col2im/common.hpp b/composable_kernel/example/52_im2col_col2im/common.hpp similarity index 100% rename from example/52_im2col_col2im/common.hpp rename to composable_kernel/example/52_im2col_col2im/common.hpp diff --git a/example/52_im2col_col2im/image_to_column_f32.cpp b/composable_kernel/example/52_im2col_col2im/image_to_column_f32.cpp similarity index 100% rename from example/52_im2col_col2im/image_to_column_f32.cpp rename to composable_kernel/example/52_im2col_col2im/image_to_column_f32.cpp diff --git a/example/53_gemv_splitk/CMakeLists.txt b/composable_kernel/example/53_gemv_splitk/CMakeLists.txt similarity index 100% rename from example/53_gemv_splitk/CMakeLists.txt rename to composable_kernel/example/53_gemv_splitk/CMakeLists.txt diff --git a/example/53_gemv_splitk/README.md b/composable_kernel/example/53_gemv_splitk/README.md similarity index 100% rename from example/53_gemv_splitk/README.md rename to composable_kernel/example/53_gemv_splitk/README.md diff --git a/example/53_gemv_splitk/common.hpp b/composable_kernel/example/53_gemv_splitk/common.hpp similarity index 100% rename from example/53_gemv_splitk/common.hpp rename to composable_kernel/example/53_gemv_splitk/common.hpp diff --git a/example/53_gemv_splitk/gemv_splitk_fp16.cpp b/composable_kernel/example/53_gemv_splitk/gemv_splitk_fp16.cpp similarity index 95% rename from example/53_gemv_splitk/gemv_splitk_fp16.cpp rename to composable_kernel/example/53_gemv_splitk/gemv_splitk_fp16.cpp index 8d50ebe02..302cce984 100755 --- a/example/53_gemv_splitk/gemv_splitk_fp16.cpp +++ b/composable_kernel/example/53_gemv_splitk/gemv_splitk_fp16.cpp @@ -2,7 +2,7 @@ // Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. #include "common.hpp" -#include "ck/tensor_operation/gpu/device/impl/device_gemv_splitk.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_tall_and_skinny_gemm_splitk.hpp" using ADataType = ck::half_t; using BDataType = ck::half_t; @@ -25,7 +25,7 @@ static constexpr auto GemmMNPadding = ck::tensor_operation::device::GemmSpeciali #define B 64 // block-size:64 // clang-format off -using DeviceGemvInstance = ck::tensor_operation::device::deviceGemvDl/* +using DeviceGemvInstance = ck::tensor_operation::device::deviceTsmmDl/* // ######| AData| BData| CData| AccData| ALayout| BLayout| CLayout| A| B| C| GEMM| Block| MPer| NPer| K0Per| K1| M1Per| N1Per| KPer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer | ABlockTransfer| ABlockTransfer | BBlockTransfer| BThreadTransfer| BThreadTransfer| CThreadTransfer| CThreadTransfer| CThreadTransfer| // ######| Type| Type| Type| Type| | | | Elementwise| Elementwise| Elementwise| Spacialization| Size| Block| Block| Block| | ThreadM111| ThreadN111| Thread| ThreadSliceLengths| ThreadClusterLengths| ThreadCluster| SrcAccess|SrcVectorTensorLengths| SrcVectorTensor|DstVectorTensorLengths| SrcAccess| SrcVectorDim| SrcScalarPerVector| SrcDstAccess| SrcDstVectorDim| DstScalarPerVector| // ######| | | | | | | | Operation| Operation| Operation| | | | | | | | | | KBatch_K0_M0_M1_K1| KBatch_K0_M0_M1_K1| ArrangeOrder| Order| KBatch_K0_M0_M1_K1 | ContiguousDimOrder| KBatch_K0_M0_M1_K1 | Order| | | Order| | | diff --git a/example/53_gemv_splitk/run_gemv_splitk_example.inc b/composable_kernel/example/53_gemv_splitk/run_gemv_splitk_example.inc similarity index 100% rename from example/53_gemv_splitk/run_gemv_splitk_example.inc rename to composable_kernel/example/53_gemv_splitk/run_gemv_splitk_example.inc diff --git a/composable_kernel/example/54_tall_and_skinny_gemm_splitk/CMakeLists.txt b/composable_kernel/example/54_tall_and_skinny_gemm_splitk/CMakeLists.txt new file mode 100755 index 000000000..f0ab9d991 --- /dev/null +++ b/composable_kernel/example/54_tall_and_skinny_gemm_splitk/CMakeLists.txt @@ -0,0 +1,12 @@ +list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942) +set(target 0) +foreach(gpu IN LISTS GPU_TARGETS) + if(gpu IN_LIST gpu_list AND target EQUAL 0) + add_custom_target(example_tall_and_skinny_gemm_splitk) + add_example_executable(example_tall_and_skinny_gemm_splitk_fp16 tall_and_skinny_gemm_splitk_fp16.cpp) + # set_source_files_properties(splitK_gemv_fp16.cpp PROPERTIES COMPILE_OPTIONS "--save-temps;-Wno-gnu-line-marker;-gline-tables-only") + add_dependencies(example_tall_and_skinny_gemm_splitk + example_tall_and_skinny_gemm_splitk_fp16) + set(target 1) + endif() + endforeach() \ No newline at end of file diff --git a/composable_kernel/example/54_tall_and_skinny_gemm_splitk/README.md b/composable_kernel/example/54_tall_and_skinny_gemm_splitk/README.md new file mode 100755 index 000000000..9515e6971 --- /dev/null +++ b/composable_kernel/example/54_tall_and_skinny_gemm_splitk/README.md @@ -0,0 +1,19 @@ +# Instructions for ```example_gemv_splitk``` + +## Run ```example_gemv_splitk``` +```bash +#arg1: verification (0=no, 1=yes) +#arg2: initialization (0=no init, 1=integer value, 2=decimal value) +#arg3: run kernel # of times (>1) +#arg4: number of splitk batches +./bin/example_tall_and_skinny_gemm_splitk_fp* 0 1 5 151 + +``` + +Result (MI250 @ 800Mhz, 181.05TFlops peak FP16) +``` +a_m_k: dim 2, lengths {16, 1024}, strides {1024, 1} +b_k_n: dim 2, lengths {1024, 16}, strides {16, 1} +c_m_n: dim 2, lengths {16, 16}, strides {16, 1} +Perf: 0.0684798 ms, 0.0076561 TFlops, 0.964489 GB/s, deviceGemvDl<64, 16, 128, 4, 2, 16, 2, 1> +``` diff --git a/composable_kernel/example/54_tall_and_skinny_gemm_splitk/common.hpp b/composable_kernel/example/54_tall_and_skinny_gemm_splitk/common.hpp new file mode 100755 index 000000000..9dd3c7761 --- /dev/null +++ b/composable_kernel/example/54_tall_and_skinny_gemm_splitk/common.hpp @@ -0,0 +1,95 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include +#include +#include +#include + +#include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" +#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" +#include "ck/utility/data_type.hpp" + +#include "ck/library/utility/check_err.hpp" +#include "ck/library/utility/device_memory.hpp" +#include "ck/library/utility/fill.hpp" +#include "ck/library/utility/host_tensor.hpp" +#include "ck/library/utility/host_tensor_generator.hpp" +#include "ck/library/utility/literals.hpp" +#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp" + +struct ProblemSize final // Default GEMV problem size +{ + ck::index_t M = 16; + ck::index_t N = 16; + ck::index_t K = 1024; + // ck::index_t M = 2; + // ck::index_t N = 256; + // ck::index_t K = 256; + ck::index_t stride_A = K; + ck::index_t stride_B = N;//K; + ck::index_t stride_C = N; + ck::index_t k_batch = 1; +}; + +struct ExecutionConfig final +{ + bool do_verification = true; + int init_method = 1; + bool time_kernel = false; +}; + +template +using S = ck::Sequence; + +using Row = ck::tensor_layout::gemm::RowMajor; +using Col = ck::tensor_layout::gemm::ColumnMajor; + +using PassThrough = ck::tensor_operation::element_wise::PassThrough; + +inline bool +parse_cmd_args(int argc, char* argv[], ProblemSize& problem_size, ExecutionConfig& config) +{ + if(argc == 1) + { + // use default case + } + else if(argc == 5) + { + config.do_verification = std::stoi(argv[1]); + config.init_method = std::stoi(argv[2]); + config.time_kernel = std::stoi(argv[3]); + + problem_size.k_batch = std::stoi(argv[4]); + } + else if(argc == 11) + { + config.do_verification = std::stoi(argv[1]); + config.init_method = std::stoi(argv[2]); + config.time_kernel = std::stoi(argv[3]); + problem_size.k_batch = std::stoi(argv[4]); + + problem_size.M = std::stoi(argv[5]); + problem_size.N = std::stoi(argv[6]); + problem_size.K = std::stoi(argv[7]); + + problem_size.stride_A = std::stoi(argv[8]); + problem_size.stride_B = std::stoi(argv[9]); + problem_size.stride_C = std::stoi(argv[10]); + } + else + { + std::cerr << "arg1: verification (0=no, 1=yes)" << std::endl + << "arg2: initialization (0=no init, 1=integer value, 2=decimal value)" + << std::endl + << "arg3: time kernel (0=no, 1=yes)" << std::endl + << "arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideC" << std::endl; + return false; + } + + return true; +} diff --git a/composable_kernel/example/54_tall_and_skinny_gemm_splitk/run_tall_and_skinny_gemm_splitk_example.inc b/composable_kernel/example/54_tall_and_skinny_gemm_splitk/run_tall_and_skinny_gemm_splitk_example.inc new file mode 100755 index 000000000..3c319dc4d --- /dev/null +++ b/composable_kernel/example/54_tall_and_skinny_gemm_splitk/run_tall_and_skinny_gemm_splitk_example.inc @@ -0,0 +1,196 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +bool run_tall_and_skinny_gemm(const ProblemSize& problem_size, const ExecutionConfig& config) +{ +#if defined(BUILD_INT4_EXAMPLE) && defined(CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4) + static_assert(sizeof(ck::int4_t) == sizeof(int8_t)); +#endif + + using namespace ck::literals; + + auto& [M, N, K, StrideA, StrideB, StrideC, k_batch] = problem_size; // // + + auto f_host_tensor_descriptor = + [](std::size_t row, std::size_t col, std::size_t stride, auto layout) { + if constexpr(std::is_same_v) + { + return HostTensorDescriptor({row, col}, {stride, 1_uz}); + } + else + { + return HostTensorDescriptor({row, col}, {1_uz, stride}); + } + }; + + Tensor a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{})); + Tensor b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{})); + + switch(config.init_method) + { + case 0: break; + case 1: + ck::utils::FillUniformDistributionIntegerValue{-5.f, 5.f}(a_m_k); + ck::utils::FillUniformDistributionIntegerValue{-5.f, 5.f}(b_k_n); + break; + default: + ck::utils::FillUniformDistribution{-1.f, 1.f}(a_m_k); + ck::utils::FillUniformDistribution{-1.f, 1.f}(b_k_n); + } + + Tensor c_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{})); + Tensor c_m_n_device_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{})); + + std::cout << "a_m_k: " << a_m_k.mDesc << std::endl; + std::cout << "b_k_n: " << b_k_n.mDesc << std::endl; + std::cout << "c_m_n: " << c_m_n_host_result.mDesc << std::endl; + +#ifdef BUILD_INT4_EXAMPLE + DeviceMem a_m_k_device_buf(sizeof(KernelADataType) * a_m_k.mDesc.GetElementSpaceSize()); + DeviceMem b_k_n_device_buf(sizeof(KernelBDataType) * b_k_n.mDesc.GetElementSpaceSize()); + DeviceMem c_m_n_device_buf(sizeof(KernelCDataType) * + c_m_n_device_result.mDesc.GetElementSpaceSize()); + + const Tensor a_m_k_converted(a_m_k); + const Tensor b_k_n_converted(b_k_n); + + a_m_k_device_buf.ToDevice(a_m_k_converted.mData.data()); + b_k_n_device_buf.ToDevice(b_k_n_converted.mData.data()); +#else + DeviceMem a_m_k_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize()); + DeviceMem b_k_n_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize()); + DeviceMem c_m_n_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpaceSize()); + + a_m_k_device_buf.ToDevice(a_m_k.mData.data()); + b_k_n_device_buf.ToDevice(b_k_n.mData.data()); +#endif + + auto a_element_op = AElementOp{}; + auto b_element_op = BElementOp{}; + auto c_element_op = CElementOp{}; + + // do GEMM + auto tsmm = DeviceTSMMInstance{}; + auto invoker = tsmm.MakeInvoker(); + auto argument = tsmm.MakeArgument( +#ifdef BUILD_INT4_EXAMPLE + static_cast(a_m_k_device_buf.GetDeviceBuffer()), + static_cast(b_k_n_device_buf.GetDeviceBuffer()), + static_cast(c_m_n_device_buf.GetDeviceBuffer()), +#else + static_cast(a_m_k_device_buf.GetDeviceBuffer()), + static_cast(b_k_n_device_buf.GetDeviceBuffer()), + static_cast(c_m_n_device_buf.GetDeviceBuffer()), +#endif + M, + N, + K, + StrideA, + StrideB, + StrideC, + a_element_op, + b_element_op, + c_element_op, + k_batch); // // + + // // + if(!tsmm.IsSupportedArgument(argument)) + { + std::cerr << tsmm.GetTypeString() << " does not support this problem" << std::endl; + + return true; + } + + c_m_n_device_buf.SetZero(); + + + + if(config.do_verification) + { + invoker.Run(argument, StreamConfig{nullptr, false}); // Run prior to verification + auto ref_tsmm = ReferenceGemmInstance{}; + auto ref_invoker = ref_tsmm.MakeInvoker(); + + auto ref_argument = ref_tsmm.MakeArgument( + a_m_k, b_k_n, c_m_n_host_result, a_element_op, b_element_op, c_element_op); + + ref_invoker.Run(ref_argument); + +#ifdef BUILD_INT4_EXAMPLE + Tensor c_m_n_device_result_converted(c_m_n_host_result.mDesc); + + c_m_n_device_buf.FromDevice(c_m_n_device_result_converted.mData.data()); + + c_m_n_device_result = c_m_n_device_result_converted.CopyAsType(); + +#else + c_m_n_device_buf.FromDevice(c_m_n_device_result.mData.data()); + +#endif + } + + float ave_time = invoker.Run( + argument, StreamConfig{nullptr, config.time_kernel}); // Run to measure performance + + std::size_t flop = 2_uz * M * N * K; + std::size_t num_btype = + sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(CDataType) * M * N; + + float tflops = static_cast(flop) / 1.E9 / ave_time; + + float gb_per_sec = num_btype / 1.E6 / ave_time; + + std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, " + << tsmm.GetTypeString() << std::endl; + +#ifdef BUILD_INT4_EXAMPLE + return ck::utils::check_err(c_m_n_device_result_converted, c_m_n_host_result); +#else + return ck::utils::check_err(c_m_n_device_result, c_m_n_host_result); +#endif +} + +bool run_tall_and_skinny_gemm_example(int argc, char* argv[]) +{ + ProblemSize problem_size; + ExecutionConfig config; + if(argc == 1) + { + // use default case + } + else if(argc == 5) + { + config.do_verification = std::stoi(argv[1]); + config.init_method = std::stoi(argv[2]); + config.time_kernel = std::stoi(argv[3]); + problem_size.k_batch = std::stoi(argv[4]); + } + else if(argc == 11) + { + config.do_verification = std::stoi(argv[1]); + config.init_method = std::stoi(argv[2]); + config.time_kernel = std::stoi(argv[3]); + problem_size.k_batch = std::stoi(argv[4]); + + problem_size.M = std::stoi(argv[5]); + problem_size.N = std::stoi(argv[6]); + problem_size.K = std::stoi(argv[7]); + + problem_size.stride_A = std::stoi(argv[8]); + problem_size.stride_B = std::stoi(argv[9]); + problem_size.stride_C = std::stoi(argv[10]); + } + else + { + printf("arg1: verification (0=no, 1=yes)\n"); + printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"); + printf("arg3: time kernel (0=no, 1=yes)\n"); + printf("arg4: splitk\n"); + printf("arg5 to 11: M (256x), N(128x), K(32x), StrideA, StrideB, StrideC\n"); + exit(0); + } + + return run_tall_and_skinny_gemm(problem_size, config); +} diff --git a/composable_kernel/example/54_tall_and_skinny_gemm_splitk/tall_and_skinny_gemm_splitk_fp16.cpp b/composable_kernel/example/54_tall_and_skinny_gemm_splitk/tall_and_skinny_gemm_splitk_fp16.cpp new file mode 100755 index 000000000..5d24260e9 --- /dev/null +++ b/composable_kernel/example/54_tall_and_skinny_gemm_splitk/tall_and_skinny_gemm_splitk_fp16.cpp @@ -0,0 +1,43 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved. +#include "common.hpp" + +#include "ck/tensor_operation/gpu/device/impl/device_tall_and_skinny_gemm_splitk.hpp" + +using ADataType = ck::half_t; +using BDataType = ck::half_t; +using CDataType = ck::half_t; +using AccDataType = float; + +using ALayout = Row; +using BLayout = Row;//Col; +using CLayout = Row; + +using AElementOp = PassThrough; +using BElementOp = PassThrough; +using CElementOp = PassThrough; + +static constexpr auto GemmMNPadding = ck::tensor_operation::device::GemmSpecialization::MNPadding; + +#define K1 2 +#define K0 4 +#define N1 2 +#define B 64 // block-size:64 +#define M1 16 + +// clang-format off +using DeviceTSMMInstance = ck::tensor_operation::device::deviceTsmmDl/* +// ######| AData| BData| CData| AccData| ALayout| BLayout| CLayout| A| B| C| GEMM| Block| MPer| NPer| K0Per| K1| M1Per| N1Per| KPer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer | ABlockTransfer| ABlockTransfer | BBlockTransfer| BThreadTransfer| BThreadTransfer| CThreadTransfer| CThreadTransfer| CThreadTransfer| +// ######| Type| Type| Type| Type| | | | Elementwise| Elementwise| Elementwise| Spacialization| Size| Block| Block| Block| | ThreadM111| ThreadN111| Thread| ThreadSliceLengths| ThreadClusterLengths| ThreadCluster| SrcAccess|SrcVectorTensorLengths| SrcVectorTensor|DstVectorTensorLengths| SrcAccess| SrcVectorDim| SrcScalarPerVector| SrcDstAccess| SrcDstVectorDim| DstScalarPerVector| +// ######| | | | | | | | Operation| Operation| Operation| | | | | | | | | | KBatch_K0_M0_M1_K1| KBatch_K0_M0_M1_K1| ArrangeOrder| Order| KBatch_K0_M0_M1_K1 | ContiguousDimOrder| KBatch_K0_M0_M1_K1 | Order| | | Order| | | +// ######| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | + //< ADataType, BDataType, CDataType, AccDataType, ALayout, BLayout, CLayout, AElementOp, BElementOp, CElementOp, GemmMNPadding, 64, 1, 64, 32, 2, 1, 1, 1, S<1, 1, 1, 2>, S<32, 1, 1, 1>, S<1, 2, 0, 3>, S<1, 2, 0, 3>, S<1, 1, 1, 2>, S<1, 2, 0, 3>, S<1, 1, 1, 2>, S<1, 2, 0, 3>, 3, 2, S<0, 1, 2, 3, 4, 5>, 5, 1>;*/ + < ADataType, BDataType, CDataType, AccDataType, ALayout, BLayout, CLayout, AElementOp, BElementOp, CElementOp, GemmMNPadding, B, M1, B*N1, K0, K1, M1, N1, 1, S<1,1, 1, 1, K1>, S<1,K0, 1,M1, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, K1>, S<0,1,2,3,4>, S<1,1, 1, 1, K1>, S<0,1,2,3,4>, 3, N1, S<0, 1, 2, 3, 4, 5>, 5, N1>; +// clang-format on + +using ReferenceGemmInstance = ck::tensor_operation::host:: + ReferenceGemm; + +#include "run_tall_and_skinny_gemm_splitk_example.inc" + +int main(int argc, char* argv[]) { return !run_tall_and_skinny_gemm_example(argc, argv); } diff --git a/example/60_gemm_multi_ABD/CMakeLists.txt b/composable_kernel/example/60_gemm_multi_ABD/CMakeLists.txt similarity index 100% rename from example/60_gemm_multi_ABD/CMakeLists.txt rename to composable_kernel/example/60_gemm_multi_ABD/CMakeLists.txt diff --git a/example/60_gemm_multi_ABD/gemm_multi_ABD_xdl_fp16.cpp b/composable_kernel/example/60_gemm_multi_ABD/gemm_multi_ABD_xdl_fp16.cpp similarity index 100% rename from example/60_gemm_multi_ABD/gemm_multi_ABD_xdl_fp16.cpp rename to composable_kernel/example/60_gemm_multi_ABD/gemm_multi_ABD_xdl_fp16.cpp diff --git a/example/61_contraction_multi_ABD/CMakeLists.txt b/composable_kernel/example/61_contraction_multi_ABD/CMakeLists.txt similarity index 100% rename from example/61_contraction_multi_ABD/CMakeLists.txt rename to composable_kernel/example/61_contraction_multi_ABD/CMakeLists.txt diff --git a/example/61_contraction_multi_ABD/contraction_multi_ABD_xdl_fp16.cpp b/composable_kernel/example/61_contraction_multi_ABD/contraction_multi_ABD_xdl_fp16.cpp similarity index 100% rename from example/61_contraction_multi_ABD/contraction_multi_ABD_xdl_fp16.cpp rename to composable_kernel/example/61_contraction_multi_ABD/contraction_multi_ABD_xdl_fp16.cpp diff --git a/example/62_conv_fwd_activ/CMakeLists.txt b/composable_kernel/example/62_conv_fwd_activ/CMakeLists.txt similarity index 100% rename from example/62_conv_fwd_activ/CMakeLists.txt rename to composable_kernel/example/62_conv_fwd_activ/CMakeLists.txt diff --git a/example/62_conv_fwd_activ/convnd_fwd_activ_common.hpp b/composable_kernel/example/62_conv_fwd_activ/convnd_fwd_activ_common.hpp similarity index 100% rename from example/62_conv_fwd_activ/convnd_fwd_activ_common.hpp rename to composable_kernel/example/62_conv_fwd_activ/convnd_fwd_activ_common.hpp diff --git a/example/62_conv_fwd_activ/convnd_fwd_xdl_abs_fp16.cpp b/composable_kernel/example/62_conv_fwd_activ/convnd_fwd_xdl_abs_fp16.cpp similarity index 100% rename from example/62_conv_fwd_activ/convnd_fwd_xdl_abs_fp16.cpp rename to composable_kernel/example/62_conv_fwd_activ/convnd_fwd_xdl_abs_fp16.cpp diff --git a/example/62_conv_fwd_activ/convnd_fwd_xdl_clippedrelu_fp16.cpp b/composable_kernel/example/62_conv_fwd_activ/convnd_fwd_xdl_clippedrelu_fp16.cpp similarity index 100% rename from example/62_conv_fwd_activ/convnd_fwd_xdl_clippedrelu_fp16.cpp rename to composable_kernel/example/62_conv_fwd_activ/convnd_fwd_xdl_clippedrelu_fp16.cpp diff --git a/example/62_conv_fwd_activ/convnd_fwd_xdl_elu_fp16.cpp b/composable_kernel/example/62_conv_fwd_activ/convnd_fwd_xdl_elu_fp16.cpp similarity index 100% rename from example/62_conv_fwd_activ/convnd_fwd_xdl_elu_fp16.cpp rename to composable_kernel/example/62_conv_fwd_activ/convnd_fwd_xdl_elu_fp16.cpp diff --git a/example/62_conv_fwd_activ/convnd_fwd_xdl_leakyrelu_fp16.cpp b/composable_kernel/example/62_conv_fwd_activ/convnd_fwd_xdl_leakyrelu_fp16.cpp similarity index 100% rename from example/62_conv_fwd_activ/convnd_fwd_xdl_leakyrelu_fp16.cpp rename to composable_kernel/example/62_conv_fwd_activ/convnd_fwd_xdl_leakyrelu_fp16.cpp diff --git a/example/62_conv_fwd_activ/convnd_fwd_xdl_pow_fp16.cpp b/composable_kernel/example/62_conv_fwd_activ/convnd_fwd_xdl_pow_fp16.cpp similarity index 100% rename from example/62_conv_fwd_activ/convnd_fwd_xdl_pow_fp16.cpp rename to composable_kernel/example/62_conv_fwd_activ/convnd_fwd_xdl_pow_fp16.cpp diff --git a/example/62_conv_fwd_activ/convnd_fwd_xdl_relu_fp16.cpp b/composable_kernel/example/62_conv_fwd_activ/convnd_fwd_xdl_relu_fp16.cpp similarity index 100% rename from example/62_conv_fwd_activ/convnd_fwd_xdl_relu_fp16.cpp rename to composable_kernel/example/62_conv_fwd_activ/convnd_fwd_xdl_relu_fp16.cpp diff --git a/example/62_conv_fwd_activ/convnd_fwd_xdl_sigmoid_fp16.cpp b/composable_kernel/example/62_conv_fwd_activ/convnd_fwd_xdl_sigmoid_fp16.cpp similarity index 100% rename from example/62_conv_fwd_activ/convnd_fwd_xdl_sigmoid_fp16.cpp rename to composable_kernel/example/62_conv_fwd_activ/convnd_fwd_xdl_sigmoid_fp16.cpp diff --git a/example/62_conv_fwd_activ/convnd_fwd_xdl_softrelu_fp16.cpp b/composable_kernel/example/62_conv_fwd_activ/convnd_fwd_xdl_softrelu_fp16.cpp similarity index 100% rename from example/62_conv_fwd_activ/convnd_fwd_xdl_softrelu_fp16.cpp rename to composable_kernel/example/62_conv_fwd_activ/convnd_fwd_xdl_softrelu_fp16.cpp diff --git a/example/62_conv_fwd_activ/convnd_fwd_xdl_tanh_fp16.cpp b/composable_kernel/example/62_conv_fwd_activ/convnd_fwd_xdl_tanh_fp16.cpp similarity index 100% rename from example/62_conv_fwd_activ/convnd_fwd_xdl_tanh_fp16.cpp rename to composable_kernel/example/62_conv_fwd_activ/convnd_fwd_xdl_tanh_fp16.cpp diff --git a/example/62_conv_fwd_activ/run_convnd_fwd_activ_example.inc b/composable_kernel/example/62_conv_fwd_activ/run_convnd_fwd_activ_example.inc similarity index 100% rename from example/62_conv_fwd_activ/run_convnd_fwd_activ_example.inc rename to composable_kernel/example/62_conv_fwd_activ/run_convnd_fwd_activ_example.inc diff --git a/example/CMakeLists.txt b/composable_kernel/example/CMakeLists.txt similarity index 100% rename from example/CMakeLists.txt rename to composable_kernel/example/CMakeLists.txt diff --git a/include/ck/ck.hpp b/composable_kernel/include/ck/ck.hpp similarity index 100% rename from include/ck/ck.hpp rename to composable_kernel/include/ck/ck.hpp diff --git a/include/ck/config.h.in b/composable_kernel/include/ck/config.h.in similarity index 100% rename from include/ck/config.h.in rename to composable_kernel/include/ck/config.h.in diff --git a/include/ck/host_utility/device_prop.hpp b/composable_kernel/include/ck/host_utility/device_prop.hpp similarity index 100% rename from include/ck/host_utility/device_prop.hpp rename to composable_kernel/include/ck/host_utility/device_prop.hpp diff --git a/include/ck/host_utility/hip_check_error.hpp b/composable_kernel/include/ck/host_utility/hip_check_error.hpp similarity index 100% rename from include/ck/host_utility/hip_check_error.hpp rename to composable_kernel/include/ck/host_utility/hip_check_error.hpp diff --git a/include/ck/host_utility/io.hpp b/composable_kernel/include/ck/host_utility/io.hpp similarity index 100% rename from include/ck/host_utility/io.hpp rename to composable_kernel/include/ck/host_utility/io.hpp diff --git a/include/ck/host_utility/kernel_launch.hpp b/composable_kernel/include/ck/host_utility/kernel_launch.hpp similarity index 100% rename from include/ck/host_utility/kernel_launch.hpp rename to composable_kernel/include/ck/host_utility/kernel_launch.hpp diff --git a/include/ck/host_utility/stream_utility.hpp b/composable_kernel/include/ck/host_utility/stream_utility.hpp similarity index 100% rename from include/ck/host_utility/stream_utility.hpp rename to composable_kernel/include/ck/host_utility/stream_utility.hpp diff --git a/include/ck/problem_transform/transform_forward_convolution3d_into_gemm_v4r4r4_ndhwc_kzyxc_ndhwk.hpp b/composable_kernel/include/ck/problem_transform/transform_forward_convolution3d_into_gemm_v4r4r4_ndhwc_kzyxc_ndhwk.hpp similarity index 100% rename from include/ck/problem_transform/transform_forward_convolution3d_into_gemm_v4r4r4_ndhwc_kzyxc_ndhwk.hpp rename to composable_kernel/include/ck/problem_transform/transform_forward_convolution3d_into_gemm_v4r4r4_ndhwc_kzyxc_ndhwk.hpp diff --git a/include/ck/stream_config.hpp b/composable_kernel/include/ck/stream_config.hpp similarity index 100% rename from include/ck/stream_config.hpp rename to composable_kernel/include/ck/stream_config.hpp diff --git a/include/ck/tensor/static_tensor.hpp b/composable_kernel/include/ck/tensor/static_tensor.hpp similarity index 100% rename from include/ck/tensor/static_tensor.hpp rename to composable_kernel/include/ck/tensor/static_tensor.hpp diff --git a/include/ck/tensor_description/cluster_descriptor.hpp b/composable_kernel/include/ck/tensor_description/cluster_descriptor.hpp similarity index 100% rename from include/ck/tensor_description/cluster_descriptor.hpp rename to composable_kernel/include/ck/tensor_description/cluster_descriptor.hpp diff --git a/include/ck/tensor_description/multi_index_transform.hpp b/composable_kernel/include/ck/tensor_description/multi_index_transform.hpp similarity index 100% rename from include/ck/tensor_description/multi_index_transform.hpp rename to composable_kernel/include/ck/tensor_description/multi_index_transform.hpp diff --git a/include/ck/tensor_description/multi_index_transform_helper.hpp b/composable_kernel/include/ck/tensor_description/multi_index_transform_helper.hpp similarity index 100% rename from include/ck/tensor_description/multi_index_transform_helper.hpp rename to composable_kernel/include/ck/tensor_description/multi_index_transform_helper.hpp diff --git a/include/ck/tensor_description/tensor_adaptor.hpp b/composable_kernel/include/ck/tensor_description/tensor_adaptor.hpp similarity index 100% rename from include/ck/tensor_description/tensor_adaptor.hpp rename to composable_kernel/include/ck/tensor_description/tensor_adaptor.hpp diff --git a/include/ck/tensor_description/tensor_descriptor.hpp b/composable_kernel/include/ck/tensor_description/tensor_descriptor.hpp similarity index 100% rename from include/ck/tensor_description/tensor_descriptor.hpp rename to composable_kernel/include/ck/tensor_description/tensor_descriptor.hpp diff --git a/include/ck/tensor_description/tensor_descriptor_helper.hpp b/composable_kernel/include/ck/tensor_description/tensor_descriptor_helper.hpp similarity index 100% rename from include/ck/tensor_description/tensor_descriptor_helper.hpp rename to composable_kernel/include/ck/tensor_description/tensor_descriptor_helper.hpp diff --git a/include/ck/tensor_description/tensor_space_filling_curve.hpp b/composable_kernel/include/ck/tensor_description/tensor_space_filling_curve.hpp similarity index 100% rename from include/ck/tensor_description/tensor_space_filling_curve.hpp rename to composable_kernel/include/ck/tensor_description/tensor_space_filling_curve.hpp diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_dl_v2r3.hpp b/composable_kernel/include/ck/tensor_operation/gpu/block/blockwise_gemm_dl_v2r3.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/block/blockwise_gemm_dl_v2r3.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/block/blockwise_gemm_dl_v2r3.hpp diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_dlops_v2r2.hpp b/composable_kernel/include/ck/tensor_operation/gpu/block/blockwise_gemm_dlops_v2r2.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/block/blockwise_gemm_dlops_v2r2.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/block/blockwise_gemm_dlops_v2r2.hpp diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_dlops_v3.hpp b/composable_kernel/include/ck/tensor_operation/gpu/block/blockwise_gemm_dlops_v3.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/block/blockwise_gemm_dlops_v3.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/block/blockwise_gemm_dlops_v3.hpp diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_dpp.hpp b/composable_kernel/include/ck/tensor_operation/gpu/block/blockwise_gemm_dpp.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/block/blockwise_gemm_dpp.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/block/blockwise_gemm_dpp.hpp diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp b/composable_kernel/include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp b/composable_kernel/include/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops_skip_b_lds.hpp b/composable_kernel/include/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops_skip_b_lds.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops_skip_b_lds.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops_skip_b_lds.hpp diff --git a/include/ck/tensor_operation/gpu/block/blockwise_softmax.hpp b/composable_kernel/include/ck/tensor_operation/gpu/block/blockwise_softmax.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/block/blockwise_softmax.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/block/blockwise_softmax.hpp diff --git a/include/ck/tensor_operation/gpu/block/blockwise_tensor_slice_transfer_v5r1.hpp b/composable_kernel/include/ck/tensor_operation/gpu/block/blockwise_tensor_slice_transfer_v5r1.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/block/blockwise_tensor_slice_transfer_v5r1.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/block/blockwise_tensor_slice_transfer_v5r1.hpp diff --git a/include/ck/tensor_operation/gpu/block/blockwise_welford.hpp b/composable_kernel/include/ck/tensor_operation/gpu/block/blockwise_welford.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/block/blockwise_welford.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/block/blockwise_welford.hpp diff --git a/include/ck/tensor_operation/gpu/block/reduction_functions_blockwise.hpp b/composable_kernel/include/ck/tensor_operation/gpu/block/reduction_functions_blockwise.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/block/reduction_functions_blockwise.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/block/reduction_functions_blockwise.hpp diff --git a/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v4r1.hpp b/composable_kernel/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v4r1.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v4r1.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v4r1.hpp diff --git a/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v6r1.hpp b/composable_kernel/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v6r1.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v6r1.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v6r1.hpp diff --git a/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v6r1r2.hpp b/composable_kernel/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v6r1r2.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v6r1r2.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v6r1r2.hpp diff --git a/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v6r2.hpp b/composable_kernel/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v6r2.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v6r2.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v6r2.hpp diff --git a/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v6r3.hpp b/composable_kernel/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v6r3.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v6r3.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v6r3.hpp diff --git a/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v7.hpp b/composable_kernel/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v7.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v7.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v7.hpp diff --git a/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v7r2.hpp b/composable_kernel/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v7r2.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v7r2.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v7r2.hpp diff --git a/include/ck/tensor_operation/gpu/device/conv_tensor_rearrange_op.hpp b/composable_kernel/include/ck/tensor_operation/gpu/device/conv_tensor_rearrange_op.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/device/conv_tensor_rearrange_op.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/device/conv_tensor_rearrange_op.hpp diff --git a/include/ck/tensor_operation/gpu/device/convolution_backward_data_specialization.hpp b/composable_kernel/include/ck/tensor_operation/gpu/device/convolution_backward_data_specialization.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/device/convolution_backward_data_specialization.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/device/convolution_backward_data_specialization.hpp diff --git a/include/ck/tensor_operation/gpu/device/convolution_backward_weight_specialization.hpp b/composable_kernel/include/ck/tensor_operation/gpu/device/convolution_backward_weight_specialization.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/device/convolution_backward_weight_specialization.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/device/convolution_backward_weight_specialization.hpp diff --git a/include/ck/tensor_operation/gpu/device/convolution_forward_specialization.hpp b/composable_kernel/include/ck/tensor_operation/gpu/device/convolution_forward_specialization.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/device/convolution_forward_specialization.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/device/convolution_forward_specialization.hpp diff --git a/include/ck/tensor_operation/gpu/device/device_avgpool_bwd.hpp b/composable_kernel/include/ck/tensor_operation/gpu/device/device_avgpool_bwd.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/device/device_avgpool_bwd.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/device/device_avgpool_bwd.hpp diff --git a/include/ck/tensor_operation/gpu/device/device_base.hpp b/composable_kernel/include/ck/tensor_operation/gpu/device/device_base.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/device/device_base.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/device/device_base.hpp diff --git a/include/ck/tensor_operation/gpu/device/device_batched_contraction_multiple_d.hpp b/composable_kernel/include/ck/tensor_operation/gpu/device/device_batched_contraction_multiple_d.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/device/device_batched_contraction_multiple_d.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/device/device_batched_contraction_multiple_d.hpp diff --git a/include/ck/tensor_operation/gpu/device/device_batched_gemm.hpp b/composable_kernel/include/ck/tensor_operation/gpu/device/device_batched_gemm.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/device/device_batched_gemm.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/device/device_batched_gemm.hpp diff --git a/include/ck/tensor_operation/gpu/device/device_batched_gemm_e_permute.hpp b/composable_kernel/include/ck/tensor_operation/gpu/device/device_batched_gemm_e_permute.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/device/device_batched_gemm_e_permute.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/device/device_batched_gemm_e_permute.hpp diff --git a/include/ck/tensor_operation/gpu/device/device_batched_gemm_gemm.hpp b/composable_kernel/include/ck/tensor_operation/gpu/device/device_batched_gemm_gemm.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/device/device_batched_gemm_gemm.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/device/device_batched_gemm_gemm.hpp diff --git a/include/ck/tensor_operation/gpu/device/device_batched_gemm_multi_d.hpp b/composable_kernel/include/ck/tensor_operation/gpu/device/device_batched_gemm_multi_d.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/device/device_batched_gemm_multi_d.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/device/device_batched_gemm_multi_d.hpp diff --git a/include/ck/tensor_operation/gpu/device/device_batched_gemm_multiple_d_gemm_multiple_d.hpp b/composable_kernel/include/ck/tensor_operation/gpu/device/device_batched_gemm_multiple_d_gemm_multiple_d.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/device/device_batched_gemm_multiple_d_gemm_multiple_d.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/device/device_batched_gemm_multiple_d_gemm_multiple_d.hpp diff --git a/include/ck/tensor_operation/gpu/device/device_batched_gemm_softmax_gemm.hpp b/composable_kernel/include/ck/tensor_operation/gpu/device/device_batched_gemm_softmax_gemm.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/device/device_batched_gemm_softmax_gemm.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/device/device_batched_gemm_softmax_gemm.hpp diff --git a/include/ck/tensor_operation/gpu/device/device_batched_gemm_softmax_gemm_permute.hpp b/composable_kernel/include/ck/tensor_operation/gpu/device/device_batched_gemm_softmax_gemm_permute.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/device/device_batched_gemm_softmax_gemm_permute.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/device/device_batched_gemm_softmax_gemm_permute.hpp diff --git a/include/ck/tensor_operation/gpu/device/device_batchnorm_backward.hpp b/composable_kernel/include/ck/tensor_operation/gpu/device/device_batchnorm_backward.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/device/device_batchnorm_backward.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/device/device_batchnorm_backward.hpp diff --git a/include/ck/tensor_operation/gpu/device/device_batchnorm_forward.hpp b/composable_kernel/include/ck/tensor_operation/gpu/device/device_batchnorm_forward.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/device/device_batchnorm_forward.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/device/device_batchnorm_forward.hpp diff --git a/include/ck/tensor_operation/gpu/device/device_batchnorm_infer.hpp b/composable_kernel/include/ck/tensor_operation/gpu/device/device_batchnorm_infer.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/device/device_batchnorm_infer.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/device/device_batchnorm_infer.hpp diff --git a/include/ck/tensor_operation/gpu/device/device_cgemm.hpp b/composable_kernel/include/ck/tensor_operation/gpu/device/device_cgemm.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/device/device_cgemm.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/device/device_cgemm.hpp diff --git a/include/ck/tensor_operation/gpu/device/device_contraction_multiple_abd.hpp b/composable_kernel/include/ck/tensor_operation/gpu/device/device_contraction_multiple_abd.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/device/device_contraction_multiple_abd.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/device/device_contraction_multiple_abd.hpp diff --git a/include/ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp b/composable_kernel/include/ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp diff --git a/include/ck/tensor_operation/gpu/device/device_conv_bwd_data.hpp b/composable_kernel/include/ck/tensor_operation/gpu/device/device_conv_bwd_data.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/device/device_conv_bwd_data.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/device/device_conv_bwd_data.hpp diff --git a/include/ck/tensor_operation/gpu/device/device_conv_fwd.hpp b/composable_kernel/include/ck/tensor_operation/gpu/device/device_conv_fwd.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/device/device_conv_fwd.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/device/device_conv_fwd.hpp diff --git a/include/ck/tensor_operation/gpu/device/device_conv_fwd_bias_activation.hpp b/composable_kernel/include/ck/tensor_operation/gpu/device/device_conv_fwd_bias_activation.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/device/device_conv_fwd_bias_activation.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/device/device_conv_fwd_bias_activation.hpp diff --git a/include/ck/tensor_operation/gpu/device/device_conv_fwd_bias_activation_add.hpp b/composable_kernel/include/ck/tensor_operation/gpu/device/device_conv_fwd_bias_activation_add.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/device/device_conv_fwd_bias_activation_add.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/device/device_conv_fwd_bias_activation_add.hpp diff --git a/include/ck/tensor_operation/gpu/device/device_conv_tensor_rearrange.hpp b/composable_kernel/include/ck/tensor_operation/gpu/device/device_conv_tensor_rearrange.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/device/device_conv_tensor_rearrange.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/device/device_conv_tensor_rearrange.hpp diff --git a/include/ck/tensor_operation/gpu/device/device_elementwise.hpp b/composable_kernel/include/ck/tensor_operation/gpu/device/device_elementwise.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/device/device_elementwise.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/device/device_elementwise.hpp diff --git a/include/ck/tensor_operation/gpu/device/device_elementwise_normalization.hpp b/composable_kernel/include/ck/tensor_operation/gpu/device/device_elementwise_normalization.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/device/device_elementwise_normalization.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/device/device_elementwise_normalization.hpp diff --git a/include/ck/tensor_operation/gpu/device/device_gemm.hpp b/composable_kernel/include/ck/tensor_operation/gpu/device/device_gemm.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/device/device_gemm.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/device/device_gemm.hpp diff --git a/include/ck/tensor_operation/gpu/device/device_gemm_bias_e_permute.hpp b/composable_kernel/include/ck/tensor_operation/gpu/device/device_gemm_bias_e_permute.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/device/device_gemm_bias_e_permute.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/device/device_gemm_bias_e_permute.hpp diff --git a/include/ck/tensor_operation/gpu/device/device_gemm_multiple_abd.hpp b/composable_kernel/include/ck/tensor_operation/gpu/device/device_gemm_multiple_abd.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/device/device_gemm_multiple_abd.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/device/device_gemm_multiple_abd.hpp diff --git a/include/ck/tensor_operation/gpu/device/device_gemm_multiple_d.hpp b/composable_kernel/include/ck/tensor_operation/gpu/device/device_gemm_multiple_d.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/device/device_gemm_multiple_d.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/device/device_gemm_multiple_d.hpp diff --git a/include/ck/tensor_operation/gpu/device/device_gemm_multiple_d_layernorm.hpp b/composable_kernel/include/ck/tensor_operation/gpu/device/device_gemm_multiple_d_layernorm.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/device/device_gemm_multiple_d_layernorm.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/device/device_gemm_multiple_d_layernorm.hpp diff --git a/include/ck/tensor_operation/gpu/device/device_gemm_multiple_d_multiple_r.hpp b/composable_kernel/include/ck/tensor_operation/gpu/device/device_gemm_multiple_d_multiple_r.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/device/device_gemm_multiple_d_multiple_r.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/device/device_gemm_multiple_d_multiple_r.hpp diff --git a/include/ck/tensor_operation/gpu/device/device_gemm_reduce.hpp b/composable_kernel/include/ck/tensor_operation/gpu/device/device_gemm_reduce.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/device/device_gemm_reduce.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/device/device_gemm_reduce.hpp diff --git a/include/ck/tensor_operation/gpu/device/device_gemm_splitk.hpp b/composable_kernel/include/ck/tensor_operation/gpu/device/device_gemm_splitk.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/device/device_gemm_splitk.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/device/device_gemm_splitk.hpp diff --git a/include/ck/tensor_operation/gpu/device/device_gemm_streamk.hpp b/composable_kernel/include/ck/tensor_operation/gpu/device/device_gemm_streamk.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/device/device_gemm_streamk.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/device/device_gemm_streamk.hpp diff --git a/include/ck/tensor_operation/gpu/device/device_grouped_contraction_multiple_d.hpp b/composable_kernel/include/ck/tensor_operation/gpu/device/device_grouped_contraction_multiple_d.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/device/device_grouped_contraction_multiple_d.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/device/device_grouped_contraction_multiple_d.hpp diff --git a/include/ck/tensor_operation/gpu/device/device_grouped_conv_bwd_data_multiple_d.hpp b/composable_kernel/include/ck/tensor_operation/gpu/device/device_grouped_conv_bwd_data_multiple_d.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/device/device_grouped_conv_bwd_data_multiple_d.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/device/device_grouped_conv_bwd_data_multiple_d.hpp diff --git a/include/ck/tensor_operation/gpu/device/device_grouped_conv_bwd_weight.hpp b/composable_kernel/include/ck/tensor_operation/gpu/device/device_grouped_conv_bwd_weight.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/device/device_grouped_conv_bwd_weight.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/device/device_grouped_conv_bwd_weight.hpp diff --git a/include/ck/tensor_operation/gpu/device/device_grouped_conv_fwd.hpp b/composable_kernel/include/ck/tensor_operation/gpu/device/device_grouped_conv_fwd.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/device/device_grouped_conv_fwd.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/device/device_grouped_conv_fwd.hpp diff --git a/include/ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_d.hpp b/composable_kernel/include/ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_d.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_d.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_d.hpp diff --git a/include/ck/tensor_operation/gpu/device/device_grouped_gemm.hpp b/composable_kernel/include/ck/tensor_operation/gpu/device/device_grouped_gemm.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/device/device_grouped_gemm.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/device/device_grouped_gemm.hpp diff --git a/include/ck/tensor_operation/gpu/device/device_grouped_gemm_fixed_nk.hpp b/composable_kernel/include/ck/tensor_operation/gpu/device/device_grouped_gemm_fixed_nk.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/device/device_grouped_gemm_fixed_nk.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/device/device_grouped_gemm_fixed_nk.hpp diff --git a/include/ck/tensor_operation/gpu/device/device_grouped_gemm_softmax_gemm_permute.hpp b/composable_kernel/include/ck/tensor_operation/gpu/device/device_grouped_gemm_softmax_gemm_permute.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/device/device_grouped_gemm_softmax_gemm_permute.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/device/device_grouped_gemm_softmax_gemm_permute.hpp diff --git a/include/ck/tensor_operation/gpu/device/device_grouped_gemm_splitk.hpp b/composable_kernel/include/ck/tensor_operation/gpu/device/device_grouped_gemm_splitk.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/device/device_grouped_gemm_splitk.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/device/device_grouped_gemm_splitk.hpp diff --git a/include/ck/tensor_operation/gpu/device/device_max_pool_bwd.hpp b/composable_kernel/include/ck/tensor_operation/gpu/device/device_max_pool_bwd.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/device/device_max_pool_bwd.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/device/device_max_pool_bwd.hpp diff --git a/include/ck/tensor_operation/gpu/device/device_multiple_reduce.hpp b/composable_kernel/include/ck/tensor_operation/gpu/device/device_multiple_reduce.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/device/device_multiple_reduce.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/device/device_multiple_reduce.hpp diff --git a/include/ck/tensor_operation/gpu/device/device_normalization.hpp b/composable_kernel/include/ck/tensor_operation/gpu/device/device_normalization.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/device/device_normalization.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/device/device_normalization.hpp diff --git a/include/ck/tensor_operation/gpu/device/device_permute.hpp b/composable_kernel/include/ck/tensor_operation/gpu/device/device_permute.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/device/device_permute.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/device/device_permute.hpp diff --git a/include/ck/tensor_operation/gpu/device/device_pool_fwd.hpp b/composable_kernel/include/ck/tensor_operation/gpu/device/device_pool_fwd.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/device/device_pool_fwd.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/device/device_pool_fwd.hpp diff --git a/include/ck/tensor_operation/gpu/device/device_put_element.hpp b/composable_kernel/include/ck/tensor_operation/gpu/device/device_put_element.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/device/device_put_element.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/device/device_put_element.hpp diff --git a/include/ck/tensor_operation/gpu/device/device_reduce.hpp b/composable_kernel/include/ck/tensor_operation/gpu/device/device_reduce.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/device/device_reduce.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/device/device_reduce.hpp diff --git a/include/ck/tensor_operation/gpu/device/device_softmax.hpp b/composable_kernel/include/ck/tensor_operation/gpu/device/device_softmax.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/device/device_softmax.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/device/device_softmax.hpp diff --git a/include/ck/tensor_operation/gpu/device/device_splitk_contraction_multiple_d.hpp b/composable_kernel/include/ck/tensor_operation/gpu/device/device_splitk_contraction_multiple_d.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/device/device_splitk_contraction_multiple_d.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/device/device_splitk_contraction_multiple_d.hpp diff --git a/include/ck/tensor_operation/gpu/device/device_gemv.hpp b/composable_kernel/include/ck/tensor_operation/gpu/device/device_tall_and_skinny_gemm.hpp similarity index 97% rename from include/ck/tensor_operation/gpu/device/device_gemv.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/device/device_tall_and_skinny_gemm.hpp index d37cf3c19..4b4664100 100755 --- a/include/ck/tensor_operation/gpu/device/device_gemv.hpp +++ b/composable_kernel/include/ck/tensor_operation/gpu/device/device_tall_and_skinny_gemm.hpp @@ -18,7 +18,7 @@ template -struct DeviceGemv : public BaseOperator +struct DeviceTsmm : public BaseOperator { virtual std::unique_ptr MakeArgumentPointer(const void* p_a, const void* p_b, diff --git a/include/ck/tensor_operation/gpu/device/gemm_specialization.hpp b/composable_kernel/include/ck/tensor_operation/gpu/device/gemm_specialization.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/device/gemm_specialization.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/device/gemm_specialization.hpp diff --git a/include/ck/tensor_operation/gpu/device/impl/device_avgpool3d_bwd_ndhwc_ndhwc.hpp b/composable_kernel/include/ck/tensor_operation/gpu/device/impl/device_avgpool3d_bwd_ndhwc_ndhwc.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/device/impl/device_avgpool3d_bwd_ndhwc_ndhwc.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/device/impl/device_avgpool3d_bwd_ndhwc_ndhwc.hpp diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_contraction_multiple_d_wmma_cshuffle.hpp b/composable_kernel/include/ck/tensor_operation/gpu/device/impl/device_batched_contraction_multiple_d_wmma_cshuffle.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/device/impl/device_batched_contraction_multiple_d_wmma_cshuffle.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/device/impl/device_batched_contraction_multiple_d_wmma_cshuffle.hpp diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_contraction_multiple_d_xdl_cshuffle.hpp b/composable_kernel/include/ck/tensor_operation/gpu/device/impl/device_batched_contraction_multiple_d_xdl_cshuffle.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/device/impl/device_batched_contraction_multiple_d_xdl_cshuffle.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/device/impl/device_batched_contraction_multiple_d_xdl_cshuffle.hpp diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_e_permute_xdl.hpp b/composable_kernel/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_e_permute_xdl.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_e_permute_xdl.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_e_permute_xdl.hpp diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_gemm_xdl_cshuffle.hpp b/composable_kernel/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_gemm_xdl_cshuffle.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_gemm_xdl_cshuffle.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_gemm_xdl_cshuffle.hpp diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multi_d_xdl.hpp b/composable_kernel/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multi_d_xdl.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multi_d_xdl.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multi_d_xdl.hpp diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_dl.hpp b/composable_kernel/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_dl.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_dl.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_dl.hpp diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle.hpp b/composable_kernel/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle.hpp diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_reduce_xdl_cshuffle.hpp b/composable_kernel/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_reduce_xdl_cshuffle.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_reduce_xdl_cshuffle.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_reduce_xdl_cshuffle.hpp diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle.hpp b/composable_kernel/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle.hpp diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_xdl_cshuffle.hpp b/composable_kernel/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_xdl_cshuffle.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_xdl_cshuffle.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_xdl_cshuffle.hpp diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_xdl.hpp b/composable_kernel/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_xdl.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_xdl.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_xdl.hpp diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batchnorm_backward_impl.hpp b/composable_kernel/include/ck/tensor_operation/gpu/device/impl/device_batchnorm_backward_impl.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/device/impl/device_batchnorm_backward_impl.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/device/impl/device_batchnorm_backward_impl.hpp diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batchnorm_forward_impl.hpp b/composable_kernel/include/ck/tensor_operation/gpu/device/impl/device_batchnorm_forward_impl.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/device/impl/device_batchnorm_forward_impl.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/device/impl/device_batchnorm_forward_impl.hpp diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batchnorm_forward_impl_obsolete.hpp b/composable_kernel/include/ck/tensor_operation/gpu/device/impl/device_batchnorm_forward_impl_obsolete.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/device/impl/device_batchnorm_forward_impl_obsolete.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/device/impl/device_batchnorm_forward_impl_obsolete.hpp diff --git a/include/ck/tensor_operation/gpu/device/impl/device_cgemm_4gemm_xdl_cshuffle.hpp b/composable_kernel/include/ck/tensor_operation/gpu/device/impl/device_cgemm_4gemm_xdl_cshuffle.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/device/impl/device_cgemm_4gemm_xdl_cshuffle.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/device/impl/device_cgemm_4gemm_xdl_cshuffle.hpp diff --git a/include/ck/tensor_operation/gpu/device/impl/device_column_to_image_impl.hpp b/composable_kernel/include/ck/tensor_operation/gpu/device/impl/device_column_to_image_impl.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/device/impl/device_column_to_image_impl.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/device/impl/device_column_to_image_impl.hpp diff --git a/include/ck/tensor_operation/gpu/device/impl/device_contraction_multiple_abd_xdl_cshuffle.hpp b/composable_kernel/include/ck/tensor_operation/gpu/device/impl/device_contraction_multiple_abd_xdl_cshuffle.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/device/impl/device_contraction_multiple_abd_xdl_cshuffle.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/device/impl/device_contraction_multiple_abd_xdl_cshuffle.hpp diff --git a/include/ck/tensor_operation/gpu/device/impl/device_contraction_multiple_d_xdl_cshuffle.hpp b/composable_kernel/include/ck/tensor_operation/gpu/device/impl/device_contraction_multiple_d_xdl_cshuffle.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/device/impl/device_contraction_multiple_d_xdl_cshuffle.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/device/impl/device_contraction_multiple_d_xdl_cshuffle.hpp diff --git a/include/ck/tensor_operation/gpu/device/impl/device_conv2d_backward_weight_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp b/composable_kernel/include/ck/tensor_operation/gpu/device/impl/device_conv2d_backward_weight_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/device/impl/device_conv2d_backward_weight_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/device/impl/device_conv2d_backward_weight_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp diff --git a/include/ck/tensor_operation/gpu/device/impl/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk.hpp b/composable_kernel/include/ck/tensor_operation/gpu/device/impl/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/device/impl/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/device/impl/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk.hpp diff --git a/include/ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_c_shuffle_bias_activation_add_nhwc_kyxc_nhwk.hpp b/composable_kernel/include/ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_c_shuffle_bias_activation_add_nhwc_kyxc_nhwk.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_c_shuffle_bias_activation_add_nhwc_kyxc_nhwk.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_c_shuffle_bias_activation_add_nhwc_kyxc_nhwk.hpp diff --git a/include/ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_c_shuffle_bias_activation_nhwc_kyxc_nhwk.hpp b/composable_kernel/include/ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_c_shuffle_bias_activation_nhwc_kyxc_nhwk.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_c_shuffle_bias_activation_nhwc_kyxc_nhwk.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_c_shuffle_bias_activation_nhwc_kyxc_nhwk.hpp diff --git a/include/ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp b/composable_kernel/include/ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp diff --git a/include/ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk.hpp b/composable_kernel/include/ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk.hpp diff --git a/include/ck/tensor_operation/gpu/device/impl/device_conv3d_fwd_naive_ndhwc_kzyxc_ndhwk.hpp b/composable_kernel/include/ck/tensor_operation/gpu/device/impl/device_conv3d_fwd_naive_ndhwc_kzyxc_ndhwk.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/device/impl/device_conv3d_fwd_naive_ndhwc_kzyxc_ndhwk.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/device/impl/device_conv3d_fwd_naive_ndhwc_kzyxc_ndhwk.hpp diff --git a/include/ck/tensor_operation/gpu/device/impl/device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk.hpp b/composable_kernel/include/ck/tensor_operation/gpu/device/impl/device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/device/impl/device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/device/impl/device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk.hpp diff --git a/include/ck/tensor_operation/gpu/device/impl/device_convnd_bwd_data_nwc_kxc_nwk_dl.hpp b/composable_kernel/include/ck/tensor_operation/gpu/device/impl/device_convnd_bwd_data_nwc_kxc_nwk_dl.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/device/impl/device_convnd_bwd_data_nwc_kxc_nwk_dl.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/device/impl/device_convnd_bwd_data_nwc_kxc_nwk_dl.hpp diff --git a/include/ck/tensor_operation/gpu/device/impl/device_convnd_bwd_data_nwc_kxc_nwk_xdl.hpp b/composable_kernel/include/ck/tensor_operation/gpu/device/impl/device_convnd_bwd_data_nwc_kxc_nwk_xdl.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/device/impl/device_convnd_bwd_data_nwc_kxc_nwk_xdl.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/device/impl/device_convnd_bwd_data_nwc_kxc_nwk_xdl.hpp diff --git a/include/ck/tensor_operation/gpu/device/impl/device_elementwise_2d_impl.hpp b/composable_kernel/include/ck/tensor_operation/gpu/device/impl/device_elementwise_2d_impl.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/device/impl/device_elementwise_2d_impl.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/device/impl/device_elementwise_2d_impl.hpp diff --git a/include/ck/tensor_operation/gpu/device/impl/device_elementwise_impl.hpp b/composable_kernel/include/ck/tensor_operation/gpu/device/impl/device_elementwise_impl.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/device/impl/device_elementwise_impl.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/device/impl/device_elementwise_impl.hpp diff --git a/include/ck/tensor_operation/gpu/device/impl/device_elementwise_normalization_impl.hpp b/composable_kernel/include/ck/tensor_operation/gpu/device/impl/device_elementwise_normalization_impl.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/device/impl/device_elementwise_normalization_impl.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/device/impl/device_elementwise_normalization_impl.hpp diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_bias_add_reduce_xdl_cshuffle.hpp b/composable_kernel/include/ck/tensor_operation/gpu/device/impl/device_gemm_bias_add_reduce_xdl_cshuffle.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/device/impl/device_gemm_bias_add_reduce_xdl_cshuffle.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/device/impl/device_gemm_bias_add_reduce_xdl_cshuffle.hpp diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_dl.hpp b/composable_kernel/include/ck/tensor_operation/gpu/device/impl/device_gemm_dl.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/device/impl/device_gemm_dl.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/device/impl/device_gemm_dl.hpp diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_dpp.hpp b/composable_kernel/include/ck/tensor_operation/gpu/device/impl/device_gemm_dpp.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/device/impl/device_gemm_dpp.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/device/impl/device_gemm_dpp.hpp diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_abd_xdl_cshuffle.hpp b/composable_kernel/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_abd_xdl_cshuffle.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_abd_xdl_cshuffle.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_abd_xdl_cshuffle.hpp diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_dl.hpp b/composable_kernel/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_dl.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_dl.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_dl.hpp diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_layernorm_xdl_cshuffle.hpp b/composable_kernel/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_layernorm_xdl_cshuffle.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_layernorm_xdl_cshuffle.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_layernorm_xdl_cshuffle.hpp diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp b/composable_kernel/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_wmma_cshuffle.hpp b/composable_kernel/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_wmma_cshuffle.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_wmma_cshuffle.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_wmma_cshuffle.hpp diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle.hpp b/composable_kernel/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle.hpp diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_reduce_xdl_cshuffle.hpp b/composable_kernel/include/ck/tensor_operation/gpu/device/impl/device_gemm_reduce_xdl_cshuffle.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/device/impl/device_gemm_reduce_xdl_cshuffle.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/device/impl/device_gemm_reduce_xdl_cshuffle.hpp diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma.hpp b/composable_kernel/include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma.hpp diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl.hpp b/composable_kernel/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl.hpp diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle.hpp b/composable_kernel/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle.hpp diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_layernorm_cshuffle.hpp b/composable_kernel/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_layernorm_cshuffle.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_layernorm_cshuffle.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_layernorm_cshuffle.hpp diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_skip_b_lds.hpp b/composable_kernel/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_skip_b_lds.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_skip_b_lds.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_skip_b_lds.hpp diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_splitk_c_shuffle.hpp b/composable_kernel/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_splitk_c_shuffle.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_splitk_c_shuffle.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_splitk_c_shuffle.hpp diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_streamk.hpp b/composable_kernel/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_streamk.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_streamk.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_streamk.hpp diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_waveletmodel_cshuffle.hpp b/composable_kernel/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_waveletmodel_cshuffle.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_waveletmodel_cshuffle.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_waveletmodel_cshuffle.hpp diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_contraction_multiple_d_xdl_cshuffle.hpp b/composable_kernel/include/ck/tensor_operation/gpu/device/impl/device_grouped_contraction_multiple_d_xdl_cshuffle.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/device/impl/device_grouped_contraction_multiple_d_xdl_cshuffle.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/device/impl/device_grouped_contraction_multiple_d_xdl_cshuffle.hpp diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_wmma_cshuffle.hpp b/composable_kernel/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_wmma_cshuffle.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_wmma_cshuffle.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_wmma_cshuffle.hpp diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp b/composable_kernel/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_dl.hpp b/composable_kernel/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_dl.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_dl.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_dl.hpp diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_wmma_cshuffle.hpp b/composable_kernel/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_wmma_cshuffle.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_wmma_cshuffle.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_wmma_cshuffle.hpp diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle.hpp b/composable_kernel/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle.hpp diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp b/composable_kernel/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_dl_nhwc_kyxc_nhwk.hpp b/composable_kernel/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_dl_nhwc_kyxc_nhwk.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_dl_nhwc_kyxc_nhwk.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_dl_nhwc_kyxc_nhwk.hpp diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_multiple_r.hpp b/composable_kernel/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_multiple_r.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_multiple_r.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_multiple_r.hpp diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_multiple_r_xdl_cshuffle.hpp b/composable_kernel/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_multiple_r_xdl_cshuffle.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_multiple_r_xdl_cshuffle.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_multiple_r_xdl_cshuffle.hpp diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_wmma_cshuffle.hpp b/composable_kernel/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_wmma_cshuffle.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_wmma_cshuffle.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_wmma_cshuffle.hpp diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp b/composable_kernel/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_utils.hpp b/composable_kernel/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_utils.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_utils.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_utils.hpp diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_dl.hpp b/composable_kernel/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_dl.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_dl.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_dl.hpp diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_softmax_gemm_permute_xdl_cshuffle.hpp b/composable_kernel/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_softmax_gemm_permute_xdl_cshuffle.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_softmax_gemm_permute_xdl_cshuffle.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_softmax_gemm_permute_xdl_cshuffle.hpp diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl.hpp b/composable_kernel/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl.hpp diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_fixed_nk.hpp b/composable_kernel/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_fixed_nk.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_fixed_nk.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_fixed_nk.hpp diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_splitk_cshuffle.hpp b/composable_kernel/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_splitk_cshuffle.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_splitk_cshuffle.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_splitk_cshuffle.hpp diff --git a/include/ck/tensor_operation/gpu/device/impl/device_image_to_column_impl.hpp b/composable_kernel/include/ck/tensor_operation/gpu/device/impl/device_image_to_column_impl.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/device/impl/device_image_to_column_impl.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/device/impl/device_image_to_column_impl.hpp diff --git a/include/ck/tensor_operation/gpu/device/impl/device_max_pool_bwd_impl.hpp b/composable_kernel/include/ck/tensor_operation/gpu/device/impl/device_max_pool_bwd_impl.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/device/impl/device_max_pool_bwd_impl.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/device/impl/device_max_pool_bwd_impl.hpp diff --git a/include/ck/tensor_operation/gpu/device/impl/device_multiple_reduce_multiblock.hpp b/composable_kernel/include/ck/tensor_operation/gpu/device/impl/device_multiple_reduce_multiblock.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/device/impl/device_multiple_reduce_multiblock.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/device/impl/device_multiple_reduce_multiblock.hpp diff --git a/include/ck/tensor_operation/gpu/device/impl/device_multiple_reduce_threadwise.hpp b/composable_kernel/include/ck/tensor_operation/gpu/device/impl/device_multiple_reduce_threadwise.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/device/impl/device_multiple_reduce_threadwise.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/device/impl/device_multiple_reduce_threadwise.hpp diff --git a/include/ck/tensor_operation/gpu/device/impl/device_normalization_impl.hpp b/composable_kernel/include/ck/tensor_operation/gpu/device/impl/device_normalization_impl.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/device/impl/device_normalization_impl.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/device/impl/device_normalization_impl.hpp diff --git a/include/ck/tensor_operation/gpu/device/impl/device_normalization_splitk_impl.hpp b/composable_kernel/include/ck/tensor_operation/gpu/device/impl/device_normalization_splitk_impl.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/device/impl/device_normalization_splitk_impl.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/device/impl/device_normalization_splitk_impl.hpp diff --git a/include/ck/tensor_operation/gpu/device/impl/device_permute_impl.hpp b/composable_kernel/include/ck/tensor_operation/gpu/device/impl/device_permute_impl.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/device/impl/device_permute_impl.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/device/impl/device_permute_impl.hpp diff --git a/include/ck/tensor_operation/gpu/device/impl/device_pool2d_fwd_nhwc_nhwc.hpp b/composable_kernel/include/ck/tensor_operation/gpu/device/impl/device_pool2d_fwd_nhwc_nhwc.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/device/impl/device_pool2d_fwd_nhwc_nhwc.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/device/impl/device_pool2d_fwd_nhwc_nhwc.hpp diff --git a/include/ck/tensor_operation/gpu/device/impl/device_pool3d_fwd_ndhwc_ndhwc.hpp b/composable_kernel/include/ck/tensor_operation/gpu/device/impl/device_pool3d_fwd_ndhwc_ndhwc.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/device/impl/device_pool3d_fwd_ndhwc_ndhwc.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/device/impl/device_pool3d_fwd_ndhwc_ndhwc.hpp diff --git a/include/ck/tensor_operation/gpu/device/impl/device_put_element_impl.hpp b/composable_kernel/include/ck/tensor_operation/gpu/device/impl/device_put_element_impl.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/device/impl/device_put_element_impl.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/device/impl/device_put_element_impl.hpp diff --git a/include/ck/tensor_operation/gpu/device/impl/device_reduce_common.hpp b/composable_kernel/include/ck/tensor_operation/gpu/device/impl/device_reduce_common.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/device/impl/device_reduce_common.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/device/impl/device_reduce_common.hpp diff --git a/include/ck/tensor_operation/gpu/device/impl/device_reduce_multiblock.hpp b/composable_kernel/include/ck/tensor_operation/gpu/device/impl/device_reduce_multiblock.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/device/impl/device_reduce_multiblock.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/device/impl/device_reduce_multiblock.hpp diff --git a/include/ck/tensor_operation/gpu/device/impl/device_reduce_threadwise.hpp b/composable_kernel/include/ck/tensor_operation/gpu/device/impl/device_reduce_threadwise.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/device/impl/device_reduce_threadwise.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/device/impl/device_reduce_threadwise.hpp diff --git a/include/ck/tensor_operation/gpu/device/impl/device_softmax_impl.hpp b/composable_kernel/include/ck/tensor_operation/gpu/device/impl/device_softmax_impl.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/device/impl/device_softmax_impl.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/device/impl/device_softmax_impl.hpp diff --git a/include/ck/tensor_operation/gpu/device/impl/device_sparse_embeddings_forward_layernorm.hpp b/composable_kernel/include/ck/tensor_operation/gpu/device/impl/device_sparse_embeddings_forward_layernorm.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/device/impl/device_sparse_embeddings_forward_layernorm.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/device/impl/device_sparse_embeddings_forward_layernorm.hpp diff --git a/include/ck/tensor_operation/gpu/device/impl/device_splitk_contraction_multiple_d_xdl_cshuffle.hpp b/composable_kernel/include/ck/tensor_operation/gpu/device/impl/device_splitk_contraction_multiple_d_xdl_cshuffle.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/device/impl/device_splitk_contraction_multiple_d_xdl_cshuffle.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/device/impl/device_splitk_contraction_multiple_d_xdl_cshuffle.hpp diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemv_splitk.hpp b/composable_kernel/include/ck/tensor_operation/gpu/device/impl/device_tall_and_skinny_gemm_splitk.hpp similarity index 89% rename from include/ck/tensor_operation/gpu/device/impl/device_gemv_splitk.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/device/impl/device_tall_and_skinny_gemm_splitk.hpp index 61813c453..d445013cc 100755 --- a/include/ck/tensor_operation/gpu/device/impl/device_gemv_splitk.hpp +++ b/composable_kernel/include/ck/tensor_operation/gpu/device/impl/device_tall_and_skinny_gemm_splitk.hpp @@ -10,9 +10,9 @@ #include "ck/tensor_description/tensor_descriptor.hpp" #include "ck/tensor_description/tensor_descriptor_helper.hpp" #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" -#include "ck/tensor_operation/gpu/device/device_gemv.hpp" +#include "ck/tensor_operation/gpu/device/device_tall_and_skinny_gemm.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" -#include "ck/tensor_operation/gpu/grid/gridwise_gemv_splitk.hpp" +#include "ck/tensor_operation/gpu/grid/gridwise_tall_and_skinny_gemm_splitk.hpp" #include "ck/host_utility/device_prop.hpp" #include "ck/host_utility/kernel_launch.hpp" @@ -58,7 +58,7 @@ template < is_same_v && is_same_v, bool> = false> -struct deviceGemvDl : public DeviceGemv{}; static constexpr auto I5 = Number<5>{}; - // GridwiseGemv - using GridwiseGemv = - GridwiseGemvDl_km_kn_mn; - using DefaultBlock2CTileMap = typename GridwiseGemv::DefaultBlock2CTileMap; - using Argument = typename GridwiseGemv::Argument; + using DefaultBlock2CTileMap = typename GridwiseTsmm::DefaultBlock2CTileMap; + using Argument = typename GridwiseTsmm::Argument; // Invoker struct Invoker : public BaseInvoker { @@ -116,14 +116,14 @@ struct deviceGemvDl : public DeviceGemv(static_cast(p_a), static_cast(p_b), static_cast(p_c), @@ -334,10 +335,10 @@ struct deviceGemvDl : public DeviceGemv +__global__ void +#if CK_USE_LAUNCH_BOUNDS +__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU) +#endif + kernel_tsmm_dl_v1r3( + typename GridwiseTsmm::Argument karg, + const Block2CTileMap& block_2_ctile_map) //: in __global__ functions, struct is + // better for reduced load overhead +{ + constexpr index_t shared_block_size = + GridwiseTsmm::GetSharedMemoryNumberOfByte() / sizeof(FloatAB); + + __shared__ FloatAB p_shared_block[shared_block_size]; + + GridwiseTsmm::template Run( + karg, + p_shared_block, + block_2_ctile_map, + integral_constant{}, + integral_constant{}); +} + +template +struct GridwiseTsmmDl_km_kn_mn +{ + static constexpr auto I0 = Number<0>{}; + static constexpr auto I1 = Number<1>{}; + static constexpr auto I2 = Number<2>{}; + static constexpr auto I3 = Number<3>{}; + + // K1 should be Number<...> + static constexpr auto K1 = Number{}; + + // Argument + struct Argument : public tensor_operation::device::BaseArgument // + { + Argument(const FloatAB* p_a_grid_, + const FloatAB* p_b_grid_, + FloatC* p_c_grid_, + index_t M_, + index_t N_, + index_t K_, + index_t StrideA_, + index_t StrideB_, + index_t StrideC_, + index_t MPadded_, + index_t NPadded_, + index_t KPadded_, + index_t K0_, + index_t k_batch_) + : p_a_grid{p_a_grid_}, + p_b_grid{p_b_grid_}, + p_c_grid{p_c_grid_}, + M{M_}, + N{N_}, + K{K_}, + StrideA{StrideA_}, + StrideB{StrideB_}, + StrideC{StrideC_}, + MPadded(MPadded_), + NPadded(NPadded_), + KPadded(KPadded_), + K0(K0_), + k_batch(k_batch_) + { + } + + // private: + const FloatAB* p_a_grid; + const FloatAB* p_b_grid; + FloatC* p_c_grid; + + index_t M, N, K; + index_t StrideA, StrideB, StrideC; + //: + index_t MPadded; + index_t NPadded; + index_t KPadded; + index_t K0; + index_t k_batch; + }; + + __host__ __device__ static constexpr index_t GetSharedMemoryNumberOfByte() + { + // TODO: change this. I think it needs multi-dimensional alignment + constexpr auto max_lds_align = K1; + + // TODO: check alignment + // A matrix in LDS memory, dst of blockwise copy + constexpr auto a_block_desc_k_m = make_naive_tensor_descriptor_aligned( + make_tuple(Number{}, Number{}, K1), max_lds_align); + + // TODO: check alignment + // LDS allocation for A and B: be careful of alignment + constexpr auto a_block_aligned_space_size = + math::integer_least_multiple(a_block_desc_k_m.GetElementSpaceSize(), max_lds_align); + + return 2 * (a_block_aligned_space_size) * sizeof(FloatAB); + } + + __host__ __device__ static constexpr index_t + CalculateGridSize(index_t M, index_t N, index_t k_batch) // + { + const index_t grid_size = math::integer_divide_ceil(N, NPerBlock) * + math::integer_divide_ceil(M, MPerBlock) * k_batch; + + return grid_size; + } + + __host__ __device__ static constexpr bool CalculateHasMainKBlockLoop(index_t K0) + { + const bool has_main_k_block_loop = (K0 + K0PerBlock) / (2 * K0PerBlock) > 1; + + return has_main_k_block_loop; + } + + __host__ __device__ static constexpr bool CalculateHasDoubleTailKBlockLoop(index_t K0) + + { + const bool has_double_tail_k_block_loop = (K0 / K0PerBlock) % 2 == 0; + + return has_double_tail_k_block_loop; + } + + __host__ __device__ static auto CalculateMPadded(index_t M) + { + return math::integer_least_multiple(M, MPerBlock); + } + + __host__ __device__ static auto CalculateNPadded(index_t N) + { + return math::integer_least_multiple(N, NPerBlock); + } + + __host__ __device__ static auto CalculateK0(index_t K, index_t K_Batch = 1) + { + // k_batch * k0 * k0_per_block * k1 + auto K_t = K_Batch * K0PerBlock * K1; + return (K + K_t - 1) / K_t * K0PerBlock; + } + + __host__ __device__ static auto CalculateKPadded(index_t K, index_t K_Batch = 1) + { + auto K0 = CalculateK0(K, K_Batch); + return K_Batch * K0 * K1; + } + + static constexpr auto K1Number = Number{}; + + // M, K -> KBatch, K0, M, K1: M -> MPad, K->KBatch, K0, K1 + __host__ __device__ static auto MakeAGridDescriptor_KBatch_K0_M_K1( + index_t M, index_t MPad, index_t K, index_t StrideA, index_t KBatch, index_t K0) + { + + const auto a_grid_desc_m_k = [&]() { + if constexpr(is_same::value) + { + return make_naive_tensor_descriptor(make_tuple(M, K), make_tuple(StrideA, I1)); + } + else if constexpr(is_same::value) + { + return make_naive_tensor_descriptor(make_tuple(M, K), make_tuple(I1, StrideA)); + } + }(); + + if constexpr(GemmSpec == tensor_operation::device::GemmSpecialization::MNPadding) + { + + return transform_tensor_descriptor( + a_grid_desc_m_k, + make_tuple(make_unmerge_transform( + make_tuple(KBatch, K0, K1Number)), // unmerge is split 1D to 3D + make_right_pad_transform(M, MPad - M)), // + make_tuple(Sequence<1>{}, Sequence<0>{}), // mapped to input M & K; sequence 0 is M; + // 1 is K; make unmerge is working on K; + make_tuple(Sequence<0, 1, 3>{}, // input is M,K; output we want is Kbatch, K0 and K1 + // -> 0, 1, 3; output is transformed from 2D to 4D + Sequence<2>{})); // 2->M + } + else + { + return transform_tensor_descriptor( + a_grid_desc_m_k, + make_tuple(make_unmerge_transform(make_tuple(KBatch, K0, K1Number)), + make_pass_through_transform(M)), + make_tuple(Sequence<1>{}, Sequence<0>{}), + make_tuple(Sequence<0, 1, 3>{}, Sequence<2>{})); + } + } + + __host__ __device__ static auto MakeBGridDescriptor_KBatch_K0_N_K1( + index_t K, index_t NPad, index_t N, index_t StrideB, index_t KBatch, index_t K0) + { + + const auto b_grid_desc_k_n = [&]() { + if constexpr(is_same::value) + { + return make_naive_tensor_descriptor(make_tuple(K, N), make_tuple(StrideB, I1)); + } + else if constexpr(is_same::value) + { + return make_naive_tensor_descriptor(make_tuple(K, N), make_tuple(I1, StrideB)); + } + }(); + + if constexpr(GemmSpec == tensor_operation::device::GemmSpecialization::MNPadding) + { + + return transform_tensor_descriptor( + b_grid_desc_k_n, + make_tuple(make_unmerge_transform(make_tuple(KBatch, K0, K1Number)), + make_right_pad_transform(N, NPad - N)), + make_tuple(Sequence<0>{}, Sequence<1>{}), + make_tuple(Sequence<0, 1, 3>{}, Sequence<2>{})); + } + else + { + return transform_tensor_descriptor( + b_grid_desc_k_n, + make_tuple(make_unmerge_transform(make_tuple(KBatch, K0, K1Number)), + make_pass_through_transform(N)), + make_tuple(Sequence<0>{}, Sequence<1>{}), + make_tuple(Sequence<0, 1, 3>{}, Sequence<2>{})); + } + } + + __host__ __device__ static auto MakeCGridDescriptor_M_N(index_t M, index_t N, index_t StrideC) + { + const auto c_grid_desc_m_n = [&]() { + if constexpr(is_same::value) + { + return make_naive_tensor_descriptor(make_tuple(M, N), make_tuple(StrideC, I1)); + } + else if constexpr(is_same::value) + { + return make_naive_tensor_descriptor(make_tuple(M, N), make_tuple(I1, StrideC)); + } + }(); + + if constexpr(GemmSpec == tensor_operation::device::GemmSpecialization::MNPadding) + { + const auto PadM = (MPerBlock - M % MPerBlock) % MPerBlock; + const auto PadN = (NPerBlock - N % NPerBlock) % NPerBlock; + + return transform_tensor_descriptor( + c_grid_desc_m_n, + make_tuple(make_right_pad_transform(M, PadM), make_right_pad_transform(N, PadN)), + make_tuple(Sequence<0>{}, Sequence<1>{}), + make_tuple(Sequence<0>{}, Sequence<1>{})); + } + else + { + + return transform_tensor_descriptor( + c_grid_desc_m_n, + make_tuple(make_pass_through_transform(M), make_pass_through_transform(N)), + make_tuple(Sequence<0>{}, Sequence<1>{}), + make_tuple(Sequence<0>{}, Sequence<1>{})); + } + } + + __host__ __device__ static auto GetKPad(index_t K, index_t KBatch) + { + const index_t K0 = math::integer_divide_ceil(K, K1 * K0PerBlock * KBatch) * K0PerBlock; + const index_t KPad = KBatch * K0 * K1; + return KPad; + } + + using AGridDesc_Kbatch_K0_M_K1 = decltype(MakeAGridDescriptor_KBatch_K0_M_K1(1, 1, 1, 1, 1, 1)); + using BGridDesc_Kbatch_K0_N_K1 = decltype(MakeBGridDescriptor_KBatch_K0_N_K1(1, 1, 1, 1, 1, 1)); + using CGridDesc_M_N = decltype(MakeCGridDescriptor_M_N(1, 1, 1)); + + __host__ __device__ static constexpr bool CheckValidity(const Argument& karg) + { + const auto a_grid_desc_kbatch_k0_m_k1 = MakeAGridDescriptor_KBatch_K0_M_K1( + karg.M, karg.MPadded, karg.K, karg.StrideA, karg.k_batch, karg.K0); + const auto b_grid_desc_kbatch_k0_n_k1 = MakeBGridDescriptor_KBatch_K0_N_K1( + karg.K, karg.NPadded, karg.N, karg.StrideB, karg.k_batch, karg.K0); + const auto c_grid_desc_m_n = MakeCGridDescriptor_M_N(karg.M, karg.N, karg.StrideC); + + const auto KBatch_a = a_grid_desc_kbatch_k0_m_k1.GetLength(I0); + const auto KBatch_b = b_grid_desc_kbatch_k0_n_k1.GetLength(I0); + const auto K0_ = a_grid_desc_kbatch_k0_m_k1.GetLength(I1); + const auto M_ = a_grid_desc_kbatch_k0_m_k1.GetLength(I2); + const auto N_ = b_grid_desc_kbatch_k0_n_k1.GetLength(I2); + + return (M_ % MPerBlock == 0 && N_ % NPerBlock == 0 && K0_ % K0PerBlock == 0 && + M_ == c_grid_desc_m_n.GetLength(I0) && N_ == c_grid_desc_m_n.GetLength(I1) && + a_grid_desc_kbatch_k0_m_k1.GetLength(I3) == + b_grid_desc_kbatch_k0_n_k1.GetLength(I3) && + karg.k_batch >= 1 && KBatch_a == karg.k_batch && KBatch_b == karg.k_batch); + } + + // KBatch, K0, M, K1 -> KBatch, K0, M0, M1 (MPerBlock), K1 + __host__ __device__ static constexpr auto MakeAGridDescriptor_Kbatch_K0_M0_M1_K1( + const AGridDesc_Kbatch_K0_M_K1& a_grid_desc_kbatch_k0_m_k1) + { + const auto KBatch = a_grid_desc_kbatch_k0_m_k1.GetLength(I0); + const auto K0 = a_grid_desc_kbatch_k0_m_k1.GetLength(I1); + const auto M = a_grid_desc_kbatch_k0_m_k1.GetLength(I2); + + const auto M1 = Number{}; + const auto M0 = M / M1; + + const auto a_grid_desc_kbatch_k0_m0_m1_k1 = transform_tensor_descriptor( + a_grid_desc_kbatch_k0_m_k1, + make_tuple(make_pass_through_transform(KBatch), + make_pass_through_transform(K0), + make_unmerge_transform(make_tuple(M0, M1)), + make_pass_through_transform(K1)), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}), // IP + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4>{})); // OP + + return a_grid_desc_kbatch_k0_m0_m1_k1; + } + + __host__ __device__ static constexpr auto MakeBGridDescriptor_Kbatch_K0_N0_N1_K1( + const BGridDesc_Kbatch_K0_N_K1& b_grid_desc_kbatch_k0_n_k1) + { + const auto KBatch = b_grid_desc_kbatch_k0_n_k1.GetLength(I0); + const auto K0 = b_grid_desc_kbatch_k0_n_k1.GetLength(I1); + const auto N = b_grid_desc_kbatch_k0_n_k1.GetLength(I2); + + const auto N1 = Number{}; + const auto N0 = N / N1; + + const auto b_grid_desc_kbatch_k0_n0_n1_k1 = transform_tensor_descriptor( + b_grid_desc_kbatch_k0_n_k1, + make_tuple(make_pass_through_transform(KBatch), + make_pass_through_transform(K0), + make_unmerge_transform(make_tuple(N0, N1)), + make_pass_through_transform(K1)), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4>{})); + + return b_grid_desc_kbatch_k0_n0_n1_k1; + } + + __host__ __device__ static constexpr auto + MakeCGridDescriptor_M0_M10_M11_N0_N10_N11(const CGridDesc_M_N& c_grid_desc_m_n) + { + const auto M = c_grid_desc_m_n.GetLength(I0); + const auto N = c_grid_desc_m_n.GetLength(I1); + + constexpr auto M1 = Number{}; + constexpr auto N1 = Number{}; + + const auto M0 = M / M1; + const auto N0 = N / N1; + + constexpr auto M11 = Number{}; + constexpr auto N11 = Number{}; + + constexpr auto M10 = M1 / M11; + constexpr auto N10 = N1 / N11; + + const auto c_grid_desc_m0_m10_m11_n0_n10_n11 = transform_tensor_descriptor( + c_grid_desc_m_n, + make_tuple(make_unmerge_transform(make_tuple(M0, M10, M11)), + make_unmerge_transform(make_tuple(N0, N10, N11))), + make_tuple(Sequence<0>{}, Sequence<1>{}), + make_tuple(Sequence<0, 1, 2>{}, Sequence<3, 4, 5>{})); + + return c_grid_desc_m0_m10_m11_n0_n10_n11; + } + + // return block_id to C matrix tile idx (m0, n0) mapping + __host__ __device__ static constexpr auto MakeDefaultBlock2CTileMap() + { + //: 3d ksplit for C + return BlockToCTileMap_3DGrid_KSplit(); + } + using DefaultBlock2CTileMap = remove_cvref_t; // + using AGridDesc_K0_M0_M1_K1 = + decltype(MakeAGridDescriptor_Kbatch_K0_M0_M1_K1(AGridDesc_Kbatch_K0_M_K1{})); + using BGridDesc_K0_N0_N1_K1 = + decltype(MakeBGridDescriptor_Kbatch_K0_N0_N1_K1(BGridDesc_Kbatch_K0_N_K1{})); + using CGridDesc_M0_M10_M11_N0_N10_N11 = + decltype(MakeCGridDescriptor_M0_M10_M11_N0_N10_N11(CGridDesc_M_N{})); // + using Block2CTileMap = decltype(MakeDefaultBlock2CTileMap()); // + + template + __device__ static void Run(const Argument& karg, + FloatAB* __restrict__ p_shared_block, + const Block2CTileMap& block_2_ctile_map, + integral_constant, + integral_constant) + { + const FloatAB* p_a_grid = karg.p_a_grid; + const FloatAB* p_b_grid = karg.p_b_grid; + FloatC* p_c_grid = karg.p_c_grid; + const auto a_grid_desc_kbatch_k0_m_k1 = GridwiseTsmm::MakeAGridDescriptor_KBatch_K0_M_K1( + karg.M, karg.MPadded, karg.K, karg.StrideA, karg.k_batch, karg.K0); // + const auto b_grid_desc_kbatch_k0_n_k1 = GridwiseTsmm::MakeBGridDescriptor_KBatch_K0_N_K1( + karg.K, karg.NPadded, karg.N, karg.StrideB, karg.k_batch, karg.K0); // + const auto c_grid_desc_m_n = + GridwiseTsmm::MakeCGridDescriptor_M_N(karg.M, karg.N, karg.StrideC); + + const auto a_grid_desc_kbatch_k0_m0_m1_k1 = + GridwiseTsmm::MakeAGridDescriptor_Kbatch_K0_M0_M1_K1(a_grid_desc_kbatch_k0_m_k1); // + const auto b_grid_desc_kbatch_k0_n0_n1_k1 = + GridwiseTsmm::MakeBGridDescriptor_Kbatch_K0_N0_N1_K1(b_grid_desc_kbatch_k0_n_k1); // + const auto c_grid_desc_m0_m10_m11_n0_n10_n11 = + GridwiseTsmm::MakeCGridDescriptor_M0_M10_M11_N0_N10_N11(c_grid_desc_m_n); + + const auto a_global_buf = make_dynamic_buffer( + p_a_grid, a_grid_desc_kbatch_k0_m0_m1_k1.GetElementSpaceSize()); + const auto b_global_buf = make_dynamic_buffer( + p_b_grid, b_grid_desc_kbatch_k0_n0_n1_k1.GetElementSpaceSize()); + ignore = b_global_buf; + auto c_grid_buf = make_dynamic_buffer( + p_c_grid, c_grid_desc_m0_m10_m11_n0_n10_n11.GetElementSpaceSize()); + + const auto c_m0_n0_block_cluster_idx = block_2_ctile_map.convert_1D_block_idx_to_3D_tuple( + get_block_1d_id(), karg.N, karg.k_batch); + + // HACK: this force index data into SGPR + const index_t im0 = __builtin_amdgcn_readfirstlane(c_m0_n0_block_cluster_idx[I0]); + const index_t in0 = __builtin_amdgcn_readfirstlane(c_m0_n0_block_cluster_idx[I1]); + const index_t kbatch_id = __builtin_amdgcn_readfirstlane(c_m0_n0_block_cluster_idx[I2]); + + if(!block_2_ctile_map.ValidCTileIndex( + make_tuple(im0, in0), + make_tuple(c_grid_desc_m0_m10_m11_n0_n10_n11.GetLength(I0), + c_grid_desc_m0_m10_m11_n0_n10_n11.GetLength(I3)))) + { + return; + } + + // TODO: change this. I think it needs multi-dimensional alignment + constexpr auto max_lds_align = K1; + + constexpr auto a_block_desc_copy_kbatch_k0_m0_m1_k1 = make_naive_tensor_descriptor_aligned( + make_tuple(I1, Number{}, I1, Number{}, K1), max_lds_align); + + // A matrix blockwise copy + auto a_blockwise_copy = BlockwiseTensorSliceTransfer_v5r1< + BlockSize, + InMemoryDataOperationEnum::Set, + Sequence<1, K0PerBlock, 1, MPerBlock, K1.value>, //: 5 dimensions; kbatch for each + // dimension is 1 + ABlockTransferThreadSliceLengths_KBatch_K0_M0_M1_K1, + ABlockTransferThreadClusterLengths_KBatch_K0_M0_M1_K1, + ABlockTransferThreadClusterArrangeOrder, // 0, 1, 2, 3, 4 + FloatAB, + FloatAB, + remove_reference_t, // Global tensor desc + decltype(a_block_desc_copy_kbatch_k0_m0_m1_k1), // block tensor desc + ABlockTransferSrcAccessOrder, // 5-dim + Sequence<0, 1, 2, 3, 4>, + ABlockTransferSrcVectorTensorLengths_KBatch_K0_M0_M1_K1, // SrcVectorTensorLengths + ABlockTransferDstVectorTensorLengths_KBatch_K0_M0_M1_K1, // DstVectorTensorLengths + ABlockTransferSrcVectorTensorContiguousDimOrder, // SrcVectorTensorContiguousDimOrder + Sequence<0, 1, 2, 3, 4>, // DstVectorTensorContiguousDimOrder + false, + true>(a_grid_desc_kbatch_k0_m0_m1_k1, // for src desc + make_multi_index(kbatch_id, 0, im0, 0, 0), //: calculate start index of K + a_block_desc_copy_kbatch_k0_m0_m1_k1, // for dst desc + make_multi_index(0, 0, 0, 0, 0)); + + static constexpr auto b_thread_desc_copy_kbatch_k0_n0_n1_k1 = + make_naive_tensor_descriptor_packed( + make_tuple(I1, + Number{}, + I1, + Number{}, + Number{})); //: this descriptor is used only for copy + + static constexpr auto b_thread_desc_copy_k0_n0_n1_k1 = make_naive_tensor_descriptor_packed( + make_tuple(I1, Number{}, I1, Number{}, Number{})); + + auto b_threadwise_copy = ThreadwiseTensorSliceTransfer_v2< + FloatAB, + FloatAB, + remove_reference_t, + decltype(b_thread_desc_copy_kbatch_k0_n0_n1_k1), // + Sequence<1, K0PerBlock, 1, NPerThread, K1.value>, + BThreadTransferSrcDstAccessOrder, + BThreadTransferSrcVectorDim, + BThreadTransferSrcScalarPerVector, + 1, + false, + true>(b_grid_desc_kbatch_k0_n0_n1_k1, + make_multi_index(kbatch_id, 0, in0, get_thread_local_1d_id() * NPerThread, 0)); + + static constexpr auto b_k0_n_k1_thread_desc = make_naive_tensor_descriptor_packed( + make_tuple(Number{}, Number{}, Number{})); + + // TODO: check alignment + // A matrix in LDS memory, dst of blockwise copy + // be careful of LDS alignment + constexpr auto a_block_desc_k0_m0_m1_k1 = make_naive_tensor_descriptor_aligned( + make_tuple(Number{}, I1, Number{}, K1), max_lds_align); + + // TODO: check alignment + // A matrix in LDS memory, for blockwise GEMM + constexpr auto a_k0_m_k1_block_desc = make_naive_tensor_descriptor_aligned( + make_tuple(Number{}, Number{}, K1), max_lds_align); + + static_assert(a_block_desc_k0_m0_m1_k1.GetElementSpaceSize() == + a_k0_m_k1_block_desc.GetElementSpaceSize() && + "wrong!"); + + const auto blockwise_tsmm = + BlockwiseGemmDlops_km_kn_m0m1n0n1_v3{}; + + constexpr auto c_m10_m11_n10_n11_thread_tensor_lengths = + decltype(blockwise_tsmm)::GetCThreadTensorLengths_BM0_BM1_BN0_BN1(); + + constexpr auto c_thread_desc_m10_m11_n10_n11 = make_naive_tensor_descriptor_packed( + sequence_to_tuple_of_number(c_m10_m11_n10_n11_thread_tensor_lengths)); + + // LDS allocation for A and B: be careful of alignment + constexpr auto a_block_aligned_space_size = math::integer_least_multiple( + a_block_desc_k0_m0_m1_k1.GetElementSpaceSize(), max_lds_align); + + FloatAB* p_a_block_double = p_shared_block; + + auto b_thread_odd_buf = make_static_buffer( + b_k0_n_k1_thread_desc.GetElementSpaceSize()); + + auto b_thread_even_buf = make_static_buffer( + b_k0_n_k1_thread_desc.GetElementSpaceSize()); + + // register allocation for output + auto c_thread_buf = make_static_buffer( + c_thread_desc_m10_m11_n10_n11.GetElementSpaceSize()); + + // Initialize C + c_thread_buf.Clear(); + + constexpr auto a_block_slice_copy_step = make_multi_index(0, K0PerBlock, 0, 0, 0); + constexpr auto b_thread_slice_copy_step = make_multi_index(0, K0PerBlock, 0, 0, 0); + + auto a_block_even_buf = make_dynamic_buffer( + p_a_block_double, a_block_desc_copy_kbatch_k0_m0_m1_k1.GetElementSpaceSize()); + + auto a_block_odd_buf = make_dynamic_buffer( + p_a_block_double + a_block_aligned_space_size, + a_block_desc_copy_kbatch_k0_m0_m1_k1.GetElementSpaceSize()); + + // LDS double buffer: preload data into LDS + { + a_blockwise_copy.RunRead(a_grid_desc_kbatch_k0_m0_m1_k1, + a_global_buf); // a_global_buf -> reg_tmp_buf + a_blockwise_copy.RunWrite(a_block_desc_copy_kbatch_k0_m0_m1_k1, + a_block_even_buf); // reg_tmp_buf->a_block_even_buf + + b_threadwise_copy.Run(b_grid_desc_kbatch_k0_n0_n1_k1, + b_global_buf, + b_thread_desc_copy_k0_n0_n1_k1, + make_tuple(I0, I0, I0, I0, I0), + b_thread_even_buf); + } + + if constexpr(HasMainKBlockLoop) + { + const auto K0 = a_grid_desc_kbatch_k0_m0_m1_k1.GetLength(I1); + + index_t k_block_data_begin = 0; + + // LDS double buffer: main body + // use Do-While loop instead of For loop to simplify control flow + do + { + // even iteration + a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc_kbatch_k0_m0_m1_k1, + a_block_slice_copy_step); + + b_threadwise_copy.MoveSrcSliceWindow(b_grid_desc_kbatch_k0_n0_n1_k1, + b_thread_slice_copy_step); + + // LDS doubel buffer: load next data from device mem + a_blockwise_copy.RunRead(a_grid_desc_kbatch_k0_m0_m1_k1, a_global_buf); + + b_threadwise_copy.Run(b_grid_desc_kbatch_k0_n0_n1_k1, + b_global_buf, + b_thread_desc_copy_k0_n0_n1_k1, + make_tuple(I0, I0, I0, I0, I0), + b_thread_odd_buf); + + block_sync_lds(); + + // LDS double buffer: GEMM on current data + blockwise_tsmm.Run(a_block_even_buf, b_thread_even_buf, c_thread_buf); + + // LDS double buffer: store next data to LDS + a_blockwise_copy.RunWrite(a_block_desc_copy_kbatch_k0_m0_m1_k1, a_block_odd_buf); + + // odd iteration + a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc_kbatch_k0_m0_m1_k1, + a_block_slice_copy_step); + + b_threadwise_copy.MoveSrcSliceWindow(b_grid_desc_kbatch_k0_n0_n1_k1, + b_thread_slice_copy_step); + + // LDS doubel buffer: load next data from device mem + a_blockwise_copy.RunRead(a_grid_desc_kbatch_k0_m0_m1_k1, a_global_buf); + + b_threadwise_copy.Run(b_grid_desc_kbatch_k0_n0_n1_k1, + b_global_buf, + b_thread_desc_copy_k0_n0_n1_k1, + make_tuple(I0, I0, I0, I0, I0), + b_thread_even_buf); + + block_sync_lds(); + + // LDS double buffer: GEMM on current data + blockwise_tsmm.Run(a_block_odd_buf, b_thread_odd_buf, c_thread_buf); + + // LDS double buffer: store next data to LDS + a_blockwise_copy.RunWrite(a_block_desc_copy_kbatch_k0_m0_m1_k1, a_block_even_buf); + + k_block_data_begin += 2 * K0PerBlock; + } while(k_block_data_begin < K0 - 2 * K0PerBlock); + } + + // LDS double buffer: tail + if constexpr(HasDoubleTailKBlockLoop) // if has 2 iteration left + { + a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc_kbatch_k0_m0_m1_k1, + a_block_slice_copy_step); + + b_threadwise_copy.MoveSrcSliceWindow(b_grid_desc_kbatch_k0_n0_n1_k1, + b_thread_slice_copy_step); + + block_sync_lds(); + + // LDS double buffer: load last data from device mem + a_blockwise_copy.RunRead(a_grid_desc_kbatch_k0_m0_m1_k1, a_global_buf); + + b_threadwise_copy.Run(b_grid_desc_kbatch_k0_n0_n1_k1, + b_global_buf, + b_thread_desc_copy_k0_n0_n1_k1, + make_tuple(I0, I0, I0, I0, I0), + b_thread_odd_buf); + + // LDS double buffer: GEMM on 2nd-last data + blockwise_tsmm.Run(a_block_even_buf, b_thread_even_buf, c_thread_buf); + + // LDS double buffer: store last data to LDS + a_blockwise_copy.RunWrite(a_block_desc_copy_kbatch_k0_m0_m1_k1, a_block_odd_buf); + + block_sync_lds(); + + // LDS double buffer: GEMM on last data + blockwise_tsmm.Run(a_block_odd_buf, b_thread_odd_buf, c_thread_buf); + } + else // if has 1 iteration left + { + __syncthreads(); + + // LDS double buffer: GEMM on last data + blockwise_tsmm.Run(a_block_even_buf, b_thread_even_buf, c_thread_buf); + } + + // output: register to global memory + { + constexpr auto c_thread_desc_m0_m10_m11_n0_n10_n11 = + make_naive_tensor_descriptor_packed( + make_tuple(I1, + Number{}, + Number{}, + I1, + Number{}, + Number{})); + + const auto c_m10_m11_n10_n11_thread_origin_idx_on_block = + blockwise_tsmm.CalculateCThreadOriginOnBlock_BM0_BM1_BN0_BN1( + get_thread_local_1d_id()); + + ThreadwiseTensorSliceTransfer_v1r3< + FloatAcc, + FloatC, + decltype(c_thread_desc_m0_m10_m11_n0_n10_n11), + decltype(c_grid_desc_m0_m10_m11_n0_n10_n11), + ck::tensor_operation::element_wise::PassThrough, + Sequence<1, + c_m10_m11_n10_n11_thread_tensor_lengths[I0], + c_m10_m11_n10_n11_thread_tensor_lengths[I1], + 1, + c_m10_m11_n10_n11_thread_tensor_lengths[I2], + c_m10_m11_n10_n11_thread_tensor_lengths[I3]>, + CThreadTransferSrcDstAccessOrder, + CThreadTransferSrcDstVectorDim, + CThreadTransferDstScalarPerVector, + CGlobalMemoryDataOperation, + 1, + true>{c_grid_desc_m0_m10_m11_n0_n10_n11, + make_multi_index(im0, + c_m10_m11_n10_n11_thread_origin_idx_on_block[I0], + c_m10_m11_n10_n11_thread_origin_idx_on_block[I1], + in0, + c_m10_m11_n10_n11_thread_origin_idx_on_block[I2], + c_m10_m11_n10_n11_thread_origin_idx_on_block[I3]), + ck::tensor_operation::element_wise::PassThrough{}} + .Run(c_thread_desc_m0_m10_m11_n0_n10_n11, + make_tuple(I0, I0, I0, I0, I0, I0), + c_thread_buf, + c_grid_desc_m0_m10_m11_n0_n10_n11, + c_grid_buf); + } + } +}; +} // namespace ck diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_tensor_rearrange.hpp b/composable_kernel/include/ck/tensor_operation/gpu/grid/gridwise_tensor_rearrange.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/grid/gridwise_tensor_rearrange.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/grid/gridwise_tensor_rearrange.hpp diff --git a/include/ck/tensor_operation/gpu/grid/normalization/gridwise_normalization_naive_variance.hpp b/composable_kernel/include/ck/tensor_operation/gpu/grid/normalization/gridwise_normalization_naive_variance.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/grid/normalization/gridwise_normalization_naive_variance.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/grid/normalization/gridwise_normalization_naive_variance.hpp diff --git a/include/ck/tensor_operation/gpu/grid/normalization/gridwise_normalization_selector.hpp b/composable_kernel/include/ck/tensor_operation/gpu/grid/normalization/gridwise_normalization_selector.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/grid/normalization/gridwise_normalization_selector.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/grid/normalization/gridwise_normalization_selector.hpp diff --git a/include/ck/tensor_operation/gpu/grid/normalization/gridwise_normalization_splitk_1st.hpp b/composable_kernel/include/ck/tensor_operation/gpu/grid/normalization/gridwise_normalization_splitk_1st.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/grid/normalization/gridwise_normalization_splitk_1st.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/grid/normalization/gridwise_normalization_splitk_1st.hpp diff --git a/include/ck/tensor_operation/gpu/grid/normalization/gridwise_normalization_splitk_2nd.hpp b/composable_kernel/include/ck/tensor_operation/gpu/grid/normalization/gridwise_normalization_splitk_2nd.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/grid/normalization/gridwise_normalization_splitk_2nd.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/grid/normalization/gridwise_normalization_splitk_2nd.hpp diff --git a/include/ck/tensor_operation/gpu/grid/normalization/gridwise_normalization_welford_variance.hpp b/composable_kernel/include/ck/tensor_operation/gpu/grid/normalization/gridwise_normalization_welford_variance.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/grid/normalization/gridwise_normalization_welford_variance.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/grid/normalization/gridwise_normalization_welford_variance.hpp diff --git a/include/ck/tensor_operation/gpu/thread/reduction_functions_threadwise.hpp b/composable_kernel/include/ck/tensor_operation/gpu/thread/reduction_functions_threadwise.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/thread/reduction_functions_threadwise.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/thread/reduction_functions_threadwise.hpp diff --git a/include/ck/tensor_operation/gpu/thread/threadwise_contraction_dl.hpp b/composable_kernel/include/ck/tensor_operation/gpu/thread/threadwise_contraction_dl.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/thread/threadwise_contraction_dl.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/thread/threadwise_contraction_dl.hpp diff --git a/include/ck/tensor_operation/gpu/thread/threadwise_gemm_dlops_v3.hpp b/composable_kernel/include/ck/tensor_operation/gpu/thread/threadwise_gemm_dlops_v3.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/thread/threadwise_gemm_dlops_v3.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/thread/threadwise_gemm_dlops_v3.hpp diff --git a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_set.hpp b/composable_kernel/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_set.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_set.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_set.hpp diff --git a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp b/composable_kernel/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp diff --git a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp b/composable_kernel/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp diff --git a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v4r1.hpp b/composable_kernel/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v4r1.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v4r1.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v4r1.hpp diff --git a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v5r1.hpp b/composable_kernel/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v5r1.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v5r1.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v5r1.hpp diff --git a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v6r1.hpp b/composable_kernel/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v6r1.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v6r1.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v6r1.hpp diff --git a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v6r1r2.hpp b/composable_kernel/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v6r1r2.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v6r1r2.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v6r1r2.hpp diff --git a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v6r2.hpp b/composable_kernel/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v6r2.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v6r2.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v6r2.hpp diff --git a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v6r3.hpp b/composable_kernel/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v6r3.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v6r3.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v6r3.hpp diff --git a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v7.hpp b/composable_kernel/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v7.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v7.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v7.hpp diff --git a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v7r2.hpp b/composable_kernel/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v7r2.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v7r2.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v7r2.hpp diff --git a/include/ck/tensor_operation/gpu/thread/threadwise_welford.hpp b/composable_kernel/include/ck/tensor_operation/gpu/thread/threadwise_welford.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/thread/threadwise_welford.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/thread/threadwise_welford.hpp diff --git a/include/ck/tensor_operation/gpu/warp/dpp_gemm.hpp b/composable_kernel/include/ck/tensor_operation/gpu/warp/dpp_gemm.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/warp/dpp_gemm.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/warp/dpp_gemm.hpp diff --git a/include/ck/tensor_operation/gpu/warp/wmma_gemm.hpp b/composable_kernel/include/ck/tensor_operation/gpu/warp/wmma_gemm.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/warp/wmma_gemm.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/warp/wmma_gemm.hpp diff --git a/include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp b/composable_kernel/include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp rename to composable_kernel/include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp diff --git a/include/ck/tensor_operation/operator_transform/transform_contraction_to_gemm.hpp b/composable_kernel/include/ck/tensor_operation/operator_transform/transform_contraction_to_gemm.hpp similarity index 100% rename from include/ck/tensor_operation/operator_transform/transform_contraction_to_gemm.hpp rename to composable_kernel/include/ck/tensor_operation/operator_transform/transform_contraction_to_gemm.hpp diff --git a/include/ck/tensor_operation/operator_transform/transform_conv_bwd_data_to_gemm_v1.hpp b/composable_kernel/include/ck/tensor_operation/operator_transform/transform_conv_bwd_data_to_gemm_v1.hpp similarity index 100% rename from include/ck/tensor_operation/operator_transform/transform_conv_bwd_data_to_gemm_v1.hpp rename to composable_kernel/include/ck/tensor_operation/operator_transform/transform_conv_bwd_data_to_gemm_v1.hpp diff --git a/include/ck/tensor_operation/operator_transform/transform_conv_fwd_to_gemm.hpp b/composable_kernel/include/ck/tensor_operation/operator_transform/transform_conv_fwd_to_gemm.hpp similarity index 100% rename from include/ck/tensor_operation/operator_transform/transform_conv_fwd_to_gemm.hpp rename to composable_kernel/include/ck/tensor_operation/operator_transform/transform_conv_fwd_to_gemm.hpp diff --git a/include/ck/utility/amd_address_space.hpp b/composable_kernel/include/ck/utility/amd_address_space.hpp similarity index 100% rename from include/ck/utility/amd_address_space.hpp rename to composable_kernel/include/ck/utility/amd_address_space.hpp diff --git a/include/ck/utility/amd_buffer_addressing.hpp b/composable_kernel/include/ck/utility/amd_buffer_addressing.hpp similarity index 100% rename from include/ck/utility/amd_buffer_addressing.hpp rename to composable_kernel/include/ck/utility/amd_buffer_addressing.hpp diff --git a/include/ck/utility/amd_gemm_dpp.hpp b/composable_kernel/include/ck/utility/amd_gemm_dpp.hpp similarity index 100% rename from include/ck/utility/amd_gemm_dpp.hpp rename to composable_kernel/include/ck/utility/amd_gemm_dpp.hpp diff --git a/include/ck/utility/amd_inline_asm.hpp b/composable_kernel/include/ck/utility/amd_inline_asm.hpp similarity index 100% rename from include/ck/utility/amd_inline_asm.hpp rename to composable_kernel/include/ck/utility/amd_inline_asm.hpp diff --git a/include/ck/utility/amd_wave_read_first_lane.hpp b/composable_kernel/include/ck/utility/amd_wave_read_first_lane.hpp similarity index 100% rename from include/ck/utility/amd_wave_read_first_lane.hpp rename to composable_kernel/include/ck/utility/amd_wave_read_first_lane.hpp diff --git a/include/ck/utility/amd_wmma.hpp b/composable_kernel/include/ck/utility/amd_wmma.hpp similarity index 100% rename from include/ck/utility/amd_wmma.hpp rename to composable_kernel/include/ck/utility/amd_wmma.hpp diff --git a/include/ck/utility/amd_xdlops.hpp b/composable_kernel/include/ck/utility/amd_xdlops.hpp similarity index 100% rename from include/ck/utility/amd_xdlops.hpp rename to composable_kernel/include/ck/utility/amd_xdlops.hpp diff --git a/include/ck/utility/array.hpp b/composable_kernel/include/ck/utility/array.hpp similarity index 100% rename from include/ck/utility/array.hpp rename to composable_kernel/include/ck/utility/array.hpp diff --git a/include/ck/utility/array_multi_index.hpp b/composable_kernel/include/ck/utility/array_multi_index.hpp similarity index 100% rename from include/ck/utility/array_multi_index.hpp rename to composable_kernel/include/ck/utility/array_multi_index.hpp diff --git a/include/ck/utility/c_style_pointer_cast.hpp b/composable_kernel/include/ck/utility/c_style_pointer_cast.hpp similarity index 100% rename from include/ck/utility/c_style_pointer_cast.hpp rename to composable_kernel/include/ck/utility/c_style_pointer_cast.hpp diff --git a/include/ck/utility/common_header.hpp b/composable_kernel/include/ck/utility/common_header.hpp similarity index 100% rename from include/ck/utility/common_header.hpp rename to composable_kernel/include/ck/utility/common_header.hpp diff --git a/include/ck/utility/container_element_picker.hpp b/composable_kernel/include/ck/utility/container_element_picker.hpp similarity index 100% rename from include/ck/utility/container_element_picker.hpp rename to composable_kernel/include/ck/utility/container_element_picker.hpp diff --git a/include/ck/utility/container_helper.hpp b/composable_kernel/include/ck/utility/container_helper.hpp similarity index 100% rename from include/ck/utility/container_helper.hpp rename to composable_kernel/include/ck/utility/container_helper.hpp diff --git a/include/ck/utility/data_type.hpp b/composable_kernel/include/ck/utility/data_type.hpp similarity index 100% rename from include/ck/utility/data_type.hpp rename to composable_kernel/include/ck/utility/data_type.hpp diff --git a/include/ck/utility/debug.hpp b/composable_kernel/include/ck/utility/debug.hpp similarity index 100% rename from include/ck/utility/debug.hpp rename to composable_kernel/include/ck/utility/debug.hpp diff --git a/include/ck/utility/dynamic_buffer.hpp b/composable_kernel/include/ck/utility/dynamic_buffer.hpp similarity index 100% rename from include/ck/utility/dynamic_buffer.hpp rename to composable_kernel/include/ck/utility/dynamic_buffer.hpp diff --git a/include/ck/utility/enable_if.hpp b/composable_kernel/include/ck/utility/enable_if.hpp similarity index 100% rename from include/ck/utility/enable_if.hpp rename to composable_kernel/include/ck/utility/enable_if.hpp diff --git a/include/ck/utility/f8_utils.hpp b/composable_kernel/include/ck/utility/f8_utils.hpp similarity index 100% rename from include/ck/utility/f8_utils.hpp rename to composable_kernel/include/ck/utility/f8_utils.hpp diff --git a/include/ck/utility/functional.hpp b/composable_kernel/include/ck/utility/functional.hpp similarity index 100% rename from include/ck/utility/functional.hpp rename to composable_kernel/include/ck/utility/functional.hpp diff --git a/include/ck/utility/functional2.hpp b/composable_kernel/include/ck/utility/functional2.hpp similarity index 100% rename from include/ck/utility/functional2.hpp rename to composable_kernel/include/ck/utility/functional2.hpp diff --git a/include/ck/utility/functional3.hpp b/composable_kernel/include/ck/utility/functional3.hpp similarity index 100% rename from include/ck/utility/functional3.hpp rename to composable_kernel/include/ck/utility/functional3.hpp diff --git a/include/ck/utility/functional4.hpp b/composable_kernel/include/ck/utility/functional4.hpp similarity index 100% rename from include/ck/utility/functional4.hpp rename to composable_kernel/include/ck/utility/functional4.hpp diff --git a/include/ck/utility/generic_memory_space_atomic.hpp b/composable_kernel/include/ck/utility/generic_memory_space_atomic.hpp similarity index 100% rename from include/ck/utility/generic_memory_space_atomic.hpp rename to composable_kernel/include/ck/utility/generic_memory_space_atomic.hpp diff --git a/include/ck/utility/get_id.hpp b/composable_kernel/include/ck/utility/get_id.hpp similarity index 100% rename from include/ck/utility/get_id.hpp rename to composable_kernel/include/ck/utility/get_id.hpp diff --git a/include/ck/utility/get_shift.hpp b/composable_kernel/include/ck/utility/get_shift.hpp similarity index 100% rename from include/ck/utility/get_shift.hpp rename to composable_kernel/include/ck/utility/get_shift.hpp diff --git a/include/ck/utility/ignore.hpp b/composable_kernel/include/ck/utility/ignore.hpp similarity index 100% rename from include/ck/utility/ignore.hpp rename to composable_kernel/include/ck/utility/ignore.hpp diff --git a/include/ck/utility/inner_product.hpp b/composable_kernel/include/ck/utility/inner_product.hpp similarity index 100% rename from include/ck/utility/inner_product.hpp rename to composable_kernel/include/ck/utility/inner_product.hpp diff --git a/include/ck/utility/inner_product_dpp8.hpp b/composable_kernel/include/ck/utility/inner_product_dpp8.hpp similarity index 100% rename from include/ck/utility/inner_product_dpp8.hpp rename to composable_kernel/include/ck/utility/inner_product_dpp8.hpp diff --git a/include/ck/utility/integral_constant.hpp b/composable_kernel/include/ck/utility/integral_constant.hpp similarity index 100% rename from include/ck/utility/integral_constant.hpp rename to composable_kernel/include/ck/utility/integral_constant.hpp diff --git a/include/ck/utility/is_detected.hpp b/composable_kernel/include/ck/utility/is_detected.hpp similarity index 100% rename from include/ck/utility/is_detected.hpp rename to composable_kernel/include/ck/utility/is_detected.hpp diff --git a/include/ck/utility/is_known_at_compile_time.hpp b/composable_kernel/include/ck/utility/is_known_at_compile_time.hpp similarity index 100% rename from include/ck/utility/is_known_at_compile_time.hpp rename to composable_kernel/include/ck/utility/is_known_at_compile_time.hpp diff --git a/include/ck/utility/loop_scheduler.hpp b/composable_kernel/include/ck/utility/loop_scheduler.hpp similarity index 100% rename from include/ck/utility/loop_scheduler.hpp rename to composable_kernel/include/ck/utility/loop_scheduler.hpp diff --git a/include/ck/utility/magic_division.hpp b/composable_kernel/include/ck/utility/magic_division.hpp similarity index 100% rename from include/ck/utility/magic_division.hpp rename to composable_kernel/include/ck/utility/magic_division.hpp diff --git a/include/ck/utility/math.hpp b/composable_kernel/include/ck/utility/math.hpp similarity index 100% rename from include/ck/utility/math.hpp rename to composable_kernel/include/ck/utility/math.hpp diff --git a/include/ck/utility/math_v2.hpp b/composable_kernel/include/ck/utility/math_v2.hpp similarity index 100% rename from include/ck/utility/math_v2.hpp rename to composable_kernel/include/ck/utility/math_v2.hpp diff --git a/include/ck/utility/multi_index.hpp b/composable_kernel/include/ck/utility/multi_index.hpp similarity index 100% rename from include/ck/utility/multi_index.hpp rename to composable_kernel/include/ck/utility/multi_index.hpp diff --git a/include/ck/utility/number.hpp b/composable_kernel/include/ck/utility/number.hpp similarity index 100% rename from include/ck/utility/number.hpp rename to composable_kernel/include/ck/utility/number.hpp diff --git a/include/ck/utility/random_gen.hpp b/composable_kernel/include/ck/utility/random_gen.hpp similarity index 100% rename from include/ck/utility/random_gen.hpp rename to composable_kernel/include/ck/utility/random_gen.hpp diff --git a/include/ck/utility/reduction_common.hpp b/composable_kernel/include/ck/utility/reduction_common.hpp similarity index 100% rename from include/ck/utility/reduction_common.hpp rename to composable_kernel/include/ck/utility/reduction_common.hpp diff --git a/include/ck/utility/reduction_enums.hpp b/composable_kernel/include/ck/utility/reduction_enums.hpp similarity index 100% rename from include/ck/utility/reduction_enums.hpp rename to composable_kernel/include/ck/utility/reduction_enums.hpp diff --git a/include/ck/utility/reduction_functions_accumulate.hpp b/composable_kernel/include/ck/utility/reduction_functions_accumulate.hpp similarity index 100% rename from include/ck/utility/reduction_functions_accumulate.hpp rename to composable_kernel/include/ck/utility/reduction_functions_accumulate.hpp diff --git a/include/ck/utility/reduction_operator.hpp b/composable_kernel/include/ck/utility/reduction_operator.hpp similarity index 100% rename from include/ck/utility/reduction_operator.hpp rename to composable_kernel/include/ck/utility/reduction_operator.hpp diff --git a/include/ck/utility/sequence.hpp b/composable_kernel/include/ck/utility/sequence.hpp similarity index 100% rename from include/ck/utility/sequence.hpp rename to composable_kernel/include/ck/utility/sequence.hpp diff --git a/include/ck/utility/sequence_helper.hpp b/composable_kernel/include/ck/utility/sequence_helper.hpp similarity index 100% rename from include/ck/utility/sequence_helper.hpp rename to composable_kernel/include/ck/utility/sequence_helper.hpp diff --git a/include/ck/utility/span.hpp b/composable_kernel/include/ck/utility/span.hpp similarity index 100% rename from include/ck/utility/span.hpp rename to composable_kernel/include/ck/utility/span.hpp diff --git a/include/ck/utility/static_buffer.hpp b/composable_kernel/include/ck/utility/static_buffer.hpp similarity index 100% rename from include/ck/utility/static_buffer.hpp rename to composable_kernel/include/ck/utility/static_buffer.hpp diff --git a/include/ck/utility/statically_indexed_array.hpp b/composable_kernel/include/ck/utility/statically_indexed_array.hpp similarity index 100% rename from include/ck/utility/statically_indexed_array.hpp rename to composable_kernel/include/ck/utility/statically_indexed_array.hpp diff --git a/include/ck/utility/statically_indexed_array_multi_index.hpp b/composable_kernel/include/ck/utility/statically_indexed_array_multi_index.hpp similarity index 100% rename from include/ck/utility/statically_indexed_array_multi_index.hpp rename to composable_kernel/include/ck/utility/statically_indexed_array_multi_index.hpp diff --git a/include/ck/utility/synchronization.hpp b/composable_kernel/include/ck/utility/synchronization.hpp similarity index 100% rename from include/ck/utility/synchronization.hpp rename to composable_kernel/include/ck/utility/synchronization.hpp diff --git a/include/ck/utility/thread_group.hpp b/composable_kernel/include/ck/utility/thread_group.hpp similarity index 100% rename from include/ck/utility/thread_group.hpp rename to composable_kernel/include/ck/utility/thread_group.hpp diff --git a/include/ck/utility/transpose_vectors.hpp b/composable_kernel/include/ck/utility/transpose_vectors.hpp similarity index 100% rename from include/ck/utility/transpose_vectors.hpp rename to composable_kernel/include/ck/utility/transpose_vectors.hpp diff --git a/include/ck/utility/tuple.hpp b/composable_kernel/include/ck/utility/tuple.hpp similarity index 100% rename from include/ck/utility/tuple.hpp rename to composable_kernel/include/ck/utility/tuple.hpp diff --git a/include/ck/utility/tuple_helper.hpp b/composable_kernel/include/ck/utility/tuple_helper.hpp similarity index 100% rename from include/ck/utility/tuple_helper.hpp rename to composable_kernel/include/ck/utility/tuple_helper.hpp diff --git a/include/ck/utility/type.hpp b/composable_kernel/include/ck/utility/type.hpp similarity index 100% rename from include/ck/utility/type.hpp rename to composable_kernel/include/ck/utility/type.hpp diff --git a/include/ck/utility/type_convert.hpp b/composable_kernel/include/ck/utility/type_convert.hpp similarity index 100% rename from include/ck/utility/type_convert.hpp rename to composable_kernel/include/ck/utility/type_convert.hpp diff --git a/include/ck/utility/workgroup_barrier.hpp b/composable_kernel/include/ck/utility/workgroup_barrier.hpp similarity index 100% rename from include/ck/utility/workgroup_barrier.hpp rename to composable_kernel/include/ck/utility/workgroup_barrier.hpp diff --git a/include/ck/utility/workgroup_synchronization.hpp b/composable_kernel/include/ck/utility/workgroup_synchronization.hpp similarity index 100% rename from include/ck/utility/workgroup_synchronization.hpp rename to composable_kernel/include/ck/utility/workgroup_synchronization.hpp diff --git a/include/ck/version.h.in b/composable_kernel/include/ck/version.h.in similarity index 100% rename from include/ck/version.h.in rename to composable_kernel/include/ck/version.h.in diff --git a/library/CMakeLists.txt b/composable_kernel/library/CMakeLists.txt similarity index 100% rename from library/CMakeLists.txt rename to composable_kernel/library/CMakeLists.txt diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_avgpool_bwd.hpp b/composable_kernel/library/include/ck/library/reference_tensor_operation/cpu/reference_avgpool_bwd.hpp similarity index 100% rename from library/include/ck/library/reference_tensor_operation/cpu/reference_avgpool_bwd.hpp rename to composable_kernel/library/include/ck/library/reference_tensor_operation/cpu/reference_avgpool_bwd.hpp diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp b/composable_kernel/library/include/ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp similarity index 100% rename from library/include/ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp rename to composable_kernel/library/include/ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_batchnorm_backward.hpp b/composable_kernel/library/include/ck/library/reference_tensor_operation/cpu/reference_batchnorm_backward.hpp similarity index 100% rename from library/include/ck/library/reference_tensor_operation/cpu/reference_batchnorm_backward.hpp rename to composable_kernel/library/include/ck/library/reference_tensor_operation/cpu/reference_batchnorm_backward.hpp diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_batchnorm_forward.hpp b/composable_kernel/library/include/ck/library/reference_tensor_operation/cpu/reference_batchnorm_forward.hpp similarity index 100% rename from library/include/ck/library/reference_tensor_operation/cpu/reference_batchnorm_forward.hpp rename to composable_kernel/library/include/ck/library/reference_tensor_operation/cpu/reference_batchnorm_forward.hpp diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_batchnorm_infer.hpp b/composable_kernel/library/include/ck/library/reference_tensor_operation/cpu/reference_batchnorm_infer.hpp similarity index 100% rename from library/include/ck/library/reference_tensor_operation/cpu/reference_batchnorm_infer.hpp rename to composable_kernel/library/include/ck/library/reference_tensor_operation/cpu/reference_batchnorm_infer.hpp diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_cgemm.hpp b/composable_kernel/library/include/ck/library/reference_tensor_operation/cpu/reference_cgemm.hpp similarity index 100% rename from library/include/ck/library/reference_tensor_operation/cpu/reference_cgemm.hpp rename to composable_kernel/library/include/ck/library/reference_tensor_operation/cpu/reference_cgemm.hpp diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_column_to_image.hpp b/composable_kernel/library/include/ck/library/reference_tensor_operation/cpu/reference_column_to_image.hpp similarity index 100% rename from library/include/ck/library/reference_tensor_operation/cpu/reference_column_to_image.hpp rename to composable_kernel/library/include/ck/library/reference_tensor_operation/cpu/reference_column_to_image.hpp diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_contraction.hpp b/composable_kernel/library/include/ck/library/reference_tensor_operation/cpu/reference_contraction.hpp similarity index 100% rename from library/include/ck/library/reference_tensor_operation/cpu/reference_contraction.hpp rename to composable_kernel/library/include/ck/library/reference_tensor_operation/cpu/reference_contraction.hpp diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_bwd_data.hpp b/composable_kernel/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_bwd_data.hpp similarity index 100% rename from library/include/ck/library/reference_tensor_operation/cpu/reference_conv_bwd_data.hpp rename to composable_kernel/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_bwd_data.hpp diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_bwd_weight.hpp b/composable_kernel/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_bwd_weight.hpp similarity index 100% rename from library/include/ck/library/reference_tensor_operation/cpu/reference_conv_bwd_weight.hpp rename to composable_kernel/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_bwd_weight.hpp diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp b/composable_kernel/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp similarity index 100% rename from library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp rename to composable_kernel/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd_bias_activation.hpp b/composable_kernel/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd_bias_activation.hpp similarity index 100% rename from library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd_bias_activation.hpp rename to composable_kernel/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd_bias_activation.hpp diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd_bias_activation_add.hpp b/composable_kernel/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd_bias_activation_add.hpp similarity index 100% rename from library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd_bias_activation_add.hpp rename to composable_kernel/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd_bias_activation_add.hpp diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm.hpp b/composable_kernel/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm.hpp similarity index 100% rename from library/include/ck/library/reference_tensor_operation/cpu/reference_gemm.hpp rename to composable_kernel/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm.hpp diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm_layernorm.hpp b/composable_kernel/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm_layernorm.hpp similarity index 100% rename from library/include/ck/library/reference_tensor_operation/cpu/reference_gemm_layernorm.hpp rename to composable_kernel/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm_layernorm.hpp diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_groupnorm.hpp b/composable_kernel/library/include/ck/library/reference_tensor_operation/cpu/reference_groupnorm.hpp similarity index 100% rename from library/include/ck/library/reference_tensor_operation/cpu/reference_groupnorm.hpp rename to composable_kernel/library/include/ck/library/reference_tensor_operation/cpu/reference_groupnorm.hpp diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_image_to_column.hpp b/composable_kernel/library/include/ck/library/reference_tensor_operation/cpu/reference_image_to_column.hpp similarity index 100% rename from library/include/ck/library/reference_tensor_operation/cpu/reference_image_to_column.hpp rename to composable_kernel/library/include/ck/library/reference_tensor_operation/cpu/reference_image_to_column.hpp diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_layernorm.hpp b/composable_kernel/library/include/ck/library/reference_tensor_operation/cpu/reference_layernorm.hpp similarity index 100% rename from library/include/ck/library/reference_tensor_operation/cpu/reference_layernorm.hpp rename to composable_kernel/library/include/ck/library/reference_tensor_operation/cpu/reference_layernorm.hpp diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_maxpool_bwd.hpp b/composable_kernel/library/include/ck/library/reference_tensor_operation/cpu/reference_maxpool_bwd.hpp similarity index 100% rename from library/include/ck/library/reference_tensor_operation/cpu/reference_maxpool_bwd.hpp rename to composable_kernel/library/include/ck/library/reference_tensor_operation/cpu/reference_maxpool_bwd.hpp diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_pool_fwd.hpp b/composable_kernel/library/include/ck/library/reference_tensor_operation/cpu/reference_pool_fwd.hpp similarity index 100% rename from library/include/ck/library/reference_tensor_operation/cpu/reference_pool_fwd.hpp rename to composable_kernel/library/include/ck/library/reference_tensor_operation/cpu/reference_pool_fwd.hpp diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_reduce.hpp b/composable_kernel/library/include/ck/library/reference_tensor_operation/cpu/reference_reduce.hpp similarity index 100% rename from library/include/ck/library/reference_tensor_operation/cpu/reference_reduce.hpp rename to composable_kernel/library/include/ck/library/reference_tensor_operation/cpu/reference_reduce.hpp diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_softmax.hpp b/composable_kernel/library/include/ck/library/reference_tensor_operation/cpu/reference_softmax.hpp similarity index 100% rename from library/include/ck/library/reference_tensor_operation/cpu/reference_softmax.hpp rename to composable_kernel/library/include/ck/library/reference_tensor_operation/cpu/reference_softmax.hpp diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_sparse_embedding3_forward_layernorm.hpp b/composable_kernel/library/include/ck/library/reference_tensor_operation/cpu/reference_sparse_embedding3_forward_layernorm.hpp similarity index 100% rename from library/include/ck/library/reference_tensor_operation/cpu/reference_sparse_embedding3_forward_layernorm.hpp rename to composable_kernel/library/include/ck/library/reference_tensor_operation/cpu/reference_sparse_embedding3_forward_layernorm.hpp diff --git a/library/include/ck/library/reference_tensor_operation/gpu/naive_conv_fwd.hpp b/composable_kernel/library/include/ck/library/reference_tensor_operation/gpu/naive_conv_fwd.hpp similarity index 100% rename from library/include/ck/library/reference_tensor_operation/gpu/naive_conv_fwd.hpp rename to composable_kernel/library/include/ck/library/reference_tensor_operation/gpu/naive_conv_fwd.hpp diff --git a/library/include/ck/library/tensor_operation_instance/add_device_operation_instance.hpp b/composable_kernel/library/include/ck/library/tensor_operation_instance/add_device_operation_instance.hpp similarity index 100% rename from library/include/ck/library/tensor_operation_instance/add_device_operation_instance.hpp rename to composable_kernel/library/include/ck/library/tensor_operation_instance/add_device_operation_instance.hpp diff --git a/library/include/ck/library/tensor_operation_instance/device_operation_instance_factory.hpp b/composable_kernel/library/include/ck/library/tensor_operation_instance/device_operation_instance_factory.hpp similarity index 100% rename from library/include/ck/library/tensor_operation_instance/device_operation_instance_factory.hpp rename to composable_kernel/library/include/ck/library/tensor_operation_instance/device_operation_instance_factory.hpp diff --git a/library/include/ck/library/tensor_operation_instance/gpu/avg_pool3d_bwd.hpp b/composable_kernel/library/include/ck/library/tensor_operation_instance/gpu/avg_pool3d_bwd.hpp similarity index 100% rename from library/include/ck/library/tensor_operation_instance/gpu/avg_pool3d_bwd.hpp rename to composable_kernel/library/include/ck/library/tensor_operation_instance/gpu/avg_pool3d_bwd.hpp diff --git a/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm.hpp b/composable_kernel/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm.hpp similarity index 100% rename from library/include/ck/library/tensor_operation_instance/gpu/batched_gemm.hpp rename to composable_kernel/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm.hpp diff --git a/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_add_relu_gemm_add.hpp b/composable_kernel/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_add_relu_gemm_add.hpp similarity index 100% rename from library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_add_relu_gemm_add.hpp rename to composable_kernel/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_add_relu_gemm_add.hpp diff --git a/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_bias_permute.hpp b/composable_kernel/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_bias_permute.hpp similarity index 100% rename from library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_bias_permute.hpp rename to composable_kernel/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_bias_permute.hpp diff --git a/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_bias_softmax_gemm_permute.hpp b/composable_kernel/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_bias_softmax_gemm_permute.hpp similarity index 100% rename from library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_bias_softmax_gemm_permute.hpp rename to composable_kernel/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_bias_softmax_gemm_permute.hpp diff --git a/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_gemm.hpp b/composable_kernel/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_gemm.hpp similarity index 100% rename from library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_gemm.hpp rename to composable_kernel/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_gemm.hpp diff --git a/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_multi_d.hpp b/composable_kernel/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_multi_d.hpp similarity index 100% rename from library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_multi_d.hpp rename to composable_kernel/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_multi_d.hpp diff --git a/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_softmax_gemm.hpp b/composable_kernel/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_softmax_gemm.hpp similarity index 100% rename from library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_softmax_gemm.hpp rename to composable_kernel/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_softmax_gemm.hpp diff --git a/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute.hpp b/composable_kernel/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute.hpp similarity index 100% rename from library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute.hpp rename to composable_kernel/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute.hpp diff --git a/library/include/ck/library/tensor_operation_instance/gpu/batchnorm_backward.hpp b/composable_kernel/library/include/ck/library/tensor_operation_instance/gpu/batchnorm_backward.hpp similarity index 100% rename from library/include/ck/library/tensor_operation_instance/gpu/batchnorm_backward.hpp rename to composable_kernel/library/include/ck/library/tensor_operation_instance/gpu/batchnorm_backward.hpp diff --git a/library/include/ck/library/tensor_operation_instance/gpu/batchnorm_forward.hpp b/composable_kernel/library/include/ck/library/tensor_operation_instance/gpu/batchnorm_forward.hpp similarity index 100% rename from library/include/ck/library/tensor_operation_instance/gpu/batchnorm_forward.hpp rename to composable_kernel/library/include/ck/library/tensor_operation_instance/gpu/batchnorm_forward.hpp diff --git a/library/include/ck/library/tensor_operation_instance/gpu/batchnorm_infer.hpp b/composable_kernel/library/include/ck/library/tensor_operation_instance/gpu/batchnorm_infer.hpp similarity index 100% rename from library/include/ck/library/tensor_operation_instance/gpu/batchnorm_infer.hpp rename to composable_kernel/library/include/ck/library/tensor_operation_instance/gpu/batchnorm_infer.hpp diff --git a/library/include/ck/library/tensor_operation_instance/gpu/contraction_bilinear.hpp b/composable_kernel/library/include/ck/library/tensor_operation_instance/gpu/contraction_bilinear.hpp similarity index 100% rename from library/include/ck/library/tensor_operation_instance/gpu/contraction_bilinear.hpp rename to composable_kernel/library/include/ck/library/tensor_operation_instance/gpu/contraction_bilinear.hpp diff --git a/library/include/ck/library/tensor_operation_instance/gpu/contraction_scale.hpp b/composable_kernel/library/include/ck/library/tensor_operation_instance/gpu/contraction_scale.hpp similarity index 100% rename from library/include/ck/library/tensor_operation_instance/gpu/contraction_scale.hpp rename to composable_kernel/library/include/ck/library/tensor_operation_instance/gpu/contraction_scale.hpp diff --git a/library/include/ck/library/tensor_operation_instance/gpu/conv_tensor_rearrange.hpp b/composable_kernel/library/include/ck/library/tensor_operation_instance/gpu/conv_tensor_rearrange.hpp similarity index 100% rename from library/include/ck/library/tensor_operation_instance/gpu/conv_tensor_rearrange.hpp rename to composable_kernel/library/include/ck/library/tensor_operation_instance/gpu/conv_tensor_rearrange.hpp diff --git a/library/include/ck/library/tensor_operation_instance/gpu/conv_tensor_rearrange/device_column_to_image_instance.hpp b/composable_kernel/library/include/ck/library/tensor_operation_instance/gpu/conv_tensor_rearrange/device_column_to_image_instance.hpp similarity index 100% rename from library/include/ck/library/tensor_operation_instance/gpu/conv_tensor_rearrange/device_column_to_image_instance.hpp rename to composable_kernel/library/include/ck/library/tensor_operation_instance/gpu/conv_tensor_rearrange/device_column_to_image_instance.hpp diff --git a/library/include/ck/library/tensor_operation_instance/gpu/conv_tensor_rearrange/device_image_to_column_instance.hpp b/composable_kernel/library/include/ck/library/tensor_operation_instance/gpu/conv_tensor_rearrange/device_image_to_column_instance.hpp similarity index 100% rename from library/include/ck/library/tensor_operation_instance/gpu/conv_tensor_rearrange/device_image_to_column_instance.hpp rename to composable_kernel/library/include/ck/library/tensor_operation_instance/gpu/conv_tensor_rearrange/device_image_to_column_instance.hpp diff --git a/library/include/ck/library/tensor_operation_instance/gpu/convolution_backward_data.hpp b/composable_kernel/library/include/ck/library/tensor_operation_instance/gpu/convolution_backward_data.hpp similarity index 100% rename from library/include/ck/library/tensor_operation_instance/gpu/convolution_backward_data.hpp rename to composable_kernel/library/include/ck/library/tensor_operation_instance/gpu/convolution_backward_data.hpp diff --git a/library/include/ck/library/tensor_operation_instance/gpu/convolution_forward.hpp b/composable_kernel/library/include/ck/library/tensor_operation_instance/gpu/convolution_forward.hpp similarity index 100% rename from library/include/ck/library/tensor_operation_instance/gpu/convolution_forward.hpp rename to composable_kernel/library/include/ck/library/tensor_operation_instance/gpu/convolution_forward.hpp diff --git a/library/include/ck/library/tensor_operation_instance/gpu/device_elementwise_instance.hpp b/composable_kernel/library/include/ck/library/tensor_operation_instance/gpu/device_elementwise_instance.hpp similarity index 100% rename from library/include/ck/library/tensor_operation_instance/gpu/device_elementwise_instance.hpp rename to composable_kernel/library/include/ck/library/tensor_operation_instance/gpu/device_elementwise_instance.hpp diff --git a/library/include/ck/library/tensor_operation_instance/gpu/device_gemm_mean_squaremean_instance.hpp b/composable_kernel/library/include/ck/library/tensor_operation_instance/gpu/device_gemm_mean_squaremean_instance.hpp similarity index 100% rename from library/include/ck/library/tensor_operation_instance/gpu/device_gemm_mean_squaremean_instance.hpp rename to composable_kernel/library/include/ck/library/tensor_operation_instance/gpu/device_gemm_mean_squaremean_instance.hpp diff --git a/library/include/ck/library/tensor_operation_instance/gpu/elementwise_normalization.hpp b/composable_kernel/library/include/ck/library/tensor_operation_instance/gpu/elementwise_normalization.hpp similarity index 100% rename from library/include/ck/library/tensor_operation_instance/gpu/elementwise_normalization.hpp rename to composable_kernel/library/include/ck/library/tensor_operation_instance/gpu/elementwise_normalization.hpp diff --git a/library/include/ck/library/tensor_operation_instance/gpu/gemm.hpp b/composable_kernel/library/include/ck/library/tensor_operation_instance/gpu/gemm.hpp similarity index 100% rename from library/include/ck/library/tensor_operation_instance/gpu/gemm.hpp rename to composable_kernel/library/include/ck/library/tensor_operation_instance/gpu/gemm.hpp diff --git a/library/include/ck/library/tensor_operation_instance/gpu/gemm_add_add_fastgelu.hpp b/composable_kernel/library/include/ck/library/tensor_operation_instance/gpu/gemm_add_add_fastgelu.hpp similarity index 100% rename from library/include/ck/library/tensor_operation_instance/gpu/gemm_add_add_fastgelu.hpp rename to composable_kernel/library/include/ck/library/tensor_operation_instance/gpu/gemm_add_add_fastgelu.hpp diff --git a/library/include/ck/library/tensor_operation_instance/gpu/gemm_add_fastgelu.hpp b/composable_kernel/library/include/ck/library/tensor_operation_instance/gpu/gemm_add_fastgelu.hpp similarity index 100% rename from library/include/ck/library/tensor_operation_instance/gpu/gemm_add_fastgelu.hpp rename to composable_kernel/library/include/ck/library/tensor_operation_instance/gpu/gemm_add_fastgelu.hpp diff --git a/library/include/ck/library/tensor_operation_instance/gpu/gemm_add_multiply.hpp b/composable_kernel/library/include/ck/library/tensor_operation_instance/gpu/gemm_add_multiply.hpp similarity index 100% rename from library/include/ck/library/tensor_operation_instance/gpu/gemm_add_multiply.hpp rename to composable_kernel/library/include/ck/library/tensor_operation_instance/gpu/gemm_add_multiply.hpp diff --git a/library/include/ck/library/tensor_operation_instance/gpu/gemm_add_relu_add_layernorm.hpp b/composable_kernel/library/include/ck/library/tensor_operation_instance/gpu/gemm_add_relu_add_layernorm.hpp similarity index 100% rename from library/include/ck/library/tensor_operation_instance/gpu/gemm_add_relu_add_layernorm.hpp rename to composable_kernel/library/include/ck/library/tensor_operation_instance/gpu/gemm_add_relu_add_layernorm.hpp diff --git a/library/include/ck/library/tensor_operation_instance/gpu/gemm_bilinear.hpp b/composable_kernel/library/include/ck/library/tensor_operation_instance/gpu/gemm_bilinear.hpp similarity index 100% rename from library/include/ck/library/tensor_operation_instance/gpu/gemm_bilinear.hpp rename to composable_kernel/library/include/ck/library/tensor_operation_instance/gpu/gemm_bilinear.hpp diff --git a/library/include/ck/library/tensor_operation_instance/gpu/gemm_fastgelu.hpp b/composable_kernel/library/include/ck/library/tensor_operation_instance/gpu/gemm_fastgelu.hpp similarity index 100% rename from library/include/ck/library/tensor_operation_instance/gpu/gemm_fastgelu.hpp rename to composable_kernel/library/include/ck/library/tensor_operation_instance/gpu/gemm_fastgelu.hpp diff --git a/library/include/ck/library/tensor_operation_instance/gpu/gemm_multiply_add.hpp b/composable_kernel/library/include/ck/library/tensor_operation_instance/gpu/gemm_multiply_add.hpp similarity index 100% rename from library/include/ck/library/tensor_operation_instance/gpu/gemm_multiply_add.hpp rename to composable_kernel/library/include/ck/library/tensor_operation_instance/gpu/gemm_multiply_add.hpp diff --git a/library/include/ck/library/tensor_operation_instance/gpu/gemm_splitk.hpp b/composable_kernel/library/include/ck/library/tensor_operation_instance/gpu/gemm_splitk.hpp similarity index 100% rename from library/include/ck/library/tensor_operation_instance/gpu/gemm_splitk.hpp rename to composable_kernel/library/include/ck/library/tensor_operation_instance/gpu/gemm_splitk.hpp diff --git a/library/include/ck/library/tensor_operation_instance/gpu/gemm_streamk.hpp b/composable_kernel/library/include/ck/library/tensor_operation_instance/gpu/gemm_streamk.hpp similarity index 100% rename from library/include/ck/library/tensor_operation_instance/gpu/gemm_streamk.hpp rename to composable_kernel/library/include/ck/library/tensor_operation_instance/gpu/gemm_streamk.hpp diff --git a/library/include/ck/library/tensor_operation_instance/gpu/gemv_splitk.hpp b/composable_kernel/library/include/ck/library/tensor_operation_instance/gpu/gemv_splitk.hpp similarity index 90% rename from library/include/ck/library/tensor_operation_instance/gpu/gemv_splitk.hpp rename to composable_kernel/library/include/ck/library/tensor_operation_instance/gpu/gemv_splitk.hpp index 683c3cc4b..6abb71d78 100755 --- a/library/include/ck/library/tensor_operation_instance/gpu/gemv_splitk.hpp +++ b/composable_kernel/library/include/ck/library/tensor_operation_instance/gpu/gemv_splitk.hpp @@ -7,7 +7,7 @@ #include #include "ck/ck.hpp" #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" -#include "ck/tensor_operation/gpu/device/device_gemv.hpp" +#include "ck/tensor_operation/gpu/device/device_tall_and_skinny_gemm.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" #include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp" @@ -19,12 +19,12 @@ namespace instance { void add_device_gemv_splitk_f16_f16_f16_mk_kn_mn_instances( std::vector>>& + DeviceTsmm>>& instances); void add_device_gemv_splitk_f16_f16_f16_mk_nk_mn_instances( std::vector>>& + DeviceTsmm>>& instances); template struct DeviceOperationInstanceFactory< - ck::tensor_operation::device::DeviceGemv> { - using DeviceOp = DeviceGemv +#include +#include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" +#include "ck/tensor_operation/gpu/device/device_tall_and_skinny_gemm.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_tall_and_skinny_gemm_splitk_f16_f16_f16_mk_kn_mn_instances( + std::vector>>& + instances); + +void add_device_tall_and_skinny_gemm_splitk_f16_f16_f16_mk_nk_mn_instances( + std::vector>>& + instances); + +template +struct DeviceOperationInstanceFactory< + ck::tensor_operation::device::DeviceTsmm> +{ + using DeviceOp = DeviceTsmm; + + static auto GetInstances() + { + std::vector> op_ptrs; + + if constexpr(is_same_v && is_same_v && + is_same_v) + { + if constexpr(is_same_v && is_same_v && + is_same_v) + { + add_device_tall_and_skinny_gemm_splitk_f16_f16_f16_mk_kn_mn_instances(op_ptrs); + } + else if constexpr(is_same_v && is_same_v && + is_same_v) + { + add_device_tall_and_skinny_gemm_splitk_f16_f16_f16_mk_nk_mn_instances(op_ptrs); + } + } + + return op_ptrs; + } +}; + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/utility/algorithm.hpp b/composable_kernel/library/include/ck/library/utility/algorithm.hpp similarity index 100% rename from library/include/ck/library/utility/algorithm.hpp rename to composable_kernel/library/include/ck/library/utility/algorithm.hpp diff --git a/library/include/ck/library/utility/check_err.hpp b/composable_kernel/library/include/ck/library/utility/check_err.hpp similarity index 100% rename from library/include/ck/library/utility/check_err.hpp rename to composable_kernel/library/include/ck/library/utility/check_err.hpp diff --git a/library/include/ck/library/utility/conv_common.hpp b/composable_kernel/library/include/ck/library/utility/conv_common.hpp similarity index 100% rename from library/include/ck/library/utility/conv_common.hpp rename to composable_kernel/library/include/ck/library/utility/conv_common.hpp diff --git a/library/include/ck/library/utility/convolution_host_tensor_descriptor_helper.hpp b/composable_kernel/library/include/ck/library/utility/convolution_host_tensor_descriptor_helper.hpp similarity index 100% rename from library/include/ck/library/utility/convolution_host_tensor_descriptor_helper.hpp rename to composable_kernel/library/include/ck/library/utility/convolution_host_tensor_descriptor_helper.hpp diff --git a/library/include/ck/library/utility/convolution_parameter.hpp b/composable_kernel/library/include/ck/library/utility/convolution_parameter.hpp similarity index 100% rename from library/include/ck/library/utility/convolution_parameter.hpp rename to composable_kernel/library/include/ck/library/utility/convolution_parameter.hpp diff --git a/library/include/ck/library/utility/device_memory.hpp b/composable_kernel/library/include/ck/library/utility/device_memory.hpp similarity index 100% rename from library/include/ck/library/utility/device_memory.hpp rename to composable_kernel/library/include/ck/library/utility/device_memory.hpp diff --git a/library/include/ck/library/utility/fill.hpp b/composable_kernel/library/include/ck/library/utility/fill.hpp similarity index 100% rename from library/include/ck/library/utility/fill.hpp rename to composable_kernel/library/include/ck/library/utility/fill.hpp diff --git a/library/include/ck/library/utility/host_common_util.hpp b/composable_kernel/library/include/ck/library/utility/host_common_util.hpp similarity index 100% rename from library/include/ck/library/utility/host_common_util.hpp rename to composable_kernel/library/include/ck/library/utility/host_common_util.hpp diff --git a/library/include/ck/library/utility/host_gemm.hpp b/composable_kernel/library/include/ck/library/utility/host_gemm.hpp similarity index 100% rename from library/include/ck/library/utility/host_gemm.hpp rename to composable_kernel/library/include/ck/library/utility/host_gemm.hpp diff --git a/library/include/ck/library/utility/host_tensor.hpp b/composable_kernel/library/include/ck/library/utility/host_tensor.hpp similarity index 100% rename from library/include/ck/library/utility/host_tensor.hpp rename to composable_kernel/library/include/ck/library/utility/host_tensor.hpp diff --git a/library/include/ck/library/utility/host_tensor_generator.hpp b/composable_kernel/library/include/ck/library/utility/host_tensor_generator.hpp similarity index 100% rename from library/include/ck/library/utility/host_tensor_generator.hpp rename to composable_kernel/library/include/ck/library/utility/host_tensor_generator.hpp diff --git a/library/include/ck/library/utility/iterator.hpp b/composable_kernel/library/include/ck/library/utility/iterator.hpp similarity index 100% rename from library/include/ck/library/utility/iterator.hpp rename to composable_kernel/library/include/ck/library/utility/iterator.hpp diff --git a/library/include/ck/library/utility/literals.hpp b/composable_kernel/library/include/ck/library/utility/literals.hpp similarity index 100% rename from library/include/ck/library/utility/literals.hpp rename to composable_kernel/library/include/ck/library/utility/literals.hpp diff --git a/library/include/ck/library/utility/numeric.hpp b/composable_kernel/library/include/ck/library/utility/numeric.hpp similarity index 100% rename from library/include/ck/library/utility/numeric.hpp rename to composable_kernel/library/include/ck/library/utility/numeric.hpp diff --git a/library/include/ck/library/utility/ranges.hpp b/composable_kernel/library/include/ck/library/utility/ranges.hpp similarity index 100% rename from library/include/ck/library/utility/ranges.hpp rename to composable_kernel/library/include/ck/library/utility/ranges.hpp diff --git a/library/src/tensor_operation_instance/gpu/CMakeLists.txt b/composable_kernel/library/src/tensor_operation_instance/gpu/CMakeLists.txt similarity index 100% rename from library/src/tensor_operation_instance/gpu/CMakeLists.txt rename to composable_kernel/library/src/tensor_operation_instance/gpu/CMakeLists.txt diff --git a/library/src/tensor_operation_instance/gpu/avg_pool3d_bwd/CMakeLists.txt b/composable_kernel/library/src/tensor_operation_instance/gpu/avg_pool3d_bwd/CMakeLists.txt similarity index 100% rename from library/src/tensor_operation_instance/gpu/avg_pool3d_bwd/CMakeLists.txt rename to composable_kernel/library/src/tensor_operation_instance/gpu/avg_pool3d_bwd/CMakeLists.txt diff --git a/library/src/tensor_operation_instance/gpu/avg_pool3d_bwd/avg_pool3d_bwd_ndhwc_instance_common.hpp b/composable_kernel/library/src/tensor_operation_instance/gpu/avg_pool3d_bwd/avg_pool3d_bwd_ndhwc_instance_common.hpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/avg_pool3d_bwd/avg_pool3d_bwd_ndhwc_instance_common.hpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/avg_pool3d_bwd/avg_pool3d_bwd_ndhwc_instance_common.hpp diff --git a/library/src/tensor_operation_instance/gpu/avg_pool3d_bwd/device_avg_pool3d_bwd_ndhwc_bf16_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/avg_pool3d_bwd/device_avg_pool3d_bwd_ndhwc_bf16_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/avg_pool3d_bwd/device_avg_pool3d_bwd_ndhwc_bf16_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/avg_pool3d_bwd/device_avg_pool3d_bwd_ndhwc_bf16_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/avg_pool3d_bwd/device_avg_pool3d_bwd_ndhwc_f16_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/avg_pool3d_bwd/device_avg_pool3d_bwd_ndhwc_f16_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/avg_pool3d_bwd/device_avg_pool3d_bwd_ndhwc_f16_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/avg_pool3d_bwd/device_avg_pool3d_bwd_ndhwc_f16_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/avg_pool3d_bwd/device_avg_pool3d_bwd_ndhwc_f32_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/avg_pool3d_bwd/device_avg_pool3d_bwd_ndhwc_f32_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/avg_pool3d_bwd/device_avg_pool3d_bwd_ndhwc_f32_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/avg_pool3d_bwd/device_avg_pool3d_bwd_ndhwc_f32_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm/CMakeLists.txt b/composable_kernel/library/src/tensor_operation_instance/gpu/batched_gemm/CMakeLists.txt similarity index 100% rename from library/src/tensor_operation_instance/gpu/batched_gemm/CMakeLists.txt rename to composable_kernel/library/src/tensor_operation_instance/gpu/batched_gemm/CMakeLists.txt diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_bf16_bf16_bf16_gkm_gkn_gmn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_bf16_bf16_bf16_gkm_gkn_gmn_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_bf16_bf16_bf16_gkm_gkn_gmn_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_bf16_bf16_bf16_gkm_gkn_gmn_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_bf16_bf16_bf16_gkm_gnk_gmn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_bf16_bf16_bf16_gkm_gnk_gmn_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_bf16_bf16_bf16_gkm_gnk_gmn_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_bf16_bf16_bf16_gkm_gnk_gmn_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_bf16_bf16_bf16_gmk_gkn_gmn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_bf16_bf16_bf16_gmk_gkn_gmn_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_bf16_bf16_bf16_gmk_gkn_gmn_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_bf16_bf16_bf16_gmk_gkn_gmn_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_bf16_bf16_bf16_gmk_gnk_gmn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_bf16_bf16_bf16_gmk_gnk_gmn_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_bf16_bf16_bf16_gmk_gnk_gmn_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_bf16_bf16_bf16_gmk_gnk_gmn_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f16_f16_f16_gkm_gkn_gmn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f16_f16_f16_gkm_gkn_gmn_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f16_f16_f16_gkm_gkn_gmn_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f16_f16_f16_gkm_gkn_gmn_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f16_f16_f16_gkm_gnk_gmn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f16_f16_f16_gkm_gnk_gmn_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f16_f16_f16_gkm_gnk_gmn_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f16_f16_f16_gkm_gnk_gmn_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f16_f16_f16_gmk_gkn_gmn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f16_f16_f16_gmk_gkn_gmn_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f16_f16_f16_gmk_gkn_gmn_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f16_f16_f16_gmk_gkn_gmn_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f16_f16_f16_gmk_gnk_gmn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f16_f16_f16_gmk_gnk_gmn_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f16_f16_f16_gmk_gnk_gmn_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f16_f16_f16_gmk_gnk_gmn_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f32_f32_f32_gkm_gkn_gmn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f32_f32_f32_gkm_gkn_gmn_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f32_f32_f32_gkm_gkn_gmn_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f32_f32_f32_gkm_gkn_gmn_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f32_f32_f32_gkm_gnk_gmn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f32_f32_f32_gkm_gnk_gmn_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f32_f32_f32_gkm_gnk_gmn_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f32_f32_f32_gkm_gnk_gmn_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f32_f32_f32_gmk_gkn_gmn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f32_f32_f32_gmk_gkn_gmn_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f32_f32_f32_gmk_gkn_gmn_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f32_f32_f32_gmk_gkn_gmn_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f32_f32_f32_gmk_gnk_gmn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f32_f32_f32_gmk_gnk_gmn_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f32_f32_f32_gmk_gnk_gmn_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f32_f32_f32_gmk_gnk_gmn_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_int8_int8_int8_gkm_gkn_gmn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_int8_int8_int8_gkm_gkn_gmn_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_int8_int8_int8_gkm_gkn_gmn_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_int8_int8_int8_gkm_gkn_gmn_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_int8_int8_int8_gkm_gnk_gmn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_int8_int8_int8_gkm_gnk_gmn_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_int8_int8_int8_gkm_gnk_gmn_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_int8_int8_int8_gkm_gnk_gmn_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_int8_int8_int8_gmk_gkn_gmn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_int8_int8_int8_gmk_gkn_gmn_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_int8_int8_int8_gmk_gkn_gmn_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_int8_int8_int8_gmk_gkn_gmn_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_int8_int8_int8_gmk_gnk_gmn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_int8_int8_int8_gmk_gnk_gmn_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_int8_int8_int8_gmk_gnk_gmn_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_int8_int8_int8_gmk_gnk_gmn_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm_add_relu_gemm_add/CMakeLists.txt b/composable_kernel/library/src/tensor_operation_instance/gpu/batched_gemm_add_relu_gemm_add/CMakeLists.txt similarity index 100% rename from library/src/tensor_operation_instance/gpu/batched_gemm_add_relu_gemm_add/CMakeLists.txt rename to composable_kernel/library/src/tensor_operation_instance/gpu/batched_gemm_add_relu_gemm_add/CMakeLists.txt diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm_add_relu_gemm_add/device_batched_gemm_add_relu_gemm_add_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/batched_gemm_add_relu_gemm_add/device_batched_gemm_add_relu_gemm_add_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/batched_gemm_add_relu_gemm_add/device_batched_gemm_add_relu_gemm_add_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/batched_gemm_add_relu_gemm_add/device_batched_gemm_add_relu_gemm_add_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm_add_relu_gemm_add/device_batched_gemm_add_relu_gemm_add_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gon_gmo_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/batched_gemm_add_relu_gemm_add/device_batched_gemm_add_relu_gemm_add_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gon_gmo_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/batched_gemm_add_relu_gemm_add/device_batched_gemm_add_relu_gemm_add_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gon_gmo_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/batched_gemm_add_relu_gemm_add/device_batched_gemm_add_relu_gemm_add_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gon_gmo_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm_bias_permute/CMakeLists.txt b/composable_kernel/library/src/tensor_operation_instance/gpu/batched_gemm_bias_permute/CMakeLists.txt similarity index 100% rename from library/src/tensor_operation_instance/gpu/batched_gemm_bias_permute/CMakeLists.txt rename to composable_kernel/library/src/tensor_operation_instance/gpu/batched_gemm_bias_permute/CMakeLists.txt diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm_bias_permute/device_batched_gemm_bias_permute_m2_n3_k1_xdl_c_shuffle_f16_f16_f16_f16_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/batched_gemm_bias_permute/device_batched_gemm_bias_permute_m2_n3_k1_xdl_c_shuffle_f16_f16_f16_f16_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/batched_gemm_bias_permute/device_batched_gemm_bias_permute_m2_n3_k1_xdl_c_shuffle_f16_f16_f16_f16_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/batched_gemm_bias_permute/device_batched_gemm_bias_permute_m2_n3_k1_xdl_c_shuffle_f16_f16_f16_f16_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm_gemm/CMakeLists.txt b/composable_kernel/library/src/tensor_operation_instance/gpu/batched_gemm_gemm/CMakeLists.txt similarity index 100% rename from library/src/tensor_operation_instance/gpu/batched_gemm_gemm/CMakeLists.txt rename to composable_kernel/library/src/tensor_operation_instance/gpu/batched_gemm_gemm/CMakeLists.txt diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm_gemm/device_batched_gemm_gemm_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/batched_gemm_gemm/device_batched_gemm_gemm_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/batched_gemm_gemm/device_batched_gemm_gemm_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/batched_gemm_gemm/device_batched_gemm_gemm_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm_gemm/device_batched_gemm_gemm_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gon_gmo_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/batched_gemm_gemm/device_batched_gemm_gemm_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gon_gmo_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/batched_gemm_gemm/device_batched_gemm_gemm_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gon_gmo_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/batched_gemm_gemm/device_batched_gemm_gemm_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gon_gmo_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/CMakeLists.txt b/composable_kernel/library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/CMakeLists.txt similarity index 100% rename from library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/CMakeLists.txt rename to composable_kernel/library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/CMakeLists.txt diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_f16_f16_f16_gkm_gkn_gmn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_f16_f16_f16_gkm_gkn_gmn_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_f16_f16_f16_gkm_gkn_gmn_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_f16_f16_f16_gkm_gkn_gmn_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_f16_f16_f16_gkm_gkn_gmn_irregular_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_f16_f16_f16_gkm_gkn_gmn_irregular_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_f16_f16_f16_gkm_gkn_gmn_irregular_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_f16_f16_f16_gkm_gkn_gmn_irregular_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_f16_f16_f16_gkm_gnk_gmn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_f16_f16_f16_gkm_gnk_gmn_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_f16_f16_f16_gkm_gnk_gmn_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_f16_f16_f16_gkm_gnk_gmn_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_f16_f16_f16_gkm_gnk_gmn_irregular_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_f16_f16_f16_gkm_gnk_gmn_irregular_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_f16_f16_f16_gkm_gnk_gmn_irregular_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_f16_f16_f16_gkm_gnk_gmn_irregular_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_f16_f16_f16_gmk_gkn_gmn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_f16_f16_f16_gmk_gkn_gmn_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_f16_f16_f16_gmk_gkn_gmn_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_f16_f16_f16_gmk_gkn_gmn_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_f16_f16_f16_gmk_gkn_gmn_irregular_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_f16_f16_f16_gmk_gkn_gmn_irregular_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_f16_f16_f16_gmk_gkn_gmn_irregular_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_f16_f16_f16_gmk_gkn_gmn_irregular_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_f16_f16_f16_gmk_gnk_gmn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_f16_f16_f16_gmk_gnk_gmn_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_f16_f16_f16_gmk_gnk_gmn_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_f16_f16_f16_gmk_gnk_gmn_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_f16_f16_f16_gmk_gnk_gmn_irregular_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_f16_f16_f16_gmk_gnk_gmn_irregular_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_f16_f16_f16_gmk_gnk_gmn_irregular_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_f16_f16_f16_gmk_gnk_gmn_irregular_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_i8_i8_i8_gkm_gkn_gmn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_i8_i8_i8_gkm_gkn_gmn_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_i8_i8_i8_gkm_gkn_gmn_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_i8_i8_i8_gkm_gkn_gmn_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_i8_i8_i8_gkm_gkn_gmn_irregular_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_i8_i8_i8_gkm_gkn_gmn_irregular_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_i8_i8_i8_gkm_gkn_gmn_irregular_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_i8_i8_i8_gkm_gkn_gmn_irregular_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_i8_i8_i8_gkm_gnk_gmn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_i8_i8_i8_gkm_gnk_gmn_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_i8_i8_i8_gkm_gnk_gmn_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_i8_i8_i8_gkm_gnk_gmn_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_i8_i8_i8_gkm_gnk_gmn_irregular_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_i8_i8_i8_gkm_gnk_gmn_irregular_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_i8_i8_i8_gkm_gnk_gmn_irregular_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_i8_i8_i8_gkm_gnk_gmn_irregular_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_i8_i8_i8_gmk_gkn_gmn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_i8_i8_i8_gmk_gkn_gmn_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_i8_i8_i8_gmk_gkn_gmn_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_i8_i8_i8_gmk_gkn_gmn_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_i8_i8_i8_gmk_gkn_gmn_irregular_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_i8_i8_i8_gmk_gkn_gmn_irregular_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_i8_i8_i8_gmk_gkn_gmn_irregular_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_i8_i8_i8_gmk_gkn_gmn_irregular_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_i8_i8_i8_gmk_gnk_gmn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_i8_i8_i8_gmk_gnk_gmn_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_i8_i8_i8_gmk_gnk_gmn_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_i8_i8_i8_gmk_gnk_gmn_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_i8_i8_i8_gmk_gnk_gmn_irregular_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_i8_i8_i8_gmk_gnk_gmn_irregular_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_i8_i8_i8_gmk_gnk_gmn_irregular_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_i8_i8_i8_gmk_gnk_gmn_irregular_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm_reduce/CMakeLists.txt b/composable_kernel/library/src/tensor_operation_instance/gpu/batched_gemm_reduce/CMakeLists.txt similarity index 100% rename from library/src/tensor_operation_instance/gpu/batched_gemm_reduce/CMakeLists.txt rename to composable_kernel/library/src/tensor_operation_instance/gpu/batched_gemm_reduce/CMakeLists.txt diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm_reduce/device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gkm_gkn_gmn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/batched_gemm_reduce/device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gkm_gkn_gmn_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/batched_gemm_reduce/device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gkm_gkn_gmn_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/batched_gemm_reduce/device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gkm_gkn_gmn_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm_reduce/device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gkm_gnk_gmn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/batched_gemm_reduce/device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gkm_gnk_gmn_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/batched_gemm_reduce/device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gkm_gnk_gmn_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/batched_gemm_reduce/device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gkm_gnk_gmn_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm_reduce/device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gmk_gkn_gmn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/batched_gemm_reduce/device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gmk_gkn_gmn_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/batched_gemm_reduce/device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gmk_gkn_gmn_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/batched_gemm_reduce/device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gmk_gkn_gmn_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm_reduce/device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gmk_gnk_gmn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/batched_gemm_reduce/device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gmk_gnk_gmn_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/batched_gemm_reduce/device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gmk_gnk_gmn_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/batched_gemm_reduce/device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gmk_gnk_gmn_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm/CMakeLists.txt b/composable_kernel/library/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm/CMakeLists.txt similarity index 100% rename from library/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm/CMakeLists.txt rename to composable_kernel/library/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm/CMakeLists.txt diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm/device_batched_gemm_softmax_gemm_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm/device_batched_gemm_softmax_gemm_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm/device_batched_gemm_softmax_gemm_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm/device_batched_gemm_softmax_gemm_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute/CMakeLists.txt b/composable_kernel/library/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute/CMakeLists.txt similarity index 100% rename from library/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute/CMakeLists.txt rename to composable_kernel/library/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute/CMakeLists.txt diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute/device_batched_gemm_bias_softmax_gemm_permute_xdl_cshuffle_bf16_bf16_bf16_bf16_gmk_gnk_gno_gmo_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute/device_batched_gemm_bias_softmax_gemm_permute_xdl_cshuffle_bf16_bf16_bf16_bf16_gmk_gnk_gno_gmo_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute/device_batched_gemm_bias_softmax_gemm_permute_xdl_cshuffle_bf16_bf16_bf16_bf16_gmk_gnk_gno_gmo_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute/device_batched_gemm_bias_softmax_gemm_permute_xdl_cshuffle_bf16_bf16_bf16_bf16_gmk_gnk_gno_gmo_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute/device_batched_gemm_bias_softmax_gemm_permute_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute/device_batched_gemm_bias_softmax_gemm_permute_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute/device_batched_gemm_bias_softmax_gemm_permute_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute/device_batched_gemm_bias_softmax_gemm_permute_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle_bf16_bf16_bf16_bf16_gmk_gnk_gno_gmo_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle_bf16_bf16_bf16_bf16_gmk_gnk_gno_gmo_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle_bf16_bf16_bf16_bf16_gmk_gnk_gno_gmo_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle_bf16_bf16_bf16_bf16_gmk_gnk_gno_gmo_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/batchnorm/CMakeLists.txt b/composable_kernel/library/src/tensor_operation_instance/gpu/batchnorm/CMakeLists.txt similarity index 100% rename from library/src/tensor_operation_instance/gpu/batchnorm/CMakeLists.txt rename to composable_kernel/library/src/tensor_operation_instance/gpu/batchnorm/CMakeLists.txt diff --git a/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_backward_bf16_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_backward_bf16_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_backward_bf16_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_backward_bf16_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_backward_f16_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_backward_f16_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_backward_f16_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_backward_f16_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_backward_f32_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_backward_f32_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_backward_f32_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_backward_f32_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_backward_f64_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_backward_f64_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_backward_f64_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_backward_f64_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_forward_bf16_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_forward_bf16_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_forward_bf16_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_forward_bf16_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_forward_f16_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_forward_f16_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_forward_f16_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_forward_f16_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_forward_f32_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_forward_f32_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_forward_f32_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_forward_f32_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_forward_f64_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_forward_f64_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_forward_f64_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_forward_f64_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_infer_bf16_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_infer_bf16_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_infer_bf16_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_infer_bf16_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_infer_f16_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_infer_f16_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_infer_f16_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_infer_f16_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_infer_f32_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_infer_f32_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_infer_f32_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_infer_f32_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_infer_f64_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_infer_f64_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_infer_f64_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_infer_f64_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/column_to_image/CMakeLists.txt b/composable_kernel/library/src/tensor_operation_instance/gpu/column_to_image/CMakeLists.txt similarity index 100% rename from library/src/tensor_operation_instance/gpu/column_to_image/CMakeLists.txt rename to composable_kernel/library/src/tensor_operation_instance/gpu/column_to_image/CMakeLists.txt diff --git a/library/src/tensor_operation_instance/gpu/column_to_image/device_column_to_image_nhwc_1d_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/column_to_image/device_column_to_image_nhwc_1d_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/column_to_image/device_column_to_image_nhwc_1d_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/column_to_image/device_column_to_image_nhwc_1d_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/column_to_image/device_column_to_image_nhwc_2d_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/column_to_image/device_column_to_image_nhwc_2d_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/column_to_image/device_column_to_image_nhwc_2d_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/column_to_image/device_column_to_image_nhwc_2d_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/column_to_image/device_column_to_image_nhwc_3d_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/column_to_image/device_column_to_image_nhwc_3d_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/column_to_image/device_column_to_image_nhwc_3d_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/column_to_image/device_column_to_image_nhwc_3d_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/CMakeLists.txt b/composable_kernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/CMakeLists.txt similarity index 100% rename from library/src/tensor_operation_instance/gpu/contraction_bilinear/CMakeLists.txt rename to composable_kernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/CMakeLists.txt diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/contraction_bilinear/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_knnn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_knnn_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/contraction_bilinear/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_knnn_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_knnn_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mknn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mknn_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/contraction_bilinear/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mknn_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mknn_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mnnn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mnnn_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/contraction_bilinear/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mnnn_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mnnn_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_kknn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_kknn_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/contraction_bilinear/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_kknn_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_kknn_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_knnn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_knnn_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/contraction_bilinear/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_knnn_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_knnn_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mknn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mknn_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/contraction_bilinear/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mknn_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mknn_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mnnn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mnnn_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/contraction_bilinear/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mnnn_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mnnn_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/CMakeLists.txt b/composable_kernel/library/src/tensor_operation_instance/gpu/contraction_scale/CMakeLists.txt similarity index 100% rename from library/src/tensor_operation_instance/gpu/contraction_scale/CMakeLists.txt rename to composable_kernel/library/src/tensor_operation_instance/gpu/contraction_scale/CMakeLists.txt diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_kkn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_kkn_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_kkn_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_kkn_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_knn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_knn_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_knn_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_knn_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mkn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mkn_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mkn_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mkn_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mnn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mnn_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mnn_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mnn_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_kkn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_kkn_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_kkn_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_kkn_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_knn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_knn_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_knn_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_knn_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mkn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mkn_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mkn_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mkn_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mnn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mnn_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mnn_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mnn_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/conv1d_bwd_data/CMakeLists.txt b/composable_kernel/library/src/tensor_operation_instance/gpu/conv1d_bwd_data/CMakeLists.txt similarity index 100% rename from library/src/tensor_operation_instance/gpu/conv1d_bwd_data/CMakeLists.txt rename to composable_kernel/library/src/tensor_operation_instance/gpu/conv1d_bwd_data/CMakeLists.txt diff --git a/library/src/tensor_operation_instance/gpu/conv1d_bwd_data/device_conv1d_bwd_data_xdl_nwc_kxc_nwk_bf16_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/conv1d_bwd_data/device_conv1d_bwd_data_xdl_nwc_kxc_nwk_bf16_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/conv1d_bwd_data/device_conv1d_bwd_data_xdl_nwc_kxc_nwk_bf16_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/conv1d_bwd_data/device_conv1d_bwd_data_xdl_nwc_kxc_nwk_bf16_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/conv1d_bwd_data/device_conv1d_bwd_data_xdl_nwc_kxc_nwk_f16_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/conv1d_bwd_data/device_conv1d_bwd_data_xdl_nwc_kxc_nwk_f16_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/conv1d_bwd_data/device_conv1d_bwd_data_xdl_nwc_kxc_nwk_f16_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/conv1d_bwd_data/device_conv1d_bwd_data_xdl_nwc_kxc_nwk_f16_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/conv1d_bwd_data/device_conv1d_bwd_data_xdl_nwc_kxc_nwk_f32_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/conv1d_bwd_data/device_conv1d_bwd_data_xdl_nwc_kxc_nwk_f32_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/conv1d_bwd_data/device_conv1d_bwd_data_xdl_nwc_kxc_nwk_f32_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/conv1d_bwd_data/device_conv1d_bwd_data_xdl_nwc_kxc_nwk_f32_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/conv1d_bwd_data/device_conv1d_bwd_data_xdl_nwc_kxc_nwk_int8_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/conv1d_bwd_data/device_conv1d_bwd_data_xdl_nwc_kxc_nwk_int8_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/conv1d_bwd_data/device_conv1d_bwd_data_xdl_nwc_kxc_nwk_int8_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/conv1d_bwd_data/device_conv1d_bwd_data_xdl_nwc_kxc_nwk_int8_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/CMakeLists.txt b/composable_kernel/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/CMakeLists.txt similarity index 100% rename from library/src/tensor_operation_instance/gpu/conv2d_bwd_data/CMakeLists.txt rename to composable_kernel/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/CMakeLists.txt diff --git a/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_f16_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_f16_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_f16_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_f16_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_f32_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_f32_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_f32_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_f32_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_int8_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_int8_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_int8_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_int8_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_bf16_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_bf16_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_bf16_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_bf16_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f16_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f16_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f16_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f16_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f32_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f32_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f32_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f32_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_int8_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_int8_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_int8_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_int8_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/conv2d_fwd/CMakeLists.txt b/composable_kernel/library/src/tensor_operation_instance/gpu/conv2d_fwd/CMakeLists.txt similarity index 100% rename from library/src/tensor_operation_instance/gpu/conv2d_fwd/CMakeLists.txt rename to composable_kernel/library/src/tensor_operation_instance/gpu/conv2d_fwd/CMakeLists.txt diff --git a/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/conv2d_fwd_bias_relu/CMakeLists.txt b/composable_kernel/library/src/tensor_operation_instance/gpu/conv2d_fwd_bias_relu/CMakeLists.txt similarity index 100% rename from library/src/tensor_operation_instance/gpu/conv2d_fwd_bias_relu/CMakeLists.txt rename to composable_kernel/library/src/tensor_operation_instance/gpu/conv2d_fwd_bias_relu/CMakeLists.txt diff --git a/library/src/tensor_operation_instance/gpu/conv2d_fwd_bias_relu/device_conv2d_fwd_xdl_c_shuffle_bias_relu_nhwc_kyxc_nhwk_f16_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/conv2d_fwd_bias_relu/device_conv2d_fwd_xdl_c_shuffle_bias_relu_nhwc_kyxc_nhwk_f16_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/conv2d_fwd_bias_relu/device_conv2d_fwd_xdl_c_shuffle_bias_relu_nhwc_kyxc_nhwk_f16_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/conv2d_fwd_bias_relu/device_conv2d_fwd_xdl_c_shuffle_bias_relu_nhwc_kyxc_nhwk_f16_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/conv2d_fwd_bias_relu_add/CMakeLists.txt b/composable_kernel/library/src/tensor_operation_instance/gpu/conv2d_fwd_bias_relu_add/CMakeLists.txt similarity index 100% rename from library/src/tensor_operation_instance/gpu/conv2d_fwd_bias_relu_add/CMakeLists.txt rename to composable_kernel/library/src/tensor_operation_instance/gpu/conv2d_fwd_bias_relu_add/CMakeLists.txt diff --git a/library/src/tensor_operation_instance/gpu/conv2d_fwd_bias_relu_add/device_conv2d_fwd_xdl_c_shuffle_bias_relu_add_nhwc_kyxc_nhwk_f16_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/conv2d_fwd_bias_relu_add/device_conv2d_fwd_xdl_c_shuffle_bias_relu_add_nhwc_kyxc_nhwk_f16_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/conv2d_fwd_bias_relu_add/device_conv2d_fwd_xdl_c_shuffle_bias_relu_add_nhwc_kyxc_nhwk_f16_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/conv2d_fwd_bias_relu_add/device_conv2d_fwd_xdl_c_shuffle_bias_relu_add_nhwc_kyxc_nhwk_f16_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/conv3d_bwd_data/CMakeLists.txt b/composable_kernel/library/src/tensor_operation_instance/gpu/conv3d_bwd_data/CMakeLists.txt similarity index 100% rename from library/src/tensor_operation_instance/gpu/conv3d_bwd_data/CMakeLists.txt rename to composable_kernel/library/src/tensor_operation_instance/gpu/conv3d_bwd_data/CMakeLists.txt diff --git a/library/src/tensor_operation_instance/gpu/conv3d_bwd_data/device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_bf16_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/conv3d_bwd_data/device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_bf16_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/conv3d_bwd_data/device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_bf16_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/conv3d_bwd_data/device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_bf16_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/conv3d_bwd_data/device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_f16_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/conv3d_bwd_data/device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_f16_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/conv3d_bwd_data/device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_f16_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/conv3d_bwd_data/device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_f16_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/conv3d_bwd_data/device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_f32_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/conv3d_bwd_data/device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_f32_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/conv3d_bwd_data/device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_f32_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/conv3d_bwd_data/device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_f32_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/conv3d_bwd_data/device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_int8_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/conv3d_bwd_data/device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_int8_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/conv3d_bwd_data/device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_int8_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/conv3d_bwd_data/device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_int8_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/elementwise/CMakeLists.txt b/composable_kernel/library/src/tensor_operation_instance/gpu/elementwise/CMakeLists.txt similarity index 100% rename from library/src/tensor_operation_instance/gpu/elementwise/CMakeLists.txt rename to composable_kernel/library/src/tensor_operation_instance/gpu/elementwise/CMakeLists.txt diff --git a/library/src/tensor_operation_instance/gpu/elementwise/device_normalize_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/elementwise/device_normalize_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/elementwise/device_normalize_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/elementwise/device_normalize_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/elementwise_normalization/CMakeLists.txt b/composable_kernel/library/src/tensor_operation_instance/gpu/elementwise_normalization/CMakeLists.txt similarity index 100% rename from library/src/tensor_operation_instance/gpu/elementwise_normalization/CMakeLists.txt rename to composable_kernel/library/src/tensor_operation_instance/gpu/elementwise_normalization/CMakeLists.txt diff --git a/library/src/tensor_operation_instance/gpu/elementwise_normalization/device_elementwise_normalization_f16_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/elementwise_normalization/device_elementwise_normalization_f16_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/elementwise_normalization/device_elementwise_normalization_f16_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/elementwise_normalization/device_elementwise_normalization_f16_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/gemm/CMakeLists.txt b/composable_kernel/library/src/tensor_operation_instance/gpu/gemm/CMakeLists.txt similarity index 100% rename from library/src/tensor_operation_instance/gpu/gemm/CMakeLists.txt rename to composable_kernel/library/src/tensor_operation_instance/gpu/gemm/CMakeLists.txt diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f16_f16_f16_km_kn_mn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f16_f16_f16_km_kn_mn_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f16_f16_f16_km_kn_mn_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f16_f16_f16_km_kn_mn_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f16_f16_f16_km_kn_mn_irregular_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f16_f16_f16_km_kn_mn_irregular_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f16_f16_f16_km_kn_mn_irregular_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f16_f16_f16_km_kn_mn_irregular_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f16_f16_f16_km_nk_mn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f16_f16_f16_km_nk_mn_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f16_f16_f16_km_nk_mn_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f16_f16_f16_km_nk_mn_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f16_f16_f16_km_nk_mn_irregular_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f16_f16_f16_km_nk_mn_irregular_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f16_f16_f16_km_nk_mn_irregular_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f16_f16_f16_km_nk_mn_irregular_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f16_f16_f16_mk_kn_mn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f16_f16_f16_mk_kn_mn_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f16_f16_f16_mk_kn_mn_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f16_f16_f16_mk_kn_mn_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f16_f16_f16_mk_kn_mn_irregular_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f16_f16_f16_mk_kn_mn_irregular_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f16_f16_f16_mk_kn_mn_irregular_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f16_f16_f16_mk_kn_mn_irregular_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f16_f16_f16_mk_nk_mn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f16_f16_f16_mk_nk_mn_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f16_f16_f16_mk_nk_mn_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f16_f16_f16_mk_nk_mn_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f16_f16_f16_mk_nk_mn_irregular_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f16_f16_f16_mk_nk_mn_irregular_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f16_f16_f16_mk_nk_mn_irregular_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f16_f16_f16_mk_nk_mn_irregular_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f32_f32_f32_km_kn_mn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f32_f32_f32_km_kn_mn_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f32_f32_f32_km_kn_mn_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f32_f32_f32_km_kn_mn_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f32_f32_f32_km_nk_mn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f32_f32_f32_km_nk_mn_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f32_f32_f32_km_nk_mn_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f32_f32_f32_km_nk_mn_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f32_f32_f32_mk_kn_mn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f32_f32_f32_mk_kn_mn_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f32_f32_f32_mk_kn_mn_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f32_f32_f32_mk_kn_mn_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f32_f32_f32_mk_nk_mn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f32_f32_f32_mk_nk_mn_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f32_f32_f32_mk_nk_mn_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f32_f32_f32_mk_nk_mn_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_i8_i8_i8_km_kn_mn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_i8_i8_i8_km_kn_mn_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_i8_i8_i8_km_kn_mn_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_i8_i8_i8_km_kn_mn_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_i8_i8_i8_km_kn_mn_irregular_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_i8_i8_i8_km_kn_mn_irregular_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_i8_i8_i8_km_kn_mn_irregular_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_i8_i8_i8_km_kn_mn_irregular_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_i8_i8_i8_km_nk_mn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_i8_i8_i8_km_nk_mn_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_i8_i8_i8_km_nk_mn_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_i8_i8_i8_km_nk_mn_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_i8_i8_i8_km_nk_mn_irregular_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_i8_i8_i8_km_nk_mn_irregular_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_i8_i8_i8_km_nk_mn_irregular_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_i8_i8_i8_km_nk_mn_irregular_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_i8_i8_i8_mk_kn_mn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_i8_i8_i8_mk_kn_mn_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_i8_i8_i8_mk_kn_mn_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_i8_i8_i8_mk_kn_mn_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_i8_i8_i8_mk_kn_mn_irregular_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_i8_i8_i8_mk_kn_mn_irregular_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_i8_i8_i8_mk_kn_mn_irregular_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_i8_i8_i8_mk_kn_mn_irregular_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_i8_i8_i8_mk_nk_mn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_i8_i8_i8_mk_nk_mn_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_i8_i8_i8_mk_nk_mn_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_i8_i8_i8_mk_nk_mn_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_i8_i8_i8_mk_nk_mn_irregular_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_i8_i8_i8_mk_nk_mn_irregular_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_i8_i8_i8_mk_nk_mn_irregular_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_i8_i8_i8_mk_nk_mn_irregular_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dpp_f16_f16_f16_km_kn_mn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dpp_f16_f16_f16_km_kn_mn_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/gemm/device_gemm_dpp_f16_f16_f16_km_kn_mn_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dpp_f16_f16_f16_km_kn_mn_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dpp_f16_f16_f16_km_kn_mn_irregular_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dpp_f16_f16_f16_km_kn_mn_irregular_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/gemm/device_gemm_dpp_f16_f16_f16_km_kn_mn_irregular_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dpp_f16_f16_f16_km_kn_mn_irregular_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dpp_f16_f16_f16_km_nk_mn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dpp_f16_f16_f16_km_nk_mn_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/gemm/device_gemm_dpp_f16_f16_f16_km_nk_mn_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dpp_f16_f16_f16_km_nk_mn_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dpp_f16_f16_f16_km_nk_mn_irregular_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dpp_f16_f16_f16_km_nk_mn_irregular_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/gemm/device_gemm_dpp_f16_f16_f16_km_nk_mn_irregular_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dpp_f16_f16_f16_km_nk_mn_irregular_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dpp_f16_f16_f16_mk_kn_mn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dpp_f16_f16_f16_mk_kn_mn_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/gemm/device_gemm_dpp_f16_f16_f16_mk_kn_mn_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dpp_f16_f16_f16_mk_kn_mn_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dpp_f16_f16_f16_mk_kn_mn_irregular_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dpp_f16_f16_f16_mk_kn_mn_irregular_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/gemm/device_gemm_dpp_f16_f16_f16_mk_kn_mn_irregular_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dpp_f16_f16_f16_mk_kn_mn_irregular_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dpp_f16_f16_f16_mk_nk_mn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dpp_f16_f16_f16_mk_nk_mn_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/gemm/device_gemm_dpp_f16_f16_f16_mk_nk_mn_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dpp_f16_f16_f16_mk_nk_mn_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dpp_f16_f16_f16_mk_nk_mn_irregular_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dpp_f16_f16_f16_mk_nk_mn_irregular_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/gemm/device_gemm_dpp_f16_f16_f16_mk_nk_mn_irregular_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dpp_f16_f16_f16_mk_nk_mn_irregular_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_2_stage_f16_f16_f16_mk_nk_mn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_2_stage_f16_f16_f16_mk_nk_mn_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_2_stage_f16_f16_f16_mk_nk_mn_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_2_stage_f16_f16_f16_mk_nk_mn_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_bf16_bf16_bf16_km_kn_mn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_bf16_bf16_bf16_km_kn_mn_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_bf16_bf16_bf16_km_kn_mn_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_bf16_bf16_bf16_km_kn_mn_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_bf16_bf16_bf16_km_nk_mn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_bf16_bf16_bf16_km_nk_mn_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_bf16_bf16_bf16_km_nk_mn_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_bf16_bf16_bf16_km_nk_mn_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_bf16_bf16_bf16_mk_kn_mn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_bf16_bf16_bf16_mk_kn_mn_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_bf16_bf16_bf16_mk_kn_mn_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_bf16_bf16_bf16_mk_kn_mn_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_bf16_bf16_bf16_mk_nk_mn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_bf16_bf16_bf16_mk_nk_mn_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_bf16_bf16_bf16_mk_nk_mn_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_bf16_bf16_bf16_mk_nk_mn_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f16_f16_f16_km_kn_mn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f16_f16_f16_km_kn_mn_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f16_f16_f16_km_kn_mn_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f16_f16_f16_km_kn_mn_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f16_f16_f16_km_nk_mn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f16_f16_f16_km_nk_mn_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f16_f16_f16_km_nk_mn_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f16_f16_f16_km_nk_mn_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f32_f32_f32_km_kn_mn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f32_f32_f32_km_kn_mn_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f32_f32_f32_km_kn_mn_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f32_f32_f32_km_kn_mn_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f32_f32_f32_km_nk_mn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f32_f32_f32_km_nk_mn_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f32_f32_f32_km_nk_mn_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f32_f32_f32_km_nk_mn_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f32_f32_f32_mk_kn_mn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f32_f32_f32_mk_kn_mn_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f32_f32_f32_mk_kn_mn_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f32_f32_f32_mk_kn_mn_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f32_f32_f32_mk_nk_mn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f32_f32_f32_mk_nk_mn_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f32_f32_f32_mk_nk_mn_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f32_f32_f32_mk_nk_mn_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_fp8_fp8_fp8_km_kn_mn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_fp8_fp8_fp8_km_kn_mn_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_fp8_fp8_fp8_km_kn_mn_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_fp8_fp8_fp8_km_kn_mn_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_fp8_fp8_fp8_km_nk_mn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_fp8_fp8_fp8_km_nk_mn_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_fp8_fp8_fp8_km_nk_mn_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_fp8_fp8_fp8_km_nk_mn_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_fp8_fp8_fp8_mk_kn_mn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_fp8_fp8_fp8_mk_kn_mn_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_fp8_fp8_fp8_mk_kn_mn_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_fp8_fp8_fp8_mk_kn_mn_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_fp8_fp8_fp8_mk_nk_mn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_fp8_fp8_fp8_mk_nk_mn_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_fp8_fp8_fp8_mk_nk_mn_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_fp8_fp8_fp8_mk_nk_mn_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_i8_i8_i8_km_kn_mn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_i8_i8_i8_km_kn_mn_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_i8_i8_i8_km_kn_mn_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_i8_i8_i8_km_kn_mn_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_i8_i8_i8_km_nk_mn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_i8_i8_i8_km_nk_mn_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_i8_i8_i8_km_nk_mn_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_i8_i8_i8_km_nk_mn_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_i8_i8_i8_mk_kn_mn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_i8_i8_i8_mk_kn_mn_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_i8_i8_i8_mk_kn_mn_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_i8_i8_i8_mk_kn_mn_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_i8_i8_i8_mk_nk_mn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_i8_i8_i8_mk_nk_mn_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_i8_i8_i8_mk_nk_mn_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_i8_i8_i8_mk_nk_mn_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/common.hpp b/composable_kernel/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/common.hpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/common.hpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/common.hpp diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/km_kn_mn_add_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/km_kn_mn_add_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/km_kn_mn_add_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/km_kn_mn_add_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/km_kn_mn_default_pipeline_v1_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/km_kn_mn_default_pipeline_v1_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/km_kn_mn_default_pipeline_v1_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/km_kn_mn_default_pipeline_v1_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/km_kn_mn_default_pipeline_v2_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/km_kn_mn_default_pipeline_v2_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/km_kn_mn_default_pipeline_v2_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/km_kn_mn_default_pipeline_v2_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/km_kn_mn_default_pipeline_v2_opt_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/km_kn_mn_default_pipeline_v2_opt_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/km_kn_mn_default_pipeline_v2_opt_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/km_kn_mn_default_pipeline_v2_opt_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/km_kn_mn_interwave_pipeline_v1_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/km_kn_mn_interwave_pipeline_v1_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/km_kn_mn_interwave_pipeline_v1_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/km_kn_mn_interwave_pipeline_v1_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/km_kn_mn_irregular_default_pipeline_v1_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/km_kn_mn_irregular_default_pipeline_v1_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/km_kn_mn_irregular_default_pipeline_v1_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/km_kn_mn_irregular_default_pipeline_v1_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/km_kn_mn_irregular_default_pipeline_v2_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/km_kn_mn_irregular_default_pipeline_v2_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/km_kn_mn_irregular_default_pipeline_v2_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/km_kn_mn_irregular_default_pipeline_v2_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/km_kn_mn_irregular_interwave_pipeline_v1_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/km_kn_mn_irregular_interwave_pipeline_v1_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/km_kn_mn_irregular_interwave_pipeline_v1_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/km_kn_mn_irregular_interwave_pipeline_v1_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/km_nk_mn_add_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/km_nk_mn_add_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/km_nk_mn_add_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/km_nk_mn_add_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/km_nk_mn_default_pipeline_v1_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/km_nk_mn_default_pipeline_v1_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/km_nk_mn_default_pipeline_v1_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/km_nk_mn_default_pipeline_v1_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/km_nk_mn_default_pipeline_v2_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/km_nk_mn_default_pipeline_v2_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/km_nk_mn_default_pipeline_v2_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/km_nk_mn_default_pipeline_v2_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/km_nk_mn_default_pipeline_v2_opt_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/km_nk_mn_default_pipeline_v2_opt_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/km_nk_mn_default_pipeline_v2_opt_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/km_nk_mn_default_pipeline_v2_opt_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/km_nk_mn_interwave_pipeline_v1_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/km_nk_mn_interwave_pipeline_v1_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/km_nk_mn_interwave_pipeline_v1_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/km_nk_mn_interwave_pipeline_v1_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/km_nk_mn_irregular_default_pipeline_v1_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/km_nk_mn_irregular_default_pipeline_v1_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/km_nk_mn_irregular_default_pipeline_v1_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/km_nk_mn_irregular_default_pipeline_v1_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/km_nk_mn_irregular_default_pipeline_v2_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/km_nk_mn_irregular_default_pipeline_v2_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/km_nk_mn_irregular_default_pipeline_v2_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/km_nk_mn_irregular_default_pipeline_v2_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/km_nk_mn_irregular_interwave_pipeline_v1_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/km_nk_mn_irregular_interwave_pipeline_v1_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/km_nk_mn_irregular_interwave_pipeline_v1_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/km_nk_mn_irregular_interwave_pipeline_v1_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/mk_kn_mn_add_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/mk_kn_mn_add_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/mk_kn_mn_add_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/mk_kn_mn_add_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/mk_kn_mn_default_pipeline_v1_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/mk_kn_mn_default_pipeline_v1_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/mk_kn_mn_default_pipeline_v1_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/mk_kn_mn_default_pipeline_v1_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/mk_kn_mn_default_pipeline_v2_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/mk_kn_mn_default_pipeline_v2_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/mk_kn_mn_default_pipeline_v2_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/mk_kn_mn_default_pipeline_v2_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/mk_kn_mn_default_pipeline_v2_opt_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/mk_kn_mn_default_pipeline_v2_opt_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/mk_kn_mn_default_pipeline_v2_opt_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/mk_kn_mn_default_pipeline_v2_opt_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/mk_kn_mn_interwave_pipeline_v1_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/mk_kn_mn_interwave_pipeline_v1_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/mk_kn_mn_interwave_pipeline_v1_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/mk_kn_mn_interwave_pipeline_v1_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/mk_kn_mn_irregular_default_pipeline_v1_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/mk_kn_mn_irregular_default_pipeline_v1_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/mk_kn_mn_irregular_default_pipeline_v1_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/mk_kn_mn_irregular_default_pipeline_v1_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/mk_kn_mn_irregular_default_pipeline_v2_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/mk_kn_mn_irregular_default_pipeline_v2_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/mk_kn_mn_irregular_default_pipeline_v2_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/mk_kn_mn_irregular_default_pipeline_v2_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/mk_kn_mn_irregular_interwave_pipeline_v1_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/mk_kn_mn_irregular_interwave_pipeline_v1_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/mk_kn_mn_irregular_interwave_pipeline_v1_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/mk_kn_mn_irregular_interwave_pipeline_v1_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/mk_nk_mn_add_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/mk_nk_mn_add_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/mk_nk_mn_add_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/mk_nk_mn_add_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/mk_nk_mn_default_pipeline_v1_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/mk_nk_mn_default_pipeline_v1_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/mk_nk_mn_default_pipeline_v1_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/mk_nk_mn_default_pipeline_v1_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/mk_nk_mn_default_pipeline_v2_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/mk_nk_mn_default_pipeline_v2_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/mk_nk_mn_default_pipeline_v2_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/mk_nk_mn_default_pipeline_v2_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/mk_nk_mn_default_pipeline_v2_opt_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/mk_nk_mn_default_pipeline_v2_opt_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/mk_nk_mn_default_pipeline_v2_opt_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/mk_nk_mn_default_pipeline_v2_opt_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/mk_nk_mn_interwave_pipeline_v1_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/mk_nk_mn_interwave_pipeline_v1_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/mk_nk_mn_interwave_pipeline_v1_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/mk_nk_mn_interwave_pipeline_v1_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/mk_nk_mn_irregular_default_pipeline_v1_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/mk_nk_mn_irregular_default_pipeline_v1_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/mk_nk_mn_irregular_default_pipeline_v1_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/mk_nk_mn_irregular_default_pipeline_v1_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/mk_nk_mn_irregular_default_pipeline_v2_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/mk_nk_mn_irregular_default_pipeline_v2_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/mk_nk_mn_irregular_default_pipeline_v2_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/mk_nk_mn_irregular_default_pipeline_v2_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/mk_nk_mn_irregular_interwave_pipeline_v1_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/mk_nk_mn_irregular_interwave_pipeline_v1_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/mk_nk_mn_irregular_interwave_pipeline_v1_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/mk_nk_mn_irregular_interwave_pipeline_v1_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f32_f32_f32_km_kn_mn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f32_f32_f32_km_kn_mn_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f32_f32_f32_km_kn_mn_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f32_f32_f32_km_kn_mn_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f32_f32_f32_km_nk_mn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f32_f32_f32_km_nk_mn_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f32_f32_f32_km_nk_mn_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f32_f32_f32_km_nk_mn_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f32_f32_f32_mk_kn_mn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f32_f32_f32_mk_kn_mn_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f32_f32_f32_mk_kn_mn_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f32_f32_f32_mk_kn_mn_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f32_f32_f32_mk_nk_mn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f32_f32_f32_mk_nk_mn_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f32_f32_f32_mk_nk_mn_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f32_f32_f32_mk_nk_mn_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f64_f64_f64_km_kn_mn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f64_f64_f64_km_kn_mn_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f64_f64_f64_km_kn_mn_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f64_f64_f64_km_kn_mn_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f64_f64_f64_km_nk_mn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f64_f64_f64_km_nk_mn_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f64_f64_f64_km_nk_mn_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f64_f64_f64_km_nk_mn_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f64_f64_f64_mk_kn_mn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f64_f64_f64_mk_kn_mn_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f64_f64_f64_mk_kn_mn_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f64_f64_f64_mk_kn_mn_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f64_f64_f64_mk_nk_mn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f64_f64_f64_mk_nk_mn_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f64_f64_f64_mk_nk_mn_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f64_f64_f64_mk_nk_mn_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/gemm_add_add_fastgelu/CMakeLists.txt b/composable_kernel/library/src/tensor_operation_instance/gpu/gemm_add_add_fastgelu/CMakeLists.txt similarity index 100% rename from library/src/tensor_operation_instance/gpu/gemm_add_add_fastgelu/CMakeLists.txt rename to composable_kernel/library/src/tensor_operation_instance/gpu/gemm_add_add_fastgelu/CMakeLists.txt diff --git a/library/src/tensor_operation_instance/gpu/gemm_add_add_fastgelu/device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_km_kn_mn_mn_mn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/gemm_add_add_fastgelu/device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_km_kn_mn_mn_mn_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/gemm_add_add_fastgelu/device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_km_kn_mn_mn_mn_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/gemm_add_add_fastgelu/device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_km_kn_mn_mn_mn_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/gemm_add_add_fastgelu/device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_km_nk_mn_mn_mn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/gemm_add_add_fastgelu/device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_km_nk_mn_mn_mn_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/gemm_add_add_fastgelu/device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_km_nk_mn_mn_mn_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/gemm_add_add_fastgelu/device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_km_nk_mn_mn_mn_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/gemm_add_add_fastgelu/device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_kn_mn_mn_mn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/gemm_add_add_fastgelu/device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_kn_mn_mn_mn_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/gemm_add_add_fastgelu/device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_kn_mn_mn_mn_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/gemm_add_add_fastgelu/device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_kn_mn_mn_mn_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/gemm_add_add_fastgelu/device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_nk_mn_mn_mn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/gemm_add_add_fastgelu/device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_nk_mn_mn_mn_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/gemm_add_add_fastgelu/device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_nk_mn_mn_mn_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/gemm_add_add_fastgelu/device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_nk_mn_mn_mn_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/gemm_add_fastgelu/CMakeLists.txt b/composable_kernel/library/src/tensor_operation_instance/gpu/gemm_add_fastgelu/CMakeLists.txt similarity index 100% rename from library/src/tensor_operation_instance/gpu/gemm_add_fastgelu/CMakeLists.txt rename to composable_kernel/library/src/tensor_operation_instance/gpu/gemm_add_fastgelu/CMakeLists.txt diff --git a/library/src/tensor_operation_instance/gpu/gemm_add_fastgelu/device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_km_kn_mn_mn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/gemm_add_fastgelu/device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_km_kn_mn_mn_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/gemm_add_fastgelu/device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_km_kn_mn_mn_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/gemm_add_fastgelu/device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_km_kn_mn_mn_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/gemm_add_fastgelu/device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_km_nk_mn_mn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/gemm_add_fastgelu/device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_km_nk_mn_mn_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/gemm_add_fastgelu/device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_km_nk_mn_mn_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/gemm_add_fastgelu/device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_km_nk_mn_mn_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/gemm_add_fastgelu/device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_mk_kn_mn_mn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/gemm_add_fastgelu/device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_mk_kn_mn_mn_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/gemm_add_fastgelu/device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_mk_kn_mn_mn_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/gemm_add_fastgelu/device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_mk_kn_mn_mn_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/gemm_add_fastgelu/device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_mk_nk_mn_mn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/gemm_add_fastgelu/device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_mk_nk_mn_mn_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/gemm_add_fastgelu/device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_mk_nk_mn_mn_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/gemm_add_fastgelu/device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_mk_nk_mn_mn_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/gemm_add_multiply/CMakeLists.txt b/composable_kernel/library/src/tensor_operation_instance/gpu/gemm_add_multiply/CMakeLists.txt similarity index 100% rename from library/src/tensor_operation_instance/gpu/gemm_add_multiply/CMakeLists.txt rename to composable_kernel/library/src/tensor_operation_instance/gpu/gemm_add_multiply/CMakeLists.txt diff --git a/library/src/tensor_operation_instance/gpu/gemm_add_multiply/device_gemm_add_multiply_xdl_c_shuffle_f16_f16_f16_f16_f16_km_kn_mn_mn_mn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/gemm_add_multiply/device_gemm_add_multiply_xdl_c_shuffle_f16_f16_f16_f16_f16_km_kn_mn_mn_mn_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/gemm_add_multiply/device_gemm_add_multiply_xdl_c_shuffle_f16_f16_f16_f16_f16_km_kn_mn_mn_mn_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/gemm_add_multiply/device_gemm_add_multiply_xdl_c_shuffle_f16_f16_f16_f16_f16_km_kn_mn_mn_mn_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/gemm_add_multiply/device_gemm_add_multiply_xdl_c_shuffle_f16_f16_f16_f16_f16_km_nk_mn_mn_mn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/gemm_add_multiply/device_gemm_add_multiply_xdl_c_shuffle_f16_f16_f16_f16_f16_km_nk_mn_mn_mn_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/gemm_add_multiply/device_gemm_add_multiply_xdl_c_shuffle_f16_f16_f16_f16_f16_km_nk_mn_mn_mn_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/gemm_add_multiply/device_gemm_add_multiply_xdl_c_shuffle_f16_f16_f16_f16_f16_km_nk_mn_mn_mn_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/gemm_add_multiply/device_gemm_add_multiply_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_kn_mn_mn_mn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/gemm_add_multiply/device_gemm_add_multiply_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_kn_mn_mn_mn_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/gemm_add_multiply/device_gemm_add_multiply_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_kn_mn_mn_mn_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/gemm_add_multiply/device_gemm_add_multiply_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_kn_mn_mn_mn_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/gemm_add_multiply/device_gemm_add_multiply_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_nk_mn_mn_mn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/gemm_add_multiply/device_gemm_add_multiply_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_nk_mn_mn_mn_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/gemm_add_multiply/device_gemm_add_multiply_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_nk_mn_mn_mn_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/gemm_add_multiply/device_gemm_add_multiply_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_nk_mn_mn_mn_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/gemm_add_relu_add_layernorm/CMakeLists.txt b/composable_kernel/library/src/tensor_operation_instance/gpu/gemm_add_relu_add_layernorm/CMakeLists.txt similarity index 100% rename from library/src/tensor_operation_instance/gpu/gemm_add_relu_add_layernorm/CMakeLists.txt rename to composable_kernel/library/src/tensor_operation_instance/gpu/gemm_add_relu_add_layernorm/CMakeLists.txt diff --git a/library/src/tensor_operation_instance/gpu/gemm_add_relu_add_layernorm/device_gemm_add_relu_add_xdl_c_shuffle_layernorm_f16_km_kn_mn_mn_mn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/gemm_add_relu_add_layernorm/device_gemm_add_relu_add_xdl_c_shuffle_layernorm_f16_km_kn_mn_mn_mn_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/gemm_add_relu_add_layernorm/device_gemm_add_relu_add_xdl_c_shuffle_layernorm_f16_km_kn_mn_mn_mn_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/gemm_add_relu_add_layernorm/device_gemm_add_relu_add_xdl_c_shuffle_layernorm_f16_km_kn_mn_mn_mn_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/gemm_add_relu_add_layernorm/device_gemm_add_relu_add_xdl_c_shuffle_layernorm_f16_km_nk_mn_mn_mn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/gemm_add_relu_add_layernorm/device_gemm_add_relu_add_xdl_c_shuffle_layernorm_f16_km_nk_mn_mn_mn_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/gemm_add_relu_add_layernorm/device_gemm_add_relu_add_xdl_c_shuffle_layernorm_f16_km_nk_mn_mn_mn_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/gemm_add_relu_add_layernorm/device_gemm_add_relu_add_xdl_c_shuffle_layernorm_f16_km_nk_mn_mn_mn_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/gemm_add_relu_add_layernorm/device_gemm_add_relu_add_xdl_c_shuffle_layernorm_f16_mk_kn_mn_mn_mn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/gemm_add_relu_add_layernorm/device_gemm_add_relu_add_xdl_c_shuffle_layernorm_f16_mk_kn_mn_mn_mn_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/gemm_add_relu_add_layernorm/device_gemm_add_relu_add_xdl_c_shuffle_layernorm_f16_mk_kn_mn_mn_mn_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/gemm_add_relu_add_layernorm/device_gemm_add_relu_add_xdl_c_shuffle_layernorm_f16_mk_kn_mn_mn_mn_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/gemm_add_relu_add_layernorm/device_gemm_add_relu_add_xdl_c_shuffle_layernorm_f16_mk_nk_mn_mn_mn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/gemm_add_relu_add_layernorm/device_gemm_add_relu_add_xdl_c_shuffle_layernorm_f16_mk_nk_mn_mn_mn_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/gemm_add_relu_add_layernorm/device_gemm_add_relu_add_xdl_c_shuffle_layernorm_f16_mk_nk_mn_mn_mn_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/gemm_add_relu_add_layernorm/device_gemm_add_relu_add_xdl_c_shuffle_layernorm_f16_mk_nk_mn_mn_mn_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/gemm_bias_add_reduce/CMakeLists.txt b/composable_kernel/library/src/tensor_operation_instance/gpu/gemm_bias_add_reduce/CMakeLists.txt similarity index 100% rename from library/src/tensor_operation_instance/gpu/gemm_bias_add_reduce/CMakeLists.txt rename to composable_kernel/library/src/tensor_operation_instance/gpu/gemm_bias_add_reduce/CMakeLists.txt diff --git a/library/src/tensor_operation_instance/gpu/gemm_bias_add_reduce/device_gemm_bias_add_mean_squaremean_xdl_cshuffle_f16_f16_f16_f32_f32_km_kn_mn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/gemm_bias_add_reduce/device_gemm_bias_add_mean_squaremean_xdl_cshuffle_f16_f16_f16_f32_f32_km_kn_mn_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/gemm_bias_add_reduce/device_gemm_bias_add_mean_squaremean_xdl_cshuffle_f16_f16_f16_f32_f32_km_kn_mn_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/gemm_bias_add_reduce/device_gemm_bias_add_mean_squaremean_xdl_cshuffle_f16_f16_f16_f32_f32_km_kn_mn_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/gemm_bias_add_reduce/device_gemm_bias_add_mean_squaremean_xdl_cshuffle_f16_f16_f16_f32_f32_km_nk_mn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/gemm_bias_add_reduce/device_gemm_bias_add_mean_squaremean_xdl_cshuffle_f16_f16_f16_f32_f32_km_nk_mn_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/gemm_bias_add_reduce/device_gemm_bias_add_mean_squaremean_xdl_cshuffle_f16_f16_f16_f32_f32_km_nk_mn_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/gemm_bias_add_reduce/device_gemm_bias_add_mean_squaremean_xdl_cshuffle_f16_f16_f16_f32_f32_km_nk_mn_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/gemm_bias_add_reduce/device_gemm_bias_add_mean_squaremean_xdl_cshuffle_f16_f16_f16_f32_f32_mk_kn_mn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/gemm_bias_add_reduce/device_gemm_bias_add_mean_squaremean_xdl_cshuffle_f16_f16_f16_f32_f32_mk_kn_mn_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/gemm_bias_add_reduce/device_gemm_bias_add_mean_squaremean_xdl_cshuffle_f16_f16_f16_f32_f32_mk_kn_mn_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/gemm_bias_add_reduce/device_gemm_bias_add_mean_squaremean_xdl_cshuffle_f16_f16_f16_f32_f32_mk_kn_mn_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/gemm_bias_add_reduce/device_gemm_bias_add_mean_squaremean_xdl_cshuffle_f16_f16_f16_f32_f32_mk_nk_mn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/gemm_bias_add_reduce/device_gemm_bias_add_mean_squaremean_xdl_cshuffle_f16_f16_f16_f32_f32_mk_nk_mn_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/gemm_bias_add_reduce/device_gemm_bias_add_mean_squaremean_xdl_cshuffle_f16_f16_f16_f32_f32_mk_nk_mn_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/gemm_bias_add_reduce/device_gemm_bias_add_mean_squaremean_xdl_cshuffle_f16_f16_f16_f32_f32_mk_nk_mn_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/gemm_bilinear/CMakeLists.txt b/composable_kernel/library/src/tensor_operation_instance/gpu/gemm_bilinear/CMakeLists.txt similarity index 100% rename from library/src/tensor_operation_instance/gpu/gemm_bilinear/CMakeLists.txt rename to composable_kernel/library/src/tensor_operation_instance/gpu/gemm_bilinear/CMakeLists.txt diff --git a/library/src/tensor_operation_instance/gpu/gemm_bilinear/device_gemm_bilinear_wmma_c_shuffle_i8_i8_i8_i8_km_kn_mn_mn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/gemm_bilinear/device_gemm_bilinear_wmma_c_shuffle_i8_i8_i8_i8_km_kn_mn_mn_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/gemm_bilinear/device_gemm_bilinear_wmma_c_shuffle_i8_i8_i8_i8_km_kn_mn_mn_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/gemm_bilinear/device_gemm_bilinear_wmma_c_shuffle_i8_i8_i8_i8_km_kn_mn_mn_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/gemm_bilinear/device_gemm_bilinear_wmma_c_shuffle_i8_i8_i8_i8_km_nk_mn_mn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/gemm_bilinear/device_gemm_bilinear_wmma_c_shuffle_i8_i8_i8_i8_km_nk_mn_mn_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/gemm_bilinear/device_gemm_bilinear_wmma_c_shuffle_i8_i8_i8_i8_km_nk_mn_mn_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/gemm_bilinear/device_gemm_bilinear_wmma_c_shuffle_i8_i8_i8_i8_km_nk_mn_mn_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/gemm_bilinear/device_gemm_bilinear_wmma_c_shuffle_i8_i8_i8_i8_mk_kn_mn_mn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/gemm_bilinear/device_gemm_bilinear_wmma_c_shuffle_i8_i8_i8_i8_mk_kn_mn_mn_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/gemm_bilinear/device_gemm_bilinear_wmma_c_shuffle_i8_i8_i8_i8_mk_kn_mn_mn_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/gemm_bilinear/device_gemm_bilinear_wmma_c_shuffle_i8_i8_i8_i8_mk_kn_mn_mn_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/gemm_bilinear/device_gemm_bilinear_wmma_c_shuffle_i8_i8_i8_i8_mk_nk_mn_mn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/gemm_bilinear/device_gemm_bilinear_wmma_c_shuffle_i8_i8_i8_i8_mk_nk_mn_mn_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/gemm_bilinear/device_gemm_bilinear_wmma_c_shuffle_i8_i8_i8_i8_mk_nk_mn_mn_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/gemm_bilinear/device_gemm_bilinear_wmma_c_shuffle_i8_i8_i8_i8_mk_nk_mn_mn_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/gemm_bilinear/device_gemm_bilinear_xdl_c_shuffle_f16_f16_f16_f16_km_kn_mn_mn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/gemm_bilinear/device_gemm_bilinear_xdl_c_shuffle_f16_f16_f16_f16_km_kn_mn_mn_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/gemm_bilinear/device_gemm_bilinear_xdl_c_shuffle_f16_f16_f16_f16_km_kn_mn_mn_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/gemm_bilinear/device_gemm_bilinear_xdl_c_shuffle_f16_f16_f16_f16_km_kn_mn_mn_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/gemm_bilinear/device_gemm_bilinear_xdl_c_shuffle_f16_f16_f16_f16_km_nk_mn_mn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/gemm_bilinear/device_gemm_bilinear_xdl_c_shuffle_f16_f16_f16_f16_km_nk_mn_mn_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/gemm_bilinear/device_gemm_bilinear_xdl_c_shuffle_f16_f16_f16_f16_km_nk_mn_mn_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/gemm_bilinear/device_gemm_bilinear_xdl_c_shuffle_f16_f16_f16_f16_km_nk_mn_mn_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/gemm_bilinear/device_gemm_bilinear_xdl_c_shuffle_f16_f16_f16_f16_mk_kn_mn_mn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/gemm_bilinear/device_gemm_bilinear_xdl_c_shuffle_f16_f16_f16_f16_mk_kn_mn_mn_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/gemm_bilinear/device_gemm_bilinear_xdl_c_shuffle_f16_f16_f16_f16_mk_kn_mn_mn_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/gemm_bilinear/device_gemm_bilinear_xdl_c_shuffle_f16_f16_f16_f16_mk_kn_mn_mn_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/gemm_bilinear/device_gemm_bilinear_xdl_c_shuffle_f16_f16_f16_f16_mk_nk_mn_mn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/gemm_bilinear/device_gemm_bilinear_xdl_c_shuffle_f16_f16_f16_f16_mk_nk_mn_mn_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/gemm_bilinear/device_gemm_bilinear_xdl_c_shuffle_f16_f16_f16_f16_mk_nk_mn_mn_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/gemm_bilinear/device_gemm_bilinear_xdl_c_shuffle_f16_f16_f16_f16_mk_nk_mn_mn_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/gemm_fastgelu/CMakeLists.txt b/composable_kernel/library/src/tensor_operation_instance/gpu/gemm_fastgelu/CMakeLists.txt similarity index 100% rename from library/src/tensor_operation_instance/gpu/gemm_fastgelu/CMakeLists.txt rename to composable_kernel/library/src/tensor_operation_instance/gpu/gemm_fastgelu/CMakeLists.txt diff --git a/library/src/tensor_operation_instance/gpu/gemm_fastgelu/device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_km_kn_mn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/gemm_fastgelu/device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_km_kn_mn_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/gemm_fastgelu/device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_km_kn_mn_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/gemm_fastgelu/device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_km_kn_mn_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/gemm_fastgelu/device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_km_nk_mn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/gemm_fastgelu/device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_km_nk_mn_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/gemm_fastgelu/device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_km_nk_mn_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/gemm_fastgelu/device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_km_nk_mn_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/gemm_fastgelu/device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/gemm_fastgelu/device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/gemm_fastgelu/device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/gemm_fastgelu/device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/gemm_fastgelu/device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/gemm_fastgelu/device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/gemm_fastgelu/device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/gemm_fastgelu/device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/gemm_multiply_add/CMakeLists.txt b/composable_kernel/library/src/tensor_operation_instance/gpu/gemm_multiply_add/CMakeLists.txt similarity index 100% rename from library/src/tensor_operation_instance/gpu/gemm_multiply_add/CMakeLists.txt rename to composable_kernel/library/src/tensor_operation_instance/gpu/gemm_multiply_add/CMakeLists.txt diff --git a/library/src/tensor_operation_instance/gpu/gemm_multiply_add/device_gemm_multiply_add_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_kn_mn_mn_mn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/gemm_multiply_add/device_gemm_multiply_add_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_kn_mn_mn_mn_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/gemm_multiply_add/device_gemm_multiply_add_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_kn_mn_mn_mn_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/gemm_multiply_add/device_gemm_multiply_add_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_kn_mn_mn_mn_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/gemm_multiply_add/device_gemm_multiply_add_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_nk_mn_mn_mn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/gemm_multiply_add/device_gemm_multiply_add_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_nk_mn_mn_mn_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/gemm_multiply_add/device_gemm_multiply_add_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_nk_mn_mn_mn_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/gemm_multiply_add/device_gemm_multiply_add_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_nk_mn_mn_mn_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/gemm_multiply_add/device_gemm_multiply_add_xdl_c_shuffle_f16_f8_f32_f32_f16_mk_kn_mn_mn_mn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/gemm_multiply_add/device_gemm_multiply_add_xdl_c_shuffle_f16_f8_f32_f32_f16_mk_kn_mn_mn_mn_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/gemm_multiply_add/device_gemm_multiply_add_xdl_c_shuffle_f16_f8_f32_f32_f16_mk_kn_mn_mn_mn_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/gemm_multiply_add/device_gemm_multiply_add_xdl_c_shuffle_f16_f8_f32_f32_f16_mk_kn_mn_mn_mn_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/gemm_multiply_add/device_gemm_multiply_add_xdl_c_shuffle_f16_f8_f32_f32_f16_mk_nk_mn_mn_mn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/gemm_multiply_add/device_gemm_multiply_add_xdl_c_shuffle_f16_f8_f32_f32_f16_mk_nk_mn_mn_mn_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/gemm_multiply_add/device_gemm_multiply_add_xdl_c_shuffle_f16_f8_f32_f32_f16_mk_nk_mn_mn_mn_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/gemm_multiply_add/device_gemm_multiply_add_xdl_c_shuffle_f16_f8_f32_f32_f16_mk_nk_mn_mn_mn_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/gemm_reduce/CMakeLists.txt b/composable_kernel/library/src/tensor_operation_instance/gpu/gemm_reduce/CMakeLists.txt similarity index 100% rename from library/src/tensor_operation_instance/gpu/gemm_reduce/CMakeLists.txt rename to composable_kernel/library/src/tensor_operation_instance/gpu/gemm_reduce/CMakeLists.txt diff --git a/library/src/tensor_operation_instance/gpu/gemm_reduce/device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_km_kn_mn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/gemm_reduce/device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_km_kn_mn_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/gemm_reduce/device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_km_kn_mn_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/gemm_reduce/device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_km_kn_mn_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/gemm_reduce/device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_km_nk_mn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/gemm_reduce/device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_km_nk_mn_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/gemm_reduce/device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_km_nk_mn_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/gemm_reduce/device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_km_nk_mn_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/gemm_reduce/device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_mk_kn_mn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/gemm_reduce/device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_mk_kn_mn_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/gemm_reduce/device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_mk_kn_mn_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/gemm_reduce/device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_mk_kn_mn_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/gemm_reduce/device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_mk_nk_mn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/gemm_reduce/device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_mk_nk_mn_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/gemm_reduce/device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_mk_nk_mn_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/gemm_reduce/device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_mk_nk_mn_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/gemm_splitk/CMakeLists.txt b/composable_kernel/library/src/tensor_operation_instance/gpu/gemm_splitk/CMakeLists.txt similarity index 100% rename from library/src/tensor_operation_instance/gpu/gemm_splitk/CMakeLists.txt rename to composable_kernel/library/src/tensor_operation_instance/gpu/gemm_splitk/CMakeLists.txt diff --git a/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f16_f16_f16_comp_fp8_km_kn_mn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f16_f16_f16_comp_fp8_km_kn_mn_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f16_f16_f16_comp_fp8_km_kn_mn_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f16_f16_f16_comp_fp8_km_kn_mn_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f16_f16_f16_comp_fp8_km_nk_mn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f16_f16_f16_comp_fp8_km_nk_mn_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f16_f16_f16_comp_fp8_km_nk_mn_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f16_f16_f16_comp_fp8_km_nk_mn_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f16_f16_f16_comp_fp8_mk_kn_mn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f16_f16_f16_comp_fp8_mk_kn_mn_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f16_f16_f16_comp_fp8_mk_kn_mn_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f16_f16_f16_comp_fp8_mk_kn_mn_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f16_f16_f16_comp_fp8_mk_nk_mn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f16_f16_f16_comp_fp8_mk_nk_mn_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f16_f16_f16_comp_fp8_mk_nk_mn_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f16_f16_f16_comp_fp8_mk_nk_mn_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f16_f16_f16_km_kn_mn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f16_f16_f16_km_kn_mn_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f16_f16_f16_km_kn_mn_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f16_f16_f16_km_kn_mn_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f16_f16_f16_km_nk_mn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f16_f16_f16_km_nk_mn_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f16_f16_f16_km_nk_mn_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f16_f16_f16_km_nk_mn_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f16_fp8_f16_km_kn_mn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f16_fp8_f16_km_kn_mn_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f16_fp8_f16_km_kn_mn_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f16_fp8_f16_km_kn_mn_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f16_fp8_f16_km_nk_mn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f16_fp8_f16_km_nk_mn_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f16_fp8_f16_km_nk_mn_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f16_fp8_f16_km_nk_mn_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f16_fp8_f16_mk_kn_mn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f16_fp8_f16_mk_kn_mn_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f16_fp8_f16_mk_kn_mn_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f16_fp8_f16_mk_kn_mn_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f16_fp8_f16_mk_nk_mn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f16_fp8_f16_mk_nk_mn_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f16_fp8_f16_mk_nk_mn_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f16_fp8_f16_mk_nk_mn_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f32_f32_f32_km_kn_mn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f32_f32_f32_km_kn_mn_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f32_f32_f32_km_kn_mn_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f32_f32_f32_km_kn_mn_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f32_f32_f32_km_nk_mn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f32_f32_f32_km_nk_mn_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f32_f32_f32_km_nk_mn_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f32_f32_f32_km_nk_mn_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f32_f32_f32_mk_kn_mn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f32_f32_f32_mk_kn_mn_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f32_f32_f32_mk_kn_mn_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f32_f32_f32_mk_kn_mn_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f32_f32_f32_mk_nk_mn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f32_f32_f32_mk_nk_mn_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f32_f32_f32_mk_nk_mn_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f32_f32_f32_mk_nk_mn_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_fp8_f16_f16_km_kn_mn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_fp8_f16_f16_km_kn_mn_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_fp8_f16_f16_km_kn_mn_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_fp8_f16_f16_km_kn_mn_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_fp8_f16_f16_km_nk_mn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_fp8_f16_f16_km_nk_mn_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_fp8_f16_f16_km_nk_mn_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_fp8_f16_f16_km_nk_mn_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_fp8_f16_f16_mk_kn_mn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_fp8_f16_f16_mk_kn_mn_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_fp8_f16_f16_mk_kn_mn_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_fp8_f16_f16_mk_kn_mn_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_fp8_f16_f16_mk_nk_mn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_fp8_f16_f16_mk_nk_mn_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_fp8_f16_f16_mk_nk_mn_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_fp8_f16_f16_mk_nk_mn_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/gemm_streamk/CMakeLists.txt b/composable_kernel/library/src/tensor_operation_instance/gpu/gemm_streamk/CMakeLists.txt similarity index 100% rename from library/src/tensor_operation_instance/gpu/gemm_streamk/CMakeLists.txt rename to composable_kernel/library/src/tensor_operation_instance/gpu/gemm_streamk/CMakeLists.txt diff --git a/library/src/tensor_operation_instance/gpu/gemm_streamk/device_gemm_xdl_streamk_f16_f16_f16_mk_kn_mn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/gemm_streamk/device_gemm_xdl_streamk_f16_f16_f16_mk_kn_mn_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/gemm_streamk/device_gemm_xdl_streamk_f16_f16_f16_mk_kn_mn_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/gemm_streamk/device_gemm_xdl_streamk_f16_f16_f16_mk_kn_mn_instance.cpp diff --git a/composable_kernel/library/src/tensor_operation_instance/gpu/gemv_splitk/CMakeLists.txt b/composable_kernel/library/src/tensor_operation_instance/gpu/gemv_splitk/CMakeLists.txt new file mode 100755 index 000000000..c0667f70a --- /dev/null +++ b/composable_kernel/library/src/tensor_operation_instance/gpu/gemv_splitk/CMakeLists.txt @@ -0,0 +1,17 @@ +list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942) +set(target 0) +foreach(gpu IN LISTS GPU_TARGETS) + if(gpu IN_LIST gpu_list AND target EQUAL 0) + set(GEMV_SPLITK_INSTANCES) + + + if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES) + list(APPEND GEMV_SPLITK_INSTANCES device_gemv_splitk_f16_f16_f16_mk_kn_mn_instance.cpp) + list(APPEND GEMV_SPLITK_INSTANCES device_gemv_splitk_f16_f16_f16_mk_nk_mn_instance.cpp) + endif() + + + add_instance_library(device_gemv_splitk_instance ${GEMV_SPLITK_INSTANCES}) + set(target 1) + endif() +endforeach() \ No newline at end of file diff --git a/library/src/tensor_operation_instance/gpu/gemv_splitk/device_gemv_splitk_f16_f16_f16_mk_kn_mn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/gemv_splitk/device_gemv_splitk_f16_f16_f16_mk_kn_mn_instance.cpp similarity index 90% rename from library/src/tensor_operation_instance/gpu/gemv_splitk/device_gemv_splitk_f16_f16_f16_mk_kn_mn_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/gemv_splitk/device_gemv_splitk_f16_f16_f16_mk_kn_mn_instance.cpp index b7919b0af..b480fab5c 100755 --- a/library/src/tensor_operation_instance/gpu/gemv_splitk/device_gemv_splitk_f16_f16_f16_mk_kn_mn_instance.cpp +++ b/composable_kernel/library/src/tensor_operation_instance/gpu/gemv_splitk/device_gemv_splitk_f16_f16_f16_mk_kn_mn_instance.cpp @@ -5,7 +5,7 @@ #include "ck/ck.hpp" #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" -#include "ck/tensor_operation/gpu/device/impl/device_gemv_splitk.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_tall_and_skinny_gemm_splitk.hpp" #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" @@ -35,158 +35,158 @@ using device_gemv_splitk_f16_f16_f16_mk_kn_mn_instances = std::tuple< // ######| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | //< ADataType, BDataType, CDataType, AccDataType, ALayout, BLayout, CLayout, AElementOp, BElementOp, CElementOp, GemmMNPadding, B, M1, B*N1, K0, K1, 1, N1, 1, S<1,1, 1, 1, K1>, S<1,K0, 1,M1, 1>,S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, K1>, S<0,1,2,3,4>, S<1,1, 1, 1, K1>, S<0,1,2,3,4>, 3, N1, S<0, 1, 2, 3, 4, 5>, 5, N1>; //N1=2 - ck::tensor_operation::device::deviceGemvDl + ck::tensor_operation::device::deviceTsmmDl < F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 1, 128, 1, 2, 1, 2, 1, S<1,1, 1, 1, 2>, S<1,1, 1, 1, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, 3, 2, S<0, 1, 2, 3, 4, 5>, 5, 2>, - ck::tensor_operation::device::deviceGemvDl + ck::tensor_operation::device::deviceTsmmDl < F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 1, 128, 1, 4, 1, 2, 1, S<1,1, 1, 1, 4>, S<1,1, 1,1, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, 3, 2, S<0, 1, 2, 3, 4, 5>, 5, 2>, - ck::tensor_operation::device::deviceGemvDl + ck::tensor_operation::device::deviceTsmmDl < F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 1, 128, 1, 8, 1, 2, 1, S<1,1, 1, 1, 8>, S<1,1, 1,1, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, 3, 2, S<0, 1, 2, 3, 4, 5>, 5, 2>, - ck::tensor_operation::device::deviceGemvDl + ck::tensor_operation::device::deviceTsmmDl < F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 1, 128, 2, 2, 1, 2, 1, S<1,1, 1, 1, 2>, S<1,2, 1,1, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, 3, 2, S<0, 1, 2, 3, 4, 5>, 5, 2>, - ck::tensor_operation::device::deviceGemvDl + ck::tensor_operation::device::deviceTsmmDl < F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 1, 128, 2, 4, 1, 2, 1, S<1,1, 1, 1, 4>, S<1,2, 1,1, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, 3, 2, S<0, 1, 2, 3, 4, 5>, 5, 2>, - ck::tensor_operation::device::deviceGemvDl + ck::tensor_operation::device::deviceTsmmDl < F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 1, 128, 2, 8, 1, 2, 1, S<1,1, 1, 1, 8>, S<1,2, 1,1, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, 3, 2, S<0, 1, 2, 3, 4, 5>, 5, 2>, - ck::tensor_operation::device::deviceGemvDl + ck::tensor_operation::device::deviceTsmmDl < F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 1, 128, 3, 2, 1, 2, 1, S<1,1, 1, 1, 2>, S<1,3, 1,1, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, 3, 2, S<0, 1, 2, 3, 4, 5>, 5, 2>, - ck::tensor_operation::device::deviceGemvDl + ck::tensor_operation::device::deviceTsmmDl < F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 1, 128, 3, 4, 1, 2, 1, S<1,1, 1, 1, 4>, S<1,3, 1,1, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, 3, 2, S<0, 1, 2, 3, 4, 5>, 5, 2>, - ck::tensor_operation::device::deviceGemvDl + ck::tensor_operation::device::deviceTsmmDl < F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 1, 128, 3, 8, 1, 2, 1, S<1,1, 1, 1, 8>, S<1,3, 1,1, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, 3, 2, S<0, 1, 2, 3, 4, 5>, 5, 2>, - ck::tensor_operation::device::deviceGemvDl + ck::tensor_operation::device::deviceTsmmDl < F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 1, 128, 4, 2, 1, 2, 1, S<1,1, 1, 1, 2>, S<1,4, 1,1, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, 3, 2, S<0, 1, 2, 3, 4, 5>, 5, 2>, - ck::tensor_operation::device::deviceGemvDl + ck::tensor_operation::device::deviceTsmmDl < F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 1, 128, 4, 4, 1, 2, 1, S<1,1, 1, 1, 4>, S<1,4, 1,1, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, 3, 2, S<0, 1, 2, 3, 4, 5>, 5, 2>, - ck::tensor_operation::device::deviceGemvDl + ck::tensor_operation::device::deviceTsmmDl < F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 1, 128, 4, 8, 1, 2, 1, S<1,1, 1, 1, 8>, S<1,4, 1,1, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, 3, 2, S<0, 1, 2, 3, 4, 5>, 5, 2>, - ck::tensor_operation::device::deviceGemvDl + ck::tensor_operation::device::deviceTsmmDl < F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 1, 128, 5, 2, 1, 2, 1, S<1,1, 1, 1, 2>, S<1,5, 1,1, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, 3, 2, S<0, 1, 2, 3, 4, 5>, 5, 2>, - ck::tensor_operation::device::deviceGemvDl + ck::tensor_operation::device::deviceTsmmDl < F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 1, 128, 5, 4, 1, 2, 1, S<1,1, 1, 1, 4>, S<1,5, 1,1, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, 3, 2, S<0, 1, 2, 3, 4, 5>, 5, 2>, - ck::tensor_operation::device::deviceGemvDl + ck::tensor_operation::device::deviceTsmmDl < F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 1, 128, 5, 8, 1, 2, 1, S<1,1, 1, 1, 8>, S<1,5, 1,1, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, 3, 2, S<0, 1, 2, 3, 4, 5>, 5, 2>, - ck::tensor_operation::device::deviceGemvDl + ck::tensor_operation::device::deviceTsmmDl < F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 1, 128, 6, 2, 1, 2, 1, S<1,1, 1, 1, 2>, S<1,6, 1,1, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, 3, 2, S<0, 1, 2, 3, 4, 5>, 5, 2>, - ck::tensor_operation::device::deviceGemvDl + ck::tensor_operation::device::deviceTsmmDl < F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 1, 128, 6, 4, 1, 2, 1, S<1,1, 1, 1, 4>, S<1,6, 1,1, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, 3, 2, S<0, 1, 2, 3, 4, 5>, 5, 2>, - ck::tensor_operation::device::deviceGemvDl + ck::tensor_operation::device::deviceTsmmDl < F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 1, 128, 6, 8, 1, 2, 1, S<1,1, 1, 1, 8>, S<1,6, 1,1, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, 3, 2, S<0, 1, 2, 3, 4, 5>, 5, 2>, - ck::tensor_operation::device::deviceGemvDl + ck::tensor_operation::device::deviceTsmmDl < F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 1, 128, 7, 2, 1, 2, 1, S<1,1, 1, 1, 2>, S<1,7, 1,1, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, 3, 2, S<0, 1, 2, 3, 4, 5>, 5, 2>, - ck::tensor_operation::device::deviceGemvDl + ck::tensor_operation::device::deviceTsmmDl < F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 1, 128, 7, 4, 1, 2, 1, S<1,1, 1, 1, 4>, S<1,7, 1,1, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, 3, 2, S<0, 1, 2, 3, 4, 5>, 5, 2>, - ck::tensor_operation::device::deviceGemvDl + ck::tensor_operation::device::deviceTsmmDl < F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 1, 128, 7, 8, 1, 2, 1, S<1,1, 1, 1, 8>, S<1,7, 1,1, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, 3, 2, S<0, 1, 2, 3, 4, 5>, 5, 2>, - ck::tensor_operation::device::deviceGemvDl + ck::tensor_operation::device::deviceTsmmDl < F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 1, 128, 8, 2, 1, 2, 1, S<1,1, 1, 1, 2>, S<1,8, 1,1, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, 3, 2, S<0, 1, 2, 3, 4, 5>, 5, 2>, - ck::tensor_operation::device::deviceGemvDl + ck::tensor_operation::device::deviceTsmmDl < F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 1, 128, 8, 4, 1, 2, 1, S<1,1, 1, 1, 4>, S<1,8, 1,1, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, 3, 2, S<0, 1, 2, 3, 4, 5>, 5, 2>, - ck::tensor_operation::device::deviceGemvDl + ck::tensor_operation::device::deviceTsmmDl < F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 1, 128, 8, 8, 1, 2, 1, S<1,1, 1, 1, 8>, S<1,8, 1,1, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, 3, 2, S<0, 1, 2, 3, 4, 5>, 5, 2>, //N1=4 - ck::tensor_operation::device::deviceGemvDl + ck::tensor_operation::device::deviceTsmmDl < F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 1, 256, 1, 2, 1, 4, 1, S<1,1, 1, 1, 2>, S<1,1, 1,1, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, 3, 4, S<0, 1, 2, 3, 4, 5>, 5, 4>, - ck::tensor_operation::device::deviceGemvDl + ck::tensor_operation::device::deviceTsmmDl < F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 1, 256, 1, 4, 1, 4, 1, S<1,1, 1, 1, 4>, S<1,1, 1,1, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, 3, 4, S<0, 1, 2, 3, 4, 5>, 5, 4>, - ck::tensor_operation::device::deviceGemvDl + ck::tensor_operation::device::deviceTsmmDl < F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 1, 256, 1, 8, 1, 4, 1, S<1,1, 1, 1, 8>, S<1,1, 1,1, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, 3, 4, S<0, 1, 2, 3, 4, 5>, 5, 4>, - ck::tensor_operation::device::deviceGemvDl + ck::tensor_operation::device::deviceTsmmDl < F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 1, 256, 2, 2, 1, 4, 1, S<1,1, 1, 1, 2>, S<1,2, 1,1, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, 3, 4, S<0, 1, 2, 3, 4, 5>, 5, 4>, - ck::tensor_operation::device::deviceGemvDl + ck::tensor_operation::device::deviceTsmmDl < F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 1, 256, 2, 4, 1, 4, 1, S<1,1, 1, 1, 4>, S<1,2, 1,1, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, 3, 4, S<0, 1, 2, 3, 4, 5>, 5, 4>, - ck::tensor_operation::device::deviceGemvDl + ck::tensor_operation::device::deviceTsmmDl < F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 1, 256, 2, 8, 1, 4, 1, S<1,1, 1, 1, 8>, S<1,2, 1,1, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, 3, 4, S<0, 1, 2, 3, 4, 5>, 5, 4>, - ck::tensor_operation::device::deviceGemvDl + ck::tensor_operation::device::deviceTsmmDl < F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 1, 256, 3, 2, 1, 4, 1, S<1,1, 1, 1, 2>, S<1,3, 1,1, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, 3, 4, S<0, 1, 2, 3, 4, 5>, 5, 4>, - ck::tensor_operation::device::deviceGemvDl + ck::tensor_operation::device::deviceTsmmDl < F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 1, 256, 3, 4, 1, 4, 1, S<1,1, 1, 1, 4>, S<1,3, 1,1, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, 3, 4, S<0, 1, 2, 3, 4, 5>, 5, 4>, - ck::tensor_operation::device::deviceGemvDl + ck::tensor_operation::device::deviceTsmmDl < F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 1, 256, 3, 8, 1, 4, 1, S<1,1, 1, 1, 8>, S<1,3, 1,1, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, 3, 4, S<0, 1, 2, 3, 4, 5>, 5, 4>, - ck::tensor_operation::device::deviceGemvDl + ck::tensor_operation::device::deviceTsmmDl < F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 1, 256, 4, 2, 1, 4, 1, S<1,1, 1, 1, 2>, S<1,4, 1,1, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, 3, 4, S<0, 1, 2, 3, 4, 5>, 5, 4>, - ck::tensor_operation::device::deviceGemvDl + ck::tensor_operation::device::deviceTsmmDl < F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 1, 256, 4, 4, 1, 4, 1, S<1,1, 1, 1, 4>, S<1,4, 1,1, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, 3, 4, S<0, 1, 2, 3, 4, 5>, 5, 4>, - ck::tensor_operation::device::deviceGemvDl + ck::tensor_operation::device::deviceTsmmDl < F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 1, 256, 4, 8, 1, 4, 1, S<1,1, 1, 1, 8>, S<1,4, 1,1, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, 3, 4, S<0, 1, 2, 3, 4, 5>, 5, 4>, - ck::tensor_operation::device::deviceGemvDl + ck::tensor_operation::device::deviceTsmmDl < F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 1, 256, 5, 2, 1, 4, 1, S<1,1, 1, 1, 2>, S<1,5, 1,1, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, 3, 4, S<0, 1, 2, 3, 4, 5>, 5, 4>, - ck::tensor_operation::device::deviceGemvDl + ck::tensor_operation::device::deviceTsmmDl < F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 1, 256, 5, 4, 1, 4, 1, S<1,1, 1, 1, 4>, S<1,5, 1,1, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, 3, 4, S<0, 1, 2, 3, 4, 5>, 5, 4>, - ck::tensor_operation::device::deviceGemvDl + ck::tensor_operation::device::deviceTsmmDl < F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 1, 256, 5, 8, 1, 4, 1, S<1,1, 1, 1, 8>, S<1,5, 1,1, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, 3, 4, S<0, 1, 2, 3, 4, 5>, 5, 4>, - ck::tensor_operation::device::deviceGemvDl + ck::tensor_operation::device::deviceTsmmDl < F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 1, 256, 6, 2, 1, 4, 1, S<1,1, 1, 1, 2>, S<1,6, 1,1, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, 3, 4, S<0, 1, 2, 3, 4, 5>, 5, 4>, - ck::tensor_operation::device::deviceGemvDl + ck::tensor_operation::device::deviceTsmmDl < F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 1, 256, 6, 4, 1, 4, 1, S<1,1, 1, 1, 4>, S<1,6, 1,1, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, 3, 4, S<0, 1, 2, 3, 4, 5>, 5, 4>, - ck::tensor_operation::device::deviceGemvDl + ck::tensor_operation::device::deviceTsmmDl < F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 1, 256, 6, 8, 1, 4, 1, S<1,1, 1, 1, 8>, S<1,6, 1,1, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, 3, 4, S<0, 1, 2, 3, 4, 5>, 5, 4>, - ck::tensor_operation::device::deviceGemvDl + ck::tensor_operation::device::deviceTsmmDl < F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 1, 256, 7, 2, 1, 4, 1, S<1,1, 1, 1, 2>, S<1,7, 1,1, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, 3, 4, S<0, 1, 2, 3, 4, 5>, 5, 4>, - ck::tensor_operation::device::deviceGemvDl + ck::tensor_operation::device::deviceTsmmDl < F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 1, 256, 7, 4, 1, 4, 1, S<1,1, 1, 1, 4>, S<1,7, 1,1, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, 3, 4, S<0, 1, 2, 3, 4, 5>, 5, 4>, - ck::tensor_operation::device::deviceGemvDl + ck::tensor_operation::device::deviceTsmmDl < F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 1, 256, 7, 8, 1, 4, 1, S<1,1, 1, 1, 8>, S<1,7, 1,1, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, 3, 4, S<0, 1, 2, 3, 4, 5>, 5, 4>, - ck::tensor_operation::device::deviceGemvDl + ck::tensor_operation::device::deviceTsmmDl < F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 1, 256, 8, 2, 1, 4, 1, S<1,1, 1, 1, 2>, S<1,8, 1,1, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, 3, 4, S<0, 1, 2, 3, 4, 5>, 5, 4>, - ck::tensor_operation::device::deviceGemvDl + ck::tensor_operation::device::deviceTsmmDl < F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 1, 256, 8, 4, 1, 4, 1, S<1,1, 1, 1, 4>, S<1,8, 1,1, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, 3, 4, S<0, 1, 2, 3, 4, 5>, 5, 4>, - ck::tensor_operation::device::deviceGemvDl + ck::tensor_operation::device::deviceTsmmDl < F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 1, 256, 8, 8, 1, 4, 1, S<1,1, 1, 1, 8>, S<1,8, 1,1, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, 3, 4, S<0, 1, 2, 3, 4, 5>, 5, 4>, //N1=8 - ck::tensor_operation::device::deviceGemvDl + ck::tensor_operation::device::deviceTsmmDl < F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 1, 512, 1, 2, 1, 8, 1, S<1,1, 1, 1, 2>, S<1,1, 1,1, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, 3, 8, S<0, 1, 2, 3, 4, 5>, 5, 8>, - ck::tensor_operation::device::deviceGemvDl + ck::tensor_operation::device::deviceTsmmDl < F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 1, 512, 1, 4, 1, 8, 1, S<1,1, 1, 1, 4>, S<1,1, 1,1, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, 3, 8, S<0, 1, 2, 3, 4, 5>, 5, 8>, - ck::tensor_operation::device::deviceGemvDl + ck::tensor_operation::device::deviceTsmmDl < F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 1, 512, 1, 8, 1, 8, 1, S<1,1, 1, 1, 8>, S<1,1, 1,1, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, 3, 8, S<0, 1, 2, 3, 4, 5>, 5, 8>, - ck::tensor_operation::device::deviceGemvDl + ck::tensor_operation::device::deviceTsmmDl < F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 1, 512, 2, 2, 1, 8, 1, S<1,1, 1, 1, 2>, S<1,2, 1,1, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, 3, 8, S<0, 1, 2, 3, 4, 5>, 5, 8>, - ck::tensor_operation::device::deviceGemvDl + ck::tensor_operation::device::deviceTsmmDl < F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 1, 512, 2, 4, 1, 8, 1, S<1,1, 1, 1, 4>, S<1,2, 1,1, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, 3, 8, S<0, 1, 2, 3, 4, 5>, 5, 8>, - ck::tensor_operation::device::deviceGemvDl + ck::tensor_operation::device::deviceTsmmDl < F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 1, 512, 2, 8, 1, 8, 1, S<1,1, 1, 1, 8>, S<1,2, 1,1, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, 3, 8, S<0, 1, 2, 3, 4, 5>, 5, 8>, - ck::tensor_operation::device::deviceGemvDl + ck::tensor_operation::device::deviceTsmmDl < F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 1, 512, 3, 2, 1, 8, 1, S<1,1, 1, 1, 2>, S<1,3, 1,1, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, 3, 8, S<0, 1, 2, 3, 4, 5>, 5, 8>, - ck::tensor_operation::device::deviceGemvDl + ck::tensor_operation::device::deviceTsmmDl < F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 1, 512, 3, 4, 1, 8, 1, S<1,1, 1, 1, 4>, S<1,3, 1,1, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, 3, 8, S<0, 1, 2, 3, 4, 5>, 5, 8>, - ck::tensor_operation::device::deviceGemvDl + ck::tensor_operation::device::deviceTsmmDl < F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 1, 512, 3, 8, 1, 8, 1, S<1,1, 1, 1, 8>, S<1,3, 1,1, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, 3, 8, S<0, 1, 2, 3, 4, 5>, 5, 8>, - ck::tensor_operation::device::deviceGemvDl + ck::tensor_operation::device::deviceTsmmDl < F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 1, 512, 4, 2, 1, 8, 1, S<1,1, 1, 1, 2>, S<1,4, 1,1, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, 3, 8, S<0, 1, 2, 3, 4, 5>, 5, 8>, - ck::tensor_operation::device::deviceGemvDl + ck::tensor_operation::device::deviceTsmmDl < F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 1, 512, 4, 4, 1, 8, 1, S<1,1, 1, 1, 4>, S<1,4, 1,1, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, 3, 8, S<0, 1, 2, 3, 4, 5>, 5, 8>, - ck::tensor_operation::device::deviceGemvDl + ck::tensor_operation::device::deviceTsmmDl < F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 1, 512, 4, 8, 1, 8, 1, S<1,1, 1, 1, 8>, S<1,4, 1,1, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, 3, 8, S<0, 1, 2, 3, 4, 5>, 5, 8>, - ck::tensor_operation::device::deviceGemvDl + ck::tensor_operation::device::deviceTsmmDl < F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 1, 512, 5, 2, 1, 8, 1, S<1,1, 1, 1, 2>, S<1,5, 1,1, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, 3, 8, S<0, 1, 2, 3, 4, 5>, 5, 8>, - ck::tensor_operation::device::deviceGemvDl + ck::tensor_operation::device::deviceTsmmDl < F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 1, 512, 5, 4, 1, 8, 1, S<1,1, 1, 1, 4>, S<1,5, 1,1, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, 3, 8, S<0, 1, 2, 3, 4, 5>, 5, 8>, - ck::tensor_operation::device::deviceGemvDl + ck::tensor_operation::device::deviceTsmmDl < F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 1, 512, 5, 8, 1, 8, 1, S<1,1, 1, 1, 8>, S<1,5, 1,1, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, 3, 8, S<0, 1, 2, 3, 4, 5>, 5, 8>, - ck::tensor_operation::device::deviceGemvDl + ck::tensor_operation::device::deviceTsmmDl < F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 1, 512, 6, 2, 1, 8, 1, S<1,1, 1, 1, 2>, S<1,6, 1,1, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, 3, 8, S<0, 1, 2, 3, 4, 5>, 5, 8>, - ck::tensor_operation::device::deviceGemvDl + ck::tensor_operation::device::deviceTsmmDl < F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 1, 512, 6, 4, 1, 8, 1, S<1,1, 1, 1, 4>, S<1,6, 1,1, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, 3, 8, S<0, 1, 2, 3, 4, 5>, 5, 8>, - ck::tensor_operation::device::deviceGemvDl + ck::tensor_operation::device::deviceTsmmDl < F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 1, 512, 6, 8, 1, 8, 1, S<1,1, 1, 1, 8>, S<1,6, 1,1, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, 3, 8, S<0, 1, 2, 3, 4, 5>, 5, 8>, - ck::tensor_operation::device::deviceGemvDl + ck::tensor_operation::device::deviceTsmmDl < F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 1, 512, 7, 2, 1, 8, 1, S<1,1, 1, 1, 2>, S<1,7, 1,1, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, 3, 8, S<0, 1, 2, 3, 4, 5>, 5, 8>, - ck::tensor_operation::device::deviceGemvDl + ck::tensor_operation::device::deviceTsmmDl < F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 1, 512, 7, 4, 1, 8, 1, S<1,1, 1, 1, 4>, S<1,7, 1,1, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, 3, 8, S<0, 1, 2, 3, 4, 5>, 5, 8>, - ck::tensor_operation::device::deviceGemvDl + ck::tensor_operation::device::deviceTsmmDl < F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 1, 512, 7, 8, 1, 8, 1, S<1,1, 1, 1, 8>, S<1,7, 1,1, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, 3, 8, S<0, 1, 2, 3, 4, 5>, 5, 8>, - ck::tensor_operation::device::deviceGemvDl + ck::tensor_operation::device::deviceTsmmDl < F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 1, 512, 8, 2, 1, 8, 1, S<1,1, 1, 1, 2>, S<1,8, 1,1, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, 3, 8, S<0, 1, 2, 3, 4, 5>, 5, 8>, - ck::tensor_operation::device::deviceGemvDl + ck::tensor_operation::device::deviceTsmmDl < F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 1, 512, 8, 4, 1, 8, 1, S<1,1, 1, 1, 4>, S<1,8, 1,1, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, 3, 8, S<0, 1, 2, 3, 4, 5>, 5, 8>, - ck::tensor_operation::device::deviceGemvDl + ck::tensor_operation::device::deviceTsmmDl < F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 1, 512, 8, 8, 1, 8, 1, S<1,1, 1, 1, 8>, S<1,8, 1,1, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, 3, 8, S<0, 1, 2, 3, 4, 5>, 5, 8> // clang-format on >; void add_device_gemv_splitk_f16_f16_f16_mk_kn_mn_instances( std::vector>>& + DeviceTsmm>>& instances) { add_device_operation_instances(instances, device_gemv_splitk_f16_f16_f16_mk_kn_mn_instances{}); diff --git a/library/src/tensor_operation_instance/gpu/gemv_splitk/device_gemv_splitk_f16_f16_f16_mk_nk_mn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/gemv_splitk/device_gemv_splitk_f16_f16_f16_mk_nk_mn_instance.cpp similarity index 90% rename from library/src/tensor_operation_instance/gpu/gemv_splitk/device_gemv_splitk_f16_f16_f16_mk_nk_mn_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/gemv_splitk/device_gemv_splitk_f16_f16_f16_mk_nk_mn_instance.cpp index e14d81af3..c15da3212 100755 --- a/library/src/tensor_operation_instance/gpu/gemv_splitk/device_gemv_splitk_f16_f16_f16_mk_nk_mn_instance.cpp +++ b/composable_kernel/library/src/tensor_operation_instance/gpu/gemv_splitk/device_gemv_splitk_f16_f16_f16_mk_nk_mn_instance.cpp @@ -5,7 +5,7 @@ #include "ck/ck.hpp" #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" -#include "ck/tensor_operation/gpu/device/impl/device_gemv_splitk.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_tall_and_skinny_gemm_splitk.hpp" #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" @@ -34,158 +34,158 @@ using device_gemv_splitk_f16_f16_f16_mk_nk_mn_instances = std::tuple< // ######| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | //< ADataType, BDataType, CDataType, AccDataType, ALayout, BLayout, CLayout, AElementOp, BElementOp, CElementOp, GemmMNPadding, B, M1, B*N1, K0, K1, 1, N1, 1, S<1,1, 1, 1, K1>, S<1,K0, 1,M1, 1>,S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, K1>, S<0,1,2,3,4>, S<1,1, 1, 1, K1>, S<0,1,2,3,4>, 4, K1, S<0, 1, 2, 3, 4, 5>, 5, N1>; //N1=2 - ck::tensor_operation::device::deviceGemvDl + ck::tensor_operation::device::deviceTsmmDl < F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 1, 128, 1, 2, 1, 2, 1, S<1,1, 1, 1, 2>, S<1,1, 1, 1, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, 4, 2, S<0, 1, 2, 3, 4, 5>, 5, 2>, - ck::tensor_operation::device::deviceGemvDl + ck::tensor_operation::device::deviceTsmmDl < F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 1, 128, 1, 4, 1, 2, 1, S<1,1, 1, 1, 4>, S<1,1, 1,1, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, 4, 4, S<0, 1, 2, 3, 4, 5>, 5, 2>, - ck::tensor_operation::device::deviceGemvDl + ck::tensor_operation::device::deviceTsmmDl < F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 1, 128, 1, 8, 1, 2, 1, S<1,1, 1, 1, 8>, S<1,1, 1,1, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, 4, 8, S<0, 1, 2, 3, 4, 5>, 5, 2>, - ck::tensor_operation::device::deviceGemvDl + ck::tensor_operation::device::deviceTsmmDl < F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 1, 128, 2, 2, 1, 2, 1, S<1,1, 1, 1, 2>, S<1,2, 1,1, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, 4, 2, S<0, 1, 2, 3, 4, 5>, 5, 2>, - ck::tensor_operation::device::deviceGemvDl + ck::tensor_operation::device::deviceTsmmDl < F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 1, 128, 2, 4, 1, 2, 1, S<1,1, 1, 1, 4>, S<1,2, 1,1, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, 4, 4, S<0, 1, 2, 3, 4, 5>, 5, 2>, - ck::tensor_operation::device::deviceGemvDl + ck::tensor_operation::device::deviceTsmmDl < F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 1, 128, 2, 8, 1, 2, 1, S<1,1, 1, 1, 8>, S<1,2, 1,1, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, 4, 8, S<0, 1, 2, 3, 4, 5>, 5, 2>, - ck::tensor_operation::device::deviceGemvDl + ck::tensor_operation::device::deviceTsmmDl < F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 1, 128, 3, 2, 1, 2, 1, S<1,1, 1, 1, 2>, S<1,3, 1,1, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, 4, 2, S<0, 1, 2, 3, 4, 5>, 5, 2>, - ck::tensor_operation::device::deviceGemvDl + ck::tensor_operation::device::deviceTsmmDl < F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 1, 128, 3, 4, 1, 2, 1, S<1,1, 1, 1, 4>, S<1,3, 1,1, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, 4, 4, S<0, 1, 2, 3, 4, 5>, 5, 2>, - ck::tensor_operation::device::deviceGemvDl + ck::tensor_operation::device::deviceTsmmDl < F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 1, 128, 3, 8, 1, 2, 1, S<1,1, 1, 1, 8>, S<1,3, 1,1, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, 4, 8, S<0, 1, 2, 3, 4, 5>, 5, 2>, - ck::tensor_operation::device::deviceGemvDl + ck::tensor_operation::device::deviceTsmmDl < F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 1, 128, 4, 2, 1, 2, 1, S<1,1, 1, 1, 2>, S<1,4, 1,1, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, 4, 2, S<0, 1, 2, 3, 4, 5>, 5, 2>, - ck::tensor_operation::device::deviceGemvDl + ck::tensor_operation::device::deviceTsmmDl < F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 1, 128, 4, 4, 1, 2, 1, S<1,1, 1, 1, 4>, S<1,4, 1,1, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, 4, 4, S<0, 1, 2, 3, 4, 5>, 5, 2>, - ck::tensor_operation::device::deviceGemvDl + ck::tensor_operation::device::deviceTsmmDl < F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 1, 128, 4, 8, 1, 2, 1, S<1,1, 1, 1, 8>, S<1,4, 1,1, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, 4, 8, S<0, 1, 2, 3, 4, 5>, 5, 2>, - ck::tensor_operation::device::deviceGemvDl + ck::tensor_operation::device::deviceTsmmDl < F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 1, 128, 5, 2, 1, 2, 1, S<1,1, 1, 1, 2>, S<1,5, 1,1, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, 4, 2, S<0, 1, 2, 3, 4, 5>, 5, 2>, - ck::tensor_operation::device::deviceGemvDl + ck::tensor_operation::device::deviceTsmmDl < F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 1, 128, 5, 4, 1, 2, 1, S<1,1, 1, 1, 4>, S<1,5, 1,1, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, 4, 4, S<0, 1, 2, 3, 4, 5>, 5, 2>, - ck::tensor_operation::device::deviceGemvDl + ck::tensor_operation::device::deviceTsmmDl < F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 1, 128, 5, 8, 1, 2, 1, S<1,1, 1, 1, 8>, S<1,5, 1,1, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, 4, 8, S<0, 1, 2, 3, 4, 5>, 5, 2>, - ck::tensor_operation::device::deviceGemvDl + ck::tensor_operation::device::deviceTsmmDl < F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 1, 128, 6, 2, 1, 2, 1, S<1,1, 1, 1, 2>, S<1,6, 1,1, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, 4, 2, S<0, 1, 2, 3, 4, 5>, 5, 2>, - ck::tensor_operation::device::deviceGemvDl + ck::tensor_operation::device::deviceTsmmDl < F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 1, 128, 6, 4, 1, 2, 1, S<1,1, 1, 1, 4>, S<1,6, 1,1, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, 4, 4, S<0, 1, 2, 3, 4, 5>, 5, 2>, - ck::tensor_operation::device::deviceGemvDl + ck::tensor_operation::device::deviceTsmmDl < F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 1, 128, 6, 8, 1, 2, 1, S<1,1, 1, 1, 8>, S<1,6, 1,1, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, 4, 8, S<0, 1, 2, 3, 4, 5>, 5, 2>, - ck::tensor_operation::device::deviceGemvDl + ck::tensor_operation::device::deviceTsmmDl < F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 1, 128, 7, 2, 1, 2, 1, S<1,1, 1, 1, 2>, S<1,7, 1,1, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, 4, 2, S<0, 1, 2, 3, 4, 5>, 5, 2>, - ck::tensor_operation::device::deviceGemvDl + ck::tensor_operation::device::deviceTsmmDl < F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 1, 128, 7, 4, 1, 2, 1, S<1,1, 1, 1, 4>, S<1,7, 1,1, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, 4, 4, S<0, 1, 2, 3, 4, 5>, 5, 2>, - ck::tensor_operation::device::deviceGemvDl + ck::tensor_operation::device::deviceTsmmDl < F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 1, 128, 7, 8, 1, 2, 1, S<1,1, 1, 1, 8>, S<1,7, 1,1, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, 4, 8, S<0, 1, 2, 3, 4, 5>, 5, 2>, - ck::tensor_operation::device::deviceGemvDl + ck::tensor_operation::device::deviceTsmmDl < F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 1, 128, 8, 2, 1, 2, 1, S<1,1, 1, 1, 2>, S<1,8, 1,1, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, 4, 2, S<0, 1, 2, 3, 4, 5>, 5, 2>, - ck::tensor_operation::device::deviceGemvDl + ck::tensor_operation::device::deviceTsmmDl < F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 1, 128, 8, 4, 1, 2, 1, S<1,1, 1, 1, 4>, S<1,8, 1,1, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, 4, 4, S<0, 1, 2, 3, 4, 5>, 5, 2>, - ck::tensor_operation::device::deviceGemvDl + ck::tensor_operation::device::deviceTsmmDl < F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 1, 128, 8, 8, 1, 2, 1, S<1,1, 1, 1, 8>, S<1,8, 1,1, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, 4, 8, S<0, 1, 2, 3, 4, 5>, 5, 2>, //N1=4 - ck::tensor_operation::device::deviceGemvDl + ck::tensor_operation::device::deviceTsmmDl < F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 1, 256, 1, 2, 1, 4, 1, S<1,1, 1, 1, 2>, S<1,1, 1,1, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, 4, 2, S<0, 1, 2, 3, 4, 5>, 5, 4>, - ck::tensor_operation::device::deviceGemvDl + ck::tensor_operation::device::deviceTsmmDl < F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 1, 256, 1, 4, 1, 4, 1, S<1,1, 1, 1, 4>, S<1,1, 1,1, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, 4, 4, S<0, 1, 2, 3, 4, 5>, 5, 4>, - ck::tensor_operation::device::deviceGemvDl + ck::tensor_operation::device::deviceTsmmDl < F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 1, 256, 1, 8, 1, 4, 1, S<1,1, 1, 1, 8>, S<1,1, 1,1, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, 4, 8, S<0, 1, 2, 3, 4, 5>, 5, 4>, - ck::tensor_operation::device::deviceGemvDl + ck::tensor_operation::device::deviceTsmmDl < F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 1, 256, 2, 2, 1, 4, 1, S<1,1, 1, 1, 2>, S<1,2, 1,1, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, 4, 2, S<0, 1, 2, 3, 4, 5>, 5, 4>, - ck::tensor_operation::device::deviceGemvDl + ck::tensor_operation::device::deviceTsmmDl < F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 1, 256, 2, 4, 1, 4, 1, S<1,1, 1, 1, 4>, S<1,2, 1,1, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, 4, 4, S<0, 1, 2, 3, 4, 5>, 5, 4>, - ck::tensor_operation::device::deviceGemvDl + ck::tensor_operation::device::deviceTsmmDl < F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 1, 256, 2, 8, 1, 4, 1, S<1,1, 1, 1, 8>, S<1,2, 1,1, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, 4, 8, S<0, 1, 2, 3, 4, 5>, 5, 4>, - ck::tensor_operation::device::deviceGemvDl + ck::tensor_operation::device::deviceTsmmDl < F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 1, 256, 3, 2, 1, 4, 1, S<1,1, 1, 1, 2>, S<1,3, 1,1, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, 4, 2, S<0, 1, 2, 3, 4, 5>, 5, 4>, - ck::tensor_operation::device::deviceGemvDl + ck::tensor_operation::device::deviceTsmmDl < F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 1, 256, 3, 4, 1, 4, 1, S<1,1, 1, 1, 4>, S<1,3, 1,1, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, 4, 4, S<0, 1, 2, 3, 4, 5>, 5, 4>, - ck::tensor_operation::device::deviceGemvDl + ck::tensor_operation::device::deviceTsmmDl < F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 1, 256, 3, 8, 1, 4, 1, S<1,1, 1, 1, 8>, S<1,3, 1,1, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, 4, 8, S<0, 1, 2, 3, 4, 5>, 5, 4>, - ck::tensor_operation::device::deviceGemvDl + ck::tensor_operation::device::deviceTsmmDl < F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 1, 256, 4, 2, 1, 4, 1, S<1,1, 1, 1, 2>, S<1,4, 1,1, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, 4, 2, S<0, 1, 2, 3, 4, 5>, 5, 4>, - ck::tensor_operation::device::deviceGemvDl + ck::tensor_operation::device::deviceTsmmDl < F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 1, 256, 4, 4, 1, 4, 1, S<1,1, 1, 1, 4>, S<1,4, 1,1, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, 4, 4, S<0, 1, 2, 3, 4, 5>, 5, 4>, - ck::tensor_operation::device::deviceGemvDl + ck::tensor_operation::device::deviceTsmmDl < F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 1, 256, 4, 8, 1, 4, 1, S<1,1, 1, 1, 8>, S<1,4, 1,1, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, 4, 8, S<0, 1, 2, 3, 4, 5>, 5, 4>, - ck::tensor_operation::device::deviceGemvDl + ck::tensor_operation::device::deviceTsmmDl < F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 1, 256, 5, 2, 1, 4, 1, S<1,1, 1, 1, 2>, S<1,5, 1,1, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, 4, 2, S<0, 1, 2, 3, 4, 5>, 5, 4>, - ck::tensor_operation::device::deviceGemvDl + ck::tensor_operation::device::deviceTsmmDl < F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 1, 256, 5, 4, 1, 4, 1, S<1,1, 1, 1, 4>, S<1,5, 1,1, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, 4, 4, S<0, 1, 2, 3, 4, 5>, 5, 4>, - ck::tensor_operation::device::deviceGemvDl + ck::tensor_operation::device::deviceTsmmDl < F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 1, 256, 5, 8, 1, 4, 1, S<1,1, 1, 1, 8>, S<1,5, 1,1, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, 4, 8, S<0, 1, 2, 3, 4, 5>, 5, 4>, - ck::tensor_operation::device::deviceGemvDl + ck::tensor_operation::device::deviceTsmmDl < F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 1, 256, 6, 2, 1, 4, 1, S<1,1, 1, 1, 2>, S<1,6, 1,1, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, 4, 2, S<0, 1, 2, 3, 4, 5>, 5, 4>, - ck::tensor_operation::device::deviceGemvDl + ck::tensor_operation::device::deviceTsmmDl < F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 1, 256, 6, 4, 1, 4, 1, S<1,1, 1, 1, 4>, S<1,6, 1,1, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, 4, 4, S<0, 1, 2, 3, 4, 5>, 5, 4>, - ck::tensor_operation::device::deviceGemvDl + ck::tensor_operation::device::deviceTsmmDl < F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 1, 256, 6, 8, 1, 4, 1, S<1,1, 1, 1, 8>, S<1,6, 1,1, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, 4, 8, S<0, 1, 2, 3, 4, 5>, 5, 4>, - ck::tensor_operation::device::deviceGemvDl + ck::tensor_operation::device::deviceTsmmDl < F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 1, 256, 7, 2, 1, 4, 1, S<1,1, 1, 1, 2>, S<1,7, 1,1, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, 4, 2, S<0, 1, 2, 3, 4, 5>, 5, 4>, - ck::tensor_operation::device::deviceGemvDl + ck::tensor_operation::device::deviceTsmmDl < F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 1, 256, 7, 4, 1, 4, 1, S<1,1, 1, 1, 4>, S<1,7, 1,1, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, 4, 4, S<0, 1, 2, 3, 4, 5>, 5, 4>, - ck::tensor_operation::device::deviceGemvDl + ck::tensor_operation::device::deviceTsmmDl < F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 1, 256, 7, 8, 1, 4, 1, S<1,1, 1, 1, 8>, S<1,7, 1,1, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, 4, 8, S<0, 1, 2, 3, 4, 5>, 5, 4>, - ck::tensor_operation::device::deviceGemvDl + ck::tensor_operation::device::deviceTsmmDl < F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 1, 256, 8, 2, 1, 4, 1, S<1,1, 1, 1, 2>, S<1,8, 1,1, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, 4, 2, S<0, 1, 2, 3, 4, 5>, 5, 4>, - ck::tensor_operation::device::deviceGemvDl + ck::tensor_operation::device::deviceTsmmDl < F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 1, 256, 8, 4, 1, 4, 1, S<1,1, 1, 1, 4>, S<1,8, 1,1, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, 4, 4, S<0, 1, 2, 3, 4, 5>, 5, 4>, - ck::tensor_operation::device::deviceGemvDl + ck::tensor_operation::device::deviceTsmmDl < F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 1, 256, 8, 8, 1, 4, 1, S<1,1, 1, 1, 8>, S<1,8, 1,1, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, 4, 8, S<0, 1, 2, 3, 4, 5>, 5, 4>, //N1=8 - ck::tensor_operation::device::deviceGemvDl + ck::tensor_operation::device::deviceTsmmDl < F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 1, 512, 1, 2, 1, 8, 1, S<1,1, 1, 1, 2>, S<1,1, 1,1, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, 4, 2, S<0, 1, 2, 3, 4, 5>, 5, 8>, - ck::tensor_operation::device::deviceGemvDl + ck::tensor_operation::device::deviceTsmmDl < F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 1, 512, 1, 4, 1, 8, 1, S<1,1, 1, 1, 4>, S<1,1, 1,1, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, 4, 4, S<0, 1, 2, 3, 4, 5>, 5, 8>, - ck::tensor_operation::device::deviceGemvDl + ck::tensor_operation::device::deviceTsmmDl < F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 1, 512, 1, 8, 1, 8, 1, S<1,1, 1, 1, 8>, S<1,1, 1,1, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, 4, 8, S<0, 1, 2, 3, 4, 5>, 5, 8>, - ck::tensor_operation::device::deviceGemvDl + ck::tensor_operation::device::deviceTsmmDl < F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 1, 512, 2, 2, 1, 8, 1, S<1,1, 1, 1, 2>, S<1,2, 1,1, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, 4, 2, S<0, 1, 2, 3, 4, 5>, 5, 8>, - ck::tensor_operation::device::deviceGemvDl + ck::tensor_operation::device::deviceTsmmDl < F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 1, 512, 2, 4, 1, 8, 1, S<1,1, 1, 1, 4>, S<1,2, 1,1, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, 4, 4, S<0, 1, 2, 3, 4, 5>, 5, 8>, - ck::tensor_operation::device::deviceGemvDl + ck::tensor_operation::device::deviceTsmmDl < F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 1, 512, 2, 8, 1, 8, 1, S<1,1, 1, 1, 8>, S<1,2, 1,1, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, 4, 8, S<0, 1, 2, 3, 4, 5>, 5, 8>, - ck::tensor_operation::device::deviceGemvDl + ck::tensor_operation::device::deviceTsmmDl < F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 1, 512, 3, 2, 1, 8, 1, S<1,1, 1, 1, 2>, S<1,3, 1,1, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, 4, 2, S<0, 1, 2, 3, 4, 5>, 5, 8>, - ck::tensor_operation::device::deviceGemvDl + ck::tensor_operation::device::deviceTsmmDl < F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 1, 512, 3, 4, 1, 8, 1, S<1,1, 1, 1, 4>, S<1,3, 1,1, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, 4, 4, S<0, 1, 2, 3, 4, 5>, 5, 8>, - ck::tensor_operation::device::deviceGemvDl + ck::tensor_operation::device::deviceTsmmDl < F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 1, 512, 3, 8, 1, 8, 1, S<1,1, 1, 1, 8>, S<1,3, 1,1, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, 4, 8, S<0, 1, 2, 3, 4, 5>, 5, 8>, - ck::tensor_operation::device::deviceGemvDl + ck::tensor_operation::device::deviceTsmmDl < F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 1, 512, 4, 2, 1, 8, 1, S<1,1, 1, 1, 2>, S<1,4, 1,1, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, 4, 2, S<0, 1, 2, 3, 4, 5>, 5, 8>, - ck::tensor_operation::device::deviceGemvDl + ck::tensor_operation::device::deviceTsmmDl < F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 1, 512, 4, 4, 1, 8, 1, S<1,1, 1, 1, 4>, S<1,4, 1,1, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, 4, 4, S<0, 1, 2, 3, 4, 5>, 5, 8>, - ck::tensor_operation::device::deviceGemvDl + ck::tensor_operation::device::deviceTsmmDl < F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 1, 512, 4, 8, 1, 8, 1, S<1,1, 1, 1, 8>, S<1,4, 1,1, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, 4, 8, S<0, 1, 2, 3, 4, 5>, 5, 8>, - ck::tensor_operation::device::deviceGemvDl + ck::tensor_operation::device::deviceTsmmDl < F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 1, 512, 5, 2, 1, 8, 1, S<1,1, 1, 1, 2>, S<1,5, 1,1, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, 4, 2, S<0, 1, 2, 3, 4, 5>, 5, 8>, - ck::tensor_operation::device::deviceGemvDl + ck::tensor_operation::device::deviceTsmmDl < F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 1, 512, 5, 4, 1, 8, 1, S<1,1, 1, 1, 4>, S<1,5, 1,1, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, 4, 4, S<0, 1, 2, 3, 4, 5>, 5, 8>, - ck::tensor_operation::device::deviceGemvDl + ck::tensor_operation::device::deviceTsmmDl < F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 1, 512, 5, 8, 1, 8, 1, S<1,1, 1, 1, 8>, S<1,5, 1,1, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, 4, 8, S<0, 1, 2, 3, 4, 5>, 5, 8>, - ck::tensor_operation::device::deviceGemvDl + ck::tensor_operation::device::deviceTsmmDl < F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 1, 512, 6, 2, 1, 8, 1, S<1,1, 1, 1, 2>, S<1,6, 1,1, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, 4, 2, S<0, 1, 2, 3, 4, 5>, 5, 8>, - ck::tensor_operation::device::deviceGemvDl + ck::tensor_operation::device::deviceTsmmDl < F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 1, 512, 6, 4, 1, 8, 1, S<1,1, 1, 1, 4>, S<1,6, 1,1, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, 4, 4, S<0, 1, 2, 3, 4, 5>, 5, 8>, - ck::tensor_operation::device::deviceGemvDl + ck::tensor_operation::device::deviceTsmmDl < F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 1, 512, 6, 8, 1, 8, 1, S<1,1, 1, 1, 8>, S<1,6, 1,1, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, 4, 8, S<0, 1, 2, 3, 4, 5>, 5, 8>, - ck::tensor_operation::device::deviceGemvDl + ck::tensor_operation::device::deviceTsmmDl < F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 1, 512, 7, 2, 1, 8, 1, S<1,1, 1, 1, 2>, S<1,7, 1,1, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, 4, 2, S<0, 1, 2, 3, 4, 5>, 5, 8>, - ck::tensor_operation::device::deviceGemvDl + ck::tensor_operation::device::deviceTsmmDl < F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 1, 512, 7, 4, 1, 8, 1, S<1,1, 1, 1, 4>, S<1,7, 1,1, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, 4, 4, S<0, 1, 2, 3, 4, 5>, 5, 8>, - ck::tensor_operation::device::deviceGemvDl + ck::tensor_operation::device::deviceTsmmDl < F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 1, 512, 7, 8, 1, 8, 1, S<1,1, 1, 1, 8>, S<1,7, 1,1, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, 4, 8, S<0, 1, 2, 3, 4, 5>, 5, 8>, - ck::tensor_operation::device::deviceGemvDl + ck::tensor_operation::device::deviceTsmmDl < F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 1, 512, 8, 2, 1, 8, 1, S<1,1, 1, 1, 2>, S<1,8, 1,1, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, 4, 2, S<0, 1, 2, 3, 4, 5>, 5, 8>, - ck::tensor_operation::device::deviceGemvDl + ck::tensor_operation::device::deviceTsmmDl < F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 1, 512, 8, 4, 1, 8, 1, S<1,1, 1, 1, 4>, S<1,8, 1,1, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, 4, 4, S<0, 1, 2, 3, 4, 5>, 5, 8>, - ck::tensor_operation::device::deviceGemvDl + ck::tensor_operation::device::deviceTsmmDl < F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 1, 512, 8, 8, 1, 8, 1, S<1,1, 1, 1, 8>, S<1,8, 1,1, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, 4, 8, S<0, 1, 2, 3, 4, 5>, 5, 8> // clang-format on >; void add_device_gemv_splitk_f16_f16_f16_mk_nk_mn_instances( std::vector>>& + DeviceTsmm>>& instances) { add_device_operation_instances(instances, device_gemv_splitk_f16_f16_f16_mk_nk_mn_instances{}); diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv1d_bwd_weight/CMakeLists.txt b/composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv1d_bwd_weight/CMakeLists.txt similarity index 100% rename from library/src/tensor_operation_instance/gpu/grouped_conv1d_bwd_weight/CMakeLists.txt rename to composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv1d_bwd_weight/CMakeLists.txt diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv1d_bwd_weight/dl/device_grouped_conv1d_bwd_weight_dl_gnwc_gkxc_gnwk_bf16_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv1d_bwd_weight/dl/device_grouped_conv1d_bwd_weight_dl_gnwc_gkxc_gnwk_bf16_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/grouped_conv1d_bwd_weight/dl/device_grouped_conv1d_bwd_weight_dl_gnwc_gkxc_gnwk_bf16_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv1d_bwd_weight/dl/device_grouped_conv1d_bwd_weight_dl_gnwc_gkxc_gnwk_bf16_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv1d_bwd_weight/dl/device_grouped_conv1d_bwd_weight_dl_gnwc_gkxc_gnwk_f16_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv1d_bwd_weight/dl/device_grouped_conv1d_bwd_weight_dl_gnwc_gkxc_gnwk_f16_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/grouped_conv1d_bwd_weight/dl/device_grouped_conv1d_bwd_weight_dl_gnwc_gkxc_gnwk_f16_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv1d_bwd_weight/dl/device_grouped_conv1d_bwd_weight_dl_gnwc_gkxc_gnwk_f16_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv1d_bwd_weight/dl/device_grouped_conv1d_bwd_weight_dl_gnwc_gkxc_gnwk_f32_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv1d_bwd_weight/dl/device_grouped_conv1d_bwd_weight_dl_gnwc_gkxc_gnwk_f32_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/grouped_conv1d_bwd_weight/dl/device_grouped_conv1d_bwd_weight_dl_gnwc_gkxc_gnwk_f32_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv1d_bwd_weight/dl/device_grouped_conv1d_bwd_weight_dl_gnwc_gkxc_gnwk_f32_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv1d_bwd_weight/dl/device_grouped_conv1d_bwd_weight_dl_nwgc_gkxc_nwgk_bf16_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv1d_bwd_weight/dl/device_grouped_conv1d_bwd_weight_dl_nwgc_gkxc_nwgk_bf16_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/grouped_conv1d_bwd_weight/dl/device_grouped_conv1d_bwd_weight_dl_nwgc_gkxc_nwgk_bf16_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv1d_bwd_weight/dl/device_grouped_conv1d_bwd_weight_dl_nwgc_gkxc_nwgk_bf16_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv1d_bwd_weight/dl/device_grouped_conv1d_bwd_weight_dl_nwgc_gkxc_nwgk_f16_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv1d_bwd_weight/dl/device_grouped_conv1d_bwd_weight_dl_nwgc_gkxc_nwgk_f16_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/grouped_conv1d_bwd_weight/dl/device_grouped_conv1d_bwd_weight_dl_nwgc_gkxc_nwgk_f16_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv1d_bwd_weight/dl/device_grouped_conv1d_bwd_weight_dl_nwgc_gkxc_nwgk_f16_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv1d_bwd_weight/dl/device_grouped_conv1d_bwd_weight_dl_nwgc_gkxc_nwgk_f32_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv1d_bwd_weight/dl/device_grouped_conv1d_bwd_weight_dl_nwgc_gkxc_nwgk_f32_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/grouped_conv1d_bwd_weight/dl/device_grouped_conv1d_bwd_weight_dl_nwgc_gkxc_nwgk_f32_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv1d_bwd_weight/dl/device_grouped_conv1d_bwd_weight_dl_nwgc_gkxc_nwgk_f32_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv1d_bwd_weight/xdl/device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_bf16_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv1d_bwd_weight/xdl/device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_bf16_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/grouped_conv1d_bwd_weight/xdl/device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_bf16_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv1d_bwd_weight/xdl/device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_bf16_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv1d_bwd_weight/xdl/device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_f16_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv1d_bwd_weight/xdl/device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_f16_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/grouped_conv1d_bwd_weight/xdl/device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_f16_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv1d_bwd_weight/xdl/device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_f16_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv1d_bwd_weight/xdl/device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_f32_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv1d_bwd_weight/xdl/device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_f32_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/grouped_conv1d_bwd_weight/xdl/device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_f32_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv1d_bwd_weight/xdl/device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_f32_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv1d_fwd/CMakeLists.txt b/composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv1d_fwd/CMakeLists.txt similarity index 100% rename from library/src/tensor_operation_instance/gpu/grouped_conv1d_fwd/CMakeLists.txt rename to composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv1d_fwd/CMakeLists.txt diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv1d_fwd/xdl/device_grouped_conv1d_fwd_xdl_gnwc_gkxc_gnwk_bf16_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv1d_fwd/xdl/device_grouped_conv1d_fwd_xdl_gnwc_gkxc_gnwk_bf16_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/grouped_conv1d_fwd/xdl/device_grouped_conv1d_fwd_xdl_gnwc_gkxc_gnwk_bf16_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv1d_fwd/xdl/device_grouped_conv1d_fwd_xdl_gnwc_gkxc_gnwk_bf16_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv1d_fwd/xdl/device_grouped_conv1d_fwd_xdl_gnwc_gkxc_gnwk_f16_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv1d_fwd/xdl/device_grouped_conv1d_fwd_xdl_gnwc_gkxc_gnwk_f16_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/grouped_conv1d_fwd/xdl/device_grouped_conv1d_fwd_xdl_gnwc_gkxc_gnwk_f16_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv1d_fwd/xdl/device_grouped_conv1d_fwd_xdl_gnwc_gkxc_gnwk_f16_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv1d_fwd/xdl/device_grouped_conv1d_fwd_xdl_gnwc_gkxc_gnwk_f32_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv1d_fwd/xdl/device_grouped_conv1d_fwd_xdl_gnwc_gkxc_gnwk_f32_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/grouped_conv1d_fwd/xdl/device_grouped_conv1d_fwd_xdl_gnwc_gkxc_gnwk_f32_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv1d_fwd/xdl/device_grouped_conv1d_fwd_xdl_gnwc_gkxc_gnwk_f32_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv1d_fwd/xdl/device_grouped_conv1d_fwd_xdl_gnwc_gkxc_gnwk_int8_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv1d_fwd/xdl/device_grouped_conv1d_fwd_xdl_gnwc_gkxc_gnwk_int8_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/grouped_conv1d_fwd/xdl/device_grouped_conv1d_fwd_xdl_gnwc_gkxc_gnwk_int8_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv1d_fwd/xdl/device_grouped_conv1d_fwd_xdl_gnwc_gkxc_gnwk_int8_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/CMakeLists.txt b/composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/CMakeLists.txt similarity index 100% rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/CMakeLists.txt rename to composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/CMakeLists.txt diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/wmma/device_grouped_conv2d_bwd_data_wmma_gnhwc_gkyxc_gnhwk_f16_1x1s1p0_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/wmma/device_grouped_conv2d_bwd_data_wmma_gnhwc_gkyxc_gnhwk_f16_1x1s1p0_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/wmma/device_grouped_conv2d_bwd_data_wmma_gnhwc_gkyxc_gnhwk_f16_1x1s1p0_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/wmma/device_grouped_conv2d_bwd_data_wmma_gnhwc_gkyxc_gnhwk_f16_1x1s1p0_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/wmma/device_grouped_conv2d_bwd_data_wmma_gnhwc_gkyxc_gnhwk_f16_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/wmma/device_grouped_conv2d_bwd_data_wmma_gnhwc_gkyxc_gnhwk_f16_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/wmma/device_grouped_conv2d_bwd_data_wmma_gnhwc_gkyxc_gnhwk_f16_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/wmma/device_grouped_conv2d_bwd_data_wmma_gnhwc_gkyxc_gnhwk_f16_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/wmma/device_grouped_conv2d_bwd_data_wmma_gnhwc_gkyxc_gnhwk_i8_1x1s1p0_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/wmma/device_grouped_conv2d_bwd_data_wmma_gnhwc_gkyxc_gnhwk_i8_1x1s1p0_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/wmma/device_grouped_conv2d_bwd_data_wmma_gnhwc_gkyxc_gnhwk_i8_1x1s1p0_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/wmma/device_grouped_conv2d_bwd_data_wmma_gnhwc_gkyxc_gnhwk_i8_1x1s1p0_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/wmma/device_grouped_conv2d_bwd_data_wmma_gnhwc_gkyxc_gnhwk_i8_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/wmma/device_grouped_conv2d_bwd_data_wmma_gnhwc_gkyxc_gnhwk_i8_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/wmma/device_grouped_conv2d_bwd_data_wmma_gnhwc_gkyxc_gnhwk_i8_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/wmma/device_grouped_conv2d_bwd_data_wmma_gnhwc_gkyxc_gnhwk_i8_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/wmma/device_grouped_conv2d_bwd_data_wmma_nhwgc_gkyxc_nhwgk_f16_1x1s1p0_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/wmma/device_grouped_conv2d_bwd_data_wmma_nhwgc_gkyxc_nhwgk_f16_1x1s1p0_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/wmma/device_grouped_conv2d_bwd_data_wmma_nhwgc_gkyxc_nhwgk_f16_1x1s1p0_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/wmma/device_grouped_conv2d_bwd_data_wmma_nhwgc_gkyxc_nhwgk_f16_1x1s1p0_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/wmma/device_grouped_conv2d_bwd_data_wmma_nhwgc_gkyxc_nhwgk_f16_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/wmma/device_grouped_conv2d_bwd_data_wmma_nhwgc_gkyxc_nhwgk_f16_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/wmma/device_grouped_conv2d_bwd_data_wmma_nhwgc_gkyxc_nhwgk_f16_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/wmma/device_grouped_conv2d_bwd_data_wmma_nhwgc_gkyxc_nhwgk_f16_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/wmma/device_grouped_conv2d_bwd_data_wmma_nhwgc_gkyxc_nhwgk_i8_1x1s1p0_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/wmma/device_grouped_conv2d_bwd_data_wmma_nhwgc_gkyxc_nhwgk_i8_1x1s1p0_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/wmma/device_grouped_conv2d_bwd_data_wmma_nhwgc_gkyxc_nhwgk_i8_1x1s1p0_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/wmma/device_grouped_conv2d_bwd_data_wmma_nhwgc_gkyxc_nhwgk_i8_1x1s1p0_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/wmma/device_grouped_conv2d_bwd_data_wmma_nhwgc_gkyxc_nhwgk_i8_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/wmma/device_grouped_conv2d_bwd_data_wmma_nhwgc_gkyxc_nhwgk_i8_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/wmma/device_grouped_conv2d_bwd_data_wmma_nhwgc_gkyxc_nhwgk_i8_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/wmma/device_grouped_conv2d_bwd_data_wmma_nhwgc_gkyxc_nhwgk_i8_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_gnhwc_gkyxc_gnhwk_bf16_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_gnhwc_gkyxc_gnhwk_bf16_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_gnhwc_gkyxc_gnhwk_bf16_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_gnhwc_gkyxc_gnhwk_bf16_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_gnhwc_gkyxc_gnhwk_f16_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_gnhwc_gkyxc_gnhwk_f16_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_gnhwc_gkyxc_gnhwk_f16_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_gnhwc_gkyxc_gnhwk_f16_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_gnhwc_gkyxc_gnhwk_f32_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_gnhwc_gkyxc_gnhwk_f32_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_gnhwc_gkyxc_gnhwk_f32_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_gnhwc_gkyxc_gnhwk_f32_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_nhwgc_gkyxc_nhwgk_f16_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_nhwgc_gkyxc_nhwgk_f16_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_nhwgc_gkyxc_nhwgk_f16_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_nhwgc_gkyxc_nhwgk_f16_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_nhwgc_gkyxc_nhwgk_f32_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_nhwgc_gkyxc_nhwgk_f32_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_nhwgc_gkyxc_nhwgk_f32_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_nhwgc_gkyxc_nhwgk_f32_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/CMakeLists.txt b/composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/CMakeLists.txt similarity index 100% rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/CMakeLists.txt rename to composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/CMakeLists.txt diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/dl/device_grouped_conv2d_bwd_weight_dl_gnhwc_gkyxc_gnhwk_bf16_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/dl/device_grouped_conv2d_bwd_weight_dl_gnhwc_gkyxc_gnhwk_bf16_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/dl/device_grouped_conv2d_bwd_weight_dl_gnhwc_gkyxc_gnhwk_bf16_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/dl/device_grouped_conv2d_bwd_weight_dl_gnhwc_gkyxc_gnhwk_bf16_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/dl/device_grouped_conv2d_bwd_weight_dl_gnhwc_gkyxc_gnhwk_f16_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/dl/device_grouped_conv2d_bwd_weight_dl_gnhwc_gkyxc_gnhwk_f16_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/dl/device_grouped_conv2d_bwd_weight_dl_gnhwc_gkyxc_gnhwk_f16_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/dl/device_grouped_conv2d_bwd_weight_dl_gnhwc_gkyxc_gnhwk_f16_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/dl/device_grouped_conv2d_bwd_weight_dl_gnhwc_gkyxc_gnhwk_f32_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/dl/device_grouped_conv2d_bwd_weight_dl_gnhwc_gkyxc_gnhwk_f32_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/dl/device_grouped_conv2d_bwd_weight_dl_gnhwc_gkyxc_gnhwk_f32_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/dl/device_grouped_conv2d_bwd_weight_dl_gnhwc_gkyxc_gnhwk_f32_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/dl/device_grouped_conv2d_bwd_weight_dl_nhwgc_gkyxc_nhwgk_bf16_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/dl/device_grouped_conv2d_bwd_weight_dl_nhwgc_gkyxc_nhwgk_bf16_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/dl/device_grouped_conv2d_bwd_weight_dl_nhwgc_gkyxc_nhwgk_bf16_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/dl/device_grouped_conv2d_bwd_weight_dl_nhwgc_gkyxc_nhwgk_bf16_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/dl/device_grouped_conv2d_bwd_weight_dl_nhwgc_gkyxc_nhwgk_f16_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/dl/device_grouped_conv2d_bwd_weight_dl_nhwgc_gkyxc_nhwgk_f16_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/dl/device_grouped_conv2d_bwd_weight_dl_nhwgc_gkyxc_nhwgk_f16_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/dl/device_grouped_conv2d_bwd_weight_dl_nhwgc_gkyxc_nhwgk_f16_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/dl/device_grouped_conv2d_bwd_weight_dl_nhwgc_gkyxc_nhwgk_f32_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/dl/device_grouped_conv2d_bwd_weight_dl_nhwgc_gkyxc_nhwgk_f32_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/dl/device_grouped_conv2d_bwd_weight_dl_nhwgc_gkyxc_nhwgk_f32_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/dl/device_grouped_conv2d_bwd_weight_dl_nhwgc_gkyxc_nhwgk_f32_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_bf16_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_bf16_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_bf16_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_bf16_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f16_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f16_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f16_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f16_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f32_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f32_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f32_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f32_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f16_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f16_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f16_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f16_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f32_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f32_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f32_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f32_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/CMakeLists.txt b/composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/CMakeLists.txt similarity index 100% rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/CMakeLists.txt rename to composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/CMakeLists.txt diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_f16_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_f16_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_f16_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_f16_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_i8_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_i8_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_i8_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_i8_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/dl/device_grouped_conv2d_fwd_dl_gnhwc_gkyxc_gnhwk_f16_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/dl/device_grouped_conv2d_fwd_dl_gnhwc_gkyxc_gnhwk_f16_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/dl/device_grouped_conv2d_fwd_dl_gnhwc_gkyxc_gnhwk_f16_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/dl/device_grouped_conv2d_fwd_dl_gnhwc_gkyxc_gnhwk_f16_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/dl/device_grouped_conv2d_fwd_dl_gnhwc_gkyxc_gnhwk_f32_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/dl/device_grouped_conv2d_fwd_dl_gnhwc_gkyxc_gnhwk_f32_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/dl/device_grouped_conv2d_fwd_dl_gnhwc_gkyxc_gnhwk_f32_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/dl/device_grouped_conv2d_fwd_dl_gnhwc_gkyxc_gnhwk_f32_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/dl/device_grouped_conv2d_fwd_dl_nhwgc_gkyxc_nhwgk_f16_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/dl/device_grouped_conv2d_fwd_dl_nhwgc_gkyxc_nhwgk_f16_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/dl/device_grouped_conv2d_fwd_dl_nhwgc_gkyxc_nhwgk_f16_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/dl/device_grouped_conv2d_fwd_dl_nhwgc_gkyxc_nhwgk_f16_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/dl/device_grouped_conv2d_fwd_dl_nhwgc_gkyxc_nhwgk_f32_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/dl/device_grouped_conv2d_fwd_dl_nhwgc_gkyxc_nhwgk_f32_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/dl/device_grouped_conv2d_fwd_dl_nhwgc_gkyxc_nhwgk_f32_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/dl/device_grouped_conv2d_fwd_dl_nhwgc_gkyxc_nhwgk_f32_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/wmma/device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_f16_1x1p0_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/wmma/device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_f16_1x1p0_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/wmma/device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_f16_1x1p0_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/wmma/device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_f16_1x1p0_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/wmma/device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_f16_1x1s1p0_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/wmma/device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_f16_1x1s1p0_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/wmma/device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_f16_1x1s1p0_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/wmma/device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_f16_1x1s1p0_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/wmma/device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_f16_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/wmma/device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_f16_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/wmma/device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_f16_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/wmma/device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_f16_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/wmma/device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_f16_oddc_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/wmma/device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_f16_oddc_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/wmma/device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_f16_oddc_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/wmma/device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_f16_oddc_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/wmma/device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_i8_1x1p0_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/wmma/device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_i8_1x1p0_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/wmma/device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_i8_1x1p0_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/wmma/device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_i8_1x1p0_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/wmma/device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_i8_1x1s1p0_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/wmma/device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_i8_1x1s1p0_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/wmma/device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_i8_1x1s1p0_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/wmma/device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_i8_1x1s1p0_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/wmma/device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_i8_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/wmma/device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_i8_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/wmma/device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_i8_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/wmma/device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_i8_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/wmma/device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_i8_oddc_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/wmma/device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_i8_oddc_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/wmma/device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_i8_oddc_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/wmma/device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_i8_oddc_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/wmma/device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_f16_1x1p0_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/wmma/device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_f16_1x1p0_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/wmma/device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_f16_1x1p0_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/wmma/device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_f16_1x1p0_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/wmma/device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_f16_1x1s1p0_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/wmma/device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_f16_1x1s1p0_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/wmma/device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_f16_1x1s1p0_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/wmma/device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_f16_1x1s1p0_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/wmma/device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_f16_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/wmma/device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_f16_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/wmma/device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_f16_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/wmma/device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_f16_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/wmma/device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_f16_oddc_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/wmma/device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_f16_oddc_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/wmma/device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_f16_oddc_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/wmma/device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_f16_oddc_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/wmma/device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_i8_1x1p0_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/wmma/device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_i8_1x1p0_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/wmma/device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_i8_1x1p0_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/wmma/device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_i8_1x1p0_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/wmma/device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_i8_1x1s1p0_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/wmma/device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_i8_1x1s1p0_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/wmma/device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_i8_1x1s1p0_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/wmma/device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_i8_1x1s1p0_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/wmma/device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_i8_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/wmma/device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_i8_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/wmma/device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_i8_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/wmma/device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_i8_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/wmma/device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_i8_oddc_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/wmma/device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_i8_oddc_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/wmma/device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_i8_oddc_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/wmma/device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_i8_oddc_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_bf16_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_bf16_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_bf16_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_bf16_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_f16_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_f16_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_f16_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_f16_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_f32_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_f32_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_f32_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_f32_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/CMakeLists.txt b/composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/CMakeLists.txt similarity index 100% rename from library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/CMakeLists.txt rename to composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/CMakeLists.txt diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/wmma/device_grouped_conv3d_bwd_data_wmma_gndhwc_gkzyxc_gndhwk_f16_1x1s1p0_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/wmma/device_grouped_conv3d_bwd_data_wmma_gndhwc_gkzyxc_gndhwk_f16_1x1s1p0_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/wmma/device_grouped_conv3d_bwd_data_wmma_gndhwc_gkzyxc_gndhwk_f16_1x1s1p0_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/wmma/device_grouped_conv3d_bwd_data_wmma_gndhwc_gkzyxc_gndhwk_f16_1x1s1p0_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/wmma/device_grouped_conv3d_bwd_data_wmma_gndhwc_gkzyxc_gndhwk_f16_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/wmma/device_grouped_conv3d_bwd_data_wmma_gndhwc_gkzyxc_gndhwk_f16_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/wmma/device_grouped_conv3d_bwd_data_wmma_gndhwc_gkzyxc_gndhwk_f16_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/wmma/device_grouped_conv3d_bwd_data_wmma_gndhwc_gkzyxc_gndhwk_f16_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/wmma/device_grouped_conv3d_bwd_data_wmma_gndhwc_gkzyxc_gndhwk_i8_1x1s1p0_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/wmma/device_grouped_conv3d_bwd_data_wmma_gndhwc_gkzyxc_gndhwk_i8_1x1s1p0_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/wmma/device_grouped_conv3d_bwd_data_wmma_gndhwc_gkzyxc_gndhwk_i8_1x1s1p0_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/wmma/device_grouped_conv3d_bwd_data_wmma_gndhwc_gkzyxc_gndhwk_i8_1x1s1p0_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/wmma/device_grouped_conv3d_bwd_data_wmma_gndhwc_gkzyxc_gndhwk_i8_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/wmma/device_grouped_conv3d_bwd_data_wmma_gndhwc_gkzyxc_gndhwk_i8_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/wmma/device_grouped_conv3d_bwd_data_wmma_gndhwc_gkzyxc_gndhwk_i8_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/wmma/device_grouped_conv3d_bwd_data_wmma_gndhwc_gkzyxc_gndhwk_i8_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/wmma/device_grouped_conv3d_bwd_data_wmma_ndhwgc_gkzyxc_ndhwgk_f16_1x1s1p0_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/wmma/device_grouped_conv3d_bwd_data_wmma_ndhwgc_gkzyxc_ndhwgk_f16_1x1s1p0_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/wmma/device_grouped_conv3d_bwd_data_wmma_ndhwgc_gkzyxc_ndhwgk_f16_1x1s1p0_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/wmma/device_grouped_conv3d_bwd_data_wmma_ndhwgc_gkzyxc_ndhwgk_f16_1x1s1p0_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/wmma/device_grouped_conv3d_bwd_data_wmma_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/wmma/device_grouped_conv3d_bwd_data_wmma_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/wmma/device_grouped_conv3d_bwd_data_wmma_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/wmma/device_grouped_conv3d_bwd_data_wmma_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/wmma/device_grouped_conv3d_bwd_data_wmma_ndhwgc_gkzyxc_ndhwgk_i8_1x1s1p0_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/wmma/device_grouped_conv3d_bwd_data_wmma_ndhwgc_gkzyxc_ndhwgk_i8_1x1s1p0_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/wmma/device_grouped_conv3d_bwd_data_wmma_ndhwgc_gkzyxc_ndhwgk_i8_1x1s1p0_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/wmma/device_grouped_conv3d_bwd_data_wmma_ndhwgc_gkzyxc_ndhwgk_i8_1x1s1p0_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/wmma/device_grouped_conv3d_bwd_data_wmma_ndhwgc_gkzyxc_ndhwgk_i8_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/wmma/device_grouped_conv3d_bwd_data_wmma_ndhwgc_gkzyxc_ndhwgk_i8_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/wmma/device_grouped_conv3d_bwd_data_wmma_ndhwgc_gkzyxc_ndhwgk_i8_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/wmma/device_grouped_conv3d_bwd_data_wmma_ndhwgc_gkzyxc_ndhwgk_i8_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_gndhwc_gkzyxc_gndhwk_bf16_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_gndhwc_gkzyxc_gndhwk_bf16_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_gndhwc_gkzyxc_gndhwk_bf16_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_gndhwc_gkzyxc_gndhwk_bf16_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_gndhwc_gkzyxc_gndhwk_f16_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_gndhwc_gkzyxc_gndhwk_f16_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_gndhwc_gkzyxc_gndhwk_f16_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_gndhwc_gkzyxc_gndhwk_f16_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_gndhwc_gkzyxc_gndhwk_f32_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_gndhwc_gkzyxc_gndhwk_f32_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_gndhwc_gkzyxc_gndhwk_f32_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_gndhwc_gkzyxc_gndhwk_f32_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ndhwgc_gkzyxc_ndhwgk_input_f16_comp_bf8_f8_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ndhwgc_gkzyxc_ndhwgk_input_f16_comp_bf8_f8_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ndhwgc_gkzyxc_ndhwgk_input_f16_comp_bf8_f8_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ndhwgc_gkzyxc_ndhwgk_input_f16_comp_bf8_f8_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/CMakeLists.txt b/composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/CMakeLists.txt similarity index 100% rename from library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/CMakeLists.txt rename to composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/CMakeLists.txt diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/dl/device_grouped_conv3d_bwd_weight_dl_gndhwc_gkzyxc_gndhwk_bf16_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/dl/device_grouped_conv3d_bwd_weight_dl_gndhwc_gkzyxc_gndhwk_bf16_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/dl/device_grouped_conv3d_bwd_weight_dl_gndhwc_gkzyxc_gndhwk_bf16_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/dl/device_grouped_conv3d_bwd_weight_dl_gndhwc_gkzyxc_gndhwk_bf16_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/dl/device_grouped_conv3d_bwd_weight_dl_gndhwc_gkzyxc_gndhwk_f16_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/dl/device_grouped_conv3d_bwd_weight_dl_gndhwc_gkzyxc_gndhwk_f16_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/dl/device_grouped_conv3d_bwd_weight_dl_gndhwc_gkzyxc_gndhwk_f16_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/dl/device_grouped_conv3d_bwd_weight_dl_gndhwc_gkzyxc_gndhwk_f16_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/dl/device_grouped_conv3d_bwd_weight_dl_gndhwc_gkzyxc_gndhwk_f32_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/dl/device_grouped_conv3d_bwd_weight_dl_gndhwc_gkzyxc_gndhwk_f32_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/dl/device_grouped_conv3d_bwd_weight_dl_gndhwc_gkzyxc_gndhwk_f32_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/dl/device_grouped_conv3d_bwd_weight_dl_gndhwc_gkzyxc_gndhwk_f32_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/dl/device_grouped_conv3d_bwd_weight_dl_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/dl/device_grouped_conv3d_bwd_weight_dl_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/dl/device_grouped_conv3d_bwd_weight_dl_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/dl/device_grouped_conv3d_bwd_weight_dl_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/dl/device_grouped_conv3d_bwd_weight_dl_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/dl/device_grouped_conv3d_bwd_weight_dl_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/dl/device_grouped_conv3d_bwd_weight_dl_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/dl/device_grouped_conv3d_bwd_weight_dl_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/dl/device_grouped_conv3d_bwd_weight_dl_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/dl/device_grouped_conv3d_bwd_weight_dl_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/dl/device_grouped_conv3d_bwd_weight_dl_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/dl/device_grouped_conv3d_bwd_weight_dl_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/wmma/device_grouped_conv3d_bwd_weight_wmma_gndhwc_gkzyxc_gndhwk_f16_1x1s1p0_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/wmma/device_grouped_conv3d_bwd_weight_wmma_gndhwc_gkzyxc_gndhwk_f16_1x1s1p0_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/wmma/device_grouped_conv3d_bwd_weight_wmma_gndhwc_gkzyxc_gndhwk_f16_1x1s1p0_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/wmma/device_grouped_conv3d_bwd_weight_wmma_gndhwc_gkzyxc_gndhwk_f16_1x1s1p0_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/wmma/device_grouped_conv3d_bwd_weight_wmma_gndhwc_gkzyxc_gndhwk_f16_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/wmma/device_grouped_conv3d_bwd_weight_wmma_gndhwc_gkzyxc_gndhwk_f16_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/wmma/device_grouped_conv3d_bwd_weight_wmma_gndhwc_gkzyxc_gndhwk_f16_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/wmma/device_grouped_conv3d_bwd_weight_wmma_gndhwc_gkzyxc_gndhwk_f16_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/wmma/device_grouped_conv3d_bwd_weight_wmma_gndhwc_gkzyxc_gndhwk_i8_1x1s1p0_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/wmma/device_grouped_conv3d_bwd_weight_wmma_gndhwc_gkzyxc_gndhwk_i8_1x1s1p0_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/wmma/device_grouped_conv3d_bwd_weight_wmma_gndhwc_gkzyxc_gndhwk_i8_1x1s1p0_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/wmma/device_grouped_conv3d_bwd_weight_wmma_gndhwc_gkzyxc_gndhwk_i8_1x1s1p0_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/wmma/device_grouped_conv3d_bwd_weight_wmma_gndhwc_gkzyxc_gndhwk_i8_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/wmma/device_grouped_conv3d_bwd_weight_wmma_gndhwc_gkzyxc_gndhwk_i8_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/wmma/device_grouped_conv3d_bwd_weight_wmma_gndhwc_gkzyxc_gndhwk_i8_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/wmma/device_grouped_conv3d_bwd_weight_wmma_gndhwc_gkzyxc_gndhwk_i8_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/wmma/device_grouped_conv3d_bwd_weight_wmma_ndhwgc_gkzyxc_ndhwgk_f16_1x1s1p0_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/wmma/device_grouped_conv3d_bwd_weight_wmma_ndhwgc_gkzyxc_ndhwgk_f16_1x1s1p0_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/wmma/device_grouped_conv3d_bwd_weight_wmma_ndhwgc_gkzyxc_ndhwgk_f16_1x1s1p0_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/wmma/device_grouped_conv3d_bwd_weight_wmma_ndhwgc_gkzyxc_ndhwgk_f16_1x1s1p0_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/wmma/device_grouped_conv3d_bwd_weight_wmma_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/wmma/device_grouped_conv3d_bwd_weight_wmma_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/wmma/device_grouped_conv3d_bwd_weight_wmma_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/wmma/device_grouped_conv3d_bwd_weight_wmma_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/wmma/device_grouped_conv3d_bwd_weight_wmma_ndhwgc_gkzyxc_ndhwgk_i8_1x1s1p0_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/wmma/device_grouped_conv3d_bwd_weight_wmma_ndhwgc_gkzyxc_ndhwgk_i8_1x1s1p0_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/wmma/device_grouped_conv3d_bwd_weight_wmma_ndhwgc_gkzyxc_ndhwgk_i8_1x1s1p0_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/wmma/device_grouped_conv3d_bwd_weight_wmma_ndhwgc_gkzyxc_ndhwgk_i8_1x1s1p0_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/wmma/device_grouped_conv3d_bwd_weight_wmma_ndhwgc_gkzyxc_ndhwgk_i8_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/wmma/device_grouped_conv3d_bwd_weight_wmma_ndhwgc_gkzyxc_ndhwgk_i8_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/wmma/device_grouped_conv3d_bwd_weight_wmma_ndhwgc_gkzyxc_ndhwgk_i8_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/wmma/device_grouped_conv3d_bwd_weight_wmma_ndhwgc_gkzyxc_ndhwgk_i8_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_bf16_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_bf16_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_bf16_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_bf16_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_f16_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_f16_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_f16_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_f16_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_f32_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_f32_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_f32_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_f32_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_bf8_fp8_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_bf8_fp8_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_bf8_fp8_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_bf8_fp8_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/CMakeLists.txt b/composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/CMakeLists.txt similarity index 100% rename from library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/CMakeLists.txt rename to composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/CMakeLists.txt diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/wmma/device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_f16_1x1p0_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/wmma/device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_f16_1x1p0_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/wmma/device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_f16_1x1p0_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/wmma/device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_f16_1x1p0_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/wmma/device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_f16_1x1s1p0_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/wmma/device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_f16_1x1s1p0_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/wmma/device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_f16_1x1s1p0_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/wmma/device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_f16_1x1s1p0_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/wmma/device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_f16_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/wmma/device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_f16_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/wmma/device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_f16_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/wmma/device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_f16_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/wmma/device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_f16_oddc_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/wmma/device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_f16_oddc_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/wmma/device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_f16_oddc_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/wmma/device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_f16_oddc_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/wmma/device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_i8_1x1p0_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/wmma/device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_i8_1x1p0_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/wmma/device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_i8_1x1p0_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/wmma/device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_i8_1x1p0_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/wmma/device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_i8_1x1s1p0_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/wmma/device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_i8_1x1s1p0_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/wmma/device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_i8_1x1s1p0_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/wmma/device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_i8_1x1s1p0_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/wmma/device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_i8_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/wmma/device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_i8_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/wmma/device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_i8_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/wmma/device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_i8_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/wmma/device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_i8_oddc_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/wmma/device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_i8_oddc_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/wmma/device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_i8_oddc_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/wmma/device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_i8_oddc_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/wmma/device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_f16_1x1p0_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/wmma/device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_f16_1x1p0_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/wmma/device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_f16_1x1p0_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/wmma/device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_f16_1x1p0_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/wmma/device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_f16_1x1s1p0_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/wmma/device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_f16_1x1s1p0_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/wmma/device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_f16_1x1s1p0_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/wmma/device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_f16_1x1s1p0_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/wmma/device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/wmma/device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/wmma/device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/wmma/device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/wmma/device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_f16_oddc_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/wmma/device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_f16_oddc_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/wmma/device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_f16_oddc_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/wmma/device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_f16_oddc_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/wmma/device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_i8_1x1p0_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/wmma/device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_i8_1x1p0_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/wmma/device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_i8_1x1p0_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/wmma/device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_i8_1x1p0_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/wmma/device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_i8_1x1s1p0_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/wmma/device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_i8_1x1s1p0_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/wmma/device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_i8_1x1s1p0_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/wmma/device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_i8_1x1s1p0_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/wmma/device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_i8_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/wmma/device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_i8_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/wmma/device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_i8_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/wmma/device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_i8_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/wmma/device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_i8_oddc_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/wmma/device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_i8_oddc_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/wmma/device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_i8_oddc_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/wmma/device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_i8_oddc_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_gndhwc_gkzyxc_gndhwk_bf16_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_gndhwc_gkzyxc_gndhwk_bf16_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_gndhwc_gkzyxc_gndhwk_bf16_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_gndhwc_gkzyxc_gndhwk_bf16_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_gndhwc_gkzyxc_gndhwk_f16_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_gndhwc_gkzyxc_gndhwk_f16_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_gndhwc_gkzyxc_gndhwk_f16_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_gndhwc_gkzyxc_gndhwk_f16_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_gndhwc_gkzyxc_gndhwk_f32_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_gndhwc_gkzyxc_gndhwk_f32_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_gndhwc_gkzyxc_gndhwk_f32_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_gndhwc_gkzyxc_gndhwk_f32_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_gndhwc_gkzyxc_gndhwk_int8_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_gndhwc_gkzyxc_gndhwk_int8_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_gndhwc_gkzyxc_gndhwk_int8_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_gndhwc_gkzyxc_gndhwk_int8_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_fp8_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_fp8_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_fp8_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_fp8_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_int8_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_int8_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_int8_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_int8_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/grouped_gemm/CMakeLists.txt b/composable_kernel/library/src/tensor_operation_instance/gpu/grouped_gemm/CMakeLists.txt similarity index 100% rename from library/src/tensor_operation_instance/gpu/grouped_gemm/CMakeLists.txt rename to composable_kernel/library/src/tensor_operation_instance/gpu/grouped_gemm/CMakeLists.txt diff --git a/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_f16_f16_f16_km_kn_mn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_f16_f16_f16_km_kn_mn_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_f16_f16_f16_km_kn_mn_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_f16_f16_f16_km_kn_mn_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_f16_f16_f16_km_nk_mn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_f16_f16_f16_km_nk_mn_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_f16_f16_f16_km_nk_mn_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_f16_f16_f16_km_nk_mn_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_f16_f16_f16_mk_kn_mn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_f16_f16_f16_mk_kn_mn_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_f16_f16_f16_mk_kn_mn_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_f16_f16_f16_mk_kn_mn_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_f16_f16_f16_mk_nk_mn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_f16_f16_f16_mk_nk_mn_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_f16_f16_f16_mk_nk_mn_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_f16_f16_f16_mk_nk_mn_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_irregular_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_irregular_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_irregular_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_irregular_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_irregular_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_irregular_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_irregular_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_irregular_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/grouped_gemm_bias/CMakeLists.txt b/composable_kernel/library/src/tensor_operation_instance/gpu/grouped_gemm_bias/CMakeLists.txt similarity index 100% rename from library/src/tensor_operation_instance/gpu/grouped_gemm_bias/CMakeLists.txt rename to composable_kernel/library/src/tensor_operation_instance/gpu/grouped_gemm_bias/CMakeLists.txt diff --git a/library/src/tensor_operation_instance/gpu/grouped_gemm_bias/device_grouped_gemm_xdl_fixed_nk_bias_f16_f16_f16_mk_kn_mn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/grouped_gemm_bias/device_grouped_gemm_xdl_fixed_nk_bias_f16_f16_f16_mk_kn_mn_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/grouped_gemm_bias/device_grouped_gemm_xdl_fixed_nk_bias_f16_f16_f16_mk_kn_mn_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/grouped_gemm_bias/device_grouped_gemm_xdl_fixed_nk_bias_f16_f16_f16_mk_kn_mn_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/grouped_gemm_bias/device_grouped_gemm_xdl_fixed_nk_bias_f16_f16_f16_mk_nk_mn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/grouped_gemm_bias/device_grouped_gemm_xdl_fixed_nk_bias_f16_f16_f16_mk_nk_mn_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/grouped_gemm_bias/device_grouped_gemm_xdl_fixed_nk_bias_f16_f16_f16_mk_nk_mn_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/grouped_gemm_bias/device_grouped_gemm_xdl_fixed_nk_bias_f16_f16_f16_mk_nk_mn_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/grouped_gemm_bias/device_grouped_gemm_xdl_fixed_nk_bias_f16_f16_f32_mk_kn_mn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/grouped_gemm_bias/device_grouped_gemm_xdl_fixed_nk_bias_f16_f16_f32_mk_kn_mn_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/grouped_gemm_bias/device_grouped_gemm_xdl_fixed_nk_bias_f16_f16_f32_mk_kn_mn_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/grouped_gemm_bias/device_grouped_gemm_xdl_fixed_nk_bias_f16_f16_f32_mk_kn_mn_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/grouped_gemm_bias/device_grouped_gemm_xdl_fixed_nk_bias_f16_f16_f32_mk_nk_mn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/grouped_gemm_bias/device_grouped_gemm_xdl_fixed_nk_bias_f16_f16_f32_mk_nk_mn_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/grouped_gemm_bias/device_grouped_gemm_xdl_fixed_nk_bias_f16_f16_f32_mk_nk_mn_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/grouped_gemm_bias/device_grouped_gemm_xdl_fixed_nk_bias_f16_f16_f32_mk_nk_mn_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/grouped_gemm_fastgelu/CMakeLists.txt b/composable_kernel/library/src/tensor_operation_instance/gpu/grouped_gemm_fastgelu/CMakeLists.txt similarity index 100% rename from library/src/tensor_operation_instance/gpu/grouped_gemm_fastgelu/CMakeLists.txt rename to composable_kernel/library/src/tensor_operation_instance/gpu/grouped_gemm_fastgelu/CMakeLists.txt diff --git a/library/src/tensor_operation_instance/gpu/grouped_gemm_fastgelu/device_grouped_gemm_fastgelu_xdl_f16_f16_f16_km_kn_mn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/grouped_gemm_fastgelu/device_grouped_gemm_fastgelu_xdl_f16_f16_f16_km_kn_mn_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/grouped_gemm_fastgelu/device_grouped_gemm_fastgelu_xdl_f16_f16_f16_km_kn_mn_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/grouped_gemm_fastgelu/device_grouped_gemm_fastgelu_xdl_f16_f16_f16_km_kn_mn_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/grouped_gemm_fastgelu/device_grouped_gemm_fastgelu_xdl_f16_f16_f16_km_nk_mn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/grouped_gemm_fastgelu/device_grouped_gemm_fastgelu_xdl_f16_f16_f16_km_nk_mn_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/grouped_gemm_fastgelu/device_grouped_gemm_fastgelu_xdl_f16_f16_f16_km_nk_mn_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/grouped_gemm_fastgelu/device_grouped_gemm_fastgelu_xdl_f16_f16_f16_km_nk_mn_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/grouped_gemm_fastgelu/device_grouped_gemm_fastgelu_xdl_f16_f16_f16_mk_kn_mn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/grouped_gemm_fastgelu/device_grouped_gemm_fastgelu_xdl_f16_f16_f16_mk_kn_mn_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/grouped_gemm_fastgelu/device_grouped_gemm_fastgelu_xdl_f16_f16_f16_mk_kn_mn_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/grouped_gemm_fastgelu/device_grouped_gemm_fastgelu_xdl_f16_f16_f16_mk_kn_mn_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/grouped_gemm_fastgelu/device_grouped_gemm_fastgelu_xdl_f16_f16_f16_mk_nk_mn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/grouped_gemm_fastgelu/device_grouped_gemm_fastgelu_xdl_f16_f16_f16_mk_nk_mn_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/grouped_gemm_fastgelu/device_grouped_gemm_fastgelu_xdl_f16_f16_f16_mk_nk_mn_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/grouped_gemm_fastgelu/device_grouped_gemm_fastgelu_xdl_f16_f16_f16_mk_nk_mn_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/grouped_gemm_fixed_nk/CMakeLists.txt b/composable_kernel/library/src/tensor_operation_instance/gpu/grouped_gemm_fixed_nk/CMakeLists.txt similarity index 100% rename from library/src/tensor_operation_instance/gpu/grouped_gemm_fixed_nk/CMakeLists.txt rename to composable_kernel/library/src/tensor_operation_instance/gpu/grouped_gemm_fixed_nk/CMakeLists.txt diff --git a/library/src/tensor_operation_instance/gpu/grouped_gemm_fixed_nk/device_grouped_gemm_xdl_fixed_nk_f16_f16_f16_mk_kn_mn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/grouped_gemm_fixed_nk/device_grouped_gemm_xdl_fixed_nk_f16_f16_f16_mk_kn_mn_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/grouped_gemm_fixed_nk/device_grouped_gemm_xdl_fixed_nk_f16_f16_f16_mk_kn_mn_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/grouped_gemm_fixed_nk/device_grouped_gemm_xdl_fixed_nk_f16_f16_f16_mk_kn_mn_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/grouped_gemm_fixed_nk/device_grouped_gemm_xdl_fixed_nk_f16_f16_f16_mk_nk_mn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/grouped_gemm_fixed_nk/device_grouped_gemm_xdl_fixed_nk_f16_f16_f16_mk_nk_mn_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/grouped_gemm_fixed_nk/device_grouped_gemm_xdl_fixed_nk_f16_f16_f16_mk_nk_mn_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/grouped_gemm_fixed_nk/device_grouped_gemm_xdl_fixed_nk_f16_f16_f16_mk_nk_mn_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/grouped_gemm_fixed_nk/device_grouped_gemm_xdl_fixed_nk_f16_fp8_f16_mk_kn_mn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/grouped_gemm_fixed_nk/device_grouped_gemm_xdl_fixed_nk_f16_fp8_f16_mk_kn_mn_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/grouped_gemm_fixed_nk/device_grouped_gemm_xdl_fixed_nk_f16_fp8_f16_mk_kn_mn_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/grouped_gemm_fixed_nk/device_grouped_gemm_xdl_fixed_nk_f16_fp8_f16_mk_kn_mn_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/grouped_gemm_fixed_nk/device_grouped_gemm_xdl_fixed_nk_f16_fp8_f16_mk_nk_mn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/grouped_gemm_fixed_nk/device_grouped_gemm_xdl_fixed_nk_f16_fp8_f16_mk_nk_mn_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/grouped_gemm_fixed_nk/device_grouped_gemm_xdl_fixed_nk_f16_fp8_f16_mk_nk_mn_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/grouped_gemm_fixed_nk/device_grouped_gemm_xdl_fixed_nk_f16_fp8_f16_mk_nk_mn_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/grouped_gemm_fixed_nk/device_grouped_gemm_xdl_fixed_nk_f16_i8_f16_mk_kn_mn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/grouped_gemm_fixed_nk/device_grouped_gemm_xdl_fixed_nk_f16_i8_f16_mk_kn_mn_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/grouped_gemm_fixed_nk/device_grouped_gemm_xdl_fixed_nk_f16_i8_f16_mk_kn_mn_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/grouped_gemm_fixed_nk/device_grouped_gemm_xdl_fixed_nk_f16_i8_f16_mk_kn_mn_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/grouped_gemm_fixed_nk/device_grouped_gemm_xdl_fixed_nk_f16_i8_f16_mk_nk_mn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/grouped_gemm_fixed_nk/device_grouped_gemm_xdl_fixed_nk_f16_i8_f16_mk_nk_mn_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/grouped_gemm_fixed_nk/device_grouped_gemm_xdl_fixed_nk_f16_i8_f16_mk_nk_mn_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/grouped_gemm_fixed_nk/device_grouped_gemm_xdl_fixed_nk_f16_i8_f16_mk_nk_mn_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/image_to_column/CMakeLists.txt b/composable_kernel/library/src/tensor_operation_instance/gpu/image_to_column/CMakeLists.txt similarity index 100% rename from library/src/tensor_operation_instance/gpu/image_to_column/CMakeLists.txt rename to composable_kernel/library/src/tensor_operation_instance/gpu/image_to_column/CMakeLists.txt diff --git a/library/src/tensor_operation_instance/gpu/image_to_column/device_image_to_column_nhwc_1d_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/image_to_column/device_image_to_column_nhwc_1d_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/image_to_column/device_image_to_column_nhwc_1d_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/image_to_column/device_image_to_column_nhwc_1d_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/image_to_column/device_image_to_column_nhwc_2d_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/image_to_column/device_image_to_column_nhwc_2d_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/image_to_column/device_image_to_column_nhwc_2d_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/image_to_column/device_image_to_column_nhwc_2d_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/image_to_column/device_image_to_column_nhwc_3d_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/image_to_column/device_image_to_column_nhwc_3d_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/image_to_column/device_image_to_column_nhwc_3d_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/image_to_column/device_image_to_column_nhwc_3d_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/max_pool_bwd/CMakeLists.txt b/composable_kernel/library/src/tensor_operation_instance/gpu/max_pool_bwd/CMakeLists.txt similarity index 100% rename from library/src/tensor_operation_instance/gpu/max_pool_bwd/CMakeLists.txt rename to composable_kernel/library/src/tensor_operation_instance/gpu/max_pool_bwd/CMakeLists.txt diff --git a/library/src/tensor_operation_instance/gpu/max_pool_bwd/device_max_pool_bwd_bf16_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/max_pool_bwd/device_max_pool_bwd_bf16_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/max_pool_bwd/device_max_pool_bwd_bf16_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/max_pool_bwd/device_max_pool_bwd_bf16_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/max_pool_bwd/device_max_pool_bwd_f16_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/max_pool_bwd/device_max_pool_bwd_f16_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/max_pool_bwd/device_max_pool_bwd_f16_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/max_pool_bwd/device_max_pool_bwd_f16_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/max_pool_bwd/device_max_pool_bwd_f32_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/max_pool_bwd/device_max_pool_bwd_f32_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/max_pool_bwd/device_max_pool_bwd_f32_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/max_pool_bwd/device_max_pool_bwd_f32_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/max_pool_bwd/max_pool_bwd_instance_common.hpp b/composable_kernel/library/src/tensor_operation_instance/gpu/max_pool_bwd/max_pool_bwd_instance_common.hpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/max_pool_bwd/max_pool_bwd_instance_common.hpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/max_pool_bwd/max_pool_bwd_instance_common.hpp diff --git a/library/src/tensor_operation_instance/gpu/normalization/CMakeLists.txt b/composable_kernel/library/src/tensor_operation_instance/gpu/normalization/CMakeLists.txt similarity index 100% rename from library/src/tensor_operation_instance/gpu/normalization/CMakeLists.txt rename to composable_kernel/library/src/tensor_operation_instance/gpu/normalization/CMakeLists.txt diff --git a/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_f16_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_f16_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_f16_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_f16_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_f32_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_f32_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_f32_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_f32_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_swish_f16_f32_f32_f16_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_swish_f16_f32_f32_f16_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_swish_f16_f32_f32_f16_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_swish_f16_f32_f32_f16_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_swish_f16_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_swish_f16_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_swish_f16_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_swish_f16_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_swish_f32_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_swish_f32_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_swish_f32_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_swish_f32_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/normalization/device_layernorm2d_f16_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/normalization/device_layernorm2d_f16_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/normalization/device_layernorm2d_f16_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/normalization/device_layernorm2d_f16_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/normalization/device_layernorm2d_f32_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/normalization/device_layernorm2d_f32_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/normalization/device_layernorm2d_f32_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/normalization/device_layernorm2d_f32_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/normalization/device_layernorm4d_f16_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/normalization/device_layernorm4d_f16_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/normalization/device_layernorm4d_f16_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/normalization/device_layernorm4d_f16_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/normalization/device_layernorm4d_f32_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/normalization/device_layernorm4d_f32_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/normalization/device_layernorm4d_f32_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/normalization/device_layernorm4d_f32_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/normalization/normalization_instance_common.hpp b/composable_kernel/library/src/tensor_operation_instance/gpu/normalization/normalization_instance_common.hpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/normalization/normalization_instance_common.hpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/normalization/normalization_instance_common.hpp diff --git a/library/src/tensor_operation_instance/gpu/pool3d_fwd/CMakeLists.txt b/composable_kernel/library/src/tensor_operation_instance/gpu/pool3d_fwd/CMakeLists.txt similarity index 100% rename from library/src/tensor_operation_instance/gpu/pool3d_fwd/CMakeLists.txt rename to composable_kernel/library/src/tensor_operation_instance/gpu/pool3d_fwd/CMakeLists.txt diff --git a/library/src/tensor_operation_instance/gpu/pool3d_fwd/device_avg_pool3d_fwd_ndhwc_bf16_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/pool3d_fwd/device_avg_pool3d_fwd_ndhwc_bf16_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/pool3d_fwd/device_avg_pool3d_fwd_ndhwc_bf16_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/pool3d_fwd/device_avg_pool3d_fwd_ndhwc_bf16_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/pool3d_fwd/device_avg_pool3d_fwd_ndhwc_f16_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/pool3d_fwd/device_avg_pool3d_fwd_ndhwc_f16_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/pool3d_fwd/device_avg_pool3d_fwd_ndhwc_f16_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/pool3d_fwd/device_avg_pool3d_fwd_ndhwc_f16_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/pool3d_fwd/device_avg_pool3d_fwd_ndhwc_f32_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/pool3d_fwd/device_avg_pool3d_fwd_ndhwc_f32_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/pool3d_fwd/device_avg_pool3d_fwd_ndhwc_f32_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/pool3d_fwd/device_avg_pool3d_fwd_ndhwc_f32_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/pool3d_fwd/device_max_pool3d_fwd_ndhwc_bf16_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/pool3d_fwd/device_max_pool3d_fwd_ndhwc_bf16_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/pool3d_fwd/device_max_pool3d_fwd_ndhwc_bf16_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/pool3d_fwd/device_max_pool3d_fwd_ndhwc_bf16_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/pool3d_fwd/device_max_pool3d_fwd_ndhwc_f16_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/pool3d_fwd/device_max_pool3d_fwd_ndhwc_f16_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/pool3d_fwd/device_max_pool3d_fwd_ndhwc_f16_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/pool3d_fwd/device_max_pool3d_fwd_ndhwc_f16_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/pool3d_fwd/device_max_pool3d_fwd_ndhwc_f32_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/pool3d_fwd/device_max_pool3d_fwd_ndhwc_f32_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/pool3d_fwd/device_max_pool3d_fwd_ndhwc_f32_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/pool3d_fwd/device_max_pool3d_fwd_ndhwc_f32_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/pool3d_fwd/pool_fwd_instance_common.hpp b/composable_kernel/library/src/tensor_operation_instance/gpu/pool3d_fwd/pool_fwd_instance_common.hpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/pool3d_fwd/pool_fwd_instance_common.hpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/pool3d_fwd/pool_fwd_instance_common.hpp diff --git a/library/src/tensor_operation_instance/gpu/quantization/CMakeLists.txt b/composable_kernel/library/src/tensor_operation_instance/gpu/quantization/CMakeLists.txt similarity index 100% rename from library/src/tensor_operation_instance/gpu/quantization/CMakeLists.txt rename to composable_kernel/library/src/tensor_operation_instance/gpu/quantization/CMakeLists.txt diff --git a/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/conv2d_quantization_common.hpp b/composable_kernel/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/conv2d_quantization_common.hpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/conv2d_quantization_common.hpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/conv2d_quantization_common.hpp diff --git a/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_dl_bias_perchannel_quantization_int8_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_dl_bias_perchannel_quantization_int8_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_dl_bias_perchannel_quantization_int8_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_dl_bias_perchannel_quantization_int8_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_dl_bias_perlayer_quantization_int8_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_dl_bias_perlayer_quantization_int8_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_dl_bias_perlayer_quantization_int8_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_dl_bias_perlayer_quantization_int8_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_dl_int8_instance.hpp b/composable_kernel/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_dl_int8_instance.hpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_dl_int8_instance.hpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_dl_int8_instance.hpp diff --git a/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_dl_perchannel_quantization_int8_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_dl_perchannel_quantization_int8_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_dl_perchannel_quantization_int8_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_dl_perchannel_quantization_int8_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_dl_perlayer_quantization_int8_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_dl_perlayer_quantization_int8_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_dl_perlayer_quantization_int8_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_dl_perlayer_quantization_int8_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_xdl_bias_perchannel_quantization_int8_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_xdl_bias_perchannel_quantization_int8_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_xdl_bias_perchannel_quantization_int8_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_xdl_bias_perchannel_quantization_int8_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_xdl_bias_perlayer_quantization_int8_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_xdl_bias_perlayer_quantization_int8_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_xdl_bias_perlayer_quantization_int8_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_xdl_bias_perlayer_quantization_int8_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_xdl_int8_instance.hpp b/composable_kernel/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_xdl_int8_instance.hpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_xdl_int8_instance.hpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_xdl_int8_instance.hpp diff --git a/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_xdl_perchannel_quantization_int8_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_xdl_perchannel_quantization_int8_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_xdl_perchannel_quantization_int8_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_xdl_perchannel_quantization_int8_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_xdl_perlayer_quantization_int8_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_xdl_perlayer_quantization_int8_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_xdl_perlayer_quantization_int8_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_xdl_perlayer_quantization_int8_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/quantization/gemm/device_gemm_quantization_dl_c_shuffle_i8_i8_i8_instance.hpp b/composable_kernel/library/src/tensor_operation_instance/gpu/quantization/gemm/device_gemm_quantization_dl_c_shuffle_i8_i8_i8_instance.hpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/quantization/gemm/device_gemm_quantization_dl_c_shuffle_i8_i8_i8_instance.hpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/quantization/gemm/device_gemm_quantization_dl_c_shuffle_i8_i8_i8_instance.hpp diff --git a/library/src/tensor_operation_instance/gpu/quantization/gemm/device_gemm_quantization_dl_c_shuffle_i8_i8_i8_km_kn_mn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/quantization/gemm/device_gemm_quantization_dl_c_shuffle_i8_i8_i8_km_kn_mn_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/quantization/gemm/device_gemm_quantization_dl_c_shuffle_i8_i8_i8_km_kn_mn_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/quantization/gemm/device_gemm_quantization_dl_c_shuffle_i8_i8_i8_km_kn_mn_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/quantization/gemm/device_gemm_quantization_dl_c_shuffle_i8_i8_i8_km_nk_mn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/quantization/gemm/device_gemm_quantization_dl_c_shuffle_i8_i8_i8_km_nk_mn_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/quantization/gemm/device_gemm_quantization_dl_c_shuffle_i8_i8_i8_km_nk_mn_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/quantization/gemm/device_gemm_quantization_dl_c_shuffle_i8_i8_i8_km_nk_mn_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/quantization/gemm/device_gemm_quantization_dl_c_shuffle_i8_i8_i8_mk_kn_mn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/quantization/gemm/device_gemm_quantization_dl_c_shuffle_i8_i8_i8_mk_kn_mn_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/quantization/gemm/device_gemm_quantization_dl_c_shuffle_i8_i8_i8_mk_kn_mn_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/quantization/gemm/device_gemm_quantization_dl_c_shuffle_i8_i8_i8_mk_kn_mn_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/quantization/gemm/device_gemm_quantization_dl_c_shuffle_i8_i8_i8_mk_nk_mn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/quantization/gemm/device_gemm_quantization_dl_c_shuffle_i8_i8_i8_mk_nk_mn_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/quantization/gemm/device_gemm_quantization_dl_c_shuffle_i8_i8_i8_mk_nk_mn_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/quantization/gemm/device_gemm_quantization_dl_c_shuffle_i8_i8_i8_mk_nk_mn_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/quantization/gemm/device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_instance.hpp b/composable_kernel/library/src/tensor_operation_instance/gpu/quantization/gemm/device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_instance.hpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/quantization/gemm/device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_instance.hpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/quantization/gemm/device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_instance.hpp diff --git a/library/src/tensor_operation_instance/gpu/quantization/gemm/device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_km_kn_mn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/quantization/gemm/device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_km_kn_mn_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/quantization/gemm/device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_km_kn_mn_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/quantization/gemm/device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_km_kn_mn_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/quantization/gemm/device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_km_nk_mn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/quantization/gemm/device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_km_nk_mn_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/quantization/gemm/device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_km_nk_mn_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/quantization/gemm/device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_km_nk_mn_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/quantization/gemm/device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_mk_kn_mn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/quantization/gemm/device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_mk_kn_mn_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/quantization/gemm/device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_mk_kn_mn_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/quantization/gemm/device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_mk_kn_mn_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/quantization/gemm/device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_mk_nk_mn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/quantization/gemm/device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_mk_nk_mn_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/quantization/gemm/device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_mk_nk_mn_instance.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/quantization/gemm/device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_mk_nk_mn_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/quantization/gemm/gemm_quantization_common.hpp b/composable_kernel/library/src/tensor_operation_instance/gpu/quantization/gemm/gemm_quantization_common.hpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/quantization/gemm/gemm_quantization_common.hpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/quantization/gemm/gemm_quantization_common.hpp diff --git a/library/src/tensor_operation_instance/gpu/reduce/CMakeLists.txt b/composable_kernel/library/src/tensor_operation_instance/gpu/reduce/CMakeLists.txt similarity index 100% rename from library/src/tensor_operation_instance/gpu/reduce/CMakeLists.txt rename to composable_kernel/library/src/tensor_operation_instance/gpu/reduce/CMakeLists.txt diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_add.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_add.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_add.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_add.cpp diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_amax.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_amax.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_amax.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_amax.cpp diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_avg.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_avg.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_avg.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_avg.cpp diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_max.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_max.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_max.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_max.cpp diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_min.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_min.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_min.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_min.cpp diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_norm2.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_norm2.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_norm2.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_norm2.cpp diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_amax.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_amax.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_amax.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_amax.cpp diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_max.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_max.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_max.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_max.cpp diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_min.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_min.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_min.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_min.cpp diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_add.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_add.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_add.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_add.cpp diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_avg.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_avg.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_avg.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_avg.cpp diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_norm2.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_norm2.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_norm2.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_norm2.cpp diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_add.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_add.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_add.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_add.cpp diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_amax.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_amax.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_amax.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_amax.cpp diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_avg.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_avg.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_avg.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_avg.cpp diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_max.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_max.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_max.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_max.cpp diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_min.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_min.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_min.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_min.cpp diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_norm2.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_norm2.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_norm2.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_norm2.cpp diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_add.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_add.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_add.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_add.cpp diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_avg.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_avg.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_avg.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_avg.cpp diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_norm2.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_norm2.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_norm2.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_norm2.cpp diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_add.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_add.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_add.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_add.cpp diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_amax.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_amax.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_amax.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_amax.cpp diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_avg.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_avg.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_avg.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_avg.cpp diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_max.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_max.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_max.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_max.cpp diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_min.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_min.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_min.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_min.cpp diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_norm2.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_norm2.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_norm2.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_norm2.cpp diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i32_i8_add.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i32_i8_add.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i32_i8_add.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i32_i8_add.cpp diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i32_i8_avg.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i32_i8_avg.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i32_i8_avg.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i32_i8_avg.cpp diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8_amax.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8_amax.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8_amax.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8_amax.cpp diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8_max.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8_max.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8_max.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8_max.cpp diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8_min.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8_min.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8_min.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8_min.cpp diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_b16_f32_f32_add.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_b16_f32_f32_add.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_b16_f32_f32_add.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_b16_f32_f32_add.cpp diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_b16_f32_f32_avg.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_b16_f32_f32_avg.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_b16_f32_f32_avg.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_b16_f32_f32_avg.cpp diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32_add.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32_add.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32_add.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32_add.cpp diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32_avg.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32_avg.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32_avg.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32_avg.cpp diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32_add.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32_add.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32_add.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32_add.cpp diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32_avg.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32_avg.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32_avg.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32_avg.cpp diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32_add.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32_add.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32_add.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32_add.cpp diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32_avg.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32_avg.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32_avg.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32_avg.cpp diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f64_f64_f64_add.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f64_f64_f64_add.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f64_f64_f64_add.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f64_f64_f64_add.cpp diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f64_f64_f64_avg.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f64_f64_f64_avg.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f64_f64_f64_avg.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f64_f64_f64_avg.cpp diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_add.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_add.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_add.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_add.cpp diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_amax.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_amax.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_amax.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_amax.cpp diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_avg.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_avg.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_avg.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_avg.cpp diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_max.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_max.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_max.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_max.cpp diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_min.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_min.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_min.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_min.cpp diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_norm2.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_norm2.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_norm2.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_norm2.cpp diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16_amax.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16_amax.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16_amax.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16_amax.cpp diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16_max.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16_max.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16_max.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16_max.cpp diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16_min.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16_min.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16_min.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16_min.cpp diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16_add.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16_add.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16_add.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16_add.cpp diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16_avg.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16_avg.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16_avg.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16_avg.cpp diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16_norm2.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16_norm2.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16_norm2.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16_norm2.cpp diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_add.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_add.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_add.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_add.cpp diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_amax.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_amax.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_amax.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_amax.cpp diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_avg.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_avg.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_avg.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_avg.cpp diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_max.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_max.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_max.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_max.cpp diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_min.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_min.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_min.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_min.cpp diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_norm2.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_norm2.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_norm2.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_norm2.cpp diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_add.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_add.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_add.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_add.cpp diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_avg.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_avg.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_avg.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_avg.cpp diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_norm2.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_norm2.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_norm2.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_norm2.cpp diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_add.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_add.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_add.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_add.cpp diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_amax.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_amax.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_amax.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_amax.cpp diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_avg.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_avg.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_avg.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_avg.cpp diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_max.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_max.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_max.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_max.cpp diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_min.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_min.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_min.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_min.cpp diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_norm2.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_norm2.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_norm2.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_norm2.cpp diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8_add.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8_add.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8_add.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8_add.cpp diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8_avg.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8_avg.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8_avg.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8_avg.cpp diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_amax.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_amax.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_amax.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_amax.cpp diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_max.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_max.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_max.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_max.cpp diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_min.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_min.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_min.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_min.cpp diff --git a/library/src/tensor_operation_instance/gpu/softmax/CMakeLists.txt b/composable_kernel/library/src/tensor_operation_instance/gpu/softmax/CMakeLists.txt similarity index 100% rename from library/src/tensor_operation_instance/gpu/softmax/CMakeLists.txt rename to composable_kernel/library/src/tensor_operation_instance/gpu/softmax/CMakeLists.txt diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce1.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce1.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce1.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce1.cpp diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce2.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce2.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce2.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce2.cpp diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce3.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce3.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce3.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce3.cpp diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce1.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce1.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce1.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce1.cpp diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce2.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce2.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce2.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce2.cpp diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce3.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce3.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce3.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce3.cpp diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce4.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce4.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce4.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce4.cpp diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce1.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce1.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce1.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce1.cpp diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce2.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce2.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce2.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce2.cpp diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce3.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce3.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce3.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce3.cpp diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce1.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce1.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce1.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce1.cpp diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce2.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce2.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce2.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce2.cpp diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce3.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce3.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce3.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce3.cpp diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce4.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce4.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce4.cpp rename to composable_kernel/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce4.cpp diff --git a/composable_kernel/library/src/tensor_operation_instance/gpu/tall_and_skinny_gemm_splitk/CMakeLists.txt b/composable_kernel/library/src/tensor_operation_instance/gpu/tall_and_skinny_gemm_splitk/CMakeLists.txt new file mode 100755 index 000000000..4cd245a48 --- /dev/null +++ b/composable_kernel/library/src/tensor_operation_instance/gpu/tall_and_skinny_gemm_splitk/CMakeLists.txt @@ -0,0 +1,18 @@ +list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942) +set(target 0) +foreach(gpu IN LISTS GPU_TARGETS) + if(gpu IN_LIST gpu_list AND target EQUAL 0) + + set(TALL_AND_SKINNY_GEMM_SPLITK_INSTANCES) + + + if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES) + list(APPEND TALL_AND_SKINNY_GEMM_SPLITK_INSTANCES device_tall_and_skinny_gemm_splitk_f16_f16_f16_mk_kn_mn_instance.cpp) + list(APPEND TALL_AND_SKINNY_GEMM_SPLITK_INSTANCES device_tall_and_skinny_gemm_splitk_f16_f16_f16_mk_nk_mn_instance.cpp) + endif() + + + add_instance_library(device_tall_and_skinny_gemm_splitk_instance ${TALL_AND_SKINNY_GEMM_SPLITK_INSTANCES}) + set(target 1) + endif() + endforeach() \ No newline at end of file diff --git a/composable_kernel/library/src/tensor_operation_instance/gpu/tall_and_skinny_gemm_splitk/device_tall_and_skinny_gemm_splitk_f16_f16_f16_mk_kn_mn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/tall_and_skinny_gemm_splitk/device_tall_and_skinny_gemm_splitk_f16_f16_f16_mk_kn_mn_instance.cpp new file mode 100755 index 000000000..3397d50b0 --- /dev/null +++ b/composable_kernel/library/src/tensor_operation_instance/gpu/tall_and_skinny_gemm_splitk/device_tall_and_skinny_gemm_splitk_f16_f16_f16_mk_kn_mn_instance.cpp @@ -0,0 +1,200 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved. + +#include + +#include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_tall_and_skinny_gemm_splitk.hpp" + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using F16 = ck::half_t; +using F32 = float; + +using Row = ck::tensor_layout::gemm::RowMajor; +using Col = ck::tensor_layout::gemm::ColumnMajor; + +template +using S = ck::Sequence; + +using PassThrough = ck::tensor_operation::element_wise::PassThrough; +static constexpr auto GemmMNPadding = ck::tensor_operation::device::GemmSpecialization::MNPadding; + +// Compilation parameters for a[m, k] * b[k, n] = c[m, n] +using device_tall_and_skinny_gemm_splitk_f16_f16_f16_mk_kn_mn_instances = std::tuple< + // clang-format off +// ######| AData| BData| CData| AccData| ALayout| BLayout| CLayout| A| B| C| GEMM| Block| MPer| NPer| K0Per| K1| M1Per| N1Per| KPer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer | ABlockTransfer| ABlockTransfer | BBlockTransfer| BThreadTransfer| BThreadTransfer| CThreadTransfer| CThreadTransfer| CThreadTransfer| +// ######| Type| Type| Type| Type| | | | Elementwise| Elementwise| Elementwise| Spacialization| Size| Block| Block| Block| | ThreadM111| ThreadN111| Thread| ThreadSliceLengths| ThreadClusterLengths| ThreadCluster| SrcAccess|SrcVectorTensorLengths| SrcVectorTensor|DstVectorTensorLengths| SrcAccess| SrcVectorDim| SrcScalarPerVector| SrcDstAccess| SrcDstVectorDim| DstScalarPerVector| +// ######| | | | | | | | Operation| Operation| Operation| | | | | | | | | | KBatch_K0_M0_M1_K1| KBatch_K0_M0_M1_K1| ArrangeOrder| Order| KBatch_K0_M0_M1_K1 | ContiguousDimOrder| KBatch_K0_M0_M1_K1 | Order| | | Order| | | +// ######| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | + //< ADataType, BDataType, CDataType, AccDataType, ALayout, BLayout, CLayout, AElementOp, BElementOp, CElementOp, GemmMNPadding, B, M1, B*N1, K0, K1, M1, N1, 1, S<1,1, 1, 1, K1>, S<1,K0, 1,M1, 1>,S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, K1>, S<0,1,2,3,4>, S<1,1, 1, 1, K1>, S<0,1,2,3,4>, 3, N1, S<0, 1, 2, 3, 4, 5>, 5, N1>; + //M1 is always tied to 16 + //N1=2 + ck::tensor_operation::device::deviceTsmmDl + < F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 16, 128, 1, 2, 16, 2, 1, S<1,1, 1, 1, 2>, S<1,1, 1,16, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, 3, 2, S<0, 1, 2, 3, 4, 5>, 5, 2>, + ck::tensor_operation::device::deviceTsmmDl + < F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 16, 128, 1, 4, 16, 2, 1, S<1,1, 1, 1, 4>, S<1,1, 1,16, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, 3, 2, S<0, 1, 2, 3, 4, 5>, 5, 2>, + ck::tensor_operation::device::deviceTsmmDl + < F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 16, 128, 1, 8, 16, 2, 1, S<1,1, 1, 1, 8>, S<1,1, 1,16, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, 3, 2, S<0, 1, 2, 3, 4, 5>, 5, 2>, + ck::tensor_operation::device::deviceTsmmDl + < F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 16, 128, 2, 2, 16, 2, 1, S<1,1, 1, 1, 2>, S<1,2, 1,16, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, 3, 2, S<0, 1, 2, 3, 4, 5>, 5, 2>, + ck::tensor_operation::device::deviceTsmmDl + < F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 16, 128, 2, 4, 16, 2, 1, S<1,1, 1, 1, 4>, S<1,2, 1,16, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, 3, 2, S<0, 1, 2, 3, 4, 5>, 5, 2>, + ck::tensor_operation::device::deviceTsmmDl + < F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 16, 128, 2, 8, 16, 2, 1, S<1,1, 1, 1, 8>, S<1,2, 1,16, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, 3, 2, S<0, 1, 2, 3, 4, 5>, 5, 2>, + ck::tensor_operation::device::deviceTsmmDl + < F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 16, 128, 3, 2, 16, 2, 1, S<1,1, 1, 1, 2>, S<1,3, 1,16, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, 3, 2, S<0, 1, 2, 3, 4, 5>, 5, 2>, + ck::tensor_operation::device::deviceTsmmDl + < F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 16, 128, 3, 4, 16, 2, 1, S<1,1, 1, 1, 4>, S<1,3, 1,16, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, 3, 2, S<0, 1, 2, 3, 4, 5>, 5, 2>, + ck::tensor_operation::device::deviceTsmmDl + < F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 16, 128, 3, 8, 16, 2, 1, S<1,1, 1, 1, 8>, S<1,3, 1,16, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, 3, 2, S<0, 1, 2, 3, 4, 5>, 5, 2>, + ck::tensor_operation::device::deviceTsmmDl + < F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 16, 128, 4, 2, 16, 2, 1, S<1,1, 1, 1, 2>, S<1,4, 1,16, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, 3, 2, S<0, 1, 2, 3, 4, 5>, 5, 2>, + ck::tensor_operation::device::deviceTsmmDl + < F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 16, 128, 4, 4, 16, 2, 1, S<1,1, 1, 1, 4>, S<1,4, 1,16, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, 3, 2, S<0, 1, 2, 3, 4, 5>, 5, 2>, + ck::tensor_operation::device::deviceTsmmDl + < F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 16, 128, 4, 8, 16, 2, 1, S<1,1, 1, 1, 8>, S<1,4, 1,16, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, 3, 2, S<0, 1, 2, 3, 4, 5>, 5, 2>, + // ck::tensor_operation::device::deviceTsmmDl + // < F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 16, 128, 5, 2, 16, 2, 1, S<1,1, 1, 1, 2>, S<1,5, 1,16, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, 3, 2, S<0, 1, 2, 3, 4, 5>, 5, 2> + // ck::tensor_operation::device::deviceTsmmDl + // < F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 16, 128, 5, 4, 16, 2, 1, S<1,1, 1, 1, 4>, S<1,5, 1,16, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, 3, 2, S<0, 1, 2, 3, 4, 5>, 5, 2>, + // ck::tensor_operation::device::deviceTsmmDl + // < F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 16, 128, 5, 8, 16, 2, 1, S<1,1, 1, 1, 8>, S<1,5, 1,16, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, 3, 2, S<0, 1, 2, 3, 4, 5>, 5, 2>, + // ck::tensor_operation::device::deviceTsmmDl + // < F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 16, 128, 6, 2, 16, 2, 1, S<1,1, 1, 1, 2>, S<1,6, 1,16, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, 3, 2, S<0, 1, 2, 3, 4, 5>, 5, 2>, + // ck::tensor_operation::device::deviceTsmmDl + // < F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 16, 128, 6, 4, 16, 2, 1, S<1,1, 1, 1, 4>, S<1,6, 1,16, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, 3, 2, S<0, 1, 2, 3, 4, 5>, 5, 2>, + // ck::tensor_operation::device::deviceTsmmDl + // < F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 16, 128, 6, 8, 16, 2, 1, S<1,1, 1, 1, 8>, S<1,6, 1,16, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, 3, 2, S<0, 1, 2, 3, 4, 5>, 5, 2>, + //ck::tensor_operation::device::deviceTsmmDl + // < F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 16, 128, 7, 2, 16, 2, 1, S<1,1, 1, 1, 2>, S<1,7, 1,16, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, 3, 2, S<0, 1, 2, 3, 4, 5>, 5, 2>, + //ck::tensor_operation::device::deviceTsmmDl + // < F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 16, 128, 7, 4, 16, 2, 1, S<1,1, 1, 1, 4>, S<1,7, 1,16, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, 3, 2, S<0, 1, 2, 3, 4, 5>, 5, 2>, + //ck::tensor_operation::device::deviceTsmmDl + // < F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 16, 128, 7, 8, 16, 2, 1, S<1,1, 1, 1, 8>, S<1,7, 1,16, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, 3, 2, S<0, 1, 2, 3, 4, 5>, 5, 2>, + // ck::tensor_operation::device::deviceTsmmDl + // < F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 16, 128, 8, 2, 16, 2, 1, S<1,1, 1, 1, 2>, S<1,8, 1,16, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, 3, 2, S<0, 1, 2, 3, 4, 5>, 5, 2>, + // ck::tensor_operation::device::deviceTsmmDl + // < F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 16, 128, 8, 4, 16, 2, 1, S<1,1, 1, 1, 4>, S<1,8, 1,16, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, 3, 2, S<0, 1, 2, 3, 4, 5>, 5, 2>, + // ck::tensor_operation::device::deviceTsmmDl + // < F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 16, 128, 8, 8, 16, 2, 1, S<1,1, 1, 1, 8>, S<1,8, 1,16, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, 3, 2, S<0, 1, 2, 3, 4, 5>, 5, 2>, + //N1=4 + ck::tensor_operation::device::deviceTsmmDl + < F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 16, 256, 1, 2, 16, 4, 1, S<1,1, 1, 1, 2>, S<1,1, 1,16, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, 3, 4, S<0, 1, 2, 3, 4, 5>, 5, 4>, + ck::tensor_operation::device::deviceTsmmDl + < F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 16, 256, 1, 4, 16, 4, 1, S<1,1, 1, 1, 4>, S<1,1, 1,16, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, 3, 4, S<0, 1, 2, 3, 4, 5>, 5, 4>, + ck::tensor_operation::device::deviceTsmmDl + < F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 16, 256, 1, 8, 16, 4, 1, S<1,1, 1, 1, 8>, S<1,1, 1,16, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, 3, 4, S<0, 1, 2, 3, 4, 5>, 5, 4>, + ck::tensor_operation::device::deviceTsmmDl + < F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 16, 256, 2, 2, 16, 4, 1, S<1,1, 1, 1, 2>, S<1,2, 1,16, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, 3, 4, S<0, 1, 2, 3, 4, 5>, 5, 4>, + ck::tensor_operation::device::deviceTsmmDl + < F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 16, 256, 2, 4, 16, 4, 1, S<1,1, 1, 1, 4>, S<1,2, 1,16, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, 3, 4, S<0, 1, 2, 3, 4, 5>, 5, 4>, + ck::tensor_operation::device::deviceTsmmDl + < F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 16, 256, 2, 8, 16, 4, 1, S<1,1, 1, 1, 8>, S<1,2, 1,16, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, 3, 4, S<0, 1, 2, 3, 4, 5>, 5, 4>, + ck::tensor_operation::device::deviceTsmmDl + < F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 16, 256, 3, 2, 16, 4, 1, S<1,1, 1, 1, 2>, S<1,3, 1,16, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, 3, 4, S<0, 1, 2, 3, 4, 5>, 5, 4>, + ck::tensor_operation::device::deviceTsmmDl + < F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 16, 256, 3, 4, 16, 4, 1, S<1,1, 1, 1, 4>, S<1,3, 1,16, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, 3, 4, S<0, 1, 2, 3, 4, 5>, 5, 4>, + ck::tensor_operation::device::deviceTsmmDl + < F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 16, 256, 3, 8, 16, 4, 1, S<1,1, 1, 1, 8>, S<1,3, 1,16, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, 3, 4, S<0, 1, 2, 3, 4, 5>, 5, 4>, + ck::tensor_operation::device::deviceTsmmDl + < F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 16, 256, 4, 2, 16, 4, 1, S<1,1, 1, 1, 2>, S<1,4, 1,16, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, 3, 4, S<0, 1, 2, 3, 4, 5>, 5, 4>, + ck::tensor_operation::device::deviceTsmmDl + < F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 16, 256, 4, 4, 16, 4, 1, S<1,1, 1, 1, 4>, S<1,4, 1,16, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, 3, 4, S<0, 1, 2, 3, 4, 5>, 5, 4>, + ck::tensor_operation::device::deviceTsmmDl + < F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 16, 256, 4, 8, 16, 4, 1, S<1,1, 1, 1, 8>, S<1,4, 1,16, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, 3, 4, S<0, 1, 2, 3, 4, 5>, 5, 4>, + // ck::tensor_operation::device::deviceTsmmDl + // < F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 16, 256, 5, 2, 16, 4, 1, S<1,1, 1, 1, 2>, S<1,5, 1,16, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, 3, 4, S<0, 1, 2, 3, 4, 5>, 5, 4>, + // ck::tensor_operation::device::deviceTsmmDl + // < F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 16, 256, 5, 4, 16, 4, 1, S<1,1, 1, 1, 4>, S<1,5, 1,16, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, 3, 4, S<0, 1, 2, 3, 4, 5>, 5, 4>, + // ck::tensor_operation::device::deviceTsmmDl + // < F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 16, 256, 5, 8, 16, 4, 1, S<1,1, 1, 1, 8>, S<1,5, 1,16, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, 3, 4, S<0, 1, 2, 3, 4, 5>, 5, 4>, + // ck::tensor_operation::device::deviceTsmmDl + // < F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 16, 256, 6, 2, 16, 4, 1, S<1,1, 1, 1, 2>, S<1,6, 1,16, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, 3, 4, S<0, 1, 2, 3, 4, 5>, 5, 4>, + // ck::tensor_operation::device::deviceTsmmDl + // < F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 16, 256, 6, 4, 16, 4, 1, S<1,1, 1, 1, 4>, S<1,6, 1,16, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, 3, 4, S<0, 1, 2, 3, 4, 5>, 5, 4>, + // ck::tensor_operation::device::deviceTsmmDl + // < F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 16, 256, 6, 8, 16, 4, 1, S<1,1, 1, 1, 8>, S<1,6, 1,16, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, 3, 4, S<0, 1, 2, 3, 4, 5>, 5, 4>, + // ck::tensor_operation::device::deviceTsmmDl + // < F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 16, 256, 7, 2, 16, 4, 1, S<1,1, 1, 1, 2>, S<1,7, 1,16, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, 3, 4, S<0, 1, 2, 3, 4, 5>, 5, 4>, + // ck::tensor_operation::device::deviceTsmmDl + // < F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 16, 256, 7, 4, 16, 4, 1, S<1,1, 1, 1, 4>, S<1,7, 1,16, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, 3, 4, S<0, 1, 2, 3, 4, 5>, 5, 4>, + // ck::tensor_operation::device::deviceTsmmDl + // < F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 16, 256, 7, 8, 16, 4, 1, S<1,1, 1, 1, 8>, S<1,7, 1,16, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, 3, 4, S<0, 1, 2, 3, 4, 5>, 5, 4>, + // ck::tensor_operation::device::deviceTsmmDl + // < F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 16, 256, 8, 2, 16, 4, 1, S<1,1, 1, 1, 2>, S<1,8, 1,16, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, 3, 4, S<0, 1, 2, 3, 4, 5>, 5, 4>, + // ck::tensor_operation::device::deviceTsmmDl + // < F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 16, 256, 8, 4, 16, 4, 1, S<1,1, 1, 1, 4>, S<1,8, 1,16, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, 3, 4, S<0, 1, 2, 3, 4, 5>, 5, 4>, + // ck::tensor_operation::device::deviceTsmmDl + // < F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 16, 256, 8, 8, 16, 4, 1, S<1,1, 1, 1, 8>, S<1,8, 1,16, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, 3, 4, S<0, 1, 2, 3, 4, 5>, 5, 4>, + // //N1=8 + ck::tensor_operation::device::deviceTsmmDl + < F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 16, 512, 1, 2, 16, 8, 1, S<1,1, 1, 1, 2>, S<1,1, 1,16, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, 3, 8, S<0, 1, 2, 3, 4, 5>, 5, 8>, + ck::tensor_operation::device::deviceTsmmDl + < F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 16, 512, 1, 4, 16, 8, 1, S<1,1, 1, 1, 4>, S<1,1, 1,16, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, 3, 8, S<0, 1, 2, 3, 4, 5>, 5, 8>, + ck::tensor_operation::device::deviceTsmmDl + < F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 16, 512, 1, 8, 16, 8, 1, S<1,1, 1, 1, 8>, S<1,1, 1,16, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, 3, 8, S<0, 1, 2, 3, 4, 5>, 5, 8>, + ck::tensor_operation::device::deviceTsmmDl + < F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 16, 512, 2, 2, 16, 8, 1, S<1,1, 1, 1, 2>, S<1,2, 1,16, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, 3, 8, S<0, 1, 2, 3, 4, 5>, 5, 8>, + ck::tensor_operation::device::deviceTsmmDl + < F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 16, 512, 2, 4, 16, 8, 1, S<1,1, 1, 1, 4>, S<1,2, 1,16, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, 3, 8, S<0, 1, 2, 3, 4, 5>, 5, 8>, + ck::tensor_operation::device::deviceTsmmDl + < F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 16, 512, 2, 8, 16, 8, 1, S<1,1, 1, 1, 8>, S<1,2, 1,16, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, 3, 8, S<0, 1, 2, 3, 4, 5>, 5, 8>, + ck::tensor_operation::device::deviceTsmmDl + < F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 16, 512, 3, 2, 16, 8, 1, S<1,1, 1, 1, 2>, S<1,3, 1,16, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, 3, 8, S<0, 1, 2, 3, 4, 5>, 5, 8>, + ck::tensor_operation::device::deviceTsmmDl + < F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 16, 512, 3, 4, 16, 8, 1, S<1,1, 1, 1, 4>, S<1,3, 1,16, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, 3, 8, S<0, 1, 2, 3, 4, 5>, 5, 8>, + ck::tensor_operation::device::deviceTsmmDl + < F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 16, 512, 3, 8, 16, 8, 1, S<1,1, 1, 1, 8>, S<1,3, 1,16, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, 3, 8, S<0, 1, 2, 3, 4, 5>, 5, 8>, + ck::tensor_operation::device::deviceTsmmDl + < F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 16, 512, 4, 2, 16, 8, 1, S<1,1, 1, 1, 2>, S<1,4, 1,16, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, 3, 8, S<0, 1, 2, 3, 4, 5>, 5, 8>, + ck::tensor_operation::device::deviceTsmmDl + < F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 16, 512, 4, 4, 16, 8, 1, S<1,1, 1, 1, 4>, S<1,4, 1,16, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, 3, 8, S<0, 1, 2, 3, 4, 5>, 5, 8>, + ck::tensor_operation::device::deviceTsmmDl + < F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 16, 512, 4, 8, 16, 8, 1, S<1,1, 1, 1, 8>, S<1,4, 1,16, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, 3, 8, S<0, 1, 2, 3, 4, 5>, 5, 8> + // ck::tensor_operation::device::deviceTsmmDl + // < F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 16, 512, 5, 2, 16, 8, 1, S<1,1, 1, 1, 2>, S<1,5, 1,16, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, 3, 8, S<0, 1, 2, 3, 4, 5>, 5, 8>, + // ck::tensor_operation::device::deviceTsmmDl + // < F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 16, 512, 5, 4, 16, 8, 1, S<1,1, 1, 1, 4>, S<1,5, 1,16, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, 3, 8, S<0, 1, 2, 3, 4, 5>, 5, 8>, + // ck::tensor_operation::device::deviceTsmmDl + // < F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 16, 512, 5, 8, 16, 8, 1, S<1,1, 1, 1, 8>, S<1,5, 1,16, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, 3, 8, S<0, 1, 2, 3, 4, 5>, 5, 8>, + // ck::tensor_operation::device::deviceTsmmDl + // < F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 16, 512, 6, 2, 16, 8, 1, S<1,1, 1, 1, 2>, S<1,6, 1,16, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, 3, 8, S<0, 1, 2, 3, 4, 5>, 5, 8>, + // ck::tensor_operation::device::deviceTsmmDl + // < F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 16, 512, 6, 4, 16, 8, 1, S<1,1, 1, 1, 4>, S<1,6, 1,16, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, 3, 8, S<0, 1, 2, 3, 4, 5>, 5, 8>, + // ck::tensor_operation::device::deviceTsmmDl + // < F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 16, 512, 6, 8, 16, 8, 1, S<1,1, 1, 1, 8>, S<1,6, 1,16, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, 3, 8, S<0, 1, 2, 3, 4, 5>, 5, 8>, + // ck::tensor_operation::device::deviceTsmmDl + // < F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 16, 512, 7, 2, 16, 8, 1, S<1,1, 1, 1, 2>, S<1,7, 1,16, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, 3, 8, S<0, 1, 2, 3, 4, 5>, 5, 8>, + // ck::tensor_operation::device::deviceTsmmDl + // < F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 16, 512, 7, 4, 16, 8, 1, S<1,1, 1, 1, 4>, S<1,7, 1,16, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, 3, 8, S<0, 1, 2, 3, 4, 5>, 5, 8>, + // ck::tensor_operation::device::deviceTsmmDl + // < F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 16, 512, 7, 8, 16, 8, 1, S<1,1, 1, 1, 8>, S<1,7, 1,16, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, 3, 8, S<0, 1, 2, 3, 4, 5>, 5, 8>, + // ck::tensor_operation::device::deviceTsmmDl + // < F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 16, 512, 8, 2, 16, 8, 1, S<1,1, 1, 1, 2>, S<1,8, 1,16, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, 3, 8, S<0, 1, 2, 3, 4, 5>, 5, 8>, + // ck::tensor_operation::device::deviceTsmmDl + // < F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 16, 512, 8, 4, 16, 8, 1, S<1,1, 1, 1, 4>, S<1,8, 1,16, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, 3, 8, S<0, 1, 2, 3, 4, 5>, 5, 8>, + // ck::tensor_operation::device::deviceTsmmDl + // < F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 16, 512, 8, 8, 16, 8, 1, S<1,1, 1, 1, 8>, S<1,8, 1,16, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, 3, 8, S<0, 1, 2, 3, 4, 5>, 5, 8> + // clang-format on + >; + +void add_device_tall_and_skinny_gemm_splitk_f16_f16_f16_mk_kn_mn_instances( + std::vector>>& + instances) +{ + add_device_operation_instances( + instances, device_tall_and_skinny_gemm_splitk_f16_f16_f16_mk_kn_mn_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/composable_kernel/library/src/tensor_operation_instance/gpu/tall_and_skinny_gemm_splitk/device_tall_and_skinny_gemm_splitk_f16_f16_f16_mk_nk_mn_instance.cpp b/composable_kernel/library/src/tensor_operation_instance/gpu/tall_and_skinny_gemm_splitk/device_tall_and_skinny_gemm_splitk_f16_f16_f16_mk_nk_mn_instance.cpp new file mode 100755 index 000000000..0b33a9747 --- /dev/null +++ b/composable_kernel/library/src/tensor_operation_instance/gpu/tall_and_skinny_gemm_splitk/device_tall_and_skinny_gemm_splitk_f16_f16_f16_mk_nk_mn_instance.cpp @@ -0,0 +1,199 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved. + +#include + +#include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_tall_and_skinny_gemm_splitk.hpp" + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using F16 = ck::half_t; +using F32 = float; + +using Row = ck::tensor_layout::gemm::RowMajor; +using Col = ck::tensor_layout::gemm::ColumnMajor; + +template +using S = ck::Sequence; + +using PassThrough = ck::tensor_operation::element_wise::PassThrough; +static constexpr auto GemmMNPadding = ck::tensor_operation::device::GemmSpecialization::MNPadding; +// Compilation parameters for a[m, k] * b[k, n] = c[m, n] +using device_tall_and_skinny_gemm_splitk_f16_f16_f16_mk_nk_mn_instances = std::tuple< + // clang-format off +// ######| AData| BData| CData| AccData| ALayout| BLayout| CLayout| A| B| C| GEMM| Block| MPer| NPer| K0Per| K1| M1Per| N1Per| KPer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer | ABlockTransfer| ABlockTransfer | BBlockTransfer| BThreadTransfer| BThreadTransfer| CThreadTransfer| CThreadTransfer| CThreadTransfer| +// ######| Type| Type| Type| Type| | | | Elementwise| Elementwise| Elementwise| Spacialization| Size| Block| Block| Block| | ThreadM111| ThreadN111| Thread| ThreadSliceLengths| ThreadClusterLengths| ThreadCluster| SrcAccess|SrcVectorTensorLengths| SrcVectorTensor|DstVectorTensorLengths| SrcAccess| SrcVectorDim| SrcScalarPerVector| SrcDstAccess| SrcDstVectorDim| DstScalarPerVector| +// ######| | | | | | | | Operation| Operation| Operation| | | | | | | | | | KBatch_K0_M0_M1_K1| KBatch_K0_M0_M1_K1| ArrangeOrder| Order| KBatch_K0_M0_M1_K1 | ContiguousDimOrder| KBatch_K0_M0_M1_K1 | Order| | | Order| | | +// ######| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | + //< ADataType, BDataType, CDataType, AccDataType, ALayout, BLayout, CLayout, AElementOp, BElementOp, CElementOp, GemmMNPadding, B, M1, B*N1, K0, K1, M1, N1, 1, S<1,1, 1, 1, K1>, S<1,K0, 1,M1, 1>,S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, K1>, S<0,1,2,3,4>, S<1,1, 1, 1, K1>, S<0,1,2,3,4>, 4, K1, S<0, 1, 2, 3, 4, 5>, 5, N1>; + //M1 is always tied to 16 + //N1=2 + ck::tensor_operation::device::deviceTsmmDl + < F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 16, 128, 1, 2, 16, 2, 1, S<1,1, 1, 1, 2>, S<1,1, 1,16, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, 4, 2, S<0, 1, 2, 3, 4, 5>, 5, 2>, + ck::tensor_operation::device::deviceTsmmDl + < F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 16, 128, 1, 4, 16, 2, 1, S<1,1, 1, 1, 4>, S<1,1, 1,16, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, 4, 4, S<0, 1, 2, 3, 4, 5>, 5, 2>, + ck::tensor_operation::device::deviceTsmmDl + < F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 16, 128, 1, 8, 16, 2, 1, S<1,1, 1, 1, 8>, S<1,1, 1,16, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, 4, 8, S<0, 1, 2, 3, 4, 5>, 5, 2>, + ck::tensor_operation::device::deviceTsmmDl + < F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 16, 128, 2, 2, 16, 2, 1, S<1,1, 1, 1, 2>, S<1,2, 1,16, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, 4, 2, S<0, 1, 2, 3, 4, 5>, 5, 2>, + ck::tensor_operation::device::deviceTsmmDl + < F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 16, 128, 2, 4, 16, 2, 1, S<1,1, 1, 1, 4>, S<1,2, 1,16, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, 4, 4, S<0, 1, 2, 3, 4, 5>, 5, 2>, + ck::tensor_operation::device::deviceTsmmDl + < F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 16, 128, 2, 8, 16, 2, 1, S<1,1, 1, 1, 8>, S<1,2, 1,16, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, 4, 8, S<0, 1, 2, 3, 4, 5>, 5, 2>, + ck::tensor_operation::device::deviceTsmmDl + < F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 16, 128, 3, 2, 16, 2, 1, S<1,1, 1, 1, 2>, S<1,3, 1,16, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, 4, 2, S<0, 1, 2, 3, 4, 5>, 5, 2>, + ck::tensor_operation::device::deviceTsmmDl + < F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 16, 128, 3, 4, 16, 2, 1, S<1,1, 1, 1, 4>, S<1,3, 1,16, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, 4, 4, S<0, 1, 2, 3, 4, 5>, 5, 2>, + ck::tensor_operation::device::deviceTsmmDl + < F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 16, 128, 3, 8, 16, 2, 1, S<1,1, 1, 1, 8>, S<1,3, 1,16, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, 4, 8, S<0, 1, 2, 3, 4, 5>, 5, 2>, + ck::tensor_operation::device::deviceTsmmDl + < F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 16, 128, 4, 2, 16, 2, 1, S<1,1, 1, 1, 2>, S<1,4, 1,16, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, 4, 2, S<0, 1, 2, 3, 4, 5>, 5, 2>, + ck::tensor_operation::device::deviceTsmmDl + < F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 16, 128, 4, 4, 16, 2, 1, S<1,1, 1, 1, 4>, S<1,4, 1,16, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, 4, 4, S<0, 1, 2, 3, 4, 5>, 5, 2>, + ck::tensor_operation::device::deviceTsmmDl + < F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 16, 128, 4, 8, 16, 2, 1, S<1,1, 1, 1, 8>, S<1,4, 1,16, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, 4, 8, S<0, 1, 2, 3, 4, 5>, 5, 2>, + // ck::tensor_operation::device::deviceTsmmDl + // < F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 16, 128, 5, 2, 16, 2, 1, S<1,1, 1, 1, 2>, S<1,5, 1,16, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, 4, 2, S<0, 1, 2, 3, 4, 5>, 5, 2>, + // ck::tensor_operation::device::deviceTsmmDl + // < F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 16, 128, 5, 4, 16, 2, 1, S<1,1, 1, 1, 4>, S<1,5, 1,16, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, 4, 4, S<0, 1, 2, 3, 4, 5>, 5, 2>, + // ck::tensor_operation::device::deviceTsmmDl + // < F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 16, 128, 5, 8, 16, 2, 1, S<1,1, 1, 1, 8>, S<1,5, 1,16, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, 4, 8, S<0, 1, 2, 3, 4, 5>, 5, 2>, + // ck::tensor_operation::device::deviceTsmmDl + // < F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 16, 128, 6, 2, 16, 2, 1, S<1,1, 1, 1, 2>, S<1,6, 1,16, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, 4, 2, S<0, 1, 2, 3, 4, 5>, 5, 2>, + // ck::tensor_operation::device::deviceTsmmDl + // < F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 16, 128, 6, 4, 16, 2, 1, S<1,1, 1, 1, 4>, S<1,6, 1,16, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, 4, 4, S<0, 1, 2, 3, 4, 5>, 5, 2>, + // ck::tensor_operation::device::deviceTsmmDl + // < F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 16, 128, 6, 8, 16, 2, 1, S<1,1, 1, 1, 8>, S<1,6, 1,16, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, 4, 8, S<0, 1, 2, 3, 4, 5>, 5, 2>, + // ck::tensor_operation::device::deviceTsmmDl + // < F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 16, 128, 7, 2, 16, 2, 1, S<1,1, 1, 1, 2>, S<1,7, 1,16, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, 4, 2, S<0, 1, 2, 3, 4, 5>, 5, 2>, + // ck::tensor_operation::device::deviceTsmmDl + // < F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 16, 128, 7, 4, 16, 2, 1, S<1,1, 1, 1, 4>, S<1,7, 1,16, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, 4, 4, S<0, 1, 2, 3, 4, 5>, 5, 2>, + // ck::tensor_operation::device::deviceTsmmDl + // < F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 16, 128, 7, 8, 16, 2, 1, S<1,1, 1, 1, 8>, S<1,7, 1,16, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, 4, 8, S<0, 1, 2, 3, 4, 5>, 5, 2>, + // ck::tensor_operation::device::deviceTsmmDl + // < F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 16, 128, 8, 2, 16, 2, 1, S<1,1, 1, 1, 2>, S<1,8, 1,16, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, 4, 2, S<0, 1, 2, 3, 4, 5>, 5, 2>, + // ck::tensor_operation::device::deviceTsmmDl + // < F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 16, 128, 8, 4, 16, 2, 1, S<1,1, 1, 1, 4>, S<1,8, 1,16, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, 4, 4, S<0, 1, 2, 3, 4, 5>, 5, 2>, + // ck::tensor_operation::device::deviceTsmmDl + // < F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 16, 128, 8, 8, 16, 2, 1, S<1,1, 1, 1, 8>, S<1,8, 1,16, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, 4, 8, S<0, 1, 2, 3, 4, 5>, 5, 2>, + // //N1=4 + ck::tensor_operation::device::deviceTsmmDl + < F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 16, 256, 1, 2, 16, 4, 1, S<1,1, 1, 1, 2>, S<1,1, 1,16, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, 4, 2, S<0, 1, 2, 3, 4, 5>, 5, 4>, + ck::tensor_operation::device::deviceTsmmDl + < F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 16, 256, 1, 4, 16, 4, 1, S<1,1, 1, 1, 4>, S<1,1, 1,16, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, 4, 4, S<0, 1, 2, 3, 4, 5>, 5, 4>, + ck::tensor_operation::device::deviceTsmmDl + < F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 16, 256, 1, 8, 16, 4, 1, S<1,1, 1, 1, 8>, S<1,1, 1,16, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, 4, 8, S<0, 1, 2, 3, 4, 5>, 5, 4>, + ck::tensor_operation::device::deviceTsmmDl + < F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 16, 256, 2, 2, 16, 4, 1, S<1,1, 1, 1, 2>, S<1,2, 1,16, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, 4, 2, S<0, 1, 2, 3, 4, 5>, 5, 4>, + ck::tensor_operation::device::deviceTsmmDl + < F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 16, 256, 2, 4, 16, 4, 1, S<1,1, 1, 1, 4>, S<1,2, 1,16, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, 4, 4, S<0, 1, 2, 3, 4, 5>, 5, 4>, + ck::tensor_operation::device::deviceTsmmDl + < F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 16, 256, 2, 8, 16, 4, 1, S<1,1, 1, 1, 8>, S<1,2, 1,16, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, 4, 8, S<0, 1, 2, 3, 4, 5>, 5, 4>, + ck::tensor_operation::device::deviceTsmmDl + < F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 16, 256, 3, 2, 16, 4, 1, S<1,1, 1, 1, 2>, S<1,3, 1,16, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, 4, 2, S<0, 1, 2, 3, 4, 5>, 5, 4>, + ck::tensor_operation::device::deviceTsmmDl + < F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 16, 256, 3, 4, 16, 4, 1, S<1,1, 1, 1, 4>, S<1,3, 1,16, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, 4, 4, S<0, 1, 2, 3, 4, 5>, 5, 4>, + ck::tensor_operation::device::deviceTsmmDl + < F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 16, 256, 3, 8, 16, 4, 1, S<1,1, 1, 1, 8>, S<1,3, 1,16, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, 4, 8, S<0, 1, 2, 3, 4, 5>, 5, 4>, + ck::tensor_operation::device::deviceTsmmDl + < F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 16, 256, 4, 2, 16, 4, 1, S<1,1, 1, 1, 2>, S<1,4, 1,16, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, 4, 2, S<0, 1, 2, 3, 4, 5>, 5, 4>, + ck::tensor_operation::device::deviceTsmmDl + < F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 16, 256, 4, 4, 16, 4, 1, S<1,1, 1, 1, 4>, S<1,4, 1,16, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, 4, 4, S<0, 1, 2, 3, 4, 5>, 5, 4>, + ck::tensor_operation::device::deviceTsmmDl + < F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 16, 256, 4, 8, 16, 4, 1, S<1,1, 1, 1, 8>, S<1,4, 1,16, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, 4, 8, S<0, 1, 2, 3, 4, 5>, 5, 4>, + // ck::tensor_operation::device::deviceTsmmDl + // < F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 16, 256, 5, 2, 16, 4, 1, S<1,1, 1, 1, 2>, S<1,5, 1,16, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, 4, 2, S<0, 1, 2, 3, 4, 5>, 5, 4>, + // ck::tensor_operation::device::deviceTsmmDl + // < F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 16, 256, 5, 4, 16, 4, 1, S<1,1, 1, 1, 4>, S<1,5, 1,16, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, 4, 4, S<0, 1, 2, 3, 4, 5>, 5, 4>, + // ck::tensor_operation::device::deviceTsmmDl + // < F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 16, 256, 5, 8, 16, 4, 1, S<1,1, 1, 1, 8>, S<1,5, 1,16, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, 4, 8, S<0, 1, 2, 3, 4, 5>, 5, 4>, + // ck::tensor_operation::device::deviceTsmmDl + // < F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 16, 256, 6, 2, 16, 4, 1, S<1,1, 1, 1, 2>, S<1,6, 1,16, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, 4, 2, S<0, 1, 2, 3, 4, 5>, 5, 4>, + // ck::tensor_operation::device::deviceTsmmDl + // < F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 16, 256, 6, 4, 16, 4, 1, S<1,1, 1, 1, 4>, S<1,6, 1,16, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, 4, 4, S<0, 1, 2, 3, 4, 5>, 5, 4>, + // ck::tensor_operation::device::deviceTsmmDl + // < F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 16, 256, 6, 8, 16, 4, 1, S<1,1, 1, 1, 8>, S<1,6, 1,16, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, 4, 8, S<0, 1, 2, 3, 4, 5>, 5, 4>, + // ck::tensor_operation::device::deviceTsmmDl + // < F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 16, 256, 7, 2, 16, 4, 1, S<1,1, 1, 1, 2>, S<1,7, 1,16, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, 4, 2, S<0, 1, 2, 3, 4, 5>, 5, 4>, + // ck::tensor_operation::device::deviceTsmmDl + // < F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 16, 256, 7, 4, 16, 4, 1, S<1,1, 1, 1, 4>, S<1,7, 1,16, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, 4, 4, S<0, 1, 2, 3, 4, 5>, 5, 4>, + // ck::tensor_operation::device::deviceTsmmDl + // < F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 16, 256, 7, 8, 16, 4, 1, S<1,1, 1, 1, 8>, S<1,7, 1,16, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, 4, 8, S<0, 1, 2, 3, 4, 5>, 5, 4>, + // ck::tensor_operation::device::deviceTsmmDl + // < F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 16, 256, 8, 2, 16, 4, 1, S<1,1, 1, 1, 2>, S<1,8, 1,16, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, 4, 2, S<0, 1, 2, 3, 4, 5>, 5, 4>, + // ck::tensor_operation::device::deviceTsmmDl + // < F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 16, 256, 8, 4, 16, 4, 1, S<1,1, 1, 1, 4>, S<1,8, 1,16, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, 4, 4, S<0, 1, 2, 3, 4, 5>, 5, 4>, + // ck::tensor_operation::device::deviceTsmmDl + // < F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 16, 256, 8, 8, 16, 4, 1, S<1,1, 1, 1, 8>, S<1,8, 1,16, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, 4, 8, S<0, 1, 2, 3, 4, 5>, 5, 4>, + // //N1=8 + ck::tensor_operation::device::deviceTsmmDl + < F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 16, 512, 1, 2, 16, 8, 1, S<1,1, 1, 1, 2>, S<1,1, 1,16, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, 4, 2, S<0, 1, 2, 3, 4, 5>, 5, 8>, + ck::tensor_operation::device::deviceTsmmDl + < F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 16, 512, 1, 4, 16, 8, 1, S<1,1, 1, 1, 4>, S<1,1, 1,16, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, 4, 4, S<0, 1, 2, 3, 4, 5>, 5, 8>, + ck::tensor_operation::device::deviceTsmmDl + < F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 16, 512, 1, 8, 16, 8, 1, S<1,1, 1, 1, 8>, S<1,1, 1,16, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, 4, 8, S<0, 1, 2, 3, 4, 5>, 5, 8>, + ck::tensor_operation::device::deviceTsmmDl + < F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 16, 512, 2, 2, 16, 8, 1, S<1,1, 1, 1, 2>, S<1,2, 1,16, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, 4, 2, S<0, 1, 2, 3, 4, 5>, 5, 8>, + ck::tensor_operation::device::deviceTsmmDl + < F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 16, 512, 2, 4, 16, 8, 1, S<1,1, 1, 1, 4>, S<1,2, 1,16, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, 4, 4, S<0, 1, 2, 3, 4, 5>, 5, 8>, + ck::tensor_operation::device::deviceTsmmDl + < F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 16, 512, 2, 8, 16, 8, 1, S<1,1, 1, 1, 8>, S<1,2, 1,16, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, 4, 8, S<0, 1, 2, 3, 4, 5>, 5, 8>, + ck::tensor_operation::device::deviceTsmmDl + < F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 16, 512, 3, 2, 16, 8, 1, S<1,1, 1, 1, 2>, S<1,3, 1,16, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, 4, 2, S<0, 1, 2, 3, 4, 5>, 5, 8>, + ck::tensor_operation::device::deviceTsmmDl + < F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 16, 512, 3, 4, 16, 8, 1, S<1,1, 1, 1, 4>, S<1,3, 1,16, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, 4, 4, S<0, 1, 2, 3, 4, 5>, 5, 8>, + ck::tensor_operation::device::deviceTsmmDl + < F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 16, 512, 3, 8, 16, 8, 1, S<1,1, 1, 1, 8>, S<1,3, 1,16, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, 4, 8, S<0, 1, 2, 3, 4, 5>, 5, 8>, + ck::tensor_operation::device::deviceTsmmDl + < F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 16, 512, 4, 2, 16, 8, 1, S<1,1, 1, 1, 2>, S<1,4, 1,16, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, 4, 2, S<0, 1, 2, 3, 4, 5>, 5, 8>, + ck::tensor_operation::device::deviceTsmmDl + < F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 16, 512, 4, 4, 16, 8, 1, S<1,1, 1, 1, 4>, S<1,4, 1,16, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, 4, 4, S<0, 1, 2, 3, 4, 5>, 5, 8>, + ck::tensor_operation::device::deviceTsmmDl + < F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 16, 512, 4, 8, 16, 8, 1, S<1,1, 1, 1, 8>, S<1,4, 1,16, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, 4, 8, S<0, 1, 2, 3, 4, 5>, 5, 8> + // ck::tensor_operation::device::deviceTsmmDl + // < F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 16, 512, 5, 2, 16, 8, 1, S<1,1, 1, 1, 2>, S<1,5, 1,16, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, 4, 2, S<0, 1, 2, 3, 4, 5>, 5, 8>, + // ck::tensor_operation::device::deviceTsmmDl + // < F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 16, 512, 5, 4, 16, 8, 1, S<1,1, 1, 1, 4>, S<1,5, 1,16, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, 4, 4, S<0, 1, 2, 3, 4, 5>, 5, 8>, + // ck::tensor_operation::device::deviceTsmmDl + // < F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 16, 512, 5, 8, 16, 8, 1, S<1,1, 1, 1, 8>, S<1,5, 1,16, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, 4, 8, S<0, 1, 2, 3, 4, 5>, 5, 8>, + // ck::tensor_operation::device::deviceTsmmDl + // < F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 16, 512, 6, 2, 16, 8, 1, S<1,1, 1, 1, 2>, S<1,6, 1,16, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, 4, 2, S<0, 1, 2, 3, 4, 5>, 5, 8>, + // ck::tensor_operation::device::deviceTsmmDl + // < F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 16, 512, 6, 4, 16, 8, 1, S<1,1, 1, 1, 4>, S<1,6, 1,16, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, 4, 4, S<0, 1, 2, 3, 4, 5>, 5, 8>, + // ck::tensor_operation::device::deviceTsmmDl + // < F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 16, 512, 6, 8, 16, 8, 1, S<1,1, 1, 1, 8>, S<1,6, 1,16, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, 4, 8, S<0, 1, 2, 3, 4, 5>, 5, 8>, + // ck::tensor_operation::device::deviceTsmmDl + // < F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 16, 512, 7, 2, 16, 8, 1, S<1,1, 1, 1, 2>, S<1,7, 1,16, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, 4, 2, S<0, 1, 2, 3, 4, 5>, 5, 8>, + // ck::tensor_operation::device::deviceTsmmDl + // < F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 16, 512, 7, 4, 16, 8, 1, S<1,1, 1, 1, 4>, S<1,7, 1,16, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, 4, 4, S<0, 1, 2, 3, 4, 5>, 5, 8>, + // ck::tensor_operation::device::deviceTsmmDl + // < F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 16, 512, 7, 8, 16, 8, 1, S<1,1, 1, 1, 8>, S<1,7, 1,16, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, 4, 8, S<0, 1, 2, 3, 4, 5>, 5, 8>, + // ck::tensor_operation::device::deviceTsmmDl + // < F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 16, 512, 8, 2, 16, 8, 1, S<1,1, 1, 1, 2>, S<1,8, 1,16, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, S<1,1, 1, 1, 2>, S<0,1,2,3,4>, 4, 2, S<0, 1, 2, 3, 4, 5>, 5, 8>, + // ck::tensor_operation::device::deviceTsmmDl + // < F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 16, 512, 8, 4, 16, 8, 1, S<1,1, 1, 1, 4>, S<1,8, 1,16, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, S<1,1, 1, 1, 4>, S<0,1,2,3,4>, 4, 4, S<0, 1, 2, 3, 4, 5>, 5, 8>, + // ck::tensor_operation::device::deviceTsmmDl + // < F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 64, 16, 512, 8, 8, 16, 8, 1, S<1,1, 1, 1, 8>, S<1,8, 1,16, 1>, S<0,1,2,3,4>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, S<1,1, 1, 1, 8>, S<0,1,2,3,4>, 4, 8, S<0, 1, 2, 3, 4, 5>, 5, 8> + // clang-format on + >; + +void add_device_tall_and_skinny_gemm_splitk_f16_f16_f16_mk_nk_mn_instances( + std::vector>>& + instances) +{ + add_device_operation_instances( + instances, device_tall_and_skinny_gemm_splitk_f16_f16_f16_mk_nk_mn_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/utility/CMakeLists.txt b/composable_kernel/library/src/utility/CMakeLists.txt similarity index 100% rename from library/src/utility/CMakeLists.txt rename to composable_kernel/library/src/utility/CMakeLists.txt diff --git a/library/src/utility/convolution_parameter.cpp b/composable_kernel/library/src/utility/convolution_parameter.cpp similarity index 100% rename from library/src/utility/convolution_parameter.cpp rename to composable_kernel/library/src/utility/convolution_parameter.cpp diff --git a/library/src/utility/device_memory.cpp b/composable_kernel/library/src/utility/device_memory.cpp similarity index 100% rename from library/src/utility/device_memory.cpp rename to composable_kernel/library/src/utility/device_memory.cpp diff --git a/library/src/utility/host_tensor.cpp b/composable_kernel/library/src/utility/host_tensor.cpp similarity index 100% rename from library/src/utility/host_tensor.cpp rename to composable_kernel/library/src/utility/host_tensor.cpp diff --git a/profiler/CMakeLists.txt b/composable_kernel/profiler/CMakeLists.txt similarity index 100% rename from profiler/CMakeLists.txt rename to composable_kernel/profiler/CMakeLists.txt diff --git a/profiler/README.md b/composable_kernel/profiler/README.md similarity index 100% rename from profiler/README.md rename to composable_kernel/profiler/README.md diff --git a/profiler/include/profiler/data_type_enum.hpp b/composable_kernel/profiler/include/profiler/data_type_enum.hpp similarity index 100% rename from profiler/include/profiler/data_type_enum.hpp rename to composable_kernel/profiler/include/profiler/data_type_enum.hpp diff --git a/profiler/include/profiler/profile_avg_pool3d_bwd_impl.hpp b/composable_kernel/profiler/include/profiler/profile_avg_pool3d_bwd_impl.hpp similarity index 100% rename from profiler/include/profiler/profile_avg_pool3d_bwd_impl.hpp rename to composable_kernel/profiler/include/profiler/profile_avg_pool3d_bwd_impl.hpp diff --git a/profiler/include/profiler/profile_batched_gemm_add_relu_gemm_add_impl.hpp b/composable_kernel/profiler/include/profiler/profile_batched_gemm_add_relu_gemm_add_impl.hpp similarity index 100% rename from profiler/include/profiler/profile_batched_gemm_add_relu_gemm_add_impl.hpp rename to composable_kernel/profiler/include/profiler/profile_batched_gemm_add_relu_gemm_add_impl.hpp diff --git a/profiler/include/profiler/profile_batched_gemm_bias_softmax_gemm_permute_impl.hpp b/composable_kernel/profiler/include/profiler/profile_batched_gemm_bias_softmax_gemm_permute_impl.hpp similarity index 100% rename from profiler/include/profiler/profile_batched_gemm_bias_softmax_gemm_permute_impl.hpp rename to composable_kernel/profiler/include/profiler/profile_batched_gemm_bias_softmax_gemm_permute_impl.hpp diff --git a/profiler/include/profiler/profile_batched_gemm_gemm_impl.hpp b/composable_kernel/profiler/include/profiler/profile_batched_gemm_gemm_impl.hpp similarity index 100% rename from profiler/include/profiler/profile_batched_gemm_gemm_impl.hpp rename to composable_kernel/profiler/include/profiler/profile_batched_gemm_gemm_impl.hpp diff --git a/profiler/include/profiler/profile_batched_gemm_impl.hpp b/composable_kernel/profiler/include/profiler/profile_batched_gemm_impl.hpp similarity index 100% rename from profiler/include/profiler/profile_batched_gemm_impl.hpp rename to composable_kernel/profiler/include/profiler/profile_batched_gemm_impl.hpp diff --git a/profiler/include/profiler/profile_batched_gemm_reduce_impl.hpp b/composable_kernel/profiler/include/profiler/profile_batched_gemm_reduce_impl.hpp similarity index 100% rename from profiler/include/profiler/profile_batched_gemm_reduce_impl.hpp rename to composable_kernel/profiler/include/profiler/profile_batched_gemm_reduce_impl.hpp diff --git a/profiler/include/profiler/profile_batched_gemm_softmax_gemm_impl.hpp b/composable_kernel/profiler/include/profiler/profile_batched_gemm_softmax_gemm_impl.hpp similarity index 100% rename from profiler/include/profiler/profile_batched_gemm_softmax_gemm_impl.hpp rename to composable_kernel/profiler/include/profiler/profile_batched_gemm_softmax_gemm_impl.hpp diff --git a/profiler/include/profiler/profile_batched_gemm_softmax_gemm_permute_impl.hpp b/composable_kernel/profiler/include/profiler/profile_batched_gemm_softmax_gemm_permute_impl.hpp similarity index 100% rename from profiler/include/profiler/profile_batched_gemm_softmax_gemm_permute_impl.hpp rename to composable_kernel/profiler/include/profiler/profile_batched_gemm_softmax_gemm_permute_impl.hpp diff --git a/profiler/include/profiler/profile_batchnorm_backward_impl.hpp b/composable_kernel/profiler/include/profiler/profile_batchnorm_backward_impl.hpp similarity index 100% rename from profiler/include/profiler/profile_batchnorm_backward_impl.hpp rename to composable_kernel/profiler/include/profiler/profile_batchnorm_backward_impl.hpp diff --git a/profiler/include/profiler/profile_batchnorm_forward_impl.hpp b/composable_kernel/profiler/include/profiler/profile_batchnorm_forward_impl.hpp similarity index 100% rename from profiler/include/profiler/profile_batchnorm_forward_impl.hpp rename to composable_kernel/profiler/include/profiler/profile_batchnorm_forward_impl.hpp diff --git a/profiler/include/profiler/profile_batchnorm_infer_impl.hpp b/composable_kernel/profiler/include/profiler/profile_batchnorm_infer_impl.hpp similarity index 100% rename from profiler/include/profiler/profile_batchnorm_infer_impl.hpp rename to composable_kernel/profiler/include/profiler/profile_batchnorm_infer_impl.hpp diff --git a/profiler/include/profiler/profile_contraction_impl.hpp b/composable_kernel/profiler/include/profiler/profile_contraction_impl.hpp similarity index 100% rename from profiler/include/profiler/profile_contraction_impl.hpp rename to composable_kernel/profiler/include/profiler/profile_contraction_impl.hpp diff --git a/profiler/include/profiler/profile_contraction_utils.hpp b/composable_kernel/profiler/include/profiler/profile_contraction_utils.hpp similarity index 100% rename from profiler/include/profiler/profile_contraction_utils.hpp rename to composable_kernel/profiler/include/profiler/profile_contraction_utils.hpp diff --git a/profiler/include/profiler/profile_conv_bwd_data_impl.hpp b/composable_kernel/profiler/include/profiler/profile_conv_bwd_data_impl.hpp similarity index 100% rename from profiler/include/profiler/profile_conv_bwd_data_impl.hpp rename to composable_kernel/profiler/include/profiler/profile_conv_bwd_data_impl.hpp diff --git a/profiler/include/profiler/profile_conv_fwd_bias_relu_add_impl.hpp b/composable_kernel/profiler/include/profiler/profile_conv_fwd_bias_relu_add_impl.hpp similarity index 100% rename from profiler/include/profiler/profile_conv_fwd_bias_relu_add_impl.hpp rename to composable_kernel/profiler/include/profiler/profile_conv_fwd_bias_relu_add_impl.hpp diff --git a/profiler/include/profiler/profile_conv_fwd_bias_relu_impl.hpp b/composable_kernel/profiler/include/profiler/profile_conv_fwd_bias_relu_impl.hpp similarity index 100% rename from profiler/include/profiler/profile_conv_fwd_bias_relu_impl.hpp rename to composable_kernel/profiler/include/profiler/profile_conv_fwd_bias_relu_impl.hpp diff --git a/profiler/include/profiler/profile_conv_fwd_impl.hpp b/composable_kernel/profiler/include/profiler/profile_conv_fwd_impl.hpp similarity index 100% rename from profiler/include/profiler/profile_conv_fwd_impl.hpp rename to composable_kernel/profiler/include/profiler/profile_conv_fwd_impl.hpp diff --git a/profiler/include/profiler/profile_conv_tensor_rearrange_impl.hpp b/composable_kernel/profiler/include/profiler/profile_conv_tensor_rearrange_impl.hpp similarity index 100% rename from profiler/include/profiler/profile_conv_tensor_rearrange_impl.hpp rename to composable_kernel/profiler/include/profiler/profile_conv_tensor_rearrange_impl.hpp diff --git a/profiler/include/profiler/profile_elementwise_layernorm_impl.hpp b/composable_kernel/profiler/include/profiler/profile_elementwise_layernorm_impl.hpp similarity index 100% rename from profiler/include/profiler/profile_elementwise_layernorm_impl.hpp rename to composable_kernel/profiler/include/profiler/profile_elementwise_layernorm_impl.hpp diff --git a/profiler/include/profiler/profile_gemm_add_add_fastgelu_impl.hpp b/composable_kernel/profiler/include/profiler/profile_gemm_add_add_fastgelu_impl.hpp similarity index 100% rename from profiler/include/profiler/profile_gemm_add_add_fastgelu_impl.hpp rename to composable_kernel/profiler/include/profiler/profile_gemm_add_add_fastgelu_impl.hpp diff --git a/profiler/include/profiler/profile_gemm_add_fastgelu_impl.hpp b/composable_kernel/profiler/include/profiler/profile_gemm_add_fastgelu_impl.hpp similarity index 100% rename from profiler/include/profiler/profile_gemm_add_fastgelu_impl.hpp rename to composable_kernel/profiler/include/profiler/profile_gemm_add_fastgelu_impl.hpp diff --git a/profiler/include/profiler/profile_gemm_add_multiply_impl.hpp b/composable_kernel/profiler/include/profiler/profile_gemm_add_multiply_impl.hpp similarity index 100% rename from profiler/include/profiler/profile_gemm_add_multiply_impl.hpp rename to composable_kernel/profiler/include/profiler/profile_gemm_add_multiply_impl.hpp diff --git a/profiler/include/profiler/profile_gemm_add_relu_add_layernorm_impl.hpp b/composable_kernel/profiler/include/profiler/profile_gemm_add_relu_add_layernorm_impl.hpp similarity index 100% rename from profiler/include/profiler/profile_gemm_add_relu_add_layernorm_impl.hpp rename to composable_kernel/profiler/include/profiler/profile_gemm_add_relu_add_layernorm_impl.hpp diff --git a/profiler/include/profiler/profile_gemm_bias_add_reduce_impl.hpp b/composable_kernel/profiler/include/profiler/profile_gemm_bias_add_reduce_impl.hpp similarity index 100% rename from profiler/include/profiler/profile_gemm_bias_add_reduce_impl.hpp rename to composable_kernel/profiler/include/profiler/profile_gemm_bias_add_reduce_impl.hpp diff --git a/profiler/include/profiler/profile_gemm_bilinear_impl.hpp b/composable_kernel/profiler/include/profiler/profile_gemm_bilinear_impl.hpp similarity index 100% rename from profiler/include/profiler/profile_gemm_bilinear_impl.hpp rename to composable_kernel/profiler/include/profiler/profile_gemm_bilinear_impl.hpp diff --git a/profiler/include/profiler/profile_gemm_fastgelu_impl.hpp b/composable_kernel/profiler/include/profiler/profile_gemm_fastgelu_impl.hpp similarity index 100% rename from profiler/include/profiler/profile_gemm_fastgelu_impl.hpp rename to composable_kernel/profiler/include/profiler/profile_gemm_fastgelu_impl.hpp diff --git a/profiler/include/profiler/profile_gemm_impl.hpp b/composable_kernel/profiler/include/profiler/profile_gemm_impl.hpp similarity index 100% rename from profiler/include/profiler/profile_gemm_impl.hpp rename to composable_kernel/profiler/include/profiler/profile_gemm_impl.hpp diff --git a/profiler/include/profiler/profile_gemm_multiply_add_impl.hpp b/composable_kernel/profiler/include/profiler/profile_gemm_multiply_add_impl.hpp similarity index 100% rename from profiler/include/profiler/profile_gemm_multiply_add_impl.hpp rename to composable_kernel/profiler/include/profiler/profile_gemm_multiply_add_impl.hpp diff --git a/profiler/include/profiler/profile_gemm_reduce_impl.hpp b/composable_kernel/profiler/include/profiler/profile_gemm_reduce_impl.hpp similarity index 100% rename from profiler/include/profiler/profile_gemm_reduce_impl.hpp rename to composable_kernel/profiler/include/profiler/profile_gemm_reduce_impl.hpp diff --git a/profiler/include/profiler/profile_gemm_splitk_impl.hpp b/composable_kernel/profiler/include/profiler/profile_gemm_splitk_impl.hpp similarity index 100% rename from profiler/include/profiler/profile_gemm_splitk_impl.hpp rename to composable_kernel/profiler/include/profiler/profile_gemm_splitk_impl.hpp diff --git a/profiler/include/profiler/profile_gemm_streamk_impl.hpp b/composable_kernel/profiler/include/profiler/profile_gemm_streamk_impl.hpp similarity index 100% rename from profiler/include/profiler/profile_gemm_streamk_impl.hpp rename to composable_kernel/profiler/include/profiler/profile_gemm_streamk_impl.hpp diff --git a/profiler/include/profiler/profile_gemv_splitk_impl.hpp b/composable_kernel/profiler/include/profiler/profile_gemv_splitk_impl.hpp similarity index 98% rename from profiler/include/profiler/profile_gemv_splitk_impl.hpp rename to composable_kernel/profiler/include/profiler/profile_gemv_splitk_impl.hpp index cec561f8c..cd5b8ff9b 100644 --- a/profiler/include/profiler/profile_gemv_splitk_impl.hpp +++ b/composable_kernel/profiler/include/profiler/profile_gemv_splitk_impl.hpp @@ -9,7 +9,7 @@ #include "ck/ck.hpp" #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" -#include "ck/tensor_operation/gpu/device/device_gemv.hpp" +#include "ck/tensor_operation/gpu/device/device_tall_and_skinny_gemm.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" #include "ck/library/tensor_operation_instance/gpu/gemv_splitk.hpp" @@ -95,7 +95,7 @@ bool profile_gemv_splitk_impl(int do_verification, a_device_buf.ToDevice(a_m_k.mData.data()); b_device_buf.ToDevice(b_k_n.mData.data()); - using DeviceOp = ck::tensor_operation::device::DeviceGemv +#include +#include + +#include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" +#include "ck/tensor_operation/gpu/device/device_tall_and_skinny_gemm.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +#include "ck/library/tensor_operation_instance/gpu/tall_and_skinny_gemm_splitk.hpp" + +#include "ck/library/utility/check_err.hpp" +#include "ck/library/utility/device_memory.hpp" +#include "ck/library/utility/host_tensor.hpp" +#include "ck/library/utility/host_tensor_generator.hpp" +#include "ck/library/utility/literals.hpp" +#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp" + +namespace ck { +namespace profiler { + +template +bool profile_tall_and_skinny_gemm_splitk_impl(int do_verification, + int init_method, + bool do_log, + bool time_kernel, + int M, + int N, + int K, + int StrideA, + int StrideB, + int StrideC, + int KBatch) +{ + bool pass = true; + + auto f_host_tensor_descriptor = + [](std::size_t row, std::size_t col, std::size_t stride, auto layout) { + using namespace ck::literals; + + if(is_same::value) + { + return HostTensorDescriptor({row, col}, {stride, 1_uz}); + } + else + { + return HostTensorDescriptor({row, col}, {1_uz, stride}); + } + }; + + Tensor a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{})); + Tensor b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{})); + Tensor c_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{})); + Tensor c_m_n_device_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{})); + + std::cout << "a_m_k: " << a_m_k.mDesc << std::endl; + std::cout << "b_k_n: " << b_k_n.mDesc << std::endl; + std::cout << "c_m_n: " << c_m_n_device_result.mDesc << std::endl; + + switch(init_method) + { + case 0: break; + case 1: + a_m_k.GenerateTensorValue(GeneratorTensor_2{-1, 2}); + b_k_n.GenerateTensorValue(GeneratorTensor_2{-1, 2}); + break; + default: + a_m_k.GenerateTensorValue(GeneratorTensor_3{0.0, 1.0}); + b_k_n.GenerateTensorValue(GeneratorTensor_3{-0.5, 0.5}); + } + + using AElementOp = ck::tensor_operation::element_wise::PassThrough; + using BElementOp = ck::tensor_operation::element_wise::PassThrough; + using CElementOp = ck::tensor_operation::element_wise::PassThrough; + + const auto a_element_op = AElementOp{}; + const auto b_element_op = BElementOp{}; + const auto c_element_op = CElementOp{}; + + DeviceMem a_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize()); + DeviceMem b_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize()); + DeviceMem c_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpaceSize()); + + a_device_buf.ToDevice(a_m_k.mData.data()); + b_device_buf.ToDevice(b_k_n.mData.data()); + + using DeviceOp = ck::tensor_operation::device::DeviceTsmm; + + // get device op instances + const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< + DeviceOp>::GetInstances(); + + std::cout << "found " << op_ptrs.size() << " instances" << std::endl; + + // Run reference GEMM + if(do_verification) + { + using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm; + + auto ref_gemm = ReferenceGemmInstance{}; + auto ref_invoker = ref_gemm.MakeInvoker(); + + auto ref_argument = ref_gemm.MakeArgument( + a_m_k, b_k_n, c_m_n_host_result, a_element_op, b_element_op, c_element_op); + + ref_invoker.Run(ref_argument); + } + + std::string best_op_name; + float best_ave_time = 0; + float best_tflops = 0; + float best_gb_per_sec = 0; + float best_kbatch = 0; + + // profile device GEMM instances + for(auto& op_ptr : op_ptrs) + { + std::vector kbatch_list = {1, 2, 4, 8, 12, 16, 20, 24, 32, 36, 40, 60, + 64, 72, 80, 88, 96, 128, 144, 160, 176, 192, 256}; + + if(KBatch > 0) + { + kbatch_list = {KBatch}; + } + + for(std::size_t i = 0; i < kbatch_list.size(); i++) + { + auto kbatch_curr = kbatch_list[i]; + + auto argument_ptr = + op_ptr->MakeArgumentPointer(static_cast(a_device_buf.GetDeviceBuffer()), + static_cast(b_device_buf.GetDeviceBuffer()), + static_cast(c_device_buf.GetDeviceBuffer()), + M, + N, + K, + StrideA, + StrideB, + StrideC, + a_element_op, + b_element_op, + c_element_op, + kbatch_curr); + + auto invoker_ptr = op_ptr->MakeInvokerPointer(); + + if(op_ptr->IsSupportedArgument(argument_ptr.get())) + { + + // re-init C to zero before profiling next kernel + c_device_buf.SetZero(); + + invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false}); + + if(do_verification) + { + c_device_buf.FromDevice(c_m_n_device_result.mData.data()); + + pass = pass & ck::utils::check_err(c_m_n_device_result, c_m_n_host_result); + + if(do_log) + { + LogRangeAsType(std::cout << "a : ", a_m_k.mData, ",") << std::endl; + LogRangeAsType(std::cout << "b: ", b_k_n.mData, ",") << std::endl; + LogRangeAsType( + std::cout << "c_host : ", c_m_n_host_result.mData, ",") + << std::endl; + LogRangeAsType( + std::cout << "c_device: ", c_m_n_device_result.mData, ",") + << std::endl; + } + } + + std::string op_name = op_ptr->GetTypeString(); + + float ave_time = + invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel}); + + std::size_t flop = std::size_t(2) * M * N * K; + + std::size_t num_btype = sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + + sizeof(CDataType) * M * N; + + float tflops = static_cast(flop) / 1.E9 / ave_time; + + float gb_per_sec = num_btype / 1.E6 / ave_time; + + std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << tflops + << " TFlops, " << gb_per_sec << " GB/s, " << op_name << ", KBatch " + << kbatch_curr << std::endl; + +#if defined CK_ENABLE_FP8 + // set softer tolerances for fp8 + if constexpr(is_same_v || is_same_v || + is_same_v) + { + std::string msg = "Error: Incorrect results!"; + double rtol = 1e-1; + double atol = 1e-1; + pass = pass & ck::utils::check_err( + c_m_n_device_result, c_m_n_host_result, msg, rtol, atol); + } + else + { +#endif + pass = pass & ck::utils::check_err(c_m_n_device_result, c_m_n_host_result); +#if defined CK_ENABLE_FP8 + } +#endif + + if(tflops > best_tflops) + { + best_op_name = op_name; + best_tflops = tflops; + best_ave_time = ave_time; + best_gb_per_sec = gb_per_sec; + best_kbatch = kbatch_curr; + } + } + else + { + std::cout << op_ptr->GetTypeString() << " does not support this problem" + << std::endl; + } + } + } + + if constexpr(is_same::value) + { + std::cout << "Best Perf for datatype = f32"; + } + else if constexpr(is_same::value) + { + std::cout << "Best Perf for datatype = f16"; + } + else if constexpr(is_same::value) + { + std::cout << "Best Perf for datatype = bf16"; + } + else if constexpr(is_same::value) + { + std::cout << "Best Perf for datatype = int8"; + } + + if constexpr(is_same::value) + { + std::cout << " ALayout = RowMajor"; + } + else if constexpr(is_same::value) + { + std::cout << " ALayout = ColumnMajor"; + } + + if constexpr(is_same::value) + { + std::cout << " BLayout = RowMajor"; + } + else if constexpr(is_same::value) + { + std::cout << " BLayout = ColumnMajor"; + } + + std::cout << " M = " << M << " N = " << N << " K = " << K << " StrideA = " << StrideA + << " StrideB = " << StrideB << " StrideC = " << StrideC << " KBatch = " << best_kbatch + << " : " << best_ave_time << " ms, " << best_tflops << " TFlops, " << best_gb_per_sec + << " GB/s, " << best_op_name << std::endl; + + return pass; +} + +} // namespace profiler +} // namespace ck diff --git a/profiler/src/CMakeLists.txt b/composable_kernel/profiler/src/CMakeLists.txt similarity index 100% rename from profiler/src/CMakeLists.txt rename to composable_kernel/profiler/src/CMakeLists.txt diff --git a/profiler/src/profile_avg_pool3d_bwd.cpp b/composable_kernel/profiler/src/profile_avg_pool3d_bwd.cpp similarity index 100% rename from profiler/src/profile_avg_pool3d_bwd.cpp rename to composable_kernel/profiler/src/profile_avg_pool3d_bwd.cpp diff --git a/profiler/src/profile_batched_gemm.cpp b/composable_kernel/profiler/src/profile_batched_gemm.cpp similarity index 100% rename from profiler/src/profile_batched_gemm.cpp rename to composable_kernel/profiler/src/profile_batched_gemm.cpp diff --git a/profiler/src/profile_batched_gemm_add_relu_gemm_add.cpp b/composable_kernel/profiler/src/profile_batched_gemm_add_relu_gemm_add.cpp similarity index 100% rename from profiler/src/profile_batched_gemm_add_relu_gemm_add.cpp rename to composable_kernel/profiler/src/profile_batched_gemm_add_relu_gemm_add.cpp diff --git a/profiler/src/profile_batched_gemm_gemm.cpp b/composable_kernel/profiler/src/profile_batched_gemm_gemm.cpp similarity index 100% rename from profiler/src/profile_batched_gemm_gemm.cpp rename to composable_kernel/profiler/src/profile_batched_gemm_gemm.cpp diff --git a/profiler/src/profile_batched_gemm_multi_d.cpp b/composable_kernel/profiler/src/profile_batched_gemm_multi_d.cpp similarity index 100% rename from profiler/src/profile_batched_gemm_multi_d.cpp rename to composable_kernel/profiler/src/profile_batched_gemm_multi_d.cpp diff --git a/profiler/src/profile_batched_gemm_reduce.cpp b/composable_kernel/profiler/src/profile_batched_gemm_reduce.cpp similarity index 100% rename from profiler/src/profile_batched_gemm_reduce.cpp rename to composable_kernel/profiler/src/profile_batched_gemm_reduce.cpp diff --git a/profiler/src/profile_batchnorm_bwd.cpp b/composable_kernel/profiler/src/profile_batchnorm_bwd.cpp similarity index 100% rename from profiler/src/profile_batchnorm_bwd.cpp rename to composable_kernel/profiler/src/profile_batchnorm_bwd.cpp diff --git a/profiler/src/profile_batchnorm_fwd.cpp b/composable_kernel/profiler/src/profile_batchnorm_fwd.cpp similarity index 100% rename from profiler/src/profile_batchnorm_fwd.cpp rename to composable_kernel/profiler/src/profile_batchnorm_fwd.cpp diff --git a/profiler/src/profile_batchnorm_infer.cpp b/composable_kernel/profiler/src/profile_batchnorm_infer.cpp similarity index 100% rename from profiler/src/profile_batchnorm_infer.cpp rename to composable_kernel/profiler/src/profile_batchnorm_infer.cpp diff --git a/profiler/src/profile_contraction_bilinear.cpp b/composable_kernel/profiler/src/profile_contraction_bilinear.cpp similarity index 100% rename from profiler/src/profile_contraction_bilinear.cpp rename to composable_kernel/profiler/src/profile_contraction_bilinear.cpp diff --git a/profiler/src/profile_contraction_scale.cpp b/composable_kernel/profiler/src/profile_contraction_scale.cpp similarity index 100% rename from profiler/src/profile_contraction_scale.cpp rename to composable_kernel/profiler/src/profile_contraction_scale.cpp diff --git a/profiler/src/profile_conv_bwd_data.cpp b/composable_kernel/profiler/src/profile_conv_bwd_data.cpp similarity index 100% rename from profiler/src/profile_conv_bwd_data.cpp rename to composable_kernel/profiler/src/profile_conv_bwd_data.cpp diff --git a/profiler/src/profile_conv_fwd.cpp b/composable_kernel/profiler/src/profile_conv_fwd.cpp similarity index 100% rename from profiler/src/profile_conv_fwd.cpp rename to composable_kernel/profiler/src/profile_conv_fwd.cpp diff --git a/profiler/src/profile_conv_fwd_bias_relu.cpp b/composable_kernel/profiler/src/profile_conv_fwd_bias_relu.cpp similarity index 100% rename from profiler/src/profile_conv_fwd_bias_relu.cpp rename to composable_kernel/profiler/src/profile_conv_fwd_bias_relu.cpp diff --git a/profiler/src/profile_conv_fwd_bias_relu_add.cpp b/composable_kernel/profiler/src/profile_conv_fwd_bias_relu_add.cpp similarity index 100% rename from profiler/src/profile_conv_fwd_bias_relu_add.cpp rename to composable_kernel/profiler/src/profile_conv_fwd_bias_relu_add.cpp diff --git a/profiler/src/profile_conv_tensor_rearrange.cpp b/composable_kernel/profiler/src/profile_conv_tensor_rearrange.cpp similarity index 100% rename from profiler/src/profile_conv_tensor_rearrange.cpp rename to composable_kernel/profiler/src/profile_conv_tensor_rearrange.cpp diff --git a/profiler/src/profile_gemm.cpp b/composable_kernel/profiler/src/profile_gemm.cpp similarity index 100% rename from profiler/src/profile_gemm.cpp rename to composable_kernel/profiler/src/profile_gemm.cpp diff --git a/profiler/src/profile_gemm_add_add_fastgelu.cpp b/composable_kernel/profiler/src/profile_gemm_add_add_fastgelu.cpp similarity index 100% rename from profiler/src/profile_gemm_add_add_fastgelu.cpp rename to composable_kernel/profiler/src/profile_gemm_add_add_fastgelu.cpp diff --git a/profiler/src/profile_gemm_add_fastgelu.cpp b/composable_kernel/profiler/src/profile_gemm_add_fastgelu.cpp similarity index 100% rename from profiler/src/profile_gemm_add_fastgelu.cpp rename to composable_kernel/profiler/src/profile_gemm_add_fastgelu.cpp diff --git a/profiler/src/profile_gemm_add_multiply.cpp b/composable_kernel/profiler/src/profile_gemm_add_multiply.cpp similarity index 100% rename from profiler/src/profile_gemm_add_multiply.cpp rename to composable_kernel/profiler/src/profile_gemm_add_multiply.cpp diff --git a/profiler/src/profile_gemm_add_relu_add_layernorm.cpp b/composable_kernel/profiler/src/profile_gemm_add_relu_add_layernorm.cpp similarity index 100% rename from profiler/src/profile_gemm_add_relu_add_layernorm.cpp rename to composable_kernel/profiler/src/profile_gemm_add_relu_add_layernorm.cpp diff --git a/profiler/src/profile_gemm_bias_add_reduce.cpp b/composable_kernel/profiler/src/profile_gemm_bias_add_reduce.cpp similarity index 100% rename from profiler/src/profile_gemm_bias_add_reduce.cpp rename to composable_kernel/profiler/src/profile_gemm_bias_add_reduce.cpp diff --git a/profiler/src/profile_gemm_bilinear.cpp b/composable_kernel/profiler/src/profile_gemm_bilinear.cpp similarity index 100% rename from profiler/src/profile_gemm_bilinear.cpp rename to composable_kernel/profiler/src/profile_gemm_bilinear.cpp diff --git a/profiler/src/profile_gemm_fastgelu.cpp b/composable_kernel/profiler/src/profile_gemm_fastgelu.cpp similarity index 100% rename from profiler/src/profile_gemm_fastgelu.cpp rename to composable_kernel/profiler/src/profile_gemm_fastgelu.cpp diff --git a/profiler/src/profile_gemm_multiply_add.cpp b/composable_kernel/profiler/src/profile_gemm_multiply_add.cpp similarity index 100% rename from profiler/src/profile_gemm_multiply_add.cpp rename to composable_kernel/profiler/src/profile_gemm_multiply_add.cpp diff --git a/profiler/src/profile_gemm_reduce.cpp b/composable_kernel/profiler/src/profile_gemm_reduce.cpp similarity index 100% rename from profiler/src/profile_gemm_reduce.cpp rename to composable_kernel/profiler/src/profile_gemm_reduce.cpp diff --git a/profiler/src/profile_gemm_splitk.cpp b/composable_kernel/profiler/src/profile_gemm_splitk.cpp similarity index 100% rename from profiler/src/profile_gemm_splitk.cpp rename to composable_kernel/profiler/src/profile_gemm_splitk.cpp diff --git a/profiler/src/profile_gemm_streamk.cpp b/composable_kernel/profiler/src/profile_gemm_streamk.cpp similarity index 100% rename from profiler/src/profile_gemm_streamk.cpp rename to composable_kernel/profiler/src/profile_gemm_streamk.cpp diff --git a/profiler/src/profile_gemv_splitk.cpp b/composable_kernel/profiler/src/profile_gemv_splitk.cpp similarity index 100% rename from profiler/src/profile_gemv_splitk.cpp rename to composable_kernel/profiler/src/profile_gemv_splitk.cpp diff --git a/profiler/src/profile_grouped_conv_bwd_data.cpp b/composable_kernel/profiler/src/profile_grouped_conv_bwd_data.cpp similarity index 100% rename from profiler/src/profile_grouped_conv_bwd_data.cpp rename to composable_kernel/profiler/src/profile_grouped_conv_bwd_data.cpp diff --git a/profiler/src/profile_grouped_conv_bwd_weight.cpp b/composable_kernel/profiler/src/profile_grouped_conv_bwd_weight.cpp similarity index 100% rename from profiler/src/profile_grouped_conv_bwd_weight.cpp rename to composable_kernel/profiler/src/profile_grouped_conv_bwd_weight.cpp diff --git a/profiler/src/profile_grouped_conv_fwd.cpp b/composable_kernel/profiler/src/profile_grouped_conv_fwd.cpp similarity index 100% rename from profiler/src/profile_grouped_conv_fwd.cpp rename to composable_kernel/profiler/src/profile_grouped_conv_fwd.cpp diff --git a/profiler/src/profile_grouped_gemm.cpp b/composable_kernel/profiler/src/profile_grouped_gemm.cpp similarity index 100% rename from profiler/src/profile_grouped_gemm.cpp rename to composable_kernel/profiler/src/profile_grouped_gemm.cpp diff --git a/profiler/src/profile_grouped_gemm_fastgelu.cpp b/composable_kernel/profiler/src/profile_grouped_gemm_fastgelu.cpp similarity index 100% rename from profiler/src/profile_grouped_gemm_fastgelu.cpp rename to composable_kernel/profiler/src/profile_grouped_gemm_fastgelu.cpp diff --git a/profiler/src/profile_groupnorm.cpp b/composable_kernel/profiler/src/profile_groupnorm.cpp similarity index 100% rename from profiler/src/profile_groupnorm.cpp rename to composable_kernel/profiler/src/profile_groupnorm.cpp diff --git a/profiler/src/profile_layernorm.cpp b/composable_kernel/profiler/src/profile_layernorm.cpp similarity index 100% rename from profiler/src/profile_layernorm.cpp rename to composable_kernel/profiler/src/profile_layernorm.cpp diff --git a/profiler/src/profile_max_pool3d_bwd.cpp b/composable_kernel/profiler/src/profile_max_pool3d_bwd.cpp similarity index 100% rename from profiler/src/profile_max_pool3d_bwd.cpp rename to composable_kernel/profiler/src/profile_max_pool3d_bwd.cpp diff --git a/profiler/src/profile_max_pool3d_fwd.cpp b/composable_kernel/profiler/src/profile_max_pool3d_fwd.cpp similarity index 100% rename from profiler/src/profile_max_pool3d_fwd.cpp rename to composable_kernel/profiler/src/profile_max_pool3d_fwd.cpp diff --git a/profiler/src/profile_reduce.cpp b/composable_kernel/profiler/src/profile_reduce.cpp similarity index 100% rename from profiler/src/profile_reduce.cpp rename to composable_kernel/profiler/src/profile_reduce.cpp diff --git a/profiler/src/profile_softmax.cpp b/composable_kernel/profiler/src/profile_softmax.cpp similarity index 100% rename from profiler/src/profile_softmax.cpp rename to composable_kernel/profiler/src/profile_softmax.cpp diff --git a/composable_kernel/profiler/src/profile_tall_and_skinny_gemm_splitk.cpp b/composable_kernel/profiler/src/profile_tall_and_skinny_gemm_splitk.cpp new file mode 100755 index 000000000..5aed9d6d4 --- /dev/null +++ b/composable_kernel/profiler/src/profile_tall_and_skinny_gemm_splitk.cpp @@ -0,0 +1,160 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved. + +#include +#include +#include +#include + +#include "profiler/profile_tall_and_skinny_gemm_splitk_impl.hpp" +#include "profiler_operation_registry.hpp" + +enum struct GemmMatrixLayout +{ + MK_KN_MN, // 0 + MK_NK_MN, // 1 + KM_KN_MN, // 2 + KM_NK_MN, // 3 +}; + +enum struct GemmDataType +{ + F32_F32_F32, // 0 + F16_F16_F16, // 1 + BF16_BF16_BF16, // 2 + INT8_INT8_INT8, // 3 + F8_F16_F16, // 4 + F16_F8_F16, // 5 +}; + +#define OP_NAME "tall_and_skinny_gemm_splitk" +#define OP_DESC "Tall and Skinny GEMM splitk" + +int profile_tall_and_skinny_gemm_splitk(int argc, char* argv[]) +{ + if(argc != 15) + { + printf("arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n"); + printf("arg2: data type (0: fp32; 1: fp16; 2: bf16; 3: int8; 4: f8@f16; 5: f16@f8)\n"); + printf("arg3: matrix layout (0: A[m, k] * B[k, n] = C[m, n];\n"); + printf(" 1: A[m, k] * B[n, k] = C[m, n];\n"); + printf(" 2: A[k, m] * B[k, n] = C[m, n];\n"); + printf(" 3: A[k, m] * B[n, k] = C[m, n])\n"); + printf("arg4: verification (0: no; 1: yes)\n"); + printf("arg5: initialization (0: no init; 1: integer value; 2: decimal value)\n"); + printf("arg6: print tensor value (0: no; 1: yes)\n"); + printf("arg7: time kernel (0=no, 1=yes)\n"); + printf("arg8 to 13: M, N, K, StrideA, StrideB, StrideC\n"); + printf("arg14: split k into mulitiple batch\n"); + exit(1); + } + + const auto data_type = static_cast(std::stoi(argv[2])); + const auto layout = static_cast(std::stoi(argv[3])); + const bool do_verification = std::stoi(argv[4]); + const int init_method = std::stoi(argv[5]); + const bool do_log = std::stoi(argv[6]); + const bool time_kernel = std::stoi(argv[7]); + + const int M = std::stoi(argv[8]); + const int N = std::stoi(argv[9]); + const int K = std::stoi(argv[10]); + + const int StrideA = std::stoi(argv[11]); + const int StrideB = std::stoi(argv[12]); + const int StrideC = std::stoi(argv[13]); + const int KBatch = std::stoi(argv[14]); + + using F32 = float; + using F16 = ck::half_t; + // #if defined CK_ENABLE_FP8 + // using F8 = ck::f8_t; + // #endif + + using Row = ck::tensor_layout::gemm::RowMajor; + using Col = ck::tensor_layout::gemm::ColumnMajor; + + auto profile = [&](auto a_type, + auto b_type, + auto acc_type, + auto c_type, + auto a_layout, + auto b_layout, + auto c_layout) { + using ADataType = decltype(a_type); + using BDataType = decltype(b_type); + using AccDataType = decltype(acc_type); + using CDataType = decltype(c_type); + + using ALayout = decltype(a_layout); + using BLayout = decltype(b_layout); + using CLayout = decltype(c_layout); + + const int DefaultStrideA = ck::is_same_v ? K : M; + const int DefaultStrideB = ck::is_same_v ? N : K; + const int DefaultStrideC = ck::is_same_v ? N : M; + + bool pass = ck::profiler::profile_tall_and_skinny_gemm_splitk_impl( + do_verification, + init_method, + do_log, + time_kernel, + M, + N, + K, + (StrideA < 0) ? DefaultStrideA : StrideA, + (StrideB < 0) ? DefaultStrideB : StrideB, + (StrideC < 0) ? DefaultStrideC : StrideC, + KBatch); + + return pass ? 0 : 1; + }; + + // if(data_type == GemmDataType::F32_F32_F32 && layout == GemmMatrixLayout::MK_KN_MN) + // { + // return profile(F32{}, F32{}, F32{}, F32{}, Row{}, Row{}, Row{}); + // } + // else if(data_type == GemmDataType::F32_F32_F32 && layout == GemmMatrixLayout::MK_NK_MN) + // { + // return profile(F32{}, F32{}, F32{}, F32{}, Row{}, Col{}, Row{}); + // } + // else if(data_type == GemmDataType::F32_F32_F32 && layout == GemmMatrixLayout::KM_KN_MN) + // { + // return profile(F32{}, F32{}, F32{}, F32{}, Col{}, Row{}, Row{}); + // } + // else if(data_type == GemmDataType::F32_F32_F32 && layout == GemmMatrixLayout::KM_NK_MN) + // { + // return profile(F32{}, F32{}, F32{}, F32{}, Col{}, Col{}, Row{}); + // } + if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::MK_KN_MN) + { + return profile(F16{}, F16{}, F32{}, F16{}, Row{}, Row{}, Row{}); + } + else if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::MK_NK_MN) + { + return profile(F16{}, F16{}, F32{}, F16{}, Row{}, Col{}, Row{}); + } + // else if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::KM_KN_MN) + // { + // return profile(F16{}, F16{}, F32{}, F16{}, Col{}, Row{}, Row{}); + // } + // else if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::KM_NK_MN) + // { + // return profile(F16{}, F16{}, F32{}, F16{}, Col{}, Col{}, Row{}); + // } + + else + { + std::cout << "this data_type & layout is not implemented" << std::endl; + + return 1; + } +} + +REGISTER_PROFILER_OPERATION(OP_NAME, OP_DESC, profile_tall_and_skinny_gemm_splitk); diff --git a/profiler/src/profiler.cpp b/composable_kernel/profiler/src/profiler.cpp similarity index 100% rename from profiler/src/profiler.cpp rename to composable_kernel/profiler/src/profiler.cpp diff --git a/profiler/src/profiler_operation_registry.hpp b/composable_kernel/profiler/src/profiler_operation_registry.hpp similarity index 100% rename from profiler/src/profiler_operation_registry.hpp rename to composable_kernel/profiler/src/profiler_operation_registry.hpp diff --git a/rbuild.ini b/composable_kernel/rbuild.ini similarity index 100% rename from rbuild.ini rename to composable_kernel/rbuild.ini diff --git a/requirements.txt b/composable_kernel/requirements.txt similarity index 100% rename from requirements.txt rename to composable_kernel/requirements.txt diff --git a/script/check_copyright_year.sh b/composable_kernel/script/check_copyright_year.sh similarity index 100% rename from script/check_copyright_year.sh rename to composable_kernel/script/check_copyright_year.sh diff --git a/script/clang-format-overwrite.sh b/composable_kernel/script/clang-format-overwrite.sh similarity index 100% rename from script/clang-format-overwrite.sh rename to composable_kernel/script/clang-format-overwrite.sh diff --git a/script/cmake-ck-dev.sh b/composable_kernel/script/cmake-ck-dev.sh similarity index 100% rename from script/cmake-ck-dev.sh rename to composable_kernel/script/cmake-ck-dev.sh diff --git a/script/cmake-ck-release.sh b/composable_kernel/script/cmake-ck-release.sh similarity index 100% rename from script/cmake-ck-release.sh rename to composable_kernel/script/cmake-ck-release.sh diff --git a/script/count_vgpr.sh b/composable_kernel/script/count_vgpr.sh similarity index 100% rename from script/count_vgpr.sh rename to composable_kernel/script/count_vgpr.sh diff --git a/script/hipclang_opt.sh b/composable_kernel/script/hipclang_opt.sh similarity index 100% rename from script/hipclang_opt.sh rename to composable_kernel/script/hipclang_opt.sh diff --git a/script/install_precommit.sh b/composable_kernel/script/install_precommit.sh similarity index 100% rename from script/install_precommit.sh rename to composable_kernel/script/install_precommit.sh diff --git a/script/parse_perf_data.py b/composable_kernel/script/parse_perf_data.py similarity index 100% rename from script/parse_perf_data.py rename to composable_kernel/script/parse_perf_data.py diff --git a/script/process_perf_data.py b/composable_kernel/script/process_perf_data.py similarity index 100% rename from script/process_perf_data.py rename to composable_kernel/script/process_perf_data.py diff --git a/script/process_perf_data.sh b/composable_kernel/script/process_perf_data.sh similarity index 100% rename from script/process_perf_data.sh rename to composable_kernel/script/process_perf_data.sh diff --git a/script/process_qa_data.sh b/composable_kernel/script/process_qa_data.sh similarity index 100% rename from script/process_qa_data.sh rename to composable_kernel/script/process_qa_data.sh diff --git a/script/profile_batched_gemm.sh b/composable_kernel/script/profile_batched_gemm.sh similarity index 100% rename from script/profile_batched_gemm.sh rename to composable_kernel/script/profile_batched_gemm.sh diff --git a/script/profile_conv_bwd_data.sh b/composable_kernel/script/profile_conv_bwd_data.sh similarity index 100% rename from script/profile_conv_bwd_data.sh rename to composable_kernel/script/profile_conv_bwd_data.sh diff --git a/script/profile_conv_fwd.sh b/composable_kernel/script/profile_conv_fwd.sh similarity index 100% rename from script/profile_conv_fwd.sh rename to composable_kernel/script/profile_conv_fwd.sh diff --git a/script/profile_gemm.sh b/composable_kernel/script/profile_gemm.sh similarity index 100% rename from script/profile_gemm.sh rename to composable_kernel/script/profile_gemm.sh diff --git a/script/profile_gemm_bilinear.sh b/composable_kernel/script/profile_gemm_bilinear.sh similarity index 100% rename from script/profile_gemm_bilinear.sh rename to composable_kernel/script/profile_gemm_bilinear.sh diff --git a/script/profile_grouped_gemm.sh b/composable_kernel/script/profile_grouped_gemm.sh similarity index 100% rename from script/profile_grouped_gemm.sh rename to composable_kernel/script/profile_grouped_gemm.sh diff --git a/script/profile_onnx_gemm.sh b/composable_kernel/script/profile_onnx_gemm.sh similarity index 100% rename from script/profile_onnx_gemm.sh rename to composable_kernel/script/profile_onnx_gemm.sh diff --git a/script/profile_reduce_no_index.sh b/composable_kernel/script/profile_reduce_no_index.sh similarity index 100% rename from script/profile_reduce_no_index.sh rename to composable_kernel/script/profile_reduce_no_index.sh diff --git a/script/profile_reduce_with_index.sh b/composable_kernel/script/profile_reduce_with_index.sh similarity index 100% rename from script/profile_reduce_with_index.sh rename to composable_kernel/script/profile_reduce_with_index.sh diff --git a/script/profile_resnet50.sh b/composable_kernel/script/profile_resnet50.sh similarity index 100% rename from script/profile_resnet50.sh rename to composable_kernel/script/profile_resnet50.sh diff --git a/script/profile_splitK_gemm.sh b/composable_kernel/script/profile_splitK_gemm.sh similarity index 100% rename from script/profile_splitK_gemm.sh rename to composable_kernel/script/profile_splitK_gemm.sh diff --git a/script/run_full_performance_tests.sh b/composable_kernel/script/run_full_performance_tests.sh similarity index 100% rename from script/run_full_performance_tests.sh rename to composable_kernel/script/run_full_performance_tests.sh diff --git a/script/run_performance_tests.sh b/composable_kernel/script/run_performance_tests.sh similarity index 100% rename from script/run_performance_tests.sh rename to composable_kernel/script/run_performance_tests.sh diff --git a/script/test_convnd_fwd.sh b/composable_kernel/script/test_convnd_fwd.sh similarity index 100% rename from script/test_convnd_fwd.sh rename to composable_kernel/script/test_convnd_fwd.sh diff --git a/script/test_reduce_no_index.sh b/composable_kernel/script/test_reduce_no_index.sh similarity index 100% rename from script/test_reduce_no_index.sh rename to composable_kernel/script/test_reduce_no_index.sh diff --git a/script/test_reduce_with_index.sh b/composable_kernel/script/test_reduce_with_index.sh similarity index 100% rename from script/test_reduce_with_index.sh rename to composable_kernel/script/test_reduce_with_index.sh diff --git a/script/uninstall_precommit.sh b/composable_kernel/script/uninstall_precommit.sh similarity index 100% rename from script/uninstall_precommit.sh rename to composable_kernel/script/uninstall_precommit.sh diff --git a/test/CMakeLists.txt b/composable_kernel/test/CMakeLists.txt similarity index 100% rename from test/CMakeLists.txt rename to composable_kernel/test/CMakeLists.txt diff --git a/test/batched_gemm/CMakeLists.txt b/composable_kernel/test/batched_gemm/CMakeLists.txt similarity index 100% rename from test/batched_gemm/CMakeLists.txt rename to composable_kernel/test/batched_gemm/CMakeLists.txt diff --git a/test/batched_gemm/batched_gemm_bf16.cpp b/composable_kernel/test/batched_gemm/batched_gemm_bf16.cpp similarity index 100% rename from test/batched_gemm/batched_gemm_bf16.cpp rename to composable_kernel/test/batched_gemm/batched_gemm_bf16.cpp diff --git a/test/batched_gemm/batched_gemm_fp16.cpp b/composable_kernel/test/batched_gemm/batched_gemm_fp16.cpp similarity index 100% rename from test/batched_gemm/batched_gemm_fp16.cpp rename to composable_kernel/test/batched_gemm/batched_gemm_fp16.cpp diff --git a/test/batched_gemm/batched_gemm_fp32.cpp b/composable_kernel/test/batched_gemm/batched_gemm_fp32.cpp similarity index 100% rename from test/batched_gemm/batched_gemm_fp32.cpp rename to composable_kernel/test/batched_gemm/batched_gemm_fp32.cpp diff --git a/test/batched_gemm/batched_gemm_int8.cpp b/composable_kernel/test/batched_gemm/batched_gemm_int8.cpp similarity index 100% rename from test/batched_gemm/batched_gemm_int8.cpp rename to composable_kernel/test/batched_gemm/batched_gemm_int8.cpp diff --git a/test/batched_gemm/test_batched_gemm.cpp b/composable_kernel/test/batched_gemm/test_batched_gemm.cpp similarity index 100% rename from test/batched_gemm/test_batched_gemm.cpp rename to composable_kernel/test/batched_gemm/test_batched_gemm.cpp diff --git a/test/batched_gemm_gemm/CMakeLists.txt b/composable_kernel/test/batched_gemm_gemm/CMakeLists.txt similarity index 100% rename from test/batched_gemm_gemm/CMakeLists.txt rename to composable_kernel/test/batched_gemm_gemm/CMakeLists.txt diff --git a/test/batched_gemm_gemm/test_batched_gemm_gemm_fp16.cpp b/composable_kernel/test/batched_gemm_gemm/test_batched_gemm_gemm_fp16.cpp similarity index 100% rename from test/batched_gemm_gemm/test_batched_gemm_gemm_fp16.cpp rename to composable_kernel/test/batched_gemm_gemm/test_batched_gemm_gemm_fp16.cpp diff --git a/test/batched_gemm_gemm/test_batched_gemm_gemm_util.hpp b/composable_kernel/test/batched_gemm_gemm/test_batched_gemm_gemm_util.hpp similarity index 100% rename from test/batched_gemm_gemm/test_batched_gemm_gemm_util.hpp rename to composable_kernel/test/batched_gemm_gemm/test_batched_gemm_gemm_util.hpp diff --git a/test/batched_gemm_multi_d/CMakeLists.txt b/composable_kernel/test/batched_gemm_multi_d/CMakeLists.txt similarity index 100% rename from test/batched_gemm_multi_d/CMakeLists.txt rename to composable_kernel/test/batched_gemm_multi_d/CMakeLists.txt diff --git a/test/batched_gemm_multi_d/test_batched_gemm_multi_d_dl.cpp b/composable_kernel/test/batched_gemm_multi_d/test_batched_gemm_multi_d_dl.cpp similarity index 100% rename from test/batched_gemm_multi_d/test_batched_gemm_multi_d_dl.cpp rename to composable_kernel/test/batched_gemm_multi_d/test_batched_gemm_multi_d_dl.cpp diff --git a/test/batched_gemm_reduce/CMakeLists.txt b/composable_kernel/test/batched_gemm_reduce/CMakeLists.txt similarity index 100% rename from test/batched_gemm_reduce/CMakeLists.txt rename to composable_kernel/test/batched_gemm_reduce/CMakeLists.txt diff --git a/test/batched_gemm_reduce/batched_gemm_reduce_fp16.cpp b/composable_kernel/test/batched_gemm_reduce/batched_gemm_reduce_fp16.cpp similarity index 100% rename from test/batched_gemm_reduce/batched_gemm_reduce_fp16.cpp rename to composable_kernel/test/batched_gemm_reduce/batched_gemm_reduce_fp16.cpp diff --git a/test/batched_gemm_softmax_gemm/CMakeLists.txt b/composable_kernel/test/batched_gemm_softmax_gemm/CMakeLists.txt similarity index 100% rename from test/batched_gemm_softmax_gemm/CMakeLists.txt rename to composable_kernel/test/batched_gemm_softmax_gemm/CMakeLists.txt diff --git a/test/batched_gemm_softmax_gemm/test_batched_gemm_softmax_gemm_fp16.cpp b/composable_kernel/test/batched_gemm_softmax_gemm/test_batched_gemm_softmax_gemm_fp16.cpp similarity index 100% rename from test/batched_gemm_softmax_gemm/test_batched_gemm_softmax_gemm_fp16.cpp rename to composable_kernel/test/batched_gemm_softmax_gemm/test_batched_gemm_softmax_gemm_fp16.cpp diff --git a/test/batched_gemm_softmax_gemm/test_batched_gemm_softmax_gemm_util.hpp b/composable_kernel/test/batched_gemm_softmax_gemm/test_batched_gemm_softmax_gemm_util.hpp similarity index 100% rename from test/batched_gemm_softmax_gemm/test_batched_gemm_softmax_gemm_util.hpp rename to composable_kernel/test/batched_gemm_softmax_gemm/test_batched_gemm_softmax_gemm_util.hpp diff --git a/test/batched_gemm_softmax_gemm_permute/CMakeLists.txt b/composable_kernel/test/batched_gemm_softmax_gemm_permute/CMakeLists.txt similarity index 100% rename from test/batched_gemm_softmax_gemm_permute/CMakeLists.txt rename to composable_kernel/test/batched_gemm_softmax_gemm_permute/CMakeLists.txt diff --git a/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_bias_softmax_gemm_permute_bf16.cpp b/composable_kernel/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_bias_softmax_gemm_permute_bf16.cpp similarity index 100% rename from test/batched_gemm_softmax_gemm_permute/test_batched_gemm_bias_softmax_gemm_permute_bf16.cpp rename to composable_kernel/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_bias_softmax_gemm_permute_bf16.cpp diff --git a/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_bias_softmax_gemm_permute_fp16.cpp b/composable_kernel/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_bias_softmax_gemm_permute_fp16.cpp similarity index 100% rename from test/batched_gemm_softmax_gemm_permute/test_batched_gemm_bias_softmax_gemm_permute_fp16.cpp rename to composable_kernel/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_bias_softmax_gemm_permute_fp16.cpp diff --git a/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_bias_softmax_gemm_permute_util.hpp b/composable_kernel/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_bias_softmax_gemm_permute_util.hpp similarity index 100% rename from test/batched_gemm_softmax_gemm_permute/test_batched_gemm_bias_softmax_gemm_permute_util.hpp rename to composable_kernel/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_bias_softmax_gemm_permute_util.hpp diff --git a/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_softmax_gemm_permute_bf16.cpp b/composable_kernel/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_softmax_gemm_permute_bf16.cpp similarity index 100% rename from test/batched_gemm_softmax_gemm_permute/test_batched_gemm_softmax_gemm_permute_bf16.cpp rename to composable_kernel/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_softmax_gemm_permute_bf16.cpp diff --git a/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_softmax_gemm_permute_fp16.cpp b/composable_kernel/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_softmax_gemm_permute_fp16.cpp similarity index 100% rename from test/batched_gemm_softmax_gemm_permute/test_batched_gemm_softmax_gemm_permute_fp16.cpp rename to composable_kernel/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_softmax_gemm_permute_fp16.cpp diff --git a/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_softmax_gemm_permute_util.hpp b/composable_kernel/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_softmax_gemm_permute_util.hpp similarity index 100% rename from test/batched_gemm_softmax_gemm_permute/test_batched_gemm_softmax_gemm_permute_util.hpp rename to composable_kernel/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_softmax_gemm_permute_util.hpp diff --git a/test/batchnorm/CMakeLists.txt b/composable_kernel/test/batchnorm/CMakeLists.txt similarity index 100% rename from test/batchnorm/CMakeLists.txt rename to composable_kernel/test/batchnorm/CMakeLists.txt diff --git a/test/batchnorm/batchnorm_bwd_rank_4.cpp b/composable_kernel/test/batchnorm/batchnorm_bwd_rank_4.cpp similarity index 100% rename from test/batchnorm/batchnorm_bwd_rank_4.cpp rename to composable_kernel/test/batchnorm/batchnorm_bwd_rank_4.cpp diff --git a/test/batchnorm/batchnorm_fwd_rank_4.cpp b/composable_kernel/test/batchnorm/batchnorm_fwd_rank_4.cpp similarity index 100% rename from test/batchnorm/batchnorm_fwd_rank_4.cpp rename to composable_kernel/test/batchnorm/batchnorm_fwd_rank_4.cpp diff --git a/test/batchnorm/batchnorm_infer_rank_4.cpp b/composable_kernel/test/batchnorm/batchnorm_infer_rank_4.cpp similarity index 100% rename from test/batchnorm/batchnorm_infer_rank_4.cpp rename to composable_kernel/test/batchnorm/batchnorm_infer_rank_4.cpp diff --git a/test/block_swizzle_test/block_swizzle_test.cpp b/composable_kernel/test/block_swizzle_test/block_swizzle_test.cpp similarity index 100% rename from test/block_swizzle_test/block_swizzle_test.cpp rename to composable_kernel/test/block_swizzle_test/block_swizzle_test.cpp diff --git a/test/block_swizzle_test/rebuild.sh b/composable_kernel/test/block_swizzle_test/rebuild.sh similarity index 100% rename from test/block_swizzle_test/rebuild.sh rename to composable_kernel/test/block_swizzle_test/rebuild.sh diff --git a/test/block_swizzle_test/simple_args.h b/composable_kernel/test/block_swizzle_test/simple_args.h similarity index 100% rename from test/block_swizzle_test/simple_args.h rename to composable_kernel/test/block_swizzle_test/simple_args.h diff --git a/test/block_to_ctile_map/CMakeLists.txt b/composable_kernel/test/block_to_ctile_map/CMakeLists.txt similarity index 100% rename from test/block_to_ctile_map/CMakeLists.txt rename to composable_kernel/test/block_to_ctile_map/CMakeLists.txt diff --git a/test/block_to_ctile_map/test_block_to_ctile_map.cpp b/composable_kernel/test/block_to_ctile_map/test_block_to_ctile_map.cpp similarity index 100% rename from test/block_to_ctile_map/test_block_to_ctile_map.cpp rename to composable_kernel/test/block_to_ctile_map/test_block_to_ctile_map.cpp diff --git a/test/contraction/CMakeLists.txt b/composable_kernel/test/contraction/CMakeLists.txt similarity index 100% rename from test/contraction/CMakeLists.txt rename to composable_kernel/test/contraction/CMakeLists.txt diff --git a/test/contraction/test_contraction.cpp b/composable_kernel/test/contraction/test_contraction.cpp similarity index 100% rename from test/contraction/test_contraction.cpp rename to composable_kernel/test/contraction/test_contraction.cpp diff --git a/test/contraction/test_contraction_interface.cpp b/composable_kernel/test/contraction/test_contraction_interface.cpp similarity index 100% rename from test/contraction/test_contraction_interface.cpp rename to composable_kernel/test/contraction/test_contraction_interface.cpp diff --git a/test/conv_tensor_rearrange/CMakeLists.txt b/composable_kernel/test/conv_tensor_rearrange/CMakeLists.txt similarity index 100% rename from test/conv_tensor_rearrange/CMakeLists.txt rename to composable_kernel/test/conv_tensor_rearrange/CMakeLists.txt diff --git a/test/conv_tensor_rearrange/test_conv_tensor_rearrange.cpp b/composable_kernel/test/conv_tensor_rearrange/test_conv_tensor_rearrange.cpp similarity index 100% rename from test/conv_tensor_rearrange/test_conv_tensor_rearrange.cpp rename to composable_kernel/test/conv_tensor_rearrange/test_conv_tensor_rearrange.cpp diff --git a/test/conv_tensor_rearrange/test_conv_tensor_rearrange_interface.cpp b/composable_kernel/test/conv_tensor_rearrange/test_conv_tensor_rearrange_interface.cpp similarity index 100% rename from test/conv_tensor_rearrange/test_conv_tensor_rearrange_interface.cpp rename to composable_kernel/test/conv_tensor_rearrange/test_conv_tensor_rearrange_interface.cpp diff --git a/test/conv_util/CMakeLists.txt b/composable_kernel/test/conv_util/CMakeLists.txt similarity index 100% rename from test/conv_util/CMakeLists.txt rename to composable_kernel/test/conv_util/CMakeLists.txt diff --git a/test/conv_util/conv_util.cpp b/composable_kernel/test/conv_util/conv_util.cpp similarity index 100% rename from test/conv_util/conv_util.cpp rename to composable_kernel/test/conv_util/conv_util.cpp diff --git a/test/convnd_bwd_data/CMakeLists.txt b/composable_kernel/test/convnd_bwd_data/CMakeLists.txt similarity index 100% rename from test/convnd_bwd_data/CMakeLists.txt rename to composable_kernel/test/convnd_bwd_data/CMakeLists.txt diff --git a/test/convnd_bwd_data/convnd_bwd_data.cpp b/composable_kernel/test/convnd_bwd_data/convnd_bwd_data.cpp similarity index 100% rename from test/convnd_bwd_data/convnd_bwd_data.cpp rename to composable_kernel/test/convnd_bwd_data/convnd_bwd_data.cpp diff --git a/test/convnd_fwd/CMakeLists.txt b/composable_kernel/test/convnd_fwd/CMakeLists.txt similarity index 100% rename from test/convnd_fwd/CMakeLists.txt rename to composable_kernel/test/convnd_fwd/CMakeLists.txt diff --git a/test/convnd_fwd/convnd_fwd.cpp b/composable_kernel/test/convnd_fwd/convnd_fwd.cpp similarity index 100% rename from test/convnd_fwd/convnd_fwd.cpp rename to composable_kernel/test/convnd_fwd/convnd_fwd.cpp diff --git a/test/data_type/CMakeLists.txt b/composable_kernel/test/data_type/CMakeLists.txt similarity index 100% rename from test/data_type/CMakeLists.txt rename to composable_kernel/test/data_type/CMakeLists.txt diff --git a/test/data_type/test_bf8.cpp b/composable_kernel/test/data_type/test_bf8.cpp similarity index 100% rename from test/data_type/test_bf8.cpp rename to composable_kernel/test/data_type/test_bf8.cpp diff --git a/test/data_type/test_fp8.cpp b/composable_kernel/test/data_type/test_fp8.cpp similarity index 100% rename from test/data_type/test_fp8.cpp rename to composable_kernel/test/data_type/test_fp8.cpp diff --git a/test/data_type/test_int4.cpp b/composable_kernel/test/data_type/test_int4.cpp similarity index 100% rename from test/data_type/test_int4.cpp rename to composable_kernel/test/data_type/test_int4.cpp diff --git a/test/data_type/type_convert_const.cpp b/composable_kernel/test/data_type/type_convert_const.cpp similarity index 100% rename from test/data_type/type_convert_const.cpp rename to composable_kernel/test/data_type/type_convert_const.cpp diff --git a/test/elementwise_normalization/CMakeLists.txt b/composable_kernel/test/elementwise_normalization/CMakeLists.txt similarity index 100% rename from test/elementwise_normalization/CMakeLists.txt rename to composable_kernel/test/elementwise_normalization/CMakeLists.txt diff --git a/test/elementwise_normalization/test_elementwise_layernorm_fp16.cpp b/composable_kernel/test/elementwise_normalization/test_elementwise_layernorm_fp16.cpp similarity index 100% rename from test/elementwise_normalization/test_elementwise_layernorm_fp16.cpp rename to composable_kernel/test/elementwise_normalization/test_elementwise_layernorm_fp16.cpp diff --git a/test/gemm/CMakeLists.txt b/composable_kernel/test/gemm/CMakeLists.txt similarity index 100% rename from test/gemm/CMakeLists.txt rename to composable_kernel/test/gemm/CMakeLists.txt diff --git a/test/gemm/gemm_bf16.cpp b/composable_kernel/test/gemm/gemm_bf16.cpp similarity index 100% rename from test/gemm/gemm_bf16.cpp rename to composable_kernel/test/gemm/gemm_bf16.cpp diff --git a/test/gemm/gemm_fp16.cpp b/composable_kernel/test/gemm/gemm_fp16.cpp similarity index 100% rename from test/gemm/gemm_fp16.cpp rename to composable_kernel/test/gemm/gemm_fp16.cpp diff --git a/test/gemm/gemm_fp32.cpp b/composable_kernel/test/gemm/gemm_fp32.cpp similarity index 100% rename from test/gemm/gemm_fp32.cpp rename to composable_kernel/test/gemm/gemm_fp32.cpp diff --git a/test/gemm/gemm_fp64.cpp b/composable_kernel/test/gemm/gemm_fp64.cpp similarity index 100% rename from test/gemm/gemm_fp64.cpp rename to composable_kernel/test/gemm/gemm_fp64.cpp diff --git a/test/gemm/gemm_int8.cpp b/composable_kernel/test/gemm/gemm_int8.cpp similarity index 100% rename from test/gemm/gemm_int8.cpp rename to composable_kernel/test/gemm/gemm_int8.cpp diff --git a/test/gemm/gemm_standalone_xdl_fp16.cpp b/composable_kernel/test/gemm/gemm_standalone_xdl_fp16.cpp similarity index 100% rename from test/gemm/gemm_standalone_xdl_fp16.cpp rename to composable_kernel/test/gemm/gemm_standalone_xdl_fp16.cpp diff --git a/test/gemm/gemm_util.hpp b/composable_kernel/test/gemm/gemm_util.hpp similarity index 100% rename from test/gemm/gemm_util.hpp rename to composable_kernel/test/gemm/gemm_util.hpp diff --git a/test/gemm/instance/gemm_f16_nn_instance.cpp b/composable_kernel/test/gemm/instance/gemm_f16_nn_instance.cpp similarity index 100% rename from test/gemm/instance/gemm_f16_nn_instance.cpp rename to composable_kernel/test/gemm/instance/gemm_f16_nn_instance.cpp diff --git a/test/gemm/instance/gemm_f16_nn_instance.hpp b/composable_kernel/test/gemm/instance/gemm_f16_nn_instance.hpp similarity index 100% rename from test/gemm/instance/gemm_f16_nn_instance.hpp rename to composable_kernel/test/gemm/instance/gemm_f16_nn_instance.hpp diff --git a/test/gemm/instance/gemm_f16_nt_instance.cpp b/composable_kernel/test/gemm/instance/gemm_f16_nt_instance.cpp similarity index 100% rename from test/gemm/instance/gemm_f16_nt_instance.cpp rename to composable_kernel/test/gemm/instance/gemm_f16_nt_instance.cpp diff --git a/test/gemm/instance/gemm_f16_nt_instance.hpp b/composable_kernel/test/gemm/instance/gemm_f16_nt_instance.hpp similarity index 100% rename from test/gemm/instance/gemm_f16_nt_instance.hpp rename to composable_kernel/test/gemm/instance/gemm_f16_nt_instance.hpp diff --git a/test/gemm/instance/gemm_f16_tn_instance.cpp b/composable_kernel/test/gemm/instance/gemm_f16_tn_instance.cpp similarity index 100% rename from test/gemm/instance/gemm_f16_tn_instance.cpp rename to composable_kernel/test/gemm/instance/gemm_f16_tn_instance.cpp diff --git a/test/gemm/instance/gemm_f16_tn_instance.hpp b/composable_kernel/test/gemm/instance/gemm_f16_tn_instance.hpp similarity index 100% rename from test/gemm/instance/gemm_f16_tn_instance.hpp rename to composable_kernel/test/gemm/instance/gemm_f16_tn_instance.hpp diff --git a/test/gemm/instance/gemm_f16_tt_instance.cpp b/composable_kernel/test/gemm/instance/gemm_f16_tt_instance.cpp similarity index 100% rename from test/gemm/instance/gemm_f16_tt_instance.cpp rename to composable_kernel/test/gemm/instance/gemm_f16_tt_instance.cpp diff --git a/test/gemm/instance/gemm_f16_tt_instance.hpp b/composable_kernel/test/gemm/instance/gemm_f16_tt_instance.hpp similarity index 100% rename from test/gemm/instance/gemm_f16_tt_instance.hpp rename to composable_kernel/test/gemm/instance/gemm_f16_tt_instance.hpp diff --git a/test/gemm/instance/gemm_wavelet_f16_tn_instance.cpp b/composable_kernel/test/gemm/instance/gemm_wavelet_f16_tn_instance.cpp similarity index 100% rename from test/gemm/instance/gemm_wavelet_f16_tn_instance.cpp rename to composable_kernel/test/gemm/instance/gemm_wavelet_f16_tn_instance.cpp diff --git a/test/gemm/instance/gemm_wavelet_f16_tn_instance.hpp b/composable_kernel/test/gemm/instance/gemm_wavelet_f16_tn_instance.hpp similarity index 100% rename from test/gemm/instance/gemm_wavelet_f16_tn_instance.hpp rename to composable_kernel/test/gemm/instance/gemm_wavelet_f16_tn_instance.hpp diff --git a/test/gemm/run_gemm_test.inc b/composable_kernel/test/gemm/run_gemm_test.inc similarity index 100% rename from test/gemm/run_gemm_test.inc rename to composable_kernel/test/gemm/run_gemm_test.inc diff --git a/test/gemm_layernorm/CMakeLists.txt b/composable_kernel/test/gemm_layernorm/CMakeLists.txt similarity index 100% rename from test/gemm_layernorm/CMakeLists.txt rename to composable_kernel/test/gemm_layernorm/CMakeLists.txt diff --git a/test/gemm_layernorm/test_gemm_add_relu_add_layernorm_fp16.cpp b/composable_kernel/test/gemm_layernorm/test_gemm_add_relu_add_layernorm_fp16.cpp similarity index 100% rename from test/gemm_layernorm/test_gemm_add_relu_add_layernorm_fp16.cpp rename to composable_kernel/test/gemm_layernorm/test_gemm_add_relu_add_layernorm_fp16.cpp diff --git a/test/gemm_reduce/CMakeLists.txt b/composable_kernel/test/gemm_reduce/CMakeLists.txt similarity index 100% rename from test/gemm_reduce/CMakeLists.txt rename to composable_kernel/test/gemm_reduce/CMakeLists.txt diff --git a/test/gemm_reduce/gemm_reduce_fp16.cpp b/composable_kernel/test/gemm_reduce/gemm_reduce_fp16.cpp similarity index 100% rename from test/gemm_reduce/gemm_reduce_fp16.cpp rename to composable_kernel/test/gemm_reduce/gemm_reduce_fp16.cpp diff --git a/test/gemm_split_k/CMakeLists.txt b/composable_kernel/test/gemm_split_k/CMakeLists.txt similarity index 100% rename from test/gemm_split_k/CMakeLists.txt rename to composable_kernel/test/gemm_split_k/CMakeLists.txt diff --git a/test/gemm_split_k/test_gemm_splitk.cpp b/composable_kernel/test/gemm_split_k/test_gemm_splitk.cpp similarity index 100% rename from test/gemm_split_k/test_gemm_splitk.cpp rename to composable_kernel/test/gemm_split_k/test_gemm_splitk.cpp diff --git a/test/gemm_split_k/test_gemm_splitk_ut_cases.inc b/composable_kernel/test/gemm_split_k/test_gemm_splitk_ut_cases.inc similarity index 100% rename from test/gemm_split_k/test_gemm_splitk_ut_cases.inc rename to composable_kernel/test/gemm_split_k/test_gemm_splitk_ut_cases.inc diff --git a/test/gemm_split_k/test_gemm_splitk_util.hpp b/composable_kernel/test/gemm_split_k/test_gemm_splitk_util.hpp similarity index 100% rename from test/gemm_split_k/test_gemm_splitk_util.hpp rename to composable_kernel/test/gemm_split_k/test_gemm_splitk_util.hpp diff --git a/test/grouped_convnd_bwd_data/CMakeLists.txt b/composable_kernel/test/grouped_convnd_bwd_data/CMakeLists.txt similarity index 100% rename from test/grouped_convnd_bwd_data/CMakeLists.txt rename to composable_kernel/test/grouped_convnd_bwd_data/CMakeLists.txt diff --git a/test/grouped_convnd_bwd_data/test_grouped_convnd_bwd_data.cpp b/composable_kernel/test/grouped_convnd_bwd_data/test_grouped_convnd_bwd_data.cpp similarity index 100% rename from test/grouped_convnd_bwd_data/test_grouped_convnd_bwd_data.cpp rename to composable_kernel/test/grouped_convnd_bwd_data/test_grouped_convnd_bwd_data.cpp diff --git a/test/grouped_convnd_bwd_data/test_grouped_convnd_bwd_data_interface_wmma.cpp b/composable_kernel/test/grouped_convnd_bwd_data/test_grouped_convnd_bwd_data_interface_wmma.cpp similarity index 100% rename from test/grouped_convnd_bwd_data/test_grouped_convnd_bwd_data_interface_wmma.cpp rename to composable_kernel/test/grouped_convnd_bwd_data/test_grouped_convnd_bwd_data_interface_wmma.cpp diff --git a/test/grouped_convnd_bwd_data/test_grouped_convnd_bwd_data_interface_xdl.cpp b/composable_kernel/test/grouped_convnd_bwd_data/test_grouped_convnd_bwd_data_interface_xdl.cpp similarity index 100% rename from test/grouped_convnd_bwd_data/test_grouped_convnd_bwd_data_interface_xdl.cpp rename to composable_kernel/test/grouped_convnd_bwd_data/test_grouped_convnd_bwd_data_interface_xdl.cpp diff --git a/test/grouped_convnd_bwd_weight/CMakeLists.txt b/composable_kernel/test/grouped_convnd_bwd_weight/CMakeLists.txt similarity index 100% rename from test/grouped_convnd_bwd_weight/CMakeLists.txt rename to composable_kernel/test/grouped_convnd_bwd_weight/CMakeLists.txt diff --git a/test/grouped_convnd_bwd_weight/test_grouped_convnd_bwd_weight.cpp b/composable_kernel/test/grouped_convnd_bwd_weight/test_grouped_convnd_bwd_weight.cpp similarity index 100% rename from test/grouped_convnd_bwd_weight/test_grouped_convnd_bwd_weight.cpp rename to composable_kernel/test/grouped_convnd_bwd_weight/test_grouped_convnd_bwd_weight.cpp diff --git a/test/grouped_convnd_bwd_weight/test_grouped_convnd_bwd_weight_interface_wmma.cpp b/composable_kernel/test/grouped_convnd_bwd_weight/test_grouped_convnd_bwd_weight_interface_wmma.cpp similarity index 100% rename from test/grouped_convnd_bwd_weight/test_grouped_convnd_bwd_weight_interface_wmma.cpp rename to composable_kernel/test/grouped_convnd_bwd_weight/test_grouped_convnd_bwd_weight_interface_wmma.cpp diff --git a/test/grouped_convnd_bwd_weight/test_grouped_convnd_bwd_weight_interface_xdl.cpp b/composable_kernel/test/grouped_convnd_bwd_weight/test_grouped_convnd_bwd_weight_interface_xdl.cpp similarity index 100% rename from test/grouped_convnd_bwd_weight/test_grouped_convnd_bwd_weight_interface_xdl.cpp rename to composable_kernel/test/grouped_convnd_bwd_weight/test_grouped_convnd_bwd_weight_interface_xdl.cpp diff --git a/test/grouped_convnd_fwd/CMakeLists.txt b/composable_kernel/test/grouped_convnd_fwd/CMakeLists.txt similarity index 100% rename from test/grouped_convnd_fwd/CMakeLists.txt rename to composable_kernel/test/grouped_convnd_fwd/CMakeLists.txt diff --git a/test/grouped_convnd_fwd/grouped_convnd_fwd.cpp b/composable_kernel/test/grouped_convnd_fwd/grouped_convnd_fwd.cpp similarity index 100% rename from test/grouped_convnd_fwd/grouped_convnd_fwd.cpp rename to composable_kernel/test/grouped_convnd_fwd/grouped_convnd_fwd.cpp diff --git a/test/grouped_convnd_fwd/test_grouped_convnd_fwd.cpp b/composable_kernel/test/grouped_convnd_fwd/test_grouped_convnd_fwd.cpp similarity index 100% rename from test/grouped_convnd_fwd/test_grouped_convnd_fwd.cpp rename to composable_kernel/test/grouped_convnd_fwd/test_grouped_convnd_fwd.cpp diff --git a/test/grouped_gemm/CMakeLists.txt b/composable_kernel/test/grouped_gemm/CMakeLists.txt similarity index 100% rename from test/grouped_gemm/CMakeLists.txt rename to composable_kernel/test/grouped_gemm/CMakeLists.txt diff --git a/test/grouped_gemm/test_grouped_gemm_interface.cpp b/composable_kernel/test/grouped_gemm/test_grouped_gemm_interface.cpp similarity index 100% rename from test/grouped_gemm/test_grouped_gemm_interface.cpp rename to composable_kernel/test/grouped_gemm/test_grouped_gemm_interface.cpp diff --git a/test/grouped_gemm/test_grouped_gemm_splitk.cpp b/composable_kernel/test/grouped_gemm/test_grouped_gemm_splitk.cpp similarity index 100% rename from test/grouped_gemm/test_grouped_gemm_splitk.cpp rename to composable_kernel/test/grouped_gemm/test_grouped_gemm_splitk.cpp diff --git a/test/grouped_gemm/test_grouped_gemm_ut_cases.inc b/composable_kernel/test/grouped_gemm/test_grouped_gemm_ut_cases.inc similarity index 100% rename from test/grouped_gemm/test_grouped_gemm_ut_cases.inc rename to composable_kernel/test/grouped_gemm/test_grouped_gemm_ut_cases.inc diff --git a/test/grouped_gemm/test_grouped_gemm_util.hpp b/composable_kernel/test/grouped_gemm/test_grouped_gemm_util.hpp similarity index 100% rename from test/grouped_gemm/test_grouped_gemm_util.hpp rename to composable_kernel/test/grouped_gemm/test_grouped_gemm_util.hpp diff --git a/test/image_to_column/CMakeLists.txt b/composable_kernel/test/image_to_column/CMakeLists.txt similarity index 100% rename from test/image_to_column/CMakeLists.txt rename to composable_kernel/test/image_to_column/CMakeLists.txt diff --git a/test/image_to_column/test_image_to_column.cpp b/composable_kernel/test/image_to_column/test_image_to_column.cpp similarity index 100% rename from test/image_to_column/test_image_to_column.cpp rename to composable_kernel/test/image_to_column/test_image_to_column.cpp diff --git a/test/image_to_column/test_image_to_column_interface.cpp b/composable_kernel/test/image_to_column/test_image_to_column_interface.cpp similarity index 100% rename from test/image_to_column/test_image_to_column_interface.cpp rename to composable_kernel/test/image_to_column/test_image_to_column_interface.cpp diff --git a/test/magic_number_division/CMakeLists.txt b/composable_kernel/test/magic_number_division/CMakeLists.txt similarity index 100% rename from test/magic_number_division/CMakeLists.txt rename to composable_kernel/test/magic_number_division/CMakeLists.txt diff --git a/test/magic_number_division/magic_number_division.cpp b/composable_kernel/test/magic_number_division/magic_number_division.cpp similarity index 100% rename from test/magic_number_division/magic_number_division.cpp rename to composable_kernel/test/magic_number_division/magic_number_division.cpp diff --git a/test/normalization/CMakeLists.txt b/composable_kernel/test/normalization/CMakeLists.txt similarity index 100% rename from test/normalization/CMakeLists.txt rename to composable_kernel/test/normalization/CMakeLists.txt diff --git a/test/normalization/test_groupnorm_fp16.cpp b/composable_kernel/test/normalization/test_groupnorm_fp16.cpp similarity index 100% rename from test/normalization/test_groupnorm_fp16.cpp rename to composable_kernel/test/normalization/test_groupnorm_fp16.cpp diff --git a/test/normalization/test_groupnorm_fp32.cpp b/composable_kernel/test/normalization/test_groupnorm_fp32.cpp similarity index 100% rename from test/normalization/test_groupnorm_fp32.cpp rename to composable_kernel/test/normalization/test_groupnorm_fp32.cpp diff --git a/test/normalization/test_layernorm2d_fp16.cpp b/composable_kernel/test/normalization/test_layernorm2d_fp16.cpp similarity index 100% rename from test/normalization/test_layernorm2d_fp16.cpp rename to composable_kernel/test/normalization/test_layernorm2d_fp16.cpp diff --git a/test/normalization/test_layernorm2d_fp32.cpp b/composable_kernel/test/normalization/test_layernorm2d_fp32.cpp similarity index 100% rename from test/normalization/test_layernorm2d_fp32.cpp rename to composable_kernel/test/normalization/test_layernorm2d_fp32.cpp diff --git a/test/pool/CMakeLists.txt b/composable_kernel/test/pool/CMakeLists.txt similarity index 100% rename from test/pool/CMakeLists.txt rename to composable_kernel/test/pool/CMakeLists.txt diff --git a/test/pool/test_avg_pool3d_bwd.cpp b/composable_kernel/test/pool/test_avg_pool3d_bwd.cpp similarity index 100% rename from test/pool/test_avg_pool3d_bwd.cpp rename to composable_kernel/test/pool/test_avg_pool3d_bwd.cpp diff --git a/test/pool/test_avg_pool3d_fwd.cpp b/composable_kernel/test/pool/test_avg_pool3d_fwd.cpp similarity index 100% rename from test/pool/test_avg_pool3d_fwd.cpp rename to composable_kernel/test/pool/test_avg_pool3d_fwd.cpp diff --git a/test/pool/test_max_pool3d_bwd.cpp b/composable_kernel/test/pool/test_max_pool3d_bwd.cpp similarity index 100% rename from test/pool/test_max_pool3d_bwd.cpp rename to composable_kernel/test/pool/test_max_pool3d_bwd.cpp diff --git a/test/pool/test_max_pool3d_fwd.cpp b/composable_kernel/test/pool/test_max_pool3d_fwd.cpp similarity index 100% rename from test/pool/test_max_pool3d_fwd.cpp rename to composable_kernel/test/pool/test_max_pool3d_fwd.cpp diff --git a/test/pool/test_pool_fwd_common.hpp b/composable_kernel/test/pool/test_pool_fwd_common.hpp similarity index 100% rename from test/pool/test_pool_fwd_common.hpp rename to composable_kernel/test/pool/test_pool_fwd_common.hpp diff --git a/test/reduce/CMakeLists.txt b/composable_kernel/test/reduce/CMakeLists.txt similarity index 100% rename from test/reduce/CMakeLists.txt rename to composable_kernel/test/reduce/CMakeLists.txt diff --git a/test/reduce/reduce_no_index.cpp b/composable_kernel/test/reduce/reduce_no_index.cpp similarity index 100% rename from test/reduce/reduce_no_index.cpp rename to composable_kernel/test/reduce/reduce_no_index.cpp diff --git a/test/reduce/reduce_with_index.cpp b/composable_kernel/test/reduce/reduce_with_index.cpp similarity index 100% rename from test/reduce/reduce_with_index.cpp rename to composable_kernel/test/reduce/reduce_with_index.cpp diff --git a/test/reference_conv_fwd/CMakeLists.txt b/composable_kernel/test/reference_conv_fwd/CMakeLists.txt similarity index 100% rename from test/reference_conv_fwd/CMakeLists.txt rename to composable_kernel/test/reference_conv_fwd/CMakeLists.txt diff --git a/test/reference_conv_fwd/reference_conv_fwd.cpp b/composable_kernel/test/reference_conv_fwd/reference_conv_fwd.cpp similarity index 100% rename from test/reference_conv_fwd/reference_conv_fwd.cpp rename to composable_kernel/test/reference_conv_fwd/reference_conv_fwd.cpp diff --git a/test/softmax/CMakeLists.txt b/composable_kernel/test/softmax/CMakeLists.txt similarity index 100% rename from test/softmax/CMakeLists.txt rename to composable_kernel/test/softmax/CMakeLists.txt diff --git a/test/softmax/test_softmax_interface.cpp b/composable_kernel/test/softmax/test_softmax_interface.cpp similarity index 100% rename from test/softmax/test_softmax_interface.cpp rename to composable_kernel/test/softmax/test_softmax_interface.cpp diff --git a/test/softmax/test_softmax_rank3.cpp b/composable_kernel/test/softmax/test_softmax_rank3.cpp similarity index 100% rename from test/softmax/test_softmax_rank3.cpp rename to composable_kernel/test/softmax/test_softmax_rank3.cpp diff --git a/test/softmax/test_softmax_rank4.cpp b/composable_kernel/test/softmax/test_softmax_rank4.cpp similarity index 100% rename from test/softmax/test_softmax_rank4.cpp rename to composable_kernel/test/softmax/test_softmax_rank4.cpp diff --git a/test/softmax/test_softmax_ut_cases.inc b/composable_kernel/test/softmax/test_softmax_ut_cases.inc similarity index 100% rename from test/softmax/test_softmax_ut_cases.inc rename to composable_kernel/test/softmax/test_softmax_ut_cases.inc diff --git a/test/softmax/test_softmax_util.hpp b/composable_kernel/test/softmax/test_softmax_util.hpp similarity index 100% rename from test/softmax/test_softmax_util.hpp rename to composable_kernel/test/softmax/test_softmax_util.hpp diff --git a/test/space_filling_curve/CMakeLists.txt b/composable_kernel/test/space_filling_curve/CMakeLists.txt similarity index 100% rename from test/space_filling_curve/CMakeLists.txt rename to composable_kernel/test/space_filling_curve/CMakeLists.txt diff --git a/test/space_filling_curve/space_filling_curve.cpp b/composable_kernel/test/space_filling_curve/space_filling_curve.cpp similarity index 100% rename from test/space_filling_curve/space_filling_curve.cpp rename to composable_kernel/test/space_filling_curve/space_filling_curve.cpp diff --git a/test/wmma_op/CMakeLists.txt b/composable_kernel/test/wmma_op/CMakeLists.txt similarity index 100% rename from test/wmma_op/CMakeLists.txt rename to composable_kernel/test/wmma_op/CMakeLists.txt diff --git a/test/wmma_op/wmma_op.cpp b/composable_kernel/test/wmma_op/wmma_op.cpp similarity index 100% rename from test/wmma_op/wmma_op.cpp rename to composable_kernel/test/wmma_op/wmma_op.cpp diff --git a/test/wmma_op/wmma_op_util.hpp b/composable_kernel/test/wmma_op/wmma_op_util.hpp similarity index 100% rename from test/wmma_op/wmma_op_util.hpp rename to composable_kernel/test/wmma_op/wmma_op_util.hpp diff --git a/library/src/tensor_operation_instance/gpu/gemv_splitk/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/gemv_splitk/CMakeLists.txt deleted file mode 100755 index 0590def76..000000000 --- a/library/src/tensor_operation_instance/gpu/gemv_splitk/CMakeLists.txt +++ /dev/null @@ -1,10 +0,0 @@ -set(GEMV_SPLITK_INSTANCES) - - -if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES) - list(APPEND GEMV_SPLITK_INSTANCES device_gemv_splitk_f16_f16_f16_mk_kn_mn_instance.cpp) - list(APPEND GEMV_SPLITK_INSTANCES device_gemv_splitk_f16_f16_f16_mk_nk_mn_instance.cpp) -endif() - - -add_instance_library(device_gemv_splitk_instance ${GEMV_SPLITK_INSTANCES}) -- GitLab