From 304802889728707c2a162322ce18686169e732ea Mon Sep 17 00:00:00 2001 From: Adam Osewski <19374865+aosewski@users.noreply.github.com> Date: Thu, 13 Oct 2022 16:05:08 +0200 Subject: [PATCH 01/95] Refactor device op implementations into `impl` subdirectory. (#420) * Move kernel implementation files under impl directory. * Update examples paths. * Update device kernel impl include paths. * Update tensor operation instances include paths. * Update profiler and tests include paths. * Clang-format * Update include paths for batched gemm reduce * Refactor UnitTest ConvNDBwdWeight. * Refactor fwd and bwd data convND UT. * Fix used test macro. * Fix include path. * Fix include paths. * Fix include paths in profiler and tests. * Fix include paths. Co-authored-by: Adam Osewski --- .../gemm_add_add_layernorm.cpp | 2 +- example/01_gemm/gemm_dl_fp16.cpp | 2 +- example/01_gemm/gemm_dl_fp32.cpp | 2 +- example/01_gemm/gemm_dl_int4.cpp | 2 +- example/01_gemm/gemm_dl_int8.cpp | 2 +- example/01_gemm/gemm_xdl_bf16.cpp | 2 +- example/01_gemm/gemm_xdl_fp16.cpp | 4 +- example/01_gemm/gemm_xdl_fp64.cpp | 2 +- example/01_gemm/gemm_xdl_int4.cpp | 2 +- example/01_gemm/gemm_xdl_int8.cpp | 2 +- example/01_gemm/gemm_xdl_skip_b_lds_fp16.cpp | 4 +- .../gemm_bilinear_xdl_fp16.cpp | 2 +- .../gemm_bias_relu_xdl_fp16.cpp | 2 +- example/04_gemm_add_add_fastgelu/common.hpp | 2 +- example/09_convnd_fwd/convnd_fwd_xdl_bf16.cpp | 2 +- example/09_convnd_fwd/convnd_fwd_xdl_fp16.cpp | 2 +- example/09_convnd_fwd/convnd_fwd_xdl_fp32.cpp | 2 +- example/09_convnd_fwd/convnd_fwd_xdl_fp64.cpp | 2 +- example/09_convnd_fwd/convnd_fwd_xdl_int8.cpp | 2 +- .../common.hpp | 2 +- example/12_reduce/reduce_blockwise_impl.hpp | 2 +- .../12_reduce/reduce_blockwise_two_call.cpp | 2 +- .../reduce_multiblock_atomic_add_impl.hpp | 2 +- example/13_pool2d_fwd/pool2d_fwd_common.hpp | 2 +- .../gemm_xdl_requant_relu_requant_int8.cpp | 2 +- .../grouped_gemm_xdl_bfp16.cpp | 2 +- .../15_grouped_gemm/grouped_gemm_xdl_fp16.cpp | 2 +- .../15_grouped_gemm/grouped_gemm_xdl_fp32.cpp | 2 +- .../15_grouped_gemm/grouped_gemm_xdl_int4.cpp | 2 +- .../15_grouped_gemm/grouped_gemm_xdl_int8.cpp | 2 +- .../gemm_add_add_mean_meansquare_xdl_fp16.cpp | 2 +- .../gemm_add_addsquare_xdl_int8.cpp | 2 +- .../gemm_max_xdl_bf16.cpp | 2 +- .../gemm_max_xdl_fp16.cpp | 2 +- .../gemm_max_xdl_fp32.cpp | 2 +- .../gemm_max_xdl_int4.cpp | 2 +- .../gemm_max_xdl_int8.cpp | 2 +- .../gemm_mean_meansquare_xdl_bf16.cpp | 2 +- .../gemm_mean_meansquare_xdl_fp16.cpp | 2 +- .../gemm_mean_meansquare_xdl_fp32.cpp | 2 +- .../convnd_bwd_data_xdl_fp16.cpp | 2 +- .../batched_gemm_reduce_xdl_fp16.cpp | 2 +- .../broadcast_add_2d_amn_bn.cpp | 2 +- .../broadcast_add_3d_am_bmnk.cpp | 2 +- .../elementwise_add_1d.cpp | 2 +- .../elementwise_add_4d.cpp | 2 +- .../convnd_bwd_weight_xdl_bf16.cpp | 2 +- .../convnd_bwd_weight_xdl_fp16.cpp | 2 +- .../gemm_bias_relu_add_layernorm_xdl_fp16.cpp | 4 +- .../gemm_layernorm_xdl_fp16.cpp | 4 +- .../gemm_xdl_layernorm_single_kernel_fp16.cpp | 2 +- example/22_cgemm/cgemm_xdl_bf16.cpp | 2 +- example/22_cgemm/cgemm_xdl_fp16.cpp | 2 +- example/22_cgemm/cgemm_xdl_fp32.cpp | 2 +- example/22_cgemm/cgemm_xdl_int4.cpp | 2 +- example/22_cgemm/cgemm_xdl_int8.cpp | 2 +- .../batched_gemm_xdl_bfp16.cpp | 2 +- .../24_batched_gemm/batched_gemm_xdl_fp16.cpp | 2 +- .../24_batched_gemm/batched_gemm_xdl_fp32.cpp | 2 +- .../24_batched_gemm/batched_gemm_xdl_int4.cpp | 2 +- .../24_batched_gemm/batched_gemm_xdl_int8.cpp | 2 +- .../gemm_bias_e_permute_g1m2n3k1_xdl_fp16.cpp | 2 +- .../gemm_bias_e_permute_g1m3n2k1_xdl_fp16.cpp | 2 +- .../contraction_bilinear_xdl_fp32.cpp | 2 +- .../contraction_scale_xdl_fp32.cpp | 2 +- example/27_layernorm/layernorm_blockwise.cpp | 2 +- .../grouped_gemm_bias_e_permute_xdl_fp16.cpp | 2 +- .../batched_gemm_bias_e_permute_xdl_fp16.cpp | 2 +- ...uped_convnd_fwd_bias_relu_add_xdl_bf16.cpp | 2 +- ...uped_convnd_fwd_bias_relu_add_xdl_fp16.cpp | 2 +- ...uped_convnd_fwd_bias_relu_add_xdl_fp32.cpp | 2 +- ...uped_convnd_fwd_bias_relu_add_xdl_int4.cpp | 2 +- ...uped_convnd_fwd_bias_relu_add_xdl_int8.cpp | 2 +- .../batched_gemm_gemm_xdl_bf16.cpp | 2 +- .../batched_gemm_gemm_xdl_fp16.cpp | 2 +- .../batched_gemm_gemm_xdl_fp32.cpp | 2 +- .../batched_gemm_gemm_xdl_int4.cpp | 2 +- .../batched_gemm_gemm_xdl_int8.cpp | 2 +- ...le_scale_softmax_gemm_permute_xdl_fp16.cpp | 2 +- ...mm_scale_softmax_gemm_permute_xdl_fp16.cpp | 2 +- ...tched_gemm_scale_softmax_gemm_xdl_fp16.cpp | 2 +- .../dual_reduce_multiblock.cpp | 2 +- .../dual_reduce_threadwise.cpp | 2 +- .../34_batchnorm/batchnorm_forward_impl.hpp | 4 +- example/34_batchnorm/batchnorm_infer_impl.hpp | 2 +- .../35_splitK_gemm/splitK_gemm_xdl_bfp16.cpp | 2 +- .../35_splitK_gemm/splitK_gemm_xdl_fp16.cpp | 2 +- .../35_splitK_gemm/splitK_gemm_xdl_fp32.cpp | 2 +- .../35_splitK_gemm/splitK_gemm_xdl_int4.cpp | 2 +- .../35_splitK_gemm/splitK_gemm_xdl_int8.cpp | 2 +- .../sparse_embedding3_forward_layernorm.cpp | 2 +- ...ed_gemm_add_add_relu_gemm_add_xdl_fp16.cpp | 2 +- .../grouped_conv_conv_fwd_xdl_bf16.cpp | 2 +- .../grouped_conv_conv_fwd_xdl_fp16.cpp | 2 +- .../grouped_conv_conv_fwd_xdl_fp32.cpp | 2 +- .../grouped_conv_conv_fwd_xdl_int4.cpp | 2 +- .../grouped_conv_conv_fwd_xdl_int8.cpp | 2 +- .../42_groupnorm/groupnorm_sigmoid_fp16.cpp | 2 +- ...ed_contraction_multiple_d_xdl_cshuffle.hpp | 0 .../device_batched_gemm_e_permute_xdl.hpp | 683 ++++++++++++++++++ .../device_batched_gemm_gemm_xdl_cshuffle.hpp | 0 .../device_batched_gemm_multi_d_xdl.hpp | 6 +- ...ultiple_d_gemm_multiple_d_xdl_cshuffle.hpp | 0 ...evice_batched_gemm_reduce_xdl_cshuffle.hpp | 0 ...gemm_softmax_gemm_permute_xdl_cshuffle.hpp | 2 +- ...batched_gemm_softmax_gemm_xdl_cshuffle.hpp | 0 .../{ => impl}/device_batched_gemm_xdl.hpp | 0 .../device_cgemm_4gemm_xdl_cshuffle.hpp | 0 ...ce_contraction_multiple_d_xdl_cshuffle.hpp | 0 ...rd_weight_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp | 0 ...ice_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk.hpp | 0 ...fle_bias_activation_add_nhwc_kyxc_nhwk.hpp | 0 ...shuffle_bias_activation_nhwc_kyxc_nhwk.hpp | 0 ...onv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp | 0 .../device_conv2d_fwd_xdl_nhwc_kyxc_nhwk.hpp | 0 ...ice_conv3d_fwd_naive_ndhwc_kzyxc_ndhwk.hpp | 0 ...evice_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk.hpp | 2 +- ...device_convnd_bwd_data_nwc_kxc_nwk_xdl.hpp | 0 ...nd_bwd_weight_nwc_kxc_nwk_xdl_cshuffle.hpp | 0 .../device/{ => impl}/device_elementwise.hpp | 0 ...vice_gemm_bias_add_reduce_xdl_cshuffle.hpp | 0 .../device_gemm_bias_e_permute_xdl.hpp | 0 .../gpu/device/{ => impl}/device_gemm_dl.hpp | 0 ...emm_multiple_d_multiple_r_xdl_cshuffle.hpp | 0 .../device_gemm_multiple_d_xdl_cshuffle.hpp | 0 .../device_gemm_reduce_xdl_cshuffle.hpp | 0 .../gpu/device/{ => impl}/device_gemm_xdl.hpp | 0 .../{ => impl}/device_gemm_xdl_cshuffle.hpp | 0 .../device_gemm_xdl_layernorm_cshuffle.hpp | 0 .../{ => impl}/device_gemm_xdl_skip_b_lds.hpp | 0 .../device_gemm_xdl_splitk_c_shuffle.hpp | 0 ...ed_contraction_multiple_d_xdl_cshuffle.hpp | 0 ...grouped_conv_fwd_multiple_d_multiple_r.hpp | 0 ...fwd_multiple_d_multiple_r_xdl_cshuffle.hpp | 8 +- ...ouped_conv_fwd_multiple_d_xdl_cshuffle.hpp | 6 +- .../{ => impl}/device_grouped_gemm_xdl.hpp | 0 .../device_multiple_reduce_multiblock.hpp | 2 +- .../device_multiple_reduce_threadwise.hpp | 2 +- .../{ => impl}/device_normalization_impl.hpp | 2 +- .../device_pool2d_fwd_nhwc_nhwc.hpp | 0 .../{ => impl}/device_reduce_common.hpp | 0 .../{ => impl}/device_reduce_multiblock.hpp | 2 +- .../{ => impl}/device_reduce_threadwise.hpp | 2 +- .../gpu/device/impl/device_softmax_impl.hpp | 4 +- ...ce_sparse_embedding3_forward_layernorm.hpp | 0 .../gpu/device_elementwise_instance.hpp | 2 +- .../device_reduce_instance_blockwise.hpp | 2 +- ..._reduce_instance_multiblock_atomic_add.hpp | 2 +- .../device_reduce_instance_threadwise.hpp | 2 +- ...dl_bf16_bf16_bf16_gkm_gkn_gmn_instance.cpp | 2 +- ...dl_bf16_bf16_bf16_gkm_gnk_gmn_instance.cpp | 2 +- ...dl_bf16_bf16_bf16_gmk_gkn_gmn_instance.cpp | 2 +- ...dl_bf16_bf16_bf16_gmk_gnk_gmn_instance.cpp | 2 +- ...m_xdl_f16_f16_f16_gkm_gkn_gmn_instance.cpp | 2 +- ...m_xdl_f16_f16_f16_gkm_gnk_gmn_instance.cpp | 2 +- ...m_xdl_f16_f16_f16_gmk_gkn_gmn_instance.cpp | 2 +- ...m_xdl_f16_f16_f16_gmk_gnk_gmn_instance.cpp | 2 +- ...m_xdl_f32_f32_f32_gkm_gkn_gmn_instance.cpp | 2 +- ...m_xdl_f32_f32_f32_gkm_gnk_gmn_instance.cpp | 2 +- ...m_xdl_f32_f32_f32_gmk_gkn_gmn_instance.cpp | 2 +- ...m_xdl_f32_f32_f32_gmk_gnk_gmn_instance.cpp | 2 +- ...dl_int8_int8_int8_gkm_gkn_gmn_instance.cpp | 2 +- ...dl_int8_int8_int8_gkm_gnk_gmn_instance.cpp | 2 +- ...dl_int8_int8_int8_gmk_gkn_gmn_instance.cpp | 2 +- ...dl_int8_int8_int8_gmk_gnk_gmn_instance.cpp | 2 +- ...6_f16_f16_f16_gmk_gnk_gno_gmo_instance.cpp | 2 +- ...6_f16_f16_f16_gmk_gnk_gon_gmo_instance.cpp | 2 +- ...6_f16_f16_f16_gmk_gnk_gno_gmo_instance.cpp | 2 +- ...6_f16_f16_f16_gmk_gnk_gon_gmo_instance.cpp | 2 +- ...6_f16_f16_f16_gmk_gnk_gno_gmo_instance.cpp | 2 +- ...6_f16_f16_f32_f32_gkm_gkn_gmn_instance.cpp | 2 +- ...6_f16_f16_f32_f32_gkm_gnk_gmn_instance.cpp | 2 +- ...6_f16_f16_f32_f32_gmk_gkn_gmn_instance.cpp | 2 +- ...6_f16_f16_f32_f32_gmk_gnk_gmn_instance.cpp | 2 +- ...6_f16_f16_f16_gmk_gnk_gno_gmo_instance.cpp | 2 +- ..._shuffle_f32_f32_f32_f32_kknn_instance.cpp | 2 +- ..._shuffle_f32_f32_f32_f32_knnn_instance.cpp | 2 +- ..._shuffle_f32_f32_f32_f32_mknn_instance.cpp | 2 +- ..._shuffle_f32_f32_f32_f32_mnnn_instance.cpp | 2 +- ...xdl_c_shuffle_f32_f32_f32_kkn_instance.cpp | 2 +- ...xdl_c_shuffle_f32_f32_f32_knn_instance.cpp | 2 +- ...xdl_c_shuffle_f32_f32_f32_mkn_instance.cpp | 2 +- ...xdl_c_shuffle_f32_f32_f32_mnn_instance.cpp | 2 +- ...bwd_data_xdl_nwc_kxc_nwk_bf16_instance.cpp | 2 +- ..._bwd_data_xdl_nwc_kxc_nwk_f16_instance.cpp | 2 +- ..._bwd_data_xdl_nwc_kxc_nwk_f32_instance.cpp | 2 +- ...bwd_data_xdl_nwc_kxc_nwk_int8_instance.cpp | 2 +- ...d_weight_xdl_nwc_kxc_nwk_bf16_instance.cpp | 2 +- ...wd_weight_xdl_nwc_kxc_nwk_f16_instance.cpp | 2 +- ...wd_weight_xdl_nwc_kxc_nwk_f32_instance.cpp | 2 +- ..._data_xdl_nhwc_kyxc_nhwk_bf16_instance.cpp | 4 +- ...d_data_xdl_nhwc_kyxc_nhwk_f16_instance.cpp | 4 +- ...d_data_xdl_nhwc_kyxc_nhwk_f32_instance.cpp | 4 +- ..._data_xdl_nhwc_kyxc_nhwk_int8_instance.cpp | 4 +- ...eight_xdl_nhwc_kyxc_nhwk_bf16_instance.cpp | 2 +- ...weight_xdl_nhwc_kyxc_nhwk_f16_instance.cpp | 4 +- ...weight_xdl_nhwc_kyxc_nhwk_f32_instance.cpp | 4 +- ..._c_shuffle_nhwc_kyxc_nhwk_f16_instance.cpp | 2 +- ...d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instance.cpp | 2 +- ...2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instance.cpp | 2 +- ...2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instance.cpp | 2 +- ...d_fwd_xdl_nhwc_kyxc_nhwk_int8_instance.cpp | 2 +- ..._bias_relu_nhwc_kyxc_nhwk_f16_instance.cpp | 2 +- ...s_relu_add_nhwc_kyxc_nhwk_f16_instance.cpp | 2 +- ...ta_xdl_ndhwc_kzyxc_ndhwk_bf16_instance.cpp | 2 +- ...ata_xdl_ndhwc_kzyxc_ndhwk_f16_instance.cpp | 2 +- ...ata_xdl_ndhwc_kzyxc_ndhwk_f32_instance.cpp | 2 +- ...ta_xdl_ndhwc_kzyxc_ndhwk_int8_instance.cpp | 2 +- ...ht_xdl_ndhwc_kzyxc_ndhwk_bf16_instance.cpp | 2 +- ...ght_xdl_ndhwc_kzyxc_ndhwk_f16_instance.cpp | 2 +- ...ght_xdl_ndhwc_kzyxc_ndhwk_f32_instance.cpp | 2 +- .../elementwise/device_normalize_instance.cpp | 2 +- ..._gemm_dl_f16_f16_f16_km_kn_mn_instance.cpp | 2 +- ..._gemm_dl_f16_f16_f16_km_nk_mn_instance.cpp | 2 +- ..._gemm_dl_f16_f16_f16_mk_kn_mn_instance.cpp | 2 +- ..._gemm_dl_f16_f16_f16_mk_nk_mn_instance.cpp | 2 +- ..._gemm_dl_f32_f32_f32_km_kn_mn_instance.cpp | 2 +- ..._gemm_dl_f32_f32_f32_km_nk_mn_instance.cpp | 2 +- ..._gemm_dl_f32_f32_f32_mk_kn_mn_instance.cpp | 2 +- ..._gemm_dl_f32_f32_f32_mk_nk_mn_instance.cpp | 2 +- ...ice_gemm_dl_i8_i8_i8_km_kn_mn_instance.cpp | 2 +- ...ice_gemm_dl_i8_i8_i8_km_nk_mn_instance.cpp | 2 +- ...ice_gemm_dl_i8_i8_i8_mk_kn_mn_instance.cpp | 2 +- ...ice_gemm_dl_i8_i8_i8_mk_nk_mn_instance.cpp | 2 +- ..._2_stage_f16_f16_f16_mk_nk_mn_instance.cpp | 2 +- ...uffle_bf16_bf16_bf16_km_kn_mn_instance.cpp | 2 +- ...uffle_bf16_bf16_bf16_km_nk_mn_instance.cpp | 2 +- ...uffle_bf16_bf16_bf16_mk_kn_mn_instance.cpp | 2 +- ...uffle_bf16_bf16_bf16_mk_nk_mn_instance.cpp | 2 +- ..._shuffle_f16_f16_f16_km_kn_mn_instance.cpp | 2 +- ..._shuffle_f16_f16_f16_km_nk_mn_instance.cpp | 2 +- ..._shuffle_f16_f16_f16_mk_kn_mn_instance.cpp | 2 +- ..._shuffle_f16_f16_f16_mk_nk_mn_instance.cpp | 2 +- ..._shuffle_f32_f32_f32_km_kn_mn_instance.cpp | 2 +- ..._shuffle_f32_f32_f32_km_nk_mn_instance.cpp | 2 +- ..._shuffle_f32_f32_f32_mk_kn_mn_instance.cpp | 2 +- ..._shuffle_f32_f32_f32_mk_nk_mn_instance.cpp | 2 +- ...l_c_shuffle_i8_i8_i8_km_kn_mn_instance.cpp | 2 +- ...l_c_shuffle_i8_i8_i8_km_nk_mn_instance.cpp | 2 +- ...l_c_shuffle_i8_i8_i8_mk_kn_mn_instance.cpp | 2 +- ...l_c_shuffle_i8_i8_i8_mk_nk_mn_instance.cpp | 2 +- ...gemm_xdl_f16_f16_f16_km_kn_mn_instance.cpp | 2 +- ...gemm_xdl_f16_f16_f16_km_nk_mn_instance.cpp | 2 +- ...gemm_xdl_f16_f16_f16_mk_kn_mn_instance.cpp | 2 +- ...gemm_xdl_f16_f16_f16_mk_nk_mn_instance.cpp | 2 +- ...gemm_xdl_f32_f32_f32_km_kn_mn_instance.cpp | 2 +- ...gemm_xdl_f32_f32_f32_km_nk_mn_instance.cpp | 2 +- ...gemm_xdl_f32_f32_f32_mk_kn_mn_instance.cpp | 2 +- ...gemm_xdl_f32_f32_f32_mk_nk_mn_instance.cpp | 2 +- ...gemm_xdl_f64_f64_f64_km_kn_mn_instance.cpp | 2 +- ...gemm_xdl_f64_f64_f64_km_nk_mn_instance.cpp | 2 +- ...gemm_xdl_f64_f64_f64_mk_kn_mn_instance.cpp | 2 +- ...gemm_xdl_f64_f64_f64_mk_nk_mn_instance.cpp | 2 +- ...16_f16_f16_f16_km_kn_mn_mn_mn_instance.cpp | 2 +- ...16_f16_f16_f16_km_nk_mn_mn_mn_instance.cpp | 2 +- ...16_f16_f16_f16_mk_kn_mn_mn_mn_instance.cpp | 2 +- ...16_f16_f16_f16_mk_nk_mn_mn_mn_instance.cpp | 2 +- ..._f16_f16_f16_f32_f32_km_kn_mn_instance.cpp | 2 +- ..._f16_f16_f16_f32_f32_km_nk_mn_instance.cpp | 2 +- ..._f16_f16_f16_f32_f32_mk_kn_mn_instance.cpp | 2 +- ..._f16_f16_f16_f32_f32_mk_nk_mn_instance.cpp | 2 +- ...e_f16_f16_f16_f16_km_kn_mn_mn_instance.cpp | 2 +- ...e_f16_f16_f16_f16_km_nk_mn_mn_instance.cpp | 2 +- ...e_f16_f16_f16_f16_mk_kn_mn_mn_instance.cpp | 2 +- ...e_f16_f16_f16_f16_mk_nk_mn_mn_instance.cpp | 2 +- ..._f16_f16_f16_f32_f32_km_kn_mn_instance.cpp | 2 +- ..._f16_f16_f16_f32_f32_km_nk_mn_instance.cpp | 2 +- ..._f16_f16_f16_f32_f32_mk_kn_mn_instance.cpp | 2 +- ..._f16_f16_f16_f32_f32_mk_nk_mn_instance.cpp | 2 +- ...l_splitk_f16_f16_f16_km_kn_mn_instance.cpp | 2 +- ...l_splitk_f16_f16_f16_km_nk_mn_instance.cpp | 2 +- ...l_splitk_f16_f16_f16_mk_kn_mn_instance.cpp | 2 +- ...l_splitk_f16_f16_f16_mk_nk_mn_instance.cpp | 2 +- ...l_splitk_f32_f32_f32_km_kn_mn_instance.cpp | 2 +- ...l_splitk_f32_f32_f32_km_nk_mn_instance.cpp | 2 +- ...l_splitk_f32_f32_f32_mk_kn_mn_instance.cpp | 2 +- ...l_splitk_f32_f32_f32_mk_nk_mn_instance.cpp | 2 +- ...d_fwd_xdl_gnwc_gkxc_gnwk_bf16_instance.cpp | 2 +- ...1d_fwd_xdl_gnwc_gkxc_gnwk_f16_instance.cpp | 2 +- ...1d_fwd_xdl_gnwc_gkxc_gnwk_f32_instance.cpp | 2 +- ...d_fwd_xdl_gnwc_gkxc_gnwk_int8_instance.cpp | 2 +- ...wd_xdl_gnhwc_gkyxc_gnhwk_bf16_instance.cpp | 2 +- ...fwd_xdl_gnhwc_gkyxc_gnhwk_f16_instance.cpp | 2 +- ...fwd_xdl_gnhwc_gkyxc_gnhwk_f32_instance.cpp | 2 +- ...wd_xdl_gnhwc_gkyxc_gnhwk_int8_instance.cpp | 2 +- ...fwd_xdl_nhwgc_gkyxc_nhwgk_f16_instance.cpp | 2 +- ...xdl_gndhwc_gkzyxc_gndhwk_bf16_instance.cpp | 2 +- ..._xdl_gndhwc_gkzyxc_gndhwk_f16_instance.cpp | 2 +- ..._xdl_gndhwc_gkzyxc_gndhwk_f32_instance.cpp | 2 +- ...xdl_gndhwc_gkzyxc_gndhwk_int8_instance.cpp | 2 +- ...gemm_xdl_f16_f16_f16_km_kn_mn_instance.cpp | 2 +- ...gemm_xdl_f16_f16_f16_km_nk_mn_instance.cpp | 2 +- ...gemm_xdl_f16_f16_f16_mk_kn_mn_instance.cpp | 2 +- ...gemm_xdl_f16_f16_f16_mk_nk_mn_instance.cpp | 2 +- .../device_normalization_f16_instance.cpp | 2 +- .../device_normalization_f32_instance.cpp | 2 +- ...asking_scale_softmax_gemm_permute_impl.hpp | 2 +- profiler/include/profile_layernorm_impl.hpp | 2 - .../test_batched_gemm_gemm_util.hpp | 2 +- ...asking_scale_softmax_gemm_permute_util.hpp | 2 +- .../test_batched_gemm_softmax_gemm_util.hpp | 2 +- test/convnd_bwd_data/convnd_bwd_data.cpp | 272 ++----- test/convnd_bwd_weight/convnd_bwd_weight.cpp | 237 ++---- test/convnd_fwd/convnd_fwd.cpp | 273 ++----- test/normalization/test_layernorm2d_util.hpp | 2 +- 305 files changed, 1152 insertions(+), 883 deletions(-) rename include/ck/tensor_operation/gpu/device/{ => impl}/device_batched_contraction_multiple_d_xdl_cshuffle.hpp (100%) create mode 100644 include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_e_permute_xdl.hpp rename include/ck/tensor_operation/gpu/device/{ => impl}/device_batched_gemm_gemm_xdl_cshuffle.hpp (100%) rename include/ck/tensor_operation/gpu/device/{ => impl}/device_batched_gemm_multi_d_xdl.hpp (99%) rename include/ck/tensor_operation/gpu/device/{ => impl}/device_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle.hpp (100%) rename include/ck/tensor_operation/gpu/device/{ => impl}/device_batched_gemm_reduce_xdl_cshuffle.hpp (100%) rename include/ck/tensor_operation/gpu/device/{ => impl}/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle.hpp (100%) rename include/ck/tensor_operation/gpu/device/{ => impl}/device_batched_gemm_softmax_gemm_xdl_cshuffle.hpp (100%) rename include/ck/tensor_operation/gpu/device/{ => impl}/device_batched_gemm_xdl.hpp (100%) rename include/ck/tensor_operation/gpu/device/{ => impl}/device_cgemm_4gemm_xdl_cshuffle.hpp (100%) rename include/ck/tensor_operation/gpu/device/{ => impl}/device_contraction_multiple_d_xdl_cshuffle.hpp (100%) rename include/ck/tensor_operation/gpu/device/{ => impl}/device_conv2d_backward_weight_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp (100%) rename include/ck/tensor_operation/gpu/device/{ => impl}/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk.hpp (100%) rename include/ck/tensor_operation/gpu/device/{ => impl}/device_conv2d_fwd_xdl_c_shuffle_bias_activation_add_nhwc_kyxc_nhwk.hpp (100%) rename include/ck/tensor_operation/gpu/device/{ => impl}/device_conv2d_fwd_xdl_c_shuffle_bias_activation_nhwc_kyxc_nhwk.hpp (100%) rename include/ck/tensor_operation/gpu/device/{ => impl}/device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp (100%) rename include/ck/tensor_operation/gpu/device/{ => impl}/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk.hpp (100%) rename include/ck/tensor_operation/gpu/device/{ => impl}/device_conv3d_fwd_naive_ndhwc_kzyxc_ndhwk.hpp (100%) rename include/ck/tensor_operation/gpu/device/{ => impl}/device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk.hpp (99%) rename include/ck/tensor_operation/gpu/device/{ => impl}/device_convnd_bwd_data_nwc_kxc_nwk_xdl.hpp (100%) rename include/ck/tensor_operation/gpu/device/{ => impl}/device_convnd_bwd_weight_nwc_kxc_nwk_xdl_cshuffle.hpp (100%) rename include/ck/tensor_operation/gpu/device/{ => impl}/device_elementwise.hpp (100%) rename include/ck/tensor_operation/gpu/device/{ => impl}/device_gemm_bias_add_reduce_xdl_cshuffle.hpp (100%) rename include/ck/tensor_operation/gpu/device/{ => impl}/device_gemm_bias_e_permute_xdl.hpp (100%) rename include/ck/tensor_operation/gpu/device/{ => impl}/device_gemm_dl.hpp (100%) rename include/ck/tensor_operation/gpu/device/{ => impl}/device_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp (100%) rename include/ck/tensor_operation/gpu/device/{ => impl}/device_gemm_multiple_d_xdl_cshuffle.hpp (100%) rename include/ck/tensor_operation/gpu/device/{ => impl}/device_gemm_reduce_xdl_cshuffle.hpp (100%) rename include/ck/tensor_operation/gpu/device/{ => impl}/device_gemm_xdl.hpp (100%) rename include/ck/tensor_operation/gpu/device/{ => impl}/device_gemm_xdl_cshuffle.hpp (100%) rename include/ck/tensor_operation/gpu/device/{ => impl}/device_gemm_xdl_layernorm_cshuffle.hpp (100%) rename include/ck/tensor_operation/gpu/device/{ => impl}/device_gemm_xdl_skip_b_lds.hpp (100%) rename include/ck/tensor_operation/gpu/device/{ => impl}/device_gemm_xdl_splitk_c_shuffle.hpp (100%) rename include/ck/tensor_operation/gpu/device/{ => impl}/device_grouped_contraction_multiple_d_xdl_cshuffle.hpp (100%) rename include/ck/tensor_operation/gpu/device/{ => impl}/device_grouped_conv_fwd_multiple_d_multiple_r.hpp (100%) rename include/ck/tensor_operation/gpu/device/{ => impl}/device_grouped_conv_fwd_multiple_d_multiple_r_xdl_cshuffle.hpp (99%) rename include/ck/tensor_operation/gpu/device/{ => impl}/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp (99%) rename include/ck/tensor_operation/gpu/device/{ => impl}/device_grouped_gemm_xdl.hpp (100%) rename include/ck/tensor_operation/gpu/device/{ => impl}/device_multiple_reduce_multiblock.hpp (99%) rename include/ck/tensor_operation/gpu/device/{ => impl}/device_multiple_reduce_threadwise.hpp (99%) rename include/ck/tensor_operation/gpu/device/{ => impl}/device_normalization_impl.hpp (99%) rename include/ck/tensor_operation/gpu/device/{ => impl}/device_pool2d_fwd_nhwc_nhwc.hpp (100%) rename include/ck/tensor_operation/gpu/device/{ => impl}/device_reduce_common.hpp (100%) rename include/ck/tensor_operation/gpu/device/{ => impl}/device_reduce_multiblock.hpp (99%) rename include/ck/tensor_operation/gpu/device/{ => impl}/device_reduce_threadwise.hpp (99%) rename include/ck/tensor_operation/gpu/device/{ => impl}/device_sparse_embedding3_forward_layernorm.hpp (100%) diff --git a/client_example/03_gemm_layernorm/gemm_add_add_layernorm.cpp b/client_example/03_gemm_layernorm/gemm_add_add_layernorm.cpp index 9b157f29a..6c259407d 100644 --- a/client_example/03_gemm_layernorm/gemm_add_add_layernorm.cpp +++ b/client_example/03_gemm_layernorm/gemm_add_add_layernorm.cpp @@ -8,7 +8,7 @@ #include "ck/ck.hpp" #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/device_gemm_reduce.hpp" -#include "ck/tensor_operation/gpu/device/device_elementwise.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_elementwise.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" #include "ck/library/tensor_operation_instance/gpu/device_elementwise_instance.hpp" diff --git a/example/01_gemm/gemm_dl_fp16.cpp b/example/01_gemm/gemm_dl_fp16.cpp index 03be1880f..cf585a8c5 100644 --- a/example/01_gemm/gemm_dl_fp16.cpp +++ b/example/01_gemm/gemm_dl_fp16.cpp @@ -3,7 +3,7 @@ #include "common.hpp" -#include "ck/tensor_operation/gpu/device/device_gemm_dl.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_gemm_dl.hpp" using ADataType = ck::half_t; using BDataType = ck::half_t; diff --git a/example/01_gemm/gemm_dl_fp32.cpp b/example/01_gemm/gemm_dl_fp32.cpp index b21701140..93f085cde 100644 --- a/example/01_gemm/gemm_dl_fp32.cpp +++ b/example/01_gemm/gemm_dl_fp32.cpp @@ -3,7 +3,7 @@ #include "common.hpp" -#include "ck/tensor_operation/gpu/device/device_gemm_dl.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_gemm_dl.hpp" using ADataType = float; using BDataType = float; diff --git a/example/01_gemm/gemm_dl_int4.cpp b/example/01_gemm/gemm_dl_int4.cpp index ea45f2166..e392c490f 100644 --- a/example/01_gemm/gemm_dl_int4.cpp +++ b/example/01_gemm/gemm_dl_int4.cpp @@ -7,7 +7,7 @@ #include "common.hpp" -#include "ck/tensor_operation/gpu/device/device_gemm_dl.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_gemm_dl.hpp" using ADataType = ck::int4_t; using BDataType = ck::int4_t; diff --git a/example/01_gemm/gemm_dl_int8.cpp b/example/01_gemm/gemm_dl_int8.cpp index a867cf3b6..be9e38771 100644 --- a/example/01_gemm/gemm_dl_int8.cpp +++ b/example/01_gemm/gemm_dl_int8.cpp @@ -3,7 +3,7 @@ #include "common.hpp" -#include "ck/tensor_operation/gpu/device/device_gemm_dl.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_gemm_dl.hpp" using ADataType = int8_t; using BDataType = int8_t; diff --git a/example/01_gemm/gemm_xdl_bf16.cpp b/example/01_gemm/gemm_xdl_bf16.cpp index 6b9dda081..9aaae6ade 100644 --- a/example/01_gemm/gemm_xdl_bf16.cpp +++ b/example/01_gemm/gemm_xdl_bf16.cpp @@ -3,7 +3,7 @@ #include "common.hpp" -#include "ck/tensor_operation/gpu/device/device_gemm_xdl_cshuffle.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle.hpp" using ADataType = ck::bhalf_t; using BDataType = ck::bhalf_t; diff --git a/example/01_gemm/gemm_xdl_fp16.cpp b/example/01_gemm/gemm_xdl_fp16.cpp index 1d48e8363..488babb75 100644 --- a/example/01_gemm/gemm_xdl_fp16.cpp +++ b/example/01_gemm/gemm_xdl_fp16.cpp @@ -3,8 +3,8 @@ #include "common.hpp" -#include "ck/tensor_operation/gpu/device/device_gemm_xdl.hpp" -#include "ck/tensor_operation/gpu/device/device_gemm_xdl_cshuffle.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle.hpp" using ADataType = ck::half_t; using BDataType = ck::half_t; diff --git a/example/01_gemm/gemm_xdl_fp64.cpp b/example/01_gemm/gemm_xdl_fp64.cpp index 275a9a214..99253b743 100644 --- a/example/01_gemm/gemm_xdl_fp64.cpp +++ b/example/01_gemm/gemm_xdl_fp64.cpp @@ -3,7 +3,7 @@ #include "common.hpp" -#include "ck/tensor_operation/gpu/device/device_gemm_xdl.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl.hpp" using ADataType = double; using BDataType = double; diff --git a/example/01_gemm/gemm_xdl_int4.cpp b/example/01_gemm/gemm_xdl_int4.cpp index d26806021..7f1283a47 100644 --- a/example/01_gemm/gemm_xdl_int4.cpp +++ b/example/01_gemm/gemm_xdl_int4.cpp @@ -7,7 +7,7 @@ #include "common.hpp" -#include "ck/tensor_operation/gpu/device/device_gemm_xdl_cshuffle.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle.hpp" using ADataType = ck::int4_t; using BDataType = ck::int4_t; diff --git a/example/01_gemm/gemm_xdl_int8.cpp b/example/01_gemm/gemm_xdl_int8.cpp index 5fd269471..e67594c5b 100644 --- a/example/01_gemm/gemm_xdl_int8.cpp +++ b/example/01_gemm/gemm_xdl_int8.cpp @@ -3,7 +3,7 @@ #include "common.hpp" -#include "ck/tensor_operation/gpu/device/device_gemm_xdl_cshuffle.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle.hpp" using ADataType = int8_t; using BDataType = int8_t; diff --git a/example/01_gemm/gemm_xdl_skip_b_lds_fp16.cpp b/example/01_gemm/gemm_xdl_skip_b_lds_fp16.cpp index 5cb7f5e4c..8ee98156e 100644 --- a/example/01_gemm/gemm_xdl_skip_b_lds_fp16.cpp +++ b/example/01_gemm/gemm_xdl_skip_b_lds_fp16.cpp @@ -3,8 +3,8 @@ #include "common.hpp" -#include "ck/tensor_operation/gpu/device/device_gemm_xdl.hpp" -#include "ck/tensor_operation/gpu/device/device_gemm_xdl_skip_b_lds.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_skip_b_lds.hpp" using F16 = ck::half_t; using F32 = float; diff --git a/example/02_gemm_bilinear/gemm_bilinear_xdl_fp16.cpp b/example/02_gemm_bilinear/gemm_bilinear_xdl_fp16.cpp index 081f2b514..d1b8ca10a 100644 --- a/example/02_gemm_bilinear/gemm_bilinear_xdl_fp16.cpp +++ b/example/02_gemm_bilinear/gemm_bilinear_xdl_fp16.cpp @@ -8,7 +8,7 @@ #include "ck/ck.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" -#include "ck/tensor_operation/gpu/device/device_gemm_multiple_d_xdl_cshuffle.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" #include "ck/library/utility/device_memory.hpp" diff --git a/example/03_gemm_bias_relu/gemm_bias_relu_xdl_fp16.cpp b/example/03_gemm_bias_relu/gemm_bias_relu_xdl_fp16.cpp index ae5e32341..5d1e9e809 100644 --- a/example/03_gemm_bias_relu/gemm_bias_relu_xdl_fp16.cpp +++ b/example/03_gemm_bias_relu/gemm_bias_relu_xdl_fp16.cpp @@ -9,7 +9,7 @@ #include "ck/ck.hpp" #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" -#include "ck/tensor_operation/gpu/device/device_gemm_multiple_d_xdl_cshuffle.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" #include "ck/library/utility/device_memory.hpp" diff --git a/example/04_gemm_add_add_fastgelu/common.hpp b/example/04_gemm_add_add_fastgelu/common.hpp index 016db614e..3f9375e09 100644 --- a/example/04_gemm_add_add_fastgelu/common.hpp +++ b/example/04_gemm_add_add_fastgelu/common.hpp @@ -12,7 +12,7 @@ #include "ck/ck.hpp" #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" -#include "ck/tensor_operation/gpu/device/device_gemm_multiple_d_xdl_cshuffle.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" #include "ck/utility/data_type.hpp" diff --git a/example/09_convnd_fwd/convnd_fwd_xdl_bf16.cpp b/example/09_convnd_fwd/convnd_fwd_xdl_bf16.cpp index eeb039827..d55d31549 100644 --- a/example/09_convnd_fwd/convnd_fwd_xdl_bf16.cpp +++ b/example/09_convnd_fwd/convnd_fwd_xdl_bf16.cpp @@ -3,7 +3,7 @@ #include "convnd_fwd_common.hpp" -#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp" #include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp" diff --git a/example/09_convnd_fwd/convnd_fwd_xdl_fp16.cpp b/example/09_convnd_fwd/convnd_fwd_xdl_fp16.cpp index f7ee4707f..d84afba64 100644 --- a/example/09_convnd_fwd/convnd_fwd_xdl_fp16.cpp +++ b/example/09_convnd_fwd/convnd_fwd_xdl_fp16.cpp @@ -3,7 +3,7 @@ #include "convnd_fwd_common.hpp" -#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp" #include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp" diff --git a/example/09_convnd_fwd/convnd_fwd_xdl_fp32.cpp b/example/09_convnd_fwd/convnd_fwd_xdl_fp32.cpp index 010304fcd..f5acc540c 100644 --- a/example/09_convnd_fwd/convnd_fwd_xdl_fp32.cpp +++ b/example/09_convnd_fwd/convnd_fwd_xdl_fp32.cpp @@ -3,7 +3,7 @@ #include "convnd_fwd_common.hpp" -#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp" #include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp" diff --git a/example/09_convnd_fwd/convnd_fwd_xdl_fp64.cpp b/example/09_convnd_fwd/convnd_fwd_xdl_fp64.cpp index 0804fdc32..8d697976a 100644 --- a/example/09_convnd_fwd/convnd_fwd_xdl_fp64.cpp +++ b/example/09_convnd_fwd/convnd_fwd_xdl_fp64.cpp @@ -3,7 +3,7 @@ #include "convnd_fwd_common.hpp" -#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp" #include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp" diff --git a/example/09_convnd_fwd/convnd_fwd_xdl_int8.cpp b/example/09_convnd_fwd/convnd_fwd_xdl_int8.cpp index 259b0a2b0..99f7f2565 100644 --- a/example/09_convnd_fwd/convnd_fwd_xdl_int8.cpp +++ b/example/09_convnd_fwd/convnd_fwd_xdl_int8.cpp @@ -3,7 +3,7 @@ #include "convnd_fwd_common.hpp" -#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp" #include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp" diff --git a/example/10_convnd_fwd_multiple_d_multiple_reduce/common.hpp b/example/10_convnd_fwd_multiple_d_multiple_reduce/common.hpp index 8ff683d33..642315fc6 100644 --- a/example/10_convnd_fwd_multiple_d_multiple_reduce/common.hpp +++ b/example/10_convnd_fwd_multiple_d_multiple_reduce/common.hpp @@ -12,7 +12,7 @@ #include #include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_d_multiple_r_xdl_cshuffle.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_multiple_r_xdl_cshuffle.hpp" #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" diff --git a/example/12_reduce/reduce_blockwise_impl.hpp b/example/12_reduce/reduce_blockwise_impl.hpp index ef5ec9948..1d2769ea9 100644 --- a/example/12_reduce/reduce_blockwise_impl.hpp +++ b/example/12_reduce/reduce_blockwise_impl.hpp @@ -8,7 +8,7 @@ #include "ck/ck.hpp" #include "ck/utility/reduction_enums.hpp" #include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp" -#include "ck/tensor_operation/gpu/device/device_reduce_multiblock.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_reduce_multiblock.hpp" #include "ck/library/utility/check_err.hpp" #include "ck/library/utility/device_memory.hpp" diff --git a/example/12_reduce/reduce_blockwise_two_call.cpp b/example/12_reduce/reduce_blockwise_two_call.cpp index df58cc276..a84856c33 100644 --- a/example/12_reduce/reduce_blockwise_two_call.cpp +++ b/example/12_reduce/reduce_blockwise_two_call.cpp @@ -11,7 +11,7 @@ #include "ck/ck.hpp" #include "ck/utility/reduction_enums.hpp" #include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp" -#include "ck/tensor_operation/gpu/device/device_reduce_multiblock.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_reduce_multiblock.hpp" #include "ck/library/utility/check_err.hpp" #include "ck/library/utility/device_memory.hpp" diff --git a/example/12_reduce/reduce_multiblock_atomic_add_impl.hpp b/example/12_reduce/reduce_multiblock_atomic_add_impl.hpp index c2fa8da91..b67854673 100644 --- a/example/12_reduce/reduce_multiblock_atomic_add_impl.hpp +++ b/example/12_reduce/reduce_multiblock_atomic_add_impl.hpp @@ -8,7 +8,7 @@ #include "ck/ck.hpp" #include "ck/utility/reduction_enums.hpp" #include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp" -#include "ck/tensor_operation/gpu/device/device_reduce_multiblock.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_reduce_multiblock.hpp" #include "ck/library/utility/check_err.hpp" #include "ck/library/utility/device_memory.hpp" diff --git a/example/13_pool2d_fwd/pool2d_fwd_common.hpp b/example/13_pool2d_fwd/pool2d_fwd_common.hpp index 32b66934a..ccb20aa1e 100644 --- a/example/13_pool2d_fwd/pool2d_fwd_common.hpp +++ b/example/13_pool2d_fwd/pool2d_fwd_common.hpp @@ -9,7 +9,7 @@ #include "ck/utility/reduction_enums.hpp" #include "ck/utility/reduction_functions_accumulate.hpp" #include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp" -#include "ck/tensor_operation/gpu/device/device_pool2d_fwd_nhwc_nhwc.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_pool2d_fwd_nhwc_nhwc.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" #include "ck/library/utility/check_err.hpp" diff --git a/example/14_gemm_xdl_requant_relu_requant/gemm_xdl_requant_relu_requant_int8.cpp b/example/14_gemm_xdl_requant_relu_requant/gemm_xdl_requant_relu_requant_int8.cpp index d3afa3865..79838d1b2 100644 --- a/example/14_gemm_xdl_requant_relu_requant/gemm_xdl_requant_relu_requant_int8.cpp +++ b/example/14_gemm_xdl_requant_relu_requant/gemm_xdl_requant_relu_requant_int8.cpp @@ -9,7 +9,7 @@ #include "ck/ck.hpp" #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" -#include "ck/tensor_operation/gpu/device/device_gemm_xdl_cshuffle.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" #include "ck/library/utility/device_memory.hpp" diff --git a/example/15_grouped_gemm/grouped_gemm_xdl_bfp16.cpp b/example/15_grouped_gemm/grouped_gemm_xdl_bfp16.cpp index 427e82b40..15d7d48fd 100644 --- a/example/15_grouped_gemm/grouped_gemm_xdl_bfp16.cpp +++ b/example/15_grouped_gemm/grouped_gemm_xdl_bfp16.cpp @@ -9,7 +9,7 @@ #include "ck/ck.hpp" #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" -#include "ck/tensor_operation/gpu/device/device_grouped_gemm_xdl.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" #include "ck/library/utility/check_err.hpp" diff --git a/example/15_grouped_gemm/grouped_gemm_xdl_fp16.cpp b/example/15_grouped_gemm/grouped_gemm_xdl_fp16.cpp index 13bb1c540..d1c265ccd 100644 --- a/example/15_grouped_gemm/grouped_gemm_xdl_fp16.cpp +++ b/example/15_grouped_gemm/grouped_gemm_xdl_fp16.cpp @@ -9,7 +9,7 @@ #include "ck/ck.hpp" #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" -#include "ck/tensor_operation/gpu/device/device_grouped_gemm_xdl.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" #include "ck/library/utility/check_err.hpp" diff --git a/example/15_grouped_gemm/grouped_gemm_xdl_fp32.cpp b/example/15_grouped_gemm/grouped_gemm_xdl_fp32.cpp index 7d1a102d1..78e2167ea 100644 --- a/example/15_grouped_gemm/grouped_gemm_xdl_fp32.cpp +++ b/example/15_grouped_gemm/grouped_gemm_xdl_fp32.cpp @@ -9,7 +9,7 @@ #include "ck/ck.hpp" #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" -#include "ck/tensor_operation/gpu/device/device_grouped_gemm_xdl.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" #include "ck/library/utility/check_err.hpp" diff --git a/example/15_grouped_gemm/grouped_gemm_xdl_int4.cpp b/example/15_grouped_gemm/grouped_gemm_xdl_int4.cpp index 7355641d9..2113cf943 100644 --- a/example/15_grouped_gemm/grouped_gemm_xdl_int4.cpp +++ b/example/15_grouped_gemm/grouped_gemm_xdl_int4.cpp @@ -9,7 +9,7 @@ #include "ck/ck.hpp" #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" -#include "ck/tensor_operation/gpu/device/device_grouped_gemm_xdl.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" #include "ck/library/utility/check_err.hpp" diff --git a/example/15_grouped_gemm/grouped_gemm_xdl_int8.cpp b/example/15_grouped_gemm/grouped_gemm_xdl_int8.cpp index c96ff76bf..0c35c1b6a 100644 --- a/example/15_grouped_gemm/grouped_gemm_xdl_int8.cpp +++ b/example/15_grouped_gemm/grouped_gemm_xdl_int8.cpp @@ -9,7 +9,7 @@ #include "ck/ck.hpp" #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" -#include "ck/tensor_operation/gpu/device/device_grouped_gemm_xdl.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" #include "ck/library/utility/check_err.hpp" diff --git a/example/16_gemm_multi_d_multi_reduces/gemm_add_add_mean_meansquare_xdl_fp16.cpp b/example/16_gemm_multi_d_multi_reduces/gemm_add_add_mean_meansquare_xdl_fp16.cpp index f7911645a..6d57cef1e 100644 --- a/example/16_gemm_multi_d_multi_reduces/gemm_add_add_mean_meansquare_xdl_fp16.cpp +++ b/example/16_gemm_multi_d_multi_reduces/gemm_add_add_mean_meansquare_xdl_fp16.cpp @@ -9,7 +9,7 @@ #include "ck/ck.hpp" #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" -#include "ck/tensor_operation/gpu/device/device_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" #include "ck/library/utility/device_memory.hpp" diff --git a/example/16_gemm_multi_d_multi_reduces/gemm_add_addsquare_xdl_int8.cpp b/example/16_gemm_multi_d_multi_reduces/gemm_add_addsquare_xdl_int8.cpp index c265c7a78..bc621a4b8 100644 --- a/example/16_gemm_multi_d_multi_reduces/gemm_add_addsquare_xdl_int8.cpp +++ b/example/16_gemm_multi_d_multi_reduces/gemm_add_addsquare_xdl_int8.cpp @@ -5,7 +5,7 @@ #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp" #include "ck/library/utility/literals.hpp" -#include "ck/tensor_operation/gpu/device/device_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" // DataType diff --git a/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_bf16.cpp b/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_bf16.cpp index b11f1c7b2..c2feffeb8 100644 --- a/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_bf16.cpp +++ b/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_bf16.cpp @@ -4,7 +4,7 @@ #include "gemm_reduce_xdl_common.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp" -#include "ck/tensor_operation/gpu/device/device_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" // DataType diff --git a/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_fp16.cpp b/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_fp16.cpp index 20b2ba3f4..363390add 100644 --- a/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_fp16.cpp +++ b/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_fp16.cpp @@ -4,7 +4,7 @@ #include "gemm_reduce_xdl_common.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp" -#include "ck/tensor_operation/gpu/device/device_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" // DataType diff --git a/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_fp32.cpp b/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_fp32.cpp index e4894bd2b..de6b7eb48 100644 --- a/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_fp32.cpp +++ b/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_fp32.cpp @@ -4,7 +4,7 @@ #include "gemm_reduce_xdl_common.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp" -#include "ck/tensor_operation/gpu/device/device_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" // DataType diff --git a/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_int4.cpp b/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_int4.cpp index 22cf27060..9666fc662 100644 --- a/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_int4.cpp +++ b/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_int4.cpp @@ -4,7 +4,7 @@ #include "gemm_reduce_xdl_common.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp" -#include "ck/tensor_operation/gpu/device/device_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" using ADataType = INT4; diff --git a/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_int8.cpp b/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_int8.cpp index a71b9a86a..00e0b767a 100644 --- a/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_int8.cpp +++ b/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_int8.cpp @@ -4,7 +4,7 @@ #include "gemm_reduce_xdl_common.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp" -#include "ck/tensor_operation/gpu/device/device_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" using ADataType = INT8; diff --git a/example/16_gemm_multi_d_multi_reduces/gemm_mean_meansquare_xdl_bf16.cpp b/example/16_gemm_multi_d_multi_reduces/gemm_mean_meansquare_xdl_bf16.cpp index e1bdaab12..652c0e6ea 100644 --- a/example/16_gemm_multi_d_multi_reduces/gemm_mean_meansquare_xdl_bf16.cpp +++ b/example/16_gemm_multi_d_multi_reduces/gemm_mean_meansquare_xdl_bf16.cpp @@ -4,7 +4,7 @@ #include "gemm_reduce_xdl_common.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp" -#include "ck/tensor_operation/gpu/device/device_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" // DataType diff --git a/example/16_gemm_multi_d_multi_reduces/gemm_mean_meansquare_xdl_fp16.cpp b/example/16_gemm_multi_d_multi_reduces/gemm_mean_meansquare_xdl_fp16.cpp index dfcd2c56c..7eee24fed 100644 --- a/example/16_gemm_multi_d_multi_reduces/gemm_mean_meansquare_xdl_fp16.cpp +++ b/example/16_gemm_multi_d_multi_reduces/gemm_mean_meansquare_xdl_fp16.cpp @@ -4,7 +4,7 @@ #include "gemm_reduce_xdl_common.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp" -#include "ck/tensor_operation/gpu/device/device_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" // DataType diff --git a/example/16_gemm_multi_d_multi_reduces/gemm_mean_meansquare_xdl_fp32.cpp b/example/16_gemm_multi_d_multi_reduces/gemm_mean_meansquare_xdl_fp32.cpp index 63aa362c8..c250b9969 100644 --- a/example/16_gemm_multi_d_multi_reduces/gemm_mean_meansquare_xdl_fp32.cpp +++ b/example/16_gemm_multi_d_multi_reduces/gemm_mean_meansquare_xdl_fp32.cpp @@ -4,7 +4,7 @@ #include "gemm_reduce_xdl_common.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp" -#include "ck/tensor_operation/gpu/device/device_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" // DataType diff --git a/example/17_convnd_bwd_data/convnd_bwd_data_xdl_fp16.cpp b/example/17_convnd_bwd_data/convnd_bwd_data_xdl_fp16.cpp index 392e961b0..c4f2c1f02 100644 --- a/example/17_convnd_bwd_data/convnd_bwd_data_xdl_fp16.cpp +++ b/example/17_convnd_bwd_data/convnd_bwd_data_xdl_fp16.cpp @@ -3,7 +3,7 @@ #include "convnd_bwd_data_common.hpp" -#include "ck/tensor_operation/gpu/device/device_convnd_bwd_data_nwc_kxc_nwk_xdl.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_convnd_bwd_data_nwc_kxc_nwk_xdl.hpp" using InDataType = ck::half_t; using WeiDataType = ck::half_t; diff --git a/example/18_batched_gemm_reduce/batched_gemm_reduce_xdl_fp16.cpp b/example/18_batched_gemm_reduce/batched_gemm_reduce_xdl_fp16.cpp index fb019faa4..3488a5336 100644 --- a/example/18_batched_gemm_reduce/batched_gemm_reduce_xdl_fp16.cpp +++ b/example/18_batched_gemm_reduce/batched_gemm_reduce_xdl_fp16.cpp @@ -9,7 +9,7 @@ #include "ck/ck.hpp" #include "ck/utility/reduction_operator.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/tensor_operation/gpu/device/device_batched_gemm_reduce_xdl_cshuffle.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_reduce_xdl_cshuffle.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" #include "ck/library/utility/check_err.hpp" diff --git a/example/19_binary_elementwise/broadcast_add_2d_amn_bn.cpp b/example/19_binary_elementwise/broadcast_add_2d_amn_bn.cpp index 50604da18..b84d32017 100644 --- a/example/19_binary_elementwise/broadcast_add_2d_amn_bn.cpp +++ b/example/19_binary_elementwise/broadcast_add_2d_amn_bn.cpp @@ -6,7 +6,7 @@ #include "ck/ck.hpp" #include "ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp" -#include "ck/tensor_operation/gpu/device/device_elementwise.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_elementwise.hpp" #include "ck/library/utility/check_err.hpp" #include "ck/library/utility/device_memory.hpp" diff --git a/example/19_binary_elementwise/broadcast_add_3d_am_bmnk.cpp b/example/19_binary_elementwise/broadcast_add_3d_am_bmnk.cpp index 9f2e1e785..041871bf5 100644 --- a/example/19_binary_elementwise/broadcast_add_3d_am_bmnk.cpp +++ b/example/19_binary_elementwise/broadcast_add_3d_am_bmnk.cpp @@ -6,7 +6,7 @@ #include "ck/ck.hpp" #include "ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp" -#include "ck/tensor_operation/gpu/device/device_elementwise.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_elementwise.hpp" #include "ck/library/utility/check_err.hpp" #include "ck/library/utility/device_memory.hpp" diff --git a/example/19_binary_elementwise/elementwise_add_1d.cpp b/example/19_binary_elementwise/elementwise_add_1d.cpp index d123798fe..fb218d235 100644 --- a/example/19_binary_elementwise/elementwise_add_1d.cpp +++ b/example/19_binary_elementwise/elementwise_add_1d.cpp @@ -5,7 +5,7 @@ #include #include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_elementwise.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_elementwise.hpp" #include "ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp" #include "ck/library/utility/check_err.hpp" #include "ck/library/utility/device_memory.hpp" diff --git a/example/19_binary_elementwise/elementwise_add_4d.cpp b/example/19_binary_elementwise/elementwise_add_4d.cpp index 4c7452694..d4b9f90fa 100644 --- a/example/19_binary_elementwise/elementwise_add_4d.cpp +++ b/example/19_binary_elementwise/elementwise_add_4d.cpp @@ -6,7 +6,7 @@ #include "ck/ck.hpp" #include "ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp" -#include "ck/tensor_operation/gpu/device/device_elementwise.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_elementwise.hpp" #include "ck/library/utility/check_err.hpp" #include "ck/library/utility/device_memory.hpp" diff --git a/example/20_convnd_bwd_weight/convnd_bwd_weight_xdl_bf16.cpp b/example/20_convnd_bwd_weight/convnd_bwd_weight_xdl_bf16.cpp index d9409d7c4..0f1dee993 100644 --- a/example/20_convnd_bwd_weight/convnd_bwd_weight_xdl_bf16.cpp +++ b/example/20_convnd_bwd_weight/convnd_bwd_weight_xdl_bf16.cpp @@ -3,7 +3,7 @@ #include "convnd_bwd_weight_common.hpp" -#include "ck/tensor_operation/gpu/device/device_convnd_bwd_weight_nwc_kxc_nwk_xdl_cshuffle.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_convnd_bwd_weight_nwc_kxc_nwk_xdl_cshuffle.hpp" using InDataType = ck::bhalf_t; // bf16 kernel use fp32 atomic add to accumulate Weight tensor into global memory diff --git a/example/20_convnd_bwd_weight/convnd_bwd_weight_xdl_fp16.cpp b/example/20_convnd_bwd_weight/convnd_bwd_weight_xdl_fp16.cpp index 39476eb04..b825192eb 100644 --- a/example/20_convnd_bwd_weight/convnd_bwd_weight_xdl_fp16.cpp +++ b/example/20_convnd_bwd_weight/convnd_bwd_weight_xdl_fp16.cpp @@ -3,7 +3,7 @@ #include "convnd_bwd_weight_common.hpp" -#include "ck/tensor_operation/gpu/device/device_convnd_bwd_weight_nwc_kxc_nwk_xdl_cshuffle.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_convnd_bwd_weight_nwc_kxc_nwk_xdl_cshuffle.hpp" using InDataType = ck::half_t; using WeiDataType = ck::half_t; diff --git a/example/21_gemm_layernorm/gemm_bias_relu_add_layernorm_xdl_fp16.cpp b/example/21_gemm_layernorm/gemm_bias_relu_add_layernorm_xdl_fp16.cpp index d4fbcfb99..8d9f87d7e 100644 --- a/example/21_gemm_layernorm/gemm_bias_relu_add_layernorm_xdl_fp16.cpp +++ b/example/21_gemm_layernorm/gemm_bias_relu_add_layernorm_xdl_fp16.cpp @@ -9,8 +9,8 @@ #include "ck/ck.hpp" #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" -#include "ck/tensor_operation/gpu/device/device_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp" -#include "ck/tensor_operation/gpu/device/device_elementwise.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_elementwise.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" #include "ck/library/utility/device_memory.hpp" diff --git a/example/21_gemm_layernorm/gemm_layernorm_xdl_fp16.cpp b/example/21_gemm_layernorm/gemm_layernorm_xdl_fp16.cpp index 0e00a0da6..31231bc8a 100644 --- a/example/21_gemm_layernorm/gemm_layernorm_xdl_fp16.cpp +++ b/example/21_gemm_layernorm/gemm_layernorm_xdl_fp16.cpp @@ -9,8 +9,8 @@ #include "ck/ck.hpp" #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" -#include "ck/tensor_operation/gpu/device/device_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp" -#include "ck/tensor_operation/gpu/device/device_elementwise.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_elementwise.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" #include "ck/library/utility/device_memory.hpp" diff --git a/example/21_gemm_layernorm/gemm_xdl_layernorm_single_kernel_fp16.cpp b/example/21_gemm_layernorm/gemm_xdl_layernorm_single_kernel_fp16.cpp index a6d15b00a..56d4472bc 100644 --- a/example/21_gemm_layernorm/gemm_xdl_layernorm_single_kernel_fp16.cpp +++ b/example/21_gemm_layernorm/gemm_xdl_layernorm_single_kernel_fp16.cpp @@ -11,7 +11,7 @@ #include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor_generator.hpp" #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" -#include "ck/tensor_operation/gpu/device/device_gemm_xdl_layernorm_cshuffle.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_layernorm_cshuffle.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" #include "ck/utility/reduction_operator.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_gemm_layernorm.hpp" diff --git a/example/22_cgemm/cgemm_xdl_bf16.cpp b/example/22_cgemm/cgemm_xdl_bf16.cpp index 4369be8a3..92ed90ce4 100644 --- a/example/22_cgemm/cgemm_xdl_bf16.cpp +++ b/example/22_cgemm/cgemm_xdl_bf16.cpp @@ -8,7 +8,7 @@ #include "ck/library/reference_tensor_operation/cpu/reference_cgemm.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/tensor_operation/gpu/device/device_cgemm_4gemm_xdl_cshuffle.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_cgemm_4gemm_xdl_cshuffle.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" using ADataType = BF16; diff --git a/example/22_cgemm/cgemm_xdl_fp16.cpp b/example/22_cgemm/cgemm_xdl_fp16.cpp index a73d41e82..11373736e 100644 --- a/example/22_cgemm/cgemm_xdl_fp16.cpp +++ b/example/22_cgemm/cgemm_xdl_fp16.cpp @@ -6,7 +6,7 @@ #include "cgemm_xdl_common.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_cgemm.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/tensor_operation/gpu/device/device_cgemm_4gemm_xdl_cshuffle.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_cgemm_4gemm_xdl_cshuffle.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" using ADataType = F16; diff --git a/example/22_cgemm/cgemm_xdl_fp32.cpp b/example/22_cgemm/cgemm_xdl_fp32.cpp index ac32ba768..0f45c18c4 100644 --- a/example/22_cgemm/cgemm_xdl_fp32.cpp +++ b/example/22_cgemm/cgemm_xdl_fp32.cpp @@ -8,7 +8,7 @@ #include "ck/library/reference_tensor_operation/cpu/reference_cgemm.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/tensor_operation/gpu/device/device_cgemm_4gemm_xdl_cshuffle.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_cgemm_4gemm_xdl_cshuffle.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" using ADataType = F32; diff --git a/example/22_cgemm/cgemm_xdl_int4.cpp b/example/22_cgemm/cgemm_xdl_int4.cpp index cf3cbbc2a..c26a83baa 100644 --- a/example/22_cgemm/cgemm_xdl_int4.cpp +++ b/example/22_cgemm/cgemm_xdl_int4.cpp @@ -8,7 +8,7 @@ #include "ck/library/reference_tensor_operation/cpu/reference_cgemm.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/tensor_operation/gpu/device/device_cgemm_4gemm_xdl_cshuffle.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_cgemm_4gemm_xdl_cshuffle.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" using ADataType = INT4; diff --git a/example/22_cgemm/cgemm_xdl_int8.cpp b/example/22_cgemm/cgemm_xdl_int8.cpp index e1389ac92..2f2418986 100644 --- a/example/22_cgemm/cgemm_xdl_int8.cpp +++ b/example/22_cgemm/cgemm_xdl_int8.cpp @@ -8,7 +8,7 @@ #include "ck/library/reference_tensor_operation/cpu/reference_cgemm.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/tensor_operation/gpu/device/device_cgemm_4gemm_xdl_cshuffle.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_cgemm_4gemm_xdl_cshuffle.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" using ADataType = INT8; diff --git a/example/24_batched_gemm/batched_gemm_xdl_bfp16.cpp b/example/24_batched_gemm/batched_gemm_xdl_bfp16.cpp index 42beb0e92..c684c13d0 100644 --- a/example/24_batched_gemm/batched_gemm_xdl_bfp16.cpp +++ b/example/24_batched_gemm/batched_gemm_xdl_bfp16.cpp @@ -6,7 +6,7 @@ #include "ck/ck.hpp" #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" -#include "ck/tensor_operation/gpu/device/device_batched_gemm_multi_d_xdl.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_multi_d_xdl.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" #include "ck/library/utility/check_err.hpp" diff --git a/example/24_batched_gemm/batched_gemm_xdl_fp16.cpp b/example/24_batched_gemm/batched_gemm_xdl_fp16.cpp index f9dc58108..d1985f9af 100644 --- a/example/24_batched_gemm/batched_gemm_xdl_fp16.cpp +++ b/example/24_batched_gemm/batched_gemm_xdl_fp16.cpp @@ -6,7 +6,7 @@ #include "ck/ck.hpp" #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" -#include "ck/tensor_operation/gpu/device/device_batched_gemm_multi_d_xdl.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_multi_d_xdl.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" #include "ck/library/utility/check_err.hpp" diff --git a/example/24_batched_gemm/batched_gemm_xdl_fp32.cpp b/example/24_batched_gemm/batched_gemm_xdl_fp32.cpp index 304cd14db..a92a04dbe 100644 --- a/example/24_batched_gemm/batched_gemm_xdl_fp32.cpp +++ b/example/24_batched_gemm/batched_gemm_xdl_fp32.cpp @@ -6,7 +6,7 @@ #include "ck/ck.hpp" #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" -#include "ck/tensor_operation/gpu/device/device_batched_gemm_multi_d_xdl.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_multi_d_xdl.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" #include "ck/library/utility/check_err.hpp" diff --git a/example/24_batched_gemm/batched_gemm_xdl_int4.cpp b/example/24_batched_gemm/batched_gemm_xdl_int4.cpp index 95e715efa..5e82cfe32 100644 --- a/example/24_batched_gemm/batched_gemm_xdl_int4.cpp +++ b/example/24_batched_gemm/batched_gemm_xdl_int4.cpp @@ -6,7 +6,7 @@ #include "ck/ck.hpp" #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" -#include "ck/tensor_operation/gpu/device/device_batched_gemm_multi_d_xdl.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_multi_d_xdl.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" #include "ck/library/utility/check_err.hpp" diff --git a/example/24_batched_gemm/batched_gemm_xdl_int8.cpp b/example/24_batched_gemm/batched_gemm_xdl_int8.cpp index cc4835507..ad22227af 100644 --- a/example/24_batched_gemm/batched_gemm_xdl_int8.cpp +++ b/example/24_batched_gemm/batched_gemm_xdl_int8.cpp @@ -6,7 +6,7 @@ #include "ck/ck.hpp" #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" -#include "ck/tensor_operation/gpu/device/device_batched_gemm_multi_d_xdl.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_multi_d_xdl.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" #include "ck/library/utility/check_err.hpp" diff --git a/example/25_gemm_bias_e_permute/gemm_bias_e_permute_g1m2n3k1_xdl_fp16.cpp b/example/25_gemm_bias_e_permute/gemm_bias_e_permute_g1m2n3k1_xdl_fp16.cpp index 2fec602f9..9cd34bfc1 100644 --- a/example/25_gemm_bias_e_permute/gemm_bias_e_permute_g1m2n3k1_xdl_fp16.cpp +++ b/example/25_gemm_bias_e_permute/gemm_bias_e_permute_g1m2n3k1_xdl_fp16.cpp @@ -8,7 +8,7 @@ #include "ck/ck.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" -#include "ck/tensor_operation/gpu/device/device_batched_contraction_multiple_d_xdl_cshuffle.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_batched_contraction_multiple_d_xdl_cshuffle.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" #include "ck/library/utility/check_err.hpp" diff --git a/example/25_gemm_bias_e_permute/gemm_bias_e_permute_g1m3n2k1_xdl_fp16.cpp b/example/25_gemm_bias_e_permute/gemm_bias_e_permute_g1m3n2k1_xdl_fp16.cpp index 66c9bda21..06553fad7 100644 --- a/example/25_gemm_bias_e_permute/gemm_bias_e_permute_g1m3n2k1_xdl_fp16.cpp +++ b/example/25_gemm_bias_e_permute/gemm_bias_e_permute_g1m3n2k1_xdl_fp16.cpp @@ -8,7 +8,7 @@ #include "ck/ck.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" -#include "ck/tensor_operation/gpu/device/device_batched_contraction_multiple_d_xdl_cshuffle.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_batched_contraction_multiple_d_xdl_cshuffle.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" #include "ck/library/utility/check_err.hpp" diff --git a/example/26_contraction/contraction_bilinear_xdl_fp32.cpp b/example/26_contraction/contraction_bilinear_xdl_fp32.cpp index 070703b4f..c73f5a51e 100644 --- a/example/26_contraction/contraction_bilinear_xdl_fp32.cpp +++ b/example/26_contraction/contraction_bilinear_xdl_fp32.cpp @@ -8,7 +8,7 @@ #include "ck/ck.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d_xdl_cshuffle.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_contraction_multiple_d_xdl_cshuffle.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" #include "ck/library/utility/check_err.hpp" diff --git a/example/26_contraction/contraction_scale_xdl_fp32.cpp b/example/26_contraction/contraction_scale_xdl_fp32.cpp index 0c8061352..5353d8a9b 100644 --- a/example/26_contraction/contraction_scale_xdl_fp32.cpp +++ b/example/26_contraction/contraction_scale_xdl_fp32.cpp @@ -8,7 +8,7 @@ #include "ck/ck.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d_xdl_cshuffle.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_contraction_multiple_d_xdl_cshuffle.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" #include "ck/library/utility/check_err.hpp" diff --git a/example/27_layernorm/layernorm_blockwise.cpp b/example/27_layernorm/layernorm_blockwise.cpp index e8a1af9c2..54c4eaf74 100644 --- a/example/27_layernorm/layernorm_blockwise.cpp +++ b/example/27_layernorm/layernorm_blockwise.cpp @@ -9,7 +9,7 @@ #include "ck/ck.hpp" #include "ck/utility/reduction_enums.hpp" -#include "ck/tensor_operation/gpu/device/device_normalization_impl.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_normalization_impl.hpp" #include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp" #include "ck/library/utility/check_err.hpp" diff --git a/example/28_grouped_gemm_bias_e_permute/grouped_gemm_bias_e_permute_xdl_fp16.cpp b/example/28_grouped_gemm_bias_e_permute/grouped_gemm_bias_e_permute_xdl_fp16.cpp index 9505b6d21..e1fa966a2 100644 --- a/example/28_grouped_gemm_bias_e_permute/grouped_gemm_bias_e_permute_xdl_fp16.cpp +++ b/example/28_grouped_gemm_bias_e_permute/grouped_gemm_bias_e_permute_xdl_fp16.cpp @@ -9,7 +9,7 @@ #include "ck/ck.hpp" #include "ck/tensor_operation/gpu/device/tensor_specialization.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" -#include "ck/tensor_operation/gpu/device/device_grouped_contraction_multiple_d_xdl_cshuffle.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_grouped_contraction_multiple_d_xdl_cshuffle.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" #include "ck/library/utility/check_err.hpp" diff --git a/example/29_batched_gemm_bias_e_permute/batched_gemm_bias_e_permute_xdl_fp16.cpp b/example/29_batched_gemm_bias_e_permute/batched_gemm_bias_e_permute_xdl_fp16.cpp index 4f723695d..ef7f5b029 100644 --- a/example/29_batched_gemm_bias_e_permute/batched_gemm_bias_e_permute_xdl_fp16.cpp +++ b/example/29_batched_gemm_bias_e_permute/batched_gemm_bias_e_permute_xdl_fp16.cpp @@ -8,7 +8,7 @@ #include "ck/ck.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" -#include "ck/tensor_operation/gpu/device/device_batched_contraction_multiple_d_xdl_cshuffle.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_batched_contraction_multiple_d_xdl_cshuffle.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" #include "ck/library/utility/check_err.hpp" diff --git a/example/30_grouped_convnd_fwd_bias_relu_add/grouped_convnd_fwd_bias_relu_add_xdl_bf16.cpp b/example/30_grouped_convnd_fwd_bias_relu_add/grouped_convnd_fwd_bias_relu_add_xdl_bf16.cpp index bd5b48f88..984f28c84 100644 --- a/example/30_grouped_convnd_fwd_bias_relu_add/grouped_convnd_fwd_bias_relu_add_xdl_bf16.cpp +++ b/example/30_grouped_convnd_fwd_bias_relu_add/grouped_convnd_fwd_bias_relu_add_xdl_bf16.cpp @@ -3,7 +3,7 @@ #include "grouped_convnd_fwd_bias_relu_add_common.hpp" -#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp" #include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp" diff --git a/example/30_grouped_convnd_fwd_bias_relu_add/grouped_convnd_fwd_bias_relu_add_xdl_fp16.cpp b/example/30_grouped_convnd_fwd_bias_relu_add/grouped_convnd_fwd_bias_relu_add_xdl_fp16.cpp index 36997c33c..d5a05a2cf 100644 --- a/example/30_grouped_convnd_fwd_bias_relu_add/grouped_convnd_fwd_bias_relu_add_xdl_fp16.cpp +++ b/example/30_grouped_convnd_fwd_bias_relu_add/grouped_convnd_fwd_bias_relu_add_xdl_fp16.cpp @@ -3,7 +3,7 @@ #include "grouped_convnd_fwd_bias_relu_add_common.hpp" -#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp" #include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp" diff --git a/example/30_grouped_convnd_fwd_bias_relu_add/grouped_convnd_fwd_bias_relu_add_xdl_fp32.cpp b/example/30_grouped_convnd_fwd_bias_relu_add/grouped_convnd_fwd_bias_relu_add_xdl_fp32.cpp index 9b2374de2..2e5dbb594 100644 --- a/example/30_grouped_convnd_fwd_bias_relu_add/grouped_convnd_fwd_bias_relu_add_xdl_fp32.cpp +++ b/example/30_grouped_convnd_fwd_bias_relu_add/grouped_convnd_fwd_bias_relu_add_xdl_fp32.cpp @@ -3,7 +3,7 @@ #include "grouped_convnd_fwd_bias_relu_add_common.hpp" -#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp" #include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp" diff --git a/example/30_grouped_convnd_fwd_bias_relu_add/grouped_convnd_fwd_bias_relu_add_xdl_int4.cpp b/example/30_grouped_convnd_fwd_bias_relu_add/grouped_convnd_fwd_bias_relu_add_xdl_int4.cpp index be5b79124..9c96015cd 100644 --- a/example/30_grouped_convnd_fwd_bias_relu_add/grouped_convnd_fwd_bias_relu_add_xdl_int4.cpp +++ b/example/30_grouped_convnd_fwd_bias_relu_add/grouped_convnd_fwd_bias_relu_add_xdl_int4.cpp @@ -3,7 +3,7 @@ #include "grouped_convnd_fwd_bias_relu_add_common.hpp" -#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp" #include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp" diff --git a/example/30_grouped_convnd_fwd_bias_relu_add/grouped_convnd_fwd_bias_relu_add_xdl_int8.cpp b/example/30_grouped_convnd_fwd_bias_relu_add/grouped_convnd_fwd_bias_relu_add_xdl_int8.cpp index 1f3434694..3a366cece 100644 --- a/example/30_grouped_convnd_fwd_bias_relu_add/grouped_convnd_fwd_bias_relu_add_xdl_int8.cpp +++ b/example/30_grouped_convnd_fwd_bias_relu_add/grouped_convnd_fwd_bias_relu_add_xdl_int8.cpp @@ -3,7 +3,7 @@ #include "grouped_convnd_fwd_bias_relu_add_common.hpp" -#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp" #include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp" diff --git a/example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_bf16.cpp b/example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_bf16.cpp index abe6fd33a..398895091 100644 --- a/example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_bf16.cpp +++ b/example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_bf16.cpp @@ -16,7 +16,7 @@ Gemm + Gemm fused operation. Computes C_m_o = A_m_k * B0_k_n * B1_n_o #include "ck/ck.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" -#include "ck/tensor_operation/gpu/device/device_batched_gemm_gemm_xdl_cshuffle.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_gemm_xdl_cshuffle.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" #include "ck/library/utility/check_err.hpp" diff --git a/example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_fp16.cpp b/example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_fp16.cpp index 7046d1b27..2f0d4e686 100644 --- a/example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_fp16.cpp +++ b/example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_fp16.cpp @@ -16,7 +16,7 @@ Gemm + Gemm fused operation. Computes C_m_o = A_m_k * B0_k_n * B1_n_o #include "ck/ck.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" -#include "ck/tensor_operation/gpu/device/device_batched_gemm_gemm_xdl_cshuffle.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_gemm_xdl_cshuffle.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" #include "ck/library/utility/check_err.hpp" diff --git a/example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_fp32.cpp b/example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_fp32.cpp index b2ad93e18..6ad74889d 100644 --- a/example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_fp32.cpp +++ b/example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_fp32.cpp @@ -16,7 +16,7 @@ Gemm + Gemm fused operation. Computes C_m_o = A_m_k * B0_k_n * B1_n_o #include "ck/ck.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" -#include "ck/tensor_operation/gpu/device/device_batched_gemm_gemm_xdl_cshuffle.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_gemm_xdl_cshuffle.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" #include "ck/library/utility/check_err.hpp" diff --git a/example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_int4.cpp b/example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_int4.cpp index 09880cb17..29faf13e1 100644 --- a/example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_int4.cpp +++ b/example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_int4.cpp @@ -20,7 +20,7 @@ Gemm + Gemm fused operation. Computes C_m_o = A_m_k * B0_k_n * B1_n_o #include "ck/ck.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" -#include "ck/tensor_operation/gpu/device/device_batched_gemm_gemm_xdl_cshuffle.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_gemm_xdl_cshuffle.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" #include "ck/library/utility/check_err.hpp" diff --git a/example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_int8.cpp b/example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_int8.cpp index 27d87215c..153257543 100644 --- a/example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_int8.cpp +++ b/example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_int8.cpp @@ -16,7 +16,7 @@ Gemm + Gemm fused operation. Computes C_m_o = A_m_k * B0_k_n * B1_n_o #include "ck/ck.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" -#include "ck/tensor_operation/gpu/device/device_batched_gemm_gemm_xdl_cshuffle.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_gemm_xdl_cshuffle.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" #include "ck/library/utility/check_err.hpp" diff --git a/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_lower_triangle_scale_softmax_gemm_permute_xdl_fp16.cpp b/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_lower_triangle_scale_softmax_gemm_permute_xdl_fp16.cpp index b77a6996c..20294bccf 100644 --- a/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_lower_triangle_scale_softmax_gemm_permute_xdl_fp16.cpp +++ b/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_lower_triangle_scale_softmax_gemm_permute_xdl_fp16.cpp @@ -17,7 +17,7 @@ Gemm + Softmax + Gemm fused operation. Computes C_g_m_o = Softmax(A_g_m_k * B0_g #include "ck/ck.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" #include "ck/tensor_operation/gpu/device/tensor_specialization.hpp" -#include "ck/tensor_operation/gpu/device/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" #include "ck/library/utility/check_err.hpp" diff --git a/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_permute_xdl_fp16.cpp b/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_permute_xdl_fp16.cpp index 570907873..8b2daec65 100644 --- a/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_permute_xdl_fp16.cpp +++ b/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_permute_xdl_fp16.cpp @@ -17,7 +17,7 @@ Gemm + Softmax + Gemm fused operation. Computes C_g_m_o = Softmax(A_g_m_k * B0_g #include "ck/ck.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" #include "ck/tensor_operation/gpu/device/tensor_specialization.hpp" -#include "ck/tensor_operation/gpu/device/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" #include "ck/library/utility/check_err.hpp" diff --git a/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_xdl_fp16.cpp b/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_xdl_fp16.cpp index 3e544cc6b..327875e28 100644 --- a/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_xdl_fp16.cpp +++ b/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_xdl_fp16.cpp @@ -16,7 +16,7 @@ Gemm + Softmax + Gemm fused operation. Computes C_g_m_o = Softmax(A_g_m_k * B0_g #include "ck/ck.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" -#include "ck/tensor_operation/gpu/device/device_batched_gemm_softmax_gemm_xdl_cshuffle.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_xdl_cshuffle.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" #include "ck/library/utility/check_err.hpp" diff --git a/example/33_multiple_reduce/dual_reduce_multiblock.cpp b/example/33_multiple_reduce/dual_reduce_multiblock.cpp index 638934ec0..9360599ed 100644 --- a/example/33_multiple_reduce/dual_reduce_multiblock.cpp +++ b/example/33_multiple_reduce/dual_reduce_multiblock.cpp @@ -13,7 +13,7 @@ #include "ck/utility/data_type.hpp" #include "ck/tensor_operation/gpu/device/device_base.hpp" -#include "ck/tensor_operation/gpu/device/device_multiple_reduce_multiblock.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_multiple_reduce_multiblock.hpp" #include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp" #include "dual_reduce_common.hpp" diff --git a/example/33_multiple_reduce/dual_reduce_threadwise.cpp b/example/33_multiple_reduce/dual_reduce_threadwise.cpp index 51b93ccaa..56255839e 100644 --- a/example/33_multiple_reduce/dual_reduce_threadwise.cpp +++ b/example/33_multiple_reduce/dual_reduce_threadwise.cpp @@ -13,7 +13,7 @@ #include "ck/utility/data_type.hpp" #include "ck/tensor_operation/gpu/device/device_base.hpp" -#include "ck/tensor_operation/gpu/device/device_multiple_reduce_threadwise.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_multiple_reduce_threadwise.hpp" #include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp" #include "dual_reduce_common.hpp" diff --git a/example/34_batchnorm/batchnorm_forward_impl.hpp b/example/34_batchnorm/batchnorm_forward_impl.hpp index c383c2a63..6fb7987e9 100644 --- a/example/34_batchnorm/batchnorm_forward_impl.hpp +++ b/example/34_batchnorm/batchnorm_forward_impl.hpp @@ -9,8 +9,8 @@ #include "ck/ck.hpp" #include "ck/utility/reduction_operator.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/tensor_operation/gpu/device/device_multiple_reduce_multiblock.hpp" -#include "ck/tensor_operation/gpu/device/device_elementwise.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_multiple_reduce_multiblock.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_elementwise.hpp" #include "batchnorm_common.hpp" diff --git a/example/34_batchnorm/batchnorm_infer_impl.hpp b/example/34_batchnorm/batchnorm_infer_impl.hpp index d1164d0ff..23c4978d7 100644 --- a/example/34_batchnorm/batchnorm_infer_impl.hpp +++ b/example/34_batchnorm/batchnorm_infer_impl.hpp @@ -10,7 +10,7 @@ #include "ck/utility/sequence.hpp" #include "ck/utility/tuple.hpp" #include "ck/utility/reduction_operator.hpp" -#include "ck/tensor_operation/gpu/device/device_elementwise.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_elementwise.hpp" #include "batchnorm_common.hpp" diff --git a/example/35_splitK_gemm/splitK_gemm_xdl_bfp16.cpp b/example/35_splitK_gemm/splitK_gemm_xdl_bfp16.cpp index 484a4494b..7191ecf50 100644 --- a/example/35_splitK_gemm/splitK_gemm_xdl_bfp16.cpp +++ b/example/35_splitK_gemm/splitK_gemm_xdl_bfp16.cpp @@ -8,7 +8,7 @@ #include "ck/ck.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" -#include "ck/tensor_operation/gpu/device/device_gemm_xdl_splitk_c_shuffle.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_splitk_c_shuffle.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" #include "ck/library/utility/check_err.hpp" diff --git a/example/35_splitK_gemm/splitK_gemm_xdl_fp16.cpp b/example/35_splitK_gemm/splitK_gemm_xdl_fp16.cpp index a1c43d038..efdb315b4 100644 --- a/example/35_splitK_gemm/splitK_gemm_xdl_fp16.cpp +++ b/example/35_splitK_gemm/splitK_gemm_xdl_fp16.cpp @@ -8,7 +8,7 @@ #include "ck/ck.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" -#include "ck/tensor_operation/gpu/device/device_gemm_xdl_splitk_c_shuffle.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_splitk_c_shuffle.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" #include "ck/library/utility/check_err.hpp" diff --git a/example/35_splitK_gemm/splitK_gemm_xdl_fp32.cpp b/example/35_splitK_gemm/splitK_gemm_xdl_fp32.cpp index 01093461c..bc2e3d1d5 100644 --- a/example/35_splitK_gemm/splitK_gemm_xdl_fp32.cpp +++ b/example/35_splitK_gemm/splitK_gemm_xdl_fp32.cpp @@ -8,7 +8,7 @@ #include "ck/ck.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" -#include "ck/tensor_operation/gpu/device/device_gemm_xdl_splitk_c_shuffle.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_splitk_c_shuffle.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" #include "ck/library/utility/check_err.hpp" diff --git a/example/35_splitK_gemm/splitK_gemm_xdl_int4.cpp b/example/35_splitK_gemm/splitK_gemm_xdl_int4.cpp index d2392faf5..4eb278246 100644 --- a/example/35_splitK_gemm/splitK_gemm_xdl_int4.cpp +++ b/example/35_splitK_gemm/splitK_gemm_xdl_int4.cpp @@ -8,7 +8,7 @@ #include "ck/ck.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" -#include "ck/tensor_operation/gpu/device/device_gemm_xdl_splitk_c_shuffle.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_splitk_c_shuffle.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" #include "ck/library/utility/check_err.hpp" diff --git a/example/35_splitK_gemm/splitK_gemm_xdl_int8.cpp b/example/35_splitK_gemm/splitK_gemm_xdl_int8.cpp index d2f51db2c..eefdbca6b 100644 --- a/example/35_splitK_gemm/splitK_gemm_xdl_int8.cpp +++ b/example/35_splitK_gemm/splitK_gemm_xdl_int8.cpp @@ -8,7 +8,7 @@ #include "ck/ck.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" -#include "ck/tensor_operation/gpu/device/device_gemm_xdl_splitk_c_shuffle.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_splitk_c_shuffle.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" #include "ck/library/utility/check_err.hpp" diff --git a/example/36_sparse_embedding/sparse_embedding3_forward_layernorm.cpp b/example/36_sparse_embedding/sparse_embedding3_forward_layernorm.cpp index c6c12108b..69d5c587e 100644 --- a/example/36_sparse_embedding/sparse_embedding3_forward_layernorm.cpp +++ b/example/36_sparse_embedding/sparse_embedding3_forward_layernorm.cpp @@ -9,7 +9,7 @@ #include #include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_sparse_embedding3_forward_layernorm.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_sparse_embedding3_forward_layernorm.hpp" #include "ck/library/utility/check_err.hpp" #include "ck/library/utility/device_memory.hpp" diff --git a/example/37_batched_gemm_add_add_relu_gemm_add/batched_gemm_add_add_relu_gemm_add_xdl_fp16.cpp b/example/37_batched_gemm_add_add_relu_gemm_add/batched_gemm_add_add_relu_gemm_add_xdl_fp16.cpp index 8bf9103e6..e7efa04d2 100644 --- a/example/37_batched_gemm_add_add_relu_gemm_add/batched_gemm_add_add_relu_gemm_add_xdl_fp16.cpp +++ b/example/37_batched_gemm_add_add_relu_gemm_add/batched_gemm_add_add_relu_gemm_add_xdl_fp16.cpp @@ -12,7 +12,7 @@ Computes C_m_o = Relu(A0[m, k] * B0[n, k] + D00[m, n] + D01[mn]) * B1[n, o] + D1 #include "ck/ck.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" -#include "ck/tensor_operation/gpu/device/device_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle.hpp" #include "ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp" #include "ck/library/utility/check_err.hpp" diff --git a/example/41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_bf16.cpp b/example/41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_bf16.cpp index 3545cc0ef..205916ff4 100644 --- a/example/41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_bf16.cpp +++ b/example/41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_bf16.cpp @@ -8,7 +8,7 @@ #include "ck/ck.hpp" #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" -#include "ck/tensor_operation/gpu/device/device_batched_gemm_gemm_xdl_cshuffle.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_gemm_xdl_cshuffle.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" #include "ck/library/utility/check_err.hpp" diff --git a/example/41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_fp16.cpp b/example/41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_fp16.cpp index f329e28bf..3bfa4c50e 100644 --- a/example/41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_fp16.cpp +++ b/example/41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_fp16.cpp @@ -8,7 +8,7 @@ #include "ck/ck.hpp" #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" -#include "ck/tensor_operation/gpu/device/device_batched_gemm_gemm_xdl_cshuffle.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_gemm_xdl_cshuffle.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" #include "ck/library/utility/check_err.hpp" diff --git a/example/41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_fp32.cpp b/example/41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_fp32.cpp index 45f909e01..ab0ddf075 100644 --- a/example/41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_fp32.cpp +++ b/example/41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_fp32.cpp @@ -8,7 +8,7 @@ #include "ck/ck.hpp" #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" -#include "ck/tensor_operation/gpu/device/device_batched_gemm_gemm_xdl_cshuffle.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_gemm_xdl_cshuffle.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" #include "ck/library/utility/check_err.hpp" diff --git a/example/41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_int4.cpp b/example/41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_int4.cpp index f327ea4b3..7a46285c5 100644 --- a/example/41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_int4.cpp +++ b/example/41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_int4.cpp @@ -12,7 +12,7 @@ #include "ck/ck.hpp" #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" -#include "ck/tensor_operation/gpu/device/device_batched_gemm_gemm_xdl_cshuffle.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_gemm_xdl_cshuffle.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" #include "ck/library/utility/check_err.hpp" diff --git a/example/41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_int8.cpp b/example/41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_int8.cpp index 9ee26ded7..62287ea60 100644 --- a/example/41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_int8.cpp +++ b/example/41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_int8.cpp @@ -8,7 +8,7 @@ #include "ck/ck.hpp" #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" -#include "ck/tensor_operation/gpu/device/device_batched_gemm_gemm_xdl_cshuffle.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_gemm_xdl_cshuffle.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" #include "ck/library/utility/check_err.hpp" diff --git a/example/42_groupnorm/groupnorm_sigmoid_fp16.cpp b/example/42_groupnorm/groupnorm_sigmoid_fp16.cpp index e0924ec3a..8261b8d6a 100644 --- a/example/42_groupnorm/groupnorm_sigmoid_fp16.cpp +++ b/example/42_groupnorm/groupnorm_sigmoid_fp16.cpp @@ -9,7 +9,7 @@ #include "ck/ck.hpp" #include "ck/utility/reduction_enums.hpp" -#include "ck/tensor_operation/gpu/device/device_normalization_impl.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_normalization_impl.hpp" #include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp" #include "ck/library/utility/fill.hpp" diff --git a/include/ck/tensor_operation/gpu/device/device_batched_contraction_multiple_d_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_contraction_multiple_d_xdl_cshuffle.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/device/device_batched_contraction_multiple_d_xdl_cshuffle.hpp rename to include/ck/tensor_operation/gpu/device/impl/device_batched_contraction_multiple_d_xdl_cshuffle.hpp diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_e_permute_xdl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_e_permute_xdl.hpp new file mode 100644 index 000000000..01f5e17d9 --- /dev/null +++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_e_permute_xdl.hpp @@ -0,0 +1,683 @@ +#pragma once + +#include +#include + +#include "ck/utility/common_header.hpp" +#include "ck/tensor_description/tensor_descriptor.hpp" +#include "ck/tensor_description/tensor_descriptor_helper.hpp" +#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" +#include "ck/tensor_operation/gpu/device/device_batched_gemm_e_permute.hpp" +#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" +#include "ck/tensor_operation/gpu/device/matrix_padder.hpp" +#include "ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_cshuffle.hpp" +#include "ck/host_utility/device_prop.hpp" +#include "ck/host_utility/kernel_launch.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { + +/* + * \brief Wrapper function of GridwiseGemm::Run to realize BatchedGEMM. + * + * \tparam ComputePtrOffsetOfBatch Class that computes the base pointer offsets of A, B, C matrix + * given the batch. For example, ComputePtrOffsetOfStridedBatch() computes the offsets of evenly + * strided batched, but we can easily extend to other layouts. The returned offset can be either \p + * index_t or \p long_index_t. If it returns \p long_index_t, we are not subject to the 2GB +#include "ck/tensor_operation/gpu/device/matrix_padder.hpp" + * limitations. + * + * \tparam Block2ETileMap Block2ETileMap::CalculateBottomIndex() takes in id of a workgroup and + * returns the 2D index of the tile that it computes. \see + * GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3::Run(). + * \note Using \p ComputePtrOffsetOfBatch gives us the flexibility that 2 workgroups can compute 2 + * tiles from different matrices. Keep in mind that these 2 matrices can share the same grid + * descriptor (like in BatchedGEMM), or use their own grid descriptors (in GroupedGemm). \link + * impl/device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk.hpp kernel_gemm_xdlops_v2r3_for_conv3d \endlink for +\link + * DeviceConv3d \endlink uses the same concept, but currently does NOT encapsulate the computing of + * pointer offset into \p ComputePtrOffsetOfStridedBatch. + * + * \note \p Block2ETileMap allows customized mapping between a workgroup and the C-tile it computes. + * Together with \p ComputePtrOffsetOfBatch, we can reuse GridwiseGemm (and GridwiseGemm fusion ) to + * realize BatchedGemmCPermute and GroupedGemm (and the corresponding GEMM fusion). + * + */ +template +__global__ void +#if CK_USE_LAUNCH_BOUNDS + __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU) +#endif + kernel_batched_gemm_e_permute_xdl(const ABDataType* __restrict__ p_a_grid, + const ABDataType* __restrict__ p_b_grid, + EDataType* __restrict__ p_e_grid, + const index_t batch_count, + const AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1, + const BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1, + const EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock + e_grid_desc_mblock_mperblock_nblock_nperblock, + const AElementwiseOperation a_element_op, + const BElementwiseOperation b_element_op, + const CDEElementwiseOperation cde_element_op, + const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch, + const Block2ETileMap block_2_etile_map) +{ +#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__)) + const index_t num_blocks_per_batch = + __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count); + const index_t g_idx = __builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch); + + const long_index_t a_batch_offset = __builtin_amdgcn_readfirstlane( + static_cast(compute_ptr_offset_of_batch.GetAPtrOffset(g_idx))); + const long_index_t b_batch_offset = __builtin_amdgcn_readfirstlane( + static_cast(compute_ptr_offset_of_batch.GetBPtrOffset(g_idx))); + const long_index_t e_batch_offset = __builtin_amdgcn_readfirstlane( + static_cast(compute_ptr_offset_of_batch.GetCPtrOffset(g_idx))); + + __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()]; + + GridwiseGemm::template Run(p_a_grid + a_batch_offset, + p_b_grid + b_batch_offset, + ck::Tuple<>{}, + p_e_grid + e_batch_offset, + p_shared, + a_element_op, + b_element_op, + cde_element_op, + a_grid_desc_ak0_m_ak1, + b_grid_desc_bk0_n_bk1, + ck::Tuple<>{}, + e_grid_desc_mblock_mperblock_nblock_nperblock, + block_2_etile_map); +#else + ignore = p_a_grid; + ignore = p_b_grid; + ignore = p_e_grid; + ignore = batch_count; + ignore = a_grid_desc_ak0_m_ak1; + ignore = b_grid_desc_bk0_n_bk1; + ignore = e_grid_desc_mblock_mperblock_nblock_nperblock; + ignore = a_element_op; + ignore = b_element_op; + ignore = cde_element_op; + ignore = compute_ptr_offset_of_batch; + ignore = block_2_etile_map; +#endif +} + +template +struct DeviceBatchedGemmEPermuteXdl : public DeviceBatchedGemmEPermute +{ + using DeviceOp = DeviceBatchedGemmEPermuteXdl; + + static constexpr auto I0 = Number<0>{}; + static constexpr auto I1 = Number<1>{}; + static constexpr auto I2 = Number<2>{}; + + static constexpr auto matrix_padder = + MatrixPadder{MPerBlock, NPerBlock, KPerBlock}; + + static auto MakeAGridDescriptor_M_K(index_t MRaw, index_t KRaw, index_t StrideA) + { + const auto a_grid_desc_mraw_kraw = [&]() { + if constexpr(is_same_v) + { + return make_naive_tensor_descriptor(make_tuple(MRaw, KRaw), + make_tuple(StrideA, I1)); + } + else if constexpr(is_same_v) + { + return make_naive_tensor_descriptor(make_tuple(MRaw, KRaw), + make_tuple(I1, StrideA)); + } + }(); + + return matrix_padder.PadADescriptor_M_K(a_grid_desc_mraw_kraw); + } + + static auto MakeBGridDescriptor_N_K(index_t KRaw, index_t NRaw, index_t StrideB) + { + const auto b_grid_desc_nraw_kraw = [&]() { + if constexpr(is_same::value) + { + return make_naive_tensor_descriptor(make_tuple(NRaw, KRaw), + make_tuple(I1, StrideB)); + } + else if constexpr(is_same::value) + { + return make_naive_tensor_descriptor(make_tuple(NRaw, KRaw), + make_tuple(StrideB, I1)); + } + }(); + + return matrix_padder.PadBDescriptor_N_K(b_grid_desc_nraw_kraw); + } + + static auto + MakeEGridDescriptor_M_N(index_t MRaw, index_t NRaw, index_t stride_M, index_t stride_N) + { + const auto e_grid_desc_mraw_nraw = + make_naive_tensor_descriptor(make_tuple(MRaw, NRaw), make_tuple(stride_M, stride_N)); + + return matrix_padder.PadCDescriptor_M_N(e_grid_desc_mraw_nraw); + } + + static auto MakeEGridDescriptor_G0_G1_M_N(index_t G0, + index_t G1, + index_t MRaw, + index_t NRaw, + index_t stride_G0, + index_t stride_G1, + index_t stride_M, + index_t stride_N) + { + const auto e_grid_desc_g0_g1_mraw_nraw = [&]() { + return make_naive_tensor_descriptor( + make_tuple(G0, G1, MRaw, NRaw), + make_tuple(stride_G0, stride_G1, stride_M, stride_N)); + }(); + + const auto M = math::integer_divide_ceil(MRaw, MPerBlock) * MPerBlock; + const auto N = math::integer_divide_ceil(NRaw, NPerBlock) * NPerBlock; + + const auto MPad = M - MRaw; + const auto NPad = N - NRaw; + + if constexpr(GemmSpec == GemmSpecialization::MNPadding || + GemmSpec == GemmSpecialization::MNKPadding) + { + // pad M and N + return transform_tensor_descriptor( + e_grid_desc_g0_g1_mraw_nraw, + make_tuple(make_pass_through_transform(G0), + make_pass_through_transform(G1), + make_right_pad_transform(MRaw, MPad), + make_right_pad_transform(NRaw, NPad)), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{})); + } + else if constexpr(GemmSpec == GemmSpecialization::MPadding || + GemmSpec == GemmSpecialization::MKPadding) + { + // pad M, but not N + return transform_tensor_descriptor( + e_grid_desc_g0_g1_mraw_nraw, + make_tuple(make_pass_through_transform(G0), + make_pass_through_transform(G1), + make_right_pad_transform(MRaw, MPad), + make_pass_through_transform(NRaw)), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{})); + } + else if constexpr(GemmSpec == GemmSpecialization::NPadding || + GemmSpec == GemmSpecialization::NKPadding) + { + // pad N, but not M + return transform_tensor_descriptor( + e_grid_desc_g0_g1_mraw_nraw, + make_tuple(make_pass_through_transform(G0), + make_pass_through_transform(G1), + make_pass_through_transform(MRaw), + make_right_pad_transform(NRaw, NPad)), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{})); + } + else + { + // not pad M or N + return e_grid_desc_g0_g1_mraw_nraw; + } + } + + using AGridDesc_M_K = decltype(MakeAGridDescriptor_M_K(1, 1, 1)); + using BGridDesc_N_K = decltype(MakeBGridDescriptor_N_K(1, 1, 1)); + using EGridDesc_M_N = decltype(MakeEGridDescriptor_M_N(1, 1, 1, 1)); + using EGridDesc_G0_G1_M_N = decltype(MakeEGridDescriptor_G0_G1_M_N(1, 1, 1, 1, 1, 1, 1, 1)); + + struct ComputePtrOffsetOfStridedBatch + { + ComputePtrOffsetOfStridedBatch(index_t Batchstride_A, + index_t Batchstride_B, + EGridDesc_G0_G1_M_N e_grid_desc_g0_g1_m_n) + : Batchstride_A_(Batchstride_A), + Batchstride_B_(Batchstride_B), + e_grid_desc_g0_g1_m_n_(e_grid_desc_g0_g1_m_n) + { + } + + __host__ __device__ constexpr long_index_t GetAPtrOffset(index_t g_idx) const + { + return g_idx * static_cast(Batchstride_A_); + } + + __host__ __device__ constexpr long_index_t GetBPtrOffset(index_t g_idx) const + { + return g_idx * static_cast(Batchstride_B_); + } + + __host__ __device__ constexpr long_index_t GetCPtrOffset(index_t g_idx) const + { + const index_t G1 = e_grid_desc_g0_g1_m_n_.GetLength(I1); + index_t b0 = g_idx / G1; + index_t b1 = g_idx - b0 * G1; // g_idx % G1 + return e_grid_desc_g0_g1_m_n_.CalculateOffset(make_multi_index(b0, b1, 0, 0)); + } + + private: + index_t Batchstride_A_; + index_t Batchstride_B_; + EGridDesc_G0_G1_M_N e_grid_desc_g0_g1_m_n_; + }; + + using GridwiseGemm = GridwiseGemmMultipleD_xdl_cshuffle< + ADataType, // TODO: distinguish A/B datatype + AccDataType, + CShuffleDataType, + ck::Tuple<>, // DsDataType, + EDataType, // EDataType, + AElementwiseOperation, + BElementwiseOperation, + CDEElementwiseOperation, + InMemoryDataOperationEnum::Set, + AGridDesc_M_K, + BGridDesc_N_K, + Tuple<>, + EGridDesc_M_N, + NumPrefetch, + BlockSize, + MPerBlock, + NPerBlock, + KPerBlock, + AK1, + BK1, + MPerXDL, + NPerXDL, + MXdlPerWave, + NXdlPerWave, + ABlockTransferThreadClusterLengths_K0_M_K1, + ABlockTransferThreadClusterArrangeOrder, + ABlockTransferSrcAccessOrder, + ABlockTransferSrcVectorDim, + ABlockTransferSrcScalarPerVector, + ABlockTransferDstScalarPerVector_K1, + false, // AThreadTransferSrcResetCoordinateAfterRun, + ABlockLdsExtraM, + BBlockTransferThreadClusterLengths_K0_N_K1, + BBlockTransferThreadClusterArrangeOrder, + BBlockTransferSrcAccessOrder, + BBlockTransferSrcVectorDim, + BBlockTransferSrcScalarPerVector, + BBlockTransferDstScalarPerVector_K1, + false, // BThreadTransferSrcResetCoordinateAfterRun, + BBlockLdsExtraN, + CShuffleMXdlPerWavePerShuffle, + CShuffleNXdlPerWavePerShuffle, + CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, + CDEBlockTransferScalarPerVector_NPerBlock, + LoopSched>; + + using AGridDesc_AK0_M_AK1 = remove_cvref_t; + using BGridDesc_BK0_N_BK1 = remove_cvref_t; + + using EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock = decltype( + GridwiseGemm::MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(EGridDesc_M_N{})); + using Block2ETileMap = typename GridwiseGemm::DefaultBlock2ETileMap; + + // Argument + struct Argument : public BaseArgument + { + Argument(const ADataType* p_a_grid, + const BDataType* p_b_grid, + EDataType* p_e_grid, + index_t M, + index_t N, + index_t K, + index_t stride_A, + index_t stride_B, + index_t batch_stride_A, + index_t batch_stride_B, + BatchedGemmEPermuteDesc batched_gemm_e_permute_desc, + index_t BatchCount, + AElementwiseOperation a_element_op, + BElementwiseOperation b_element_op, + CDEElementwiseOperation cde_element_op) + : p_a_grid_{p_a_grid}, + p_b_grid_{p_b_grid}, + p_e_grid_{p_e_grid}, + BatchCount_(BatchCount), + a_grid_desc_m_k_{DeviceOp::MakeAGridDescriptor_M_K(M, K, stride_A)}, + b_grid_desc_n_k_{DeviceOp::MakeBGridDescriptor_N_K(K, N, stride_B)}, + e_grid_desc_m_n_{ + DeviceOp::MakeEGridDescriptor_M_N(batched_gemm_e_permute_desc.M_, + batched_gemm_e_permute_desc.N_, + batched_gemm_e_permute_desc.stride_M_, + batched_gemm_e_permute_desc.stride_N_)}, + a_grid_desc_ak0_m_ak1_{ + GridwiseGemm::MakeDefaultAGridDescriptor_AK0_M_AK1(a_grid_desc_m_k_)}, + b_grid_desc_bk0_n_bk1_{ + GridwiseGemm::MakeDefaultBGridDescriptor_BK0_N_BK1(b_grid_desc_n_k_)}, + e_grid_desc_mblock_mperblock_nblock_nperblock{}, + e_grid_desc_g0_g1_m_n_{ + DeviceOp::MakeEGridDescriptor_G0_G1_M_N(batched_gemm_e_permute_desc.G0_, + batched_gemm_e_permute_desc.G1_, + batched_gemm_e_permute_desc.M_, + batched_gemm_e_permute_desc.N_, + batched_gemm_e_permute_desc.stride_G0_, + batched_gemm_e_permute_desc.stride_G1_, + batched_gemm_e_permute_desc.stride_M_, + batched_gemm_e_permute_desc.stride_N_)}, + compute_ptr_offset_of_batch_{batch_stride_A, batch_stride_B, e_grid_desc_g0_g1_m_n_}, + block_2_etile_map_{GridwiseGemm::MakeDefaultBlock2ETileMap(e_grid_desc_m_n_)}, + a_element_op_{a_element_op}, + b_element_op_{b_element_op}, + cde_element_op_{cde_element_op} + { + if(GridwiseGemm::CheckValidity(a_grid_desc_m_k_, + b_grid_desc_n_k_, + ck::Tuple<>{}, + e_grid_desc_m_n_, + block_2_etile_map_)) + { + e_grid_desc_mblock_mperblock_nblock_nperblock = + GridwiseGemm::MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock( + e_grid_desc_m_n_); + } + } + + void Print() const + { + std::cout << "A[M, K]: " << a_grid_desc_m_k_ << std::endl; + std::cout << "B[N, K]: " << b_grid_desc_n_k_ << std::endl; + std::cout << "C[M, N]: " << e_grid_desc_m_n_ << std::endl; + } + + // private: + // pointers + const ADataType* p_a_grid_; + const BDataType* p_b_grid_; + EDataType* p_e_grid_; + + // batch count + index_t BatchCount_; + + // tensor descriptors for problem definiton + AGridDesc_M_K a_grid_desc_m_k_; + BGridDesc_N_K b_grid_desc_n_k_; + EGridDesc_M_N e_grid_desc_m_n_; + + // tensor descriptors for block/thread-wise copy + AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1_; + BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1_; + EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock e_grid_desc_mblock_mperblock_nblock_nperblock; + EGridDesc_G0_G1_M_N e_grid_desc_g0_g1_m_n_; + + // for calculating Batch offset + ComputePtrOffsetOfStridedBatch compute_ptr_offset_of_batch_; + + // block-to-e-tile map + Block2ETileMap block_2_etile_map_; + + // element-wise op + AElementwiseOperation a_element_op_; + BElementwiseOperation b_element_op_; + CDEElementwiseOperation cde_element_op_; + }; + + // Invoker + struct Invoker : public BaseInvoker + { + using Argument = DeviceOp::Argument; + + float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{}) + { + if(!GridwiseGemm::CheckValidity(arg.a_grid_desc_m_k_, + arg.b_grid_desc_n_k_, + ck::Tuple<>{}, + arg.e_grid_desc_m_n_, + arg.block_2_etile_map_)) + { + throw std::runtime_error( + "wrong! GridwiseBatchedGemmCPermute_km_kn_m0m1n0n1_xdlops_v2r3 has invalid " + "setting"); + } + + const index_t grid_size = + arg.block_2_etile_map_.CalculateGridSize(arg.e_grid_desc_m_n_) * arg.BatchCount_; + + const auto K = + arg.a_grid_desc_ak0_m_ak1_.GetLength(I0) * arg.a_grid_desc_ak0_m_ak1_.GetLength(I2); + + auto launch_kernel = [&](auto has_main_k_block_loop_) { + const auto kernel = kernel_batched_gemm_e_permute_xdl< + GridwiseGemm, + ADataType, // TODO: distiguish A/B datatype + EDataType, + remove_reference_t, + remove_reference_t, + typename GridwiseGemm::EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock, + AElementwiseOperation, + BElementwiseOperation, + CDEElementwiseOperation, + ComputePtrOffsetOfStridedBatch, + remove_reference_t, + has_main_k_block_loop_>; + + return launch_and_time_kernel(stream_config, + kernel, + dim3(grid_size), + dim3(BlockSize), + 0, + arg.p_a_grid_, + arg.p_b_grid_, + arg.p_e_grid_, + arg.BatchCount_, + arg.a_grid_desc_ak0_m_ak1_, + arg.b_grid_desc_bk0_n_bk1_, + arg.e_grid_desc_mblock_mperblock_nblock_nperblock, + arg.a_element_op_, + arg.b_element_op_, + arg.cde_element_op_, + arg.compute_ptr_offset_of_batch_, + arg.block_2_etile_map_); + }; + + if(GridwiseGemm::CalculateHasMainKBlockLoop(K)) + { + return launch_kernel(integral_constant{}); + } + else + { + return launch_kernel(integral_constant{}); + } + } + + // polymorphic + float Run(const BaseArgument* p_arg, + const StreamConfig& stream_config = StreamConfig{}) override + { + return Run(*dynamic_cast(p_arg), stream_config); + } + }; + + static constexpr bool IsValidCompilationParameter() + { + // TODO: properly implement this check + return true; + } + + static bool IsSupportedArgument(const Argument& arg) + { + return GridwiseGemm::CheckValidity(arg.a_grid_desc_m_k_, + arg.b_grid_desc_n_k_, + ck::Tuple<>{}, + arg.e_grid_desc_m_n_, + arg.block_2_etile_map_); + } + + // polymorphic + bool IsSupportedArgument(const BaseArgument* p_arg) override + { + return IsSupportedArgument(*dynamic_cast(p_arg)); + } + + static auto MakeArgument(const ADataType* p_a, + const BDataType* p_b, + EDataType* p_e, + index_t M, + index_t N, + index_t K, + index_t stride_A, + index_t stride_B, + index_t batch_stride_A, + index_t batch_stride_B, + BatchedGemmEPermuteDesc batched_gemm_e_permute_desc, + index_t BatchCount, + AElementwiseOperation a_element_op, + BElementwiseOperation b_element_op, + CDEElementwiseOperation cde_element_op) + { + return Argument{p_a, + p_b, + p_e, + M, + N, + K, + stride_A, + stride_B, + batch_stride_A, + batch_stride_B, + batched_gemm_e_permute_desc, + BatchCount, + a_element_op, + b_element_op, + cde_element_op}; + } + + static auto MakeInvoker() { return Invoker{}; } + + // polymorphic + std::unique_ptr + MakeArgumentPointer(const void* p_a, + const void* p_b, + void* p_e, + index_t M, + index_t N, + index_t K, + index_t stride_A, + index_t stride_B, + index_t batch_stride_A, + index_t batch_stride_B, + BatchedGemmEPermuteDesc batched_gemm_e_permute_desc, + index_t BatchCount, + AElementwiseOperation a_element_op, + BElementwiseOperation b_element_op, + CDEElementwiseOperation cde_element_op) override + { + return std::make_unique(static_cast(p_a), + static_cast(p_b), + static_cast(p_e), + M, + N, + K, + stride_A, + stride_B, + batch_stride_A, + batch_stride_B, + batched_gemm_e_permute_desc, + BatchCount, + a_element_op, + b_element_op, + cde_element_op); + } + + // polymorphic + std::unique_ptr MakeInvokerPointer() override + { + return std::make_unique(Invoker{}); + } + + // polymorphic + std::string GetTypeString() const override + { + auto str = std::stringstream(); + + // clang-format off + str << "DeviceBatchedGemmEPermuteXdl" + << "<" + << BlockSize << ", " + << MPerBlock << ", " + << NPerBlock << ", " + << KPerBlock + << ">"; + // clang-format on + + return str.str(); + } +}; + +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/include/ck/tensor_operation/gpu/device/device_batched_gemm_gemm_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_gemm_xdl_cshuffle.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/device/device_batched_gemm_gemm_xdl_cshuffle.hpp rename to include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_gemm_xdl_cshuffle.hpp diff --git a/include/ck/tensor_operation/gpu/device/device_batched_gemm_multi_d_xdl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multi_d_xdl.hpp similarity index 99% rename from include/ck/tensor_operation/gpu/device/device_batched_gemm_multi_d_xdl.hpp rename to include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multi_d_xdl.hpp index af5b88065..c2c765208 100644 --- a/include/ck/tensor_operation/gpu/device/device_batched_gemm_multi_d_xdl.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multi_d_xdl.hpp @@ -38,9 +38,9 @@ namespace device { * \note Using \p ComputePtrOffsetOfBatch gives us the flexibility that 2 workgroups can compute 2 * tiles from different matrices. Keep in mind that these 2 matrices can share the same grid * descriptor (like in BatchedGEMM), or use their own grid descriptors (in GroupedGemm). \link - * device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk.hpp kernel_gemm_xdlops_v2r3_for_conv3d \endlink for \link - * DeviceConv3d \endlink uses the same concept, but currently does NOT encapsulate the computing of - * pointer offset into \p ComputePtrOffsetOfStridedBatch. + * impl/device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk.hpp kernel_gemm_xdlops_v2r3_for_conv3d \endlink for + * \link DeviceConv3d \endlink uses the same concept, but currently does NOT encapsulate the + * computing of pointer offset into \p ComputePtrOffsetOfStridedBatch. * * \note \p Block2ETileMap allows customized mapping between a workgroup and the C-tile it computes. * Together with \p ComputePtrOffsetOfBatch, we can reuse GridwiseGemm (and GridwiseGemm fusion ) to diff --git a/include/ck/tensor_operation/gpu/device/device_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/device/device_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle.hpp rename to include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle.hpp diff --git a/include/ck/tensor_operation/gpu/device/device_batched_gemm_reduce_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_reduce_xdl_cshuffle.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/device/device_batched_gemm_reduce_xdl_cshuffle.hpp rename to include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_reduce_xdl_cshuffle.hpp diff --git a/include/ck/tensor_operation/gpu/device/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/device/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle.hpp rename to include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle.hpp index 44d392d99..d37c02b81 100644 --- a/include/ck/tensor_operation/gpu/device/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle.hpp @@ -9,10 +9,10 @@ #include "ck/utility/common_header.hpp" #include "ck/tensor_description/tensor_descriptor.hpp" #include "ck/tensor_description/tensor_descriptor_helper.hpp" -#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/device_batched_gemm_softmax_gemm_permute.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" #include "ck/tensor_operation/gpu/device/matrix_padder.hpp" +#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_xdl_cshuffle_v1.hpp" #include "ck/host_utility/device_prop.hpp" #include "ck/host_utility/kernel_launch.hpp" diff --git a/include/ck/tensor_operation/gpu/device/device_batched_gemm_softmax_gemm_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_xdl_cshuffle.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/device/device_batched_gemm_softmax_gemm_xdl_cshuffle.hpp rename to include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_xdl_cshuffle.hpp diff --git a/include/ck/tensor_operation/gpu/device/device_batched_gemm_xdl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_xdl.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/device/device_batched_gemm_xdl.hpp rename to include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_xdl.hpp diff --git a/include/ck/tensor_operation/gpu/device/device_cgemm_4gemm_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_cgemm_4gemm_xdl_cshuffle.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/device/device_cgemm_4gemm_xdl_cshuffle.hpp rename to include/ck/tensor_operation/gpu/device/impl/device_cgemm_4gemm_xdl_cshuffle.hpp diff --git a/include/ck/tensor_operation/gpu/device/device_contraction_multiple_d_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_contraction_multiple_d_xdl_cshuffle.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/device/device_contraction_multiple_d_xdl_cshuffle.hpp rename to include/ck/tensor_operation/gpu/device/impl/device_contraction_multiple_d_xdl_cshuffle.hpp diff --git a/include/ck/tensor_operation/gpu/device/device_conv2d_backward_weight_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp b/include/ck/tensor_operation/gpu/device/impl/device_conv2d_backward_weight_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/device/device_conv2d_backward_weight_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp rename to include/ck/tensor_operation/gpu/device/impl/device_conv2d_backward_weight_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp diff --git a/include/ck/tensor_operation/gpu/device/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk.hpp b/include/ck/tensor_operation/gpu/device/impl/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/device/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk.hpp rename to include/ck/tensor_operation/gpu/device/impl/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk.hpp diff --git a/include/ck/tensor_operation/gpu/device/device_conv2d_fwd_xdl_c_shuffle_bias_activation_add_nhwc_kyxc_nhwk.hpp b/include/ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_c_shuffle_bias_activation_add_nhwc_kyxc_nhwk.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/device/device_conv2d_fwd_xdl_c_shuffle_bias_activation_add_nhwc_kyxc_nhwk.hpp rename to include/ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_c_shuffle_bias_activation_add_nhwc_kyxc_nhwk.hpp diff --git a/include/ck/tensor_operation/gpu/device/device_conv2d_fwd_xdl_c_shuffle_bias_activation_nhwc_kyxc_nhwk.hpp b/include/ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_c_shuffle_bias_activation_nhwc_kyxc_nhwk.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/device/device_conv2d_fwd_xdl_c_shuffle_bias_activation_nhwc_kyxc_nhwk.hpp rename to include/ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_c_shuffle_bias_activation_nhwc_kyxc_nhwk.hpp diff --git a/include/ck/tensor_operation/gpu/device/device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp b/include/ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/device/device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp rename to include/ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp diff --git a/include/ck/tensor_operation/gpu/device/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk.hpp b/include/ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/device/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk.hpp rename to include/ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk.hpp diff --git a/include/ck/tensor_operation/gpu/device/device_conv3d_fwd_naive_ndhwc_kzyxc_ndhwk.hpp b/include/ck/tensor_operation/gpu/device/impl/device_conv3d_fwd_naive_ndhwc_kzyxc_ndhwk.hpp similarity index 100% rename from include/ck/tensor_operation/gpu/device/device_conv3d_fwd_naive_ndhwc_kzyxc_ndhwk.hpp rename to include/ck/tensor_operation/gpu/device/impl/device_conv3d_fwd_naive_ndhwc_kzyxc_ndhwk.hpp diff --git a/include/ck/tensor_operation/gpu/device/device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk.hpp b/include/ck/tensor_operation/gpu/device/impl/device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk.hpp similarity index 99% rename from include/ck/tensor_operation/gpu/device/device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk.hpp rename to include/ck/tensor_operation/gpu/device/impl/device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk.hpp index b48cfac0d..f950538d0 100644 --- a/include/ck/tensor_operation/gpu/device/device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk.hpp @@ -22,7 +22,7 @@ namespace tensor_operation { namespace device { /* - * \see \link device_batched_gemm_xdl.hpp kernel_batched_gemm_xdlops_v2r3() \endlink. + * \see \link impl/device_batched_gemm_xdl.hpp kernel_batched_gemm_xdlops_v2r3() \endlink. */ template #include "ck/ck.hpp" - #include "ck/library/tensor_operation_instance/gpu/normalization.hpp" - #include "ck/library/utility/check_err.hpp" #include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/host_tensor.hpp" diff --git a/test/batched_gemm_gemm/test_batched_gemm_gemm_util.hpp b/test/batched_gemm_gemm/test_batched_gemm_gemm_util.hpp index f8dec4fc8..d7fbc37f0 100644 --- a/test/batched_gemm_gemm/test_batched_gemm_gemm_util.hpp +++ b/test/batched_gemm_gemm/test_batched_gemm_gemm_util.hpp @@ -5,7 +5,7 @@ #include #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" -#include "ck/tensor_operation/gpu/device/device_batched_gemm_gemm_xdl_cshuffle.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_gemm_xdl_cshuffle.hpp" #include "profiler/include/profile_batched_gemm_gemm_impl.hpp" using ck::tensor_operation::device::GemmSpecialization; diff --git a/test/batched_gemm_masking_scale_softmax_gemm_permute/test_batched_gemm_masking_scale_softmax_gemm_permute_util.hpp b/test/batched_gemm_masking_scale_softmax_gemm_permute/test_batched_gemm_masking_scale_softmax_gemm_permute_util.hpp index ba27dd7e6..cd5d6389b 100644 --- a/test/batched_gemm_masking_scale_softmax_gemm_permute/test_batched_gemm_masking_scale_softmax_gemm_permute_util.hpp +++ b/test/batched_gemm_masking_scale_softmax_gemm_permute/test_batched_gemm_masking_scale_softmax_gemm_permute_util.hpp @@ -5,7 +5,7 @@ #include #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" -#include "ck/tensor_operation/gpu/device/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle.hpp" #include "profiler/include/profile_batched_gemm_masking_scale_softmax_gemm_permute_impl.hpp" using ck::tensor_operation::device::GemmSpecialization; diff --git a/test/batched_gemm_softmax_gemm/test_batched_gemm_softmax_gemm_util.hpp b/test/batched_gemm_softmax_gemm/test_batched_gemm_softmax_gemm_util.hpp index ae098c541..eb7fb24b2 100644 --- a/test/batched_gemm_softmax_gemm/test_batched_gemm_softmax_gemm_util.hpp +++ b/test/batched_gemm_softmax_gemm/test_batched_gemm_softmax_gemm_util.hpp @@ -5,7 +5,7 @@ #include #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" -#include "ck/tensor_operation/gpu/device/device_batched_gemm_softmax_gemm_xdl_cshuffle.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_xdl_cshuffle.hpp" #include "profiler/include/profile_batched_gemm_softmax_gemm_impl.hpp" using ck::tensor_operation::device::GemmSpecialization; diff --git a/test/convnd_bwd_data/convnd_bwd_data.cpp b/test/convnd_bwd_data/convnd_bwd_data.cpp index cc555faf6..c31e399ef 100644 --- a/test/convnd_bwd_data/convnd_bwd_data.cpp +++ b/test/convnd_bwd_data/convnd_bwd_data.cpp @@ -5,237 +5,89 @@ #include #include #include +#include #include #include "profiler/include/profile_conv_bwd_data_impl.hpp" +template class TestConvndBwdData : public ::testing::Test { protected: + using DataType = std::tuple_element_t<0, Tuple>; std::vector conv_params; -}; -// 1d -TEST_F(TestConvndBwdData, Conv1dBwdData) -{ - conv_params.clear(); - conv_params.push_back({1, 1, 128, 128, 256, {1}, {14}, {2}, {1}, {0}, {0}}); - conv_params.push_back({1, 1, 128, 128, 256, {3}, {28}, {1}, {1}, {1}, {1}}); - conv_params.push_back({1, 1, 128, 128, 256, {1}, {3}, {1}, {1}, {0}, {0}}); - - for(auto& param : conv_params) + template + void Run() { - bool pass; - - // fp32 - pass = ck::profiler::profile_conv_bwd_data_impl<1, - ck::tensor_layout::convolution::NWC, - ck::tensor_layout::convolution::KXC, - ck::tensor_layout::convolution::NWK, - float, - float, - float>(true, // do_verification - 1, // init_method - false, // do_log - false, // time_kernel - param); - - EXPECT_TRUE(pass); - - // fp16 - pass = ck::profiler::profile_conv_bwd_data_impl<1, - ck::tensor_layout::convolution::NWC, - ck::tensor_layout::convolution::KXC, - ck::tensor_layout::convolution::NWK, - ck::half_t, - ck::half_t, - ck::half_t>(true, // do_verification - 1, // init_method - false, // do_log - false, // time_kernel - param); - - EXPECT_TRUE(pass); - - // bf16 - pass = ck::profiler::profile_conv_bwd_data_impl<1, - ck::tensor_layout::convolution::NWC, - ck::tensor_layout::convolution::KXC, - ck::tensor_layout::convolution::NWK, - ck::bhalf_t, - ck::bhalf_t, - ck::bhalf_t>(true, // do_verification - 1, // init_method - false, // do_log - false, // time_kernel - param); - - EXPECT_TRUE(pass); + for(auto& param : conv_params) + { + bool pass; + EXPECT_FALSE(conv_params.empty()); + pass = ck::profiler::profile_conv_bwd_data_impl< + NDimSpatial, + ck::tuple_element_t>, + ck::tuple_element_t>, + ck::tuple_element_t>, + DataType, + DataType, + DataType>(true, // do_verification + 1, // init_method integer value + false, // do_log + false, // time_kernel + param); + EXPECT_TRUE(pass); + } + } +}; - // int8 - pass = ck::profiler::profile_conv_bwd_data_impl<1, - ck::tensor_layout::convolution::NWC, - ck::tensor_layout::convolution::KXC, - ck::tensor_layout::convolution::NWK, - int8_t, - int8_t, - int8_t>(true, // do_verification - 1, // init_method - false, // do_log - false, // time_kernel - param); +using KernelTypes = ::testing::Types, + std::tuple, + std::tuple, + std::tuple>; +TYPED_TEST_SUITE(TestConvndBwdData, KernelTypes); - EXPECT_TRUE(pass); - } +// 1d +TYPED_TEST(TestConvndBwdData, Conv1dBwdData) +{ + this->conv_params.clear(); + this->conv_params.push_back({1, 1, 128, 128, 256, {1}, {14}, {2}, {1}, {0}, {0}}); + this->conv_params.push_back({1, 1, 128, 128, 256, {3}, {28}, {1}, {1}, {1}, {1}}); + this->conv_params.push_back({1, 1, 128, 128, 256, {1}, {3}, {1}, {1}, {0}, {0}}); + this->template Run<1>(); } // 2d -TEST_F(TestConvndBwdData, Conv2dBwdData) +TYPED_TEST(TestConvndBwdData, Conv2dBwdData) { - conv_params.clear(); - conv_params.push_back({2, 1, 128, 128, 256, {1, 1}, {7, 7}, {2, 2}, {1, 1}, {0, 0}, {0, 0}}); - conv_params.push_back({2, 1, 128, 128, 256, {3, 3}, {14, 14}, {1, 1}, {1, 1}, {1, 1}, {1, 1}}); - conv_params.push_back({2, 1, 128, 128, 256, {1, 1}, {3, 3}, {1, 1}, {1, 1}, {0, 0}, {0, 0}}); - - for(auto& param : conv_params) - { - bool pass; - - // fp32 - pass = ck::profiler::profile_conv_bwd_data_impl<2, - ck::tensor_layout::convolution::NHWC, - ck::tensor_layout::convolution::KYXC, - ck::tensor_layout::convolution::NHWK, - float, - float, - float>(true, // do_verification - 1, // init_method - false, // do_log - false, // time_kernel - param); - - EXPECT_TRUE(pass); - - // fp16 - pass = ck::profiler::profile_conv_bwd_data_impl<2, - ck::tensor_layout::convolution::NHWC, - ck::tensor_layout::convolution::KYXC, - ck::tensor_layout::convolution::NHWK, - ck::half_t, - ck::half_t, - ck::half_t>(true, // do_verification - 1, // init_method - false, // do_log - false, // time_kernel - param); - - EXPECT_TRUE(pass); - - // bf16 - pass = ck::profiler::profile_conv_bwd_data_impl<2, - ck::tensor_layout::convolution::NHWC, - ck::tensor_layout::convolution::KYXC, - ck::tensor_layout::convolution::NHWK, - ck::bhalf_t, - ck::bhalf_t, - ck::bhalf_t>(true, // do_verification - 1, // init_method - false, // do_log - false, // time_kernel - param); - - EXPECT_TRUE(pass); - - // int8 - pass = ck::profiler::profile_conv_bwd_data_impl<2, - ck::tensor_layout::convolution::NHWC, - ck::tensor_layout::convolution::KYXC, - ck::tensor_layout::convolution::NHWK, - int8_t, - int8_t, - int8_t>(true, // do_verification - 1, // init_method - false, // do_log - false, // time_kernel - param); - - EXPECT_TRUE(pass); - } + this->conv_params.clear(); + this->conv_params.push_back( + {2, 1, 128, 128, 256, {1, 1}, {7, 7}, {2, 2}, {1, 1}, {0, 0}, {0, 0}}); + this->conv_params.push_back( + {2, 1, 128, 128, 256, {3, 3}, {14, 14}, {1, 1}, {1, 1}, {1, 1}, {1, 1}}); + this->conv_params.push_back( + {2, 1, 128, 128, 256, {1, 1}, {3, 3}, {1, 1}, {1, 1}, {0, 0}, {0, 0}}); + this->template Run<2>(); } // 3d -TEST_F(TestConvndBwdData, Conv3dBwdData) +TYPED_TEST(TestConvndBwdData, Conv3dBwdData) { - conv_params.clear(); - conv_params.push_back( + this->conv_params.clear(); + this->conv_params.push_back( {3, 1, 128, 128, 256, {1, 1, 1}, {7, 7, 7}, {2, 2, 2}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}}); - conv_params.push_back( + this->conv_params.push_back( {3, 1, 128, 128, 256, {3, 3, 3}, {14, 14, 3}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}}); - conv_params.push_back( + this->conv_params.push_back( {3, 1, 128, 128, 256, {1, 1, 1}, {3, 3, 3}, {1, 1, 1}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}}); - - for(auto& param : conv_params) - { - bool pass; - - // fp32 - pass = ck::profiler::profile_conv_bwd_data_impl<3, - ck::tensor_layout::convolution::NDHWC, - ck::tensor_layout::convolution::KZYXC, - ck::tensor_layout::convolution::NDHWK, - float, - float, - float>(true, // do_verification - 1, // init_method - false, // do_log - false, // time_kernel - param); - - EXPECT_TRUE(pass); - - // fp16 - pass = ck::profiler::profile_conv_bwd_data_impl<3, - ck::tensor_layout::convolution::NDHWC, - ck::tensor_layout::convolution::KZYXC, - ck::tensor_layout::convolution::NDHWK, - ck::half_t, - ck::half_t, - ck::half_t>(true, // do_verification - 1, // init_method - false, // do_log - false, // time_kernel - param); - - EXPECT_TRUE(pass); - - // bf16 - pass = ck::profiler::profile_conv_bwd_data_impl<3, - ck::tensor_layout::convolution::NDHWC, - ck::tensor_layout::convolution::KZYXC, - ck::tensor_layout::convolution::NDHWK, - ck::bhalf_t, - ck::bhalf_t, - ck::bhalf_t>(true, // do_verification - 1, // init_method - false, // do_log - false, // time_kernel - param); - - EXPECT_TRUE(pass); - - // int8 - pass = ck::profiler::profile_conv_bwd_data_impl<3, - ck::tensor_layout::convolution::NDHWC, - ck::tensor_layout::convolution::KZYXC, - ck::tensor_layout::convolution::NDHWK, - int8_t, - int8_t, - int8_t>(true, // do_verification - 1, // init_method - false, // do_log - false, // time_kernel - param); - - EXPECT_TRUE(pass); - } + this->template Run<3>(); } diff --git a/test/convnd_bwd_weight/convnd_bwd_weight.cpp b/test/convnd_bwd_weight/convnd_bwd_weight.cpp index af27282f1..19fc66a90 100644 --- a/test/convnd_bwd_weight/convnd_bwd_weight.cpp +++ b/test/convnd_bwd_weight/convnd_bwd_weight.cpp @@ -5,201 +5,86 @@ #include #include #include +#include #include #include "profiler/include/profile_conv_bwd_weight_impl.hpp" +template class TestConvndBwdWeight : public ::testing::Test { protected: + using DataType = std::tuple_element_t<0, Tuple>; std::vector conv_params; -}; - -// 1d -TEST_F(TestConvndBwdWeight, Conv1dBwdWeight) -{ - conv_params.clear(); - conv_params.push_back({1, 1, 128, 128, 256, {1}, {14}, {2}, {1}, {0}, {0}}); - conv_params.push_back({1, 1, 128, 128, 256, {3}, {28}, {1}, {1}, {1}, {1}}); - conv_params.push_back({1, 1, 128, 128, 256, {1}, {3}, {1}, {1}, {0}, {0}}); + ck::index_t split_k{2}; - for(auto& param : conv_params) + template + void Run() { - bool pass; - - // fp32 - pass = ck::profiler::profile_conv_bwd_weight_impl<1, - ck::tensor_layout::convolution::NWC, - ck::tensor_layout::convolution::KXC, - ck::tensor_layout::convolution::NWK, - float, - float, - float>(true, // do_verification - 1, // init_method - false, // do_log - false, // time_kernel - param, - 2); - - EXPECT_TRUE(pass); - - // fp16 - pass = ck::profiler::profile_conv_bwd_weight_impl<1, - ck::tensor_layout::convolution::NWC, - ck::tensor_layout::convolution::KXC, - ck::tensor_layout::convolution::NWK, - ck::half_t, - ck::half_t, - ck::half_t>(true, // do_verification - 1, // init_method - false, // do_log - false, // time_kernel - param, - 2); - - EXPECT_TRUE(pass); + for(auto& param : conv_params) + { + bool pass; + EXPECT_FALSE(conv_params.empty()); + pass = ck::profiler::profile_conv_bwd_weight_impl< + NDimSpatial, + ck::tuple_element_t>, + ck::tuple_element_t>, + ck::tuple_element_t>, + DataType, + DataType, + DataType>(true, // do_verification + 1, // init_method integer value + false, // do_log + false, // time_kernel + param, + split_k); + EXPECT_TRUE(pass); + } + } +}; - // bf16 - pass = ck::profiler::profile_conv_bwd_weight_impl<1, - ck::tensor_layout::convolution::NWC, - ck::tensor_layout::convolution::KXC, - ck::tensor_layout::convolution::NWK, - ck::bhalf_t, - ck::bhalf_t, - ck::bhalf_t>(true, // do_verification - 1, // init_method - false, // do_log - false, // time_kernel - param, - 2); +using KernelTypes = + ::testing::Types, std::tuple, std::tuple>; +TYPED_TEST_SUITE(TestConvndBwdWeight, KernelTypes); - EXPECT_TRUE(pass); - } +TYPED_TEST(TestConvndBwdWeight, Test1D) +{ + this->conv_params.clear(); + this->conv_params.push_back({1, 1, 128, 128, 256, {1}, {14}, {2}, {1}, {0}, {0}}); + this->conv_params.push_back({1, 1, 128, 128, 256, {3}, {28}, {1}, {1}, {1}, {1}}); + this->conv_params.push_back({1, 1, 128, 128, 256, {1}, {3}, {1}, {1}, {0}, {0}}); + this->template Run<1>(); } -// 2d -TEST_F(TestConvndBwdWeight, Conv2dBwdWeight) +TYPED_TEST(TestConvndBwdWeight, Test2D) { - conv_params.clear(); - conv_params.push_back({2, 1, 128, 128, 256, {1, 1}, {7, 7}, {2, 2}, {1, 1}, {0, 0}, {0, 0}}); - conv_params.push_back({2, 1, 32, 128, 256, {3, 3}, {14, 14}, {1, 1}, {1, 1}, {1, 1}, {1, 1}}); - conv_params.push_back({2, 1, 128, 128, 256, {1, 1}, {3, 3}, {1, 1}, {1, 1}, {0, 0}, {0, 0}}); - - for(auto& param : conv_params) - { - bool pass; - - // fp32 - pass = ck::profiler::profile_conv_bwd_weight_impl<2, - ck::tensor_layout::convolution::NHWC, - ck::tensor_layout::convolution::KYXC, - ck::tensor_layout::convolution::NHWK, - float, - float, - float>(true, // do_verification - 1, // init_method - false, // do_log - false, // time_kernel - param, - 2); - - EXPECT_TRUE(pass); - - // fp16 - pass = ck::profiler::profile_conv_bwd_weight_impl<2, - ck::tensor_layout::convolution::NHWC, - ck::tensor_layout::convolution::KYXC, - ck::tensor_layout::convolution::NHWK, - ck::half_t, - ck::half_t, - ck::half_t>(true, // do_verification - 1, // init_method - false, // do_log - false, // time_kernel - param, - 2); - - EXPECT_TRUE(pass); - - // bf16 - pass = ck::profiler::profile_conv_bwd_weight_impl<2, - ck::tensor_layout::convolution::NHWC, - ck::tensor_layout::convolution::KYXC, - ck::tensor_layout::convolution::NHWK, - ck::bhalf_t, - ck::bhalf_t, - ck::bhalf_t>(true, // do_verification - 1, // init_method - false, // do_log - false, // time_kernel - param, - 2); - - EXPECT_TRUE(pass); - } + this->conv_params.clear(); + this->conv_params.push_back( + {2, 1, 128, 128, 256, {1, 1}, {7, 7}, {2, 2}, {1, 1}, {0, 0}, {0, 0}}); + this->conv_params.push_back( + {2, 1, 32, 128, 256, {3, 3}, {14, 14}, {1, 1}, {1, 1}, {1, 1}, {1, 1}}); + this->conv_params.push_back( + {2, 1, 128, 128, 256, {1, 1}, {3, 3}, {1, 1}, {1, 1}, {0, 0}, {0, 0}}); + this->template Run<2>(); } -// 3d -TEST_F(TestConvndBwdWeight, Conv3dBwdWeight) +TYPED_TEST(TestConvndBwdWeight, Test3D) { - conv_params.clear(); - conv_params.push_back( + this->conv_params.clear(); + this->conv_params.push_back( {3, 1, 128, 128, 256, {1, 1, 1}, {7, 7, 7}, {2, 2, 2}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}}); - conv_params.push_back( + this->conv_params.push_back( {3, 1, 32, 128, 256, {3, 3, 3}, {14, 14, 3}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}}); - conv_params.push_back( + this->conv_params.push_back( {3, 1, 128, 128, 256, {1, 1, 1}, {3, 3, 3}, {1, 1, 1}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}}); - - for(auto& param : conv_params) - { - bool pass; - - // fp32 - pass = ck::profiler::profile_conv_bwd_weight_impl<3, - ck::tensor_layout::convolution::NDHWC, - ck::tensor_layout::convolution::KZYXC, - ck::tensor_layout::convolution::NDHWK, - float, - float, - float>(true, // do_verification - 1, // init_method - false, // do_log - false, // time_kernel - param, - 2); - - EXPECT_TRUE(pass); - - // fp16 - pass = ck::profiler::profile_conv_bwd_weight_impl<3, - ck::tensor_layout::convolution::NDHWC, - ck::tensor_layout::convolution::KZYXC, - ck::tensor_layout::convolution::NDHWK, - ck::half_t, - ck::half_t, - ck::half_t>(true, // do_verification - 1, // init_method - false, // do_log - false, // time_kernel - param, - 2); - - EXPECT_TRUE(pass); - - // bf16 - pass = ck::profiler::profile_conv_bwd_weight_impl<3, - ck::tensor_layout::convolution::NDHWC, - ck::tensor_layout::convolution::KZYXC, - ck::tensor_layout::convolution::NDHWK, - ck::bhalf_t, - ck::bhalf_t, - ck::bhalf_t>(true, // do_verification - 1, // init_method - false, // do_log - false, // time_kernel - param, - 2); - - EXPECT_TRUE(pass); - } + this->template Run<3>(); } diff --git a/test/convnd_fwd/convnd_fwd.cpp b/test/convnd_fwd/convnd_fwd.cpp index 5d4aae295..7a9782ebc 100644 --- a/test/convnd_fwd/convnd_fwd.cpp +++ b/test/convnd_fwd/convnd_fwd.cpp @@ -5,237 +5,88 @@ #include #include #include +#include #include #include "profiler/include/profile_conv_fwd_impl.hpp" +template class TestConvndFwd : public ::testing::Test { protected: + using DataType = std::tuple_element_t<0, Tuple>; std::vector conv_params; -}; -// 1d -TEST_F(TestConvndFwd, Conv1dFwd) -{ - conv_params.clear(); - conv_params.push_back({1, 1, 128, 128, 256, {1}, {14}, {2}, {1}, {0}, {0}}); - conv_params.push_back({1, 1, 128, 128, 256, {3}, {28}, {1}, {1}, {1}, {1}}); - conv_params.push_back({1, 1, 128, 128, 256, {1}, {3}, {1}, {1}, {0}, {0}}); - - for(auto& param : conv_params) + template + void Run() { - bool pass; - - // fp32 - pass = ck::profiler::profile_conv_fwd_impl<1, - ck::tensor_layout::convolution::NWC, - ck::tensor_layout::convolution::KXC, - ck::tensor_layout::convolution::NWK, - float, - float, - float>(true, // do_verification - 1, // init_method - false, // do_log - false, // time_kernel - param); - - EXPECT_TRUE(pass); - - // fp16 - pass = ck::profiler::profile_conv_fwd_impl<1, - ck::tensor_layout::convolution::NWC, - ck::tensor_layout::convolution::KXC, - ck::tensor_layout::convolution::NWK, - ck::half_t, - ck::half_t, - ck::half_t>(true, // do_verification - 1, // init_method - false, // do_log - false, // time_kernel - param); - - EXPECT_TRUE(pass); - - // bf16 - pass = ck::profiler::profile_conv_fwd_impl<1, - ck::tensor_layout::convolution::NWC, - ck::tensor_layout::convolution::KXC, - ck::tensor_layout::convolution::NWK, - ck::bhalf_t, - ck::bhalf_t, - ck::bhalf_t>(true, // do_verification - 1, // init_method - false, // do_log - false, // time_kernel - param); - - EXPECT_TRUE(pass); + for(auto& param : conv_params) + { + bool pass; + EXPECT_FALSE(conv_params.empty()); + pass = ck::profiler::profile_conv_fwd_impl< + NDimSpatial, + ck::tuple_element_t>, + ck::tuple_element_t>, + ck::tuple_element_t>, + DataType, + DataType, + DataType>(true, // do_verification + 1, // init_method integer value + false, // do_log + false, // time_kernel + param); + EXPECT_TRUE(pass); + } + } +}; - // int8 - pass = ck::profiler::profile_conv_fwd_impl<1, - ck::tensor_layout::convolution::NWC, - ck::tensor_layout::convolution::KXC, - ck::tensor_layout::convolution::NWK, - int8_t, - int8_t, - int8_t>(true, // do_verification - 1, // init_method - false, // do_log - false, // time_kernel - param); +using KernelTypes = ::testing::Types, + std::tuple, + std::tuple, + std::tuple>; +TYPED_TEST_SUITE(TestConvndFwd, KernelTypes); - EXPECT_TRUE(pass); - } +// 1d +TYPED_TEST(TestConvndFwd, Conv1dFwd) +{ + this->conv_params.clear(); + this->conv_params.push_back({1, 1, 128, 128, 256, {1}, {14}, {2}, {1}, {0}, {0}}); + this->conv_params.push_back({1, 1, 128, 128, 256, {3}, {28}, {1}, {1}, {1}, {1}}); + this->conv_params.push_back({1, 1, 128, 128, 256, {1}, {3}, {1}, {1}, {0}, {0}}); + this->template Run<1>(); } // 2d -TEST_F(TestConvndFwd, Conv2dFwd) +TYPED_TEST(TestConvndFwd, Conv2dFwd) { - conv_params.clear(); - conv_params.push_back({2, 1, 128, 128, 256, {1, 1}, {7, 7}, {2, 2}, {1, 1}, {0, 0}, {0, 0}}); - conv_params.push_back({2, 1, 128, 128, 256, {3, 3}, {14, 14}, {1, 1}, {1, 1}, {1, 1}, {1, 1}}); - conv_params.push_back({2, 1, 128, 128, 256, {1, 1}, {3, 3}, {1, 1}, {1, 1}, {0, 0}, {0, 0}}); - - for(auto& param : conv_params) - { - bool pass; - - // fp32 - pass = ck::profiler::profile_conv_fwd_impl<2, - ck::tensor_layout::convolution::NHWC, - ck::tensor_layout::convolution::KYXC, - ck::tensor_layout::convolution::NHWK, - float, - float, - float>(true, // do_verification - 1, // init_method - false, // do_log - false, // time_kernel - param); - - EXPECT_TRUE(pass); - - // fp16 - pass = ck::profiler::profile_conv_fwd_impl<2, - ck::tensor_layout::convolution::NHWC, - ck::tensor_layout::convolution::KYXC, - ck::tensor_layout::convolution::NHWK, - ck::half_t, - ck::half_t, - ck::half_t>(true, // do_verification - 1, // init_method - false, // do_log - false, // time_kernel - param); - - EXPECT_TRUE(pass); - - // bf16 - pass = ck::profiler::profile_conv_fwd_impl<2, - ck::tensor_layout::convolution::NHWC, - ck::tensor_layout::convolution::KYXC, - ck::tensor_layout::convolution::NHWK, - ck::bhalf_t, - ck::bhalf_t, - ck::bhalf_t>(true, // do_verification - 1, // init_method - false, // do_log - false, // time_kernel - param); - - EXPECT_TRUE(pass); - - // int8 - pass = ck::profiler::profile_conv_fwd_impl<2, - ck::tensor_layout::convolution::NHWC, - ck::tensor_layout::convolution::KYXC, - ck::tensor_layout::convolution::NHWK, - int8_t, - int8_t, - int8_t>(true, // do_verification - 1, // init_method - false, // do_log - false, // time_kernel - param); - - EXPECT_TRUE(pass); - } + this->conv_params.clear(); + this->conv_params.push_back( + {2, 1, 128, 128, 256, {1, 1}, {7, 7}, {2, 2}, {1, 1}, {0, 0}, {0, 0}}); + this->conv_params.push_back( + {2, 1, 128, 128, 256, {3, 3}, {14, 14}, {1, 1}, {1, 1}, {1, 1}, {1, 1}}); + this->conv_params.push_back( + {2, 1, 128, 128, 256, {1, 1}, {3, 3}, {1, 1}, {1, 1}, {0, 0}, {0, 0}}); + this->template Run<2>(); } - // 3d -TEST_F(TestConvndFwd, Conv3dFwd) +TYPED_TEST(TestConvndFwd, Conv3dFwd) { - conv_params.clear(); - conv_params.push_back( + this->conv_params.clear(); + this->conv_params.push_back( {3, 1, 128, 128, 256, {1, 1, 1}, {7, 7, 7}, {2, 2, 2}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}}); - conv_params.push_back( + this->conv_params.push_back( {3, 1, 128, 128, 256, {3, 3, 3}, {14, 14, 3}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}}); - conv_params.push_back( + this->conv_params.push_back( {3, 1, 128, 128, 256, {1, 1, 1}, {3, 3, 3}, {1, 1, 1}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}}); - - for(auto& param : conv_params) - { - bool pass; - - // fp32 - pass = ck::profiler::profile_conv_fwd_impl<3, - ck::tensor_layout::convolution::NDHWC, - ck::tensor_layout::convolution::KZYXC, - ck::tensor_layout::convolution::NDHWK, - float, - float, - float>(true, // do_verification - 1, // init_method - false, // do_log - false, // time_kernel - param); - - EXPECT_TRUE(pass); - - // fp16 - pass = ck::profiler::profile_conv_fwd_impl<3, - ck::tensor_layout::convolution::NDHWC, - ck::tensor_layout::convolution::KZYXC, - ck::tensor_layout::convolution::NDHWK, - ck::half_t, - ck::half_t, - ck::half_t>(true, // do_verification - 1, // init_method - false, // do_log - false, // time_kernel - param); - - EXPECT_TRUE(pass); - - // bf16 - pass = ck::profiler::profile_conv_fwd_impl<3, - ck::tensor_layout::convolution::NDHWC, - ck::tensor_layout::convolution::KZYXC, - ck::tensor_layout::convolution::NDHWK, - ck::bhalf_t, - ck::bhalf_t, - ck::bhalf_t>(true, // do_verification - 1, // init_method - false, // do_log - false, // time_kernel - param); - - EXPECT_TRUE(pass); - - // int8 - pass = ck::profiler::profile_conv_fwd_impl<3, - ck::tensor_layout::convolution::NDHWC, - ck::tensor_layout::convolution::KZYXC, - ck::tensor_layout::convolution::NDHWK, - int8_t, - int8_t, - int8_t>(true, // do_verification - 1, // init_method - false, // do_log - false, // time_kernel - param); - - EXPECT_TRUE(pass); - } + this->template Run<3>(); } diff --git a/test/normalization/test_layernorm2d_util.hpp b/test/normalization/test_layernorm2d_util.hpp index 3998d08b0..c1d4d0f54 100644 --- a/test/normalization/test_layernorm2d_util.hpp +++ b/test/normalization/test_layernorm2d_util.hpp @@ -9,7 +9,7 @@ #include "ck/ck.hpp" #include "ck/utility/number.hpp" -#include "ck/tensor_operation/gpu/device/device_normalization_impl.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_normalization_impl.hpp" #include "ck/library/utility/check_err.hpp" #include "ck/library/utility/host_tensor.hpp" -- GitLab From cee440fe4c006021a3b4c875bc416e68525a8fd9 Mon Sep 17 00:00:00 2001 From: arai713 <67439843+arai713@users.noreply.github.com> Date: Mon, 17 Oct 2022 12:59:34 -0700 Subject: [PATCH 02/95] adding tensor_permutation example folder (#389) * adding tensor_permutation example folder * fixed formatting * adding tensor_permutation example folder * fixed formatting * changed deviceelementwise parameters for outscalar * removed .swo file * updated folder/file name * changed function call in verification for better consistency with hostelementwist parameters * formatted again * fixed shape in verification function call * changed verification function call, added definition for nhwc * added elementwise permute example * updated CMakeLists file in folder * Delete CmakeLists.txt * Delete tensor_permute.cpp * first version of 2d gridwise_elementwise kernel * temporary fix for stride problem * formatting * format * changed directory name * Delete gridwise_elementwise_2d.hpp * Delete CMakeLists.txt * Delete extra file * delete extra file * got rid of extraneous code * added 2d device elementwise file * deleted accidently added file * update * stride values generalized with equations * updated stride for output matrix * Update CMakeLists.txt * removed extraneous commented code * removed shape_nchw vector, replaced with GetLength for each dimension * changed vector load in kernel call * removed extra space in CMake --- example/38_elementwise_permute/CMakeLists.txt | 1 + .../elementwise_permute_4D_fp16.cpp | 105 ++++++++++++++++++ 2 files changed, 106 insertions(+) create mode 100644 example/38_elementwise_permute/CMakeLists.txt create mode 100644 example/38_elementwise_permute/elementwise_permute_4D_fp16.cpp diff --git a/example/38_elementwise_permute/CMakeLists.txt b/example/38_elementwise_permute/CMakeLists.txt new file mode 100644 index 000000000..280797ad7 --- /dev/null +++ b/example/38_elementwise_permute/CMakeLists.txt @@ -0,0 +1 @@ +add_example_executable(example_elementwise_permute_4D_fp16 elementwise_permute_4D_fp16.cpp) diff --git a/example/38_elementwise_permute/elementwise_permute_4D_fp16.cpp b/example/38_elementwise_permute/elementwise_permute_4D_fp16.cpp new file mode 100644 index 000000000..31defbc0c --- /dev/null +++ b/example/38_elementwise_permute/elementwise_permute_4D_fp16.cpp @@ -0,0 +1,105 @@ +#include +#include + +#include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp" +#include "ck/tensor_operation/gpu/device/device_elementwise.hpp" + +#include "ck/library/utility/check_err.hpp" +#include "ck/library/utility/device_memory.hpp" +#include "ck/library/utility/host_tensor.hpp" +#include "ck/library/utility/host_tensor_generator.hpp" + +using F16 = ck::half_t; +using F32 = float; + +using ADataType = F16; +using BDataType = F16; + +using PassThrough = ck::tensor_operation::element_wise::PassThrough; +using DeviceElementwisePermuteInstance = + ck::tensor_operation::device::DeviceElementwise, + ck::Tuple, + PassThrough, + 4, + 8, + ck::Sequence<8>, + ck::Sequence<1>>; + +template +void host_elementwise4D(HostTensorB& B_nhwc, const HostTensorA& A_nchw, Functor functor) +{ + for(std::size_t n = 0; n < A_nchw.mDesc.GetLengths()[0]; ++n) + for(std::size_t c = 0; c < A_nchw.mDesc.GetLengths()[1]; ++c) + for(std::size_t h = 0; h < A_nchw.mDesc.GetLengths()[2]; ++h) + for(std::size_t w = 0; w < A_nchw.mDesc.GetLengths()[3]; ++w) + { + auto a_val = A_nchw(n, c, h, w); + functor(B_nhwc(n, h, w, c), a_val); + } +} + +int main() +{ + bool do_verification = true; + bool time_kernel = false; + + std::vector nchw = {4, 4, 8, 8}; + std::vector nhwc = {4, 8, 8, 4}; + Tensor a(nchw); + Tensor b(nhwc); + + a.GenerateTensorValue(GeneratorTensor_3{0.0, 1.0}); + + DeviceMem a_device_buf(sizeof(ADataType) * a.mDesc.GetElementSpaceSize()); + DeviceMem b_device_buf(sizeof(BDataType) * b.mDesc.GetElementSpaceSize()); + + a_device_buf.ToDevice(a.mData.data()); + // LogRangeAsType(std::cout << "Tensor a : ", a.mData, ",") << std::endl; + + std::array input = {a_device_buf.GetDeviceBuffer()}; + std::array output = {b_device_buf.GetDeviceBuffer()}; + + std::array ab_lengths; + std::array a_strides = {static_cast(nchw[1] * nchw[2] * nchw[3]), + static_cast(nchw[2] * nchw[3]), + static_cast(nchw[3]), + 1}; + std::array b_strides = {static_cast(nhwc[1] * nhwc[2] * nhwc[3]), + 1, + static_cast(nhwc[2] * nhwc[3]), + static_cast(nhwc[3])}; + + std::copy(nchw.begin(), nchw.end(), ab_lengths.begin()); + + auto broadcastPermute = DeviceElementwisePermuteInstance{}; + auto argument = broadcastPermute.MakeArgumentPointer( + ab_lengths, {a_strides}, {b_strides}, input, output, PassThrough{}); + + if(!broadcastPermute.IsSupportedArgument(argument.get())) + { + throw std::runtime_error( + "The runtime parameters seems not supported by the device instance, exiting!"); + }; + auto broadcastPermute_invoker_ptr = broadcastPermute.MakeInvokerPointer(); + float ave_time = + broadcastPermute_invoker_ptr->Run(argument.get(), StreamConfig{nullptr, time_kernel}); + + std::cout << "Perf: " << ave_time << " ms" << std::endl; + + bool pass = true; + + if(do_verification) + { + b_device_buf.FromDevice(b.mData.data()); + // LogRangeAsType(std::cout << "Tensor b : ", b.mData, ",") << std::endl; + Tensor host_b(nhwc); + host_elementwise4D(host_b, a, PassThrough{}); + + // LogRangeAsType(std::cout << "Host b : ", host_b.mData, ",") << std::endl; + pass &= + ck::utils::check_err(b.mData, host_b.mData, "Error: Incorrect results b", 1e-3, 1e-3); + } + + return pass ? 0 : 1; +} -- GitLab From 685860c2a9483c9e909d2f8bfb950566724913c8 Mon Sep 17 00:00:00 2001 From: arai713 <67439843+arai713@users.noreply.github.com> Date: Tue, 18 Oct 2022 21:24:19 -0700 Subject: [PATCH 03/95] Tensor permutation (#479) --- .../CMakeLists.txt | 0 .../elementwise_permute_4D_fp16.cpp | 26 +++++++++++++------ 2 files changed, 18 insertions(+), 8 deletions(-) rename example/{38_elementwise_permute => 44_elementwise_permute}/CMakeLists.txt (100%) rename example/{38_elementwise_permute => 44_elementwise_permute}/elementwise_permute_4D_fp16.cpp (81%) diff --git a/example/38_elementwise_permute/CMakeLists.txt b/example/44_elementwise_permute/CMakeLists.txt similarity index 100% rename from example/38_elementwise_permute/CMakeLists.txt rename to example/44_elementwise_permute/CMakeLists.txt diff --git a/example/38_elementwise_permute/elementwise_permute_4D_fp16.cpp b/example/44_elementwise_permute/elementwise_permute_4D_fp16.cpp similarity index 81% rename from example/38_elementwise_permute/elementwise_permute_4D_fp16.cpp rename to example/44_elementwise_permute/elementwise_permute_4D_fp16.cpp index 31defbc0c..0ae9d5fd8 100644 --- a/example/38_elementwise_permute/elementwise_permute_4D_fp16.cpp +++ b/example/44_elementwise_permute/elementwise_permute_4D_fp16.cpp @@ -3,7 +3,7 @@ #include "ck/ck.hpp" #include "ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp" -#include "ck/tensor_operation/gpu/device/device_elementwise.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_elementwise.hpp" #include "ck/library/utility/check_err.hpp" #include "ck/library/utility/device_memory.hpp" @@ -42,10 +42,10 @@ void host_elementwise4D(HostTensorB& B_nhwc, const HostTensorA& A_nchw, Functor int main() { bool do_verification = true; - bool time_kernel = false; + bool time_kernel = true; - std::vector nchw = {4, 4, 8, 8}; - std::vector nhwc = {4, 8, 8, 4}; + std::vector nchw = {16, 128, 32, 64}; + std::vector nhwc = {16, 32, 64, 128}; Tensor a(nchw); Tensor b(nhwc); @@ -55,7 +55,6 @@ int main() DeviceMem b_device_buf(sizeof(BDataType) * b.mDesc.GetElementSpaceSize()); a_device_buf.ToDevice(a.mData.data()); - // LogRangeAsType(std::cout << "Tensor a : ", a.mData, ",") << std::endl; std::array input = {a_device_buf.GetDeviceBuffer()}; std::array output = {b_device_buf.GetDeviceBuffer()}; @@ -81,22 +80,33 @@ int main() throw std::runtime_error( "The runtime parameters seems not supported by the device instance, exiting!"); }; + + std::cout << "A (nchw): " << a.mDesc << std::endl; + std::cout << "B (nhwc): " << b.mDesc << std::endl; + auto broadcastPermute_invoker_ptr = broadcastPermute.MakeInvokerPointer(); float ave_time = broadcastPermute_invoker_ptr->Run(argument.get(), StreamConfig{nullptr, time_kernel}); + std::size_t flop = std::size_t(2) * nchw[0] * nchw[1] * nchw[2] * nchw[3]; + + std::size_t num_btype = sizeof(ADataType) * (nchw[0] * nchw[1] * nchw[2] * nchw[3]) + + sizeof(BDataType) * (nchw[0] * nchw[1] * nchw[2] * nchw[3]); + + float tflops = static_cast(flop) / 1.E9 / ave_time; + + float gb_per_sec = num_btype / 1.E6 / ave_time; - std::cout << "Perf: " << ave_time << " ms" << std::endl; + std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s" + << std::endl; bool pass = true; if(do_verification) { b_device_buf.FromDevice(b.mData.data()); - // LogRangeAsType(std::cout << "Tensor b : ", b.mData, ",") << std::endl; Tensor host_b(nhwc); host_elementwise4D(host_b, a, PassThrough{}); - // LogRangeAsType(std::cout << "Host b : ", host_b.mData, ",") << std::endl; pass &= ck::utils::check_err(b.mData, host_b.mData, "Error: Incorrect results b", 1e-3, 1e-3); } -- GitLab From efbcc6eddce63453df8009e5406eef2685f0a1a9 Mon Sep 17 00:00:00 2001 From: guangzlu <87220526+guangzlu@users.noreply.github.com> Date: Tue, 25 Oct 2022 10:23:20 +0800 Subject: [PATCH 04/95] Fused elementwise layernorm (#468) * add fused addition lyernorm * add fused addition lyernorm * changed CMakelist * removed annotates * modified descriptor of C * fixed bug in gridwise add layernorm * format the files * modified name from add&layernorm into elementwise&layernorm * created fused elementwise layernorm branch * change input into tuple type * add sweep once to reduce load & read of C from global memory * modified Argument api * modified way to malloc c in global memory * changed gamma and beta to m_k_desc * fixed bug when sweep once and move CDataType when define device level struct * add src dim for gamma and beta * implement optimization for coalesced * delete a annotation line * fixed some bug to meet the requirements of ck * add bandwidth computing in example, and fixed the time unit * move device_elementwise_layernorm_impl.hpp into device/impl * fixed bug in device_elementwise_layernorm_impl.hpp * changed name from layernorm into normalization * clang-format the changed files * changed the names * moved immidiate results into lds, it become faster in non-sweeponce cases * changed naming of C into X to make the defination more clear * changed naming in example * add tests for elementwise normalization * move example_elementwise_layernorm_blockwise into folder 44_elementwise_normalization * move test_elementwise_layernorm_fp16 into new folder * move elementwise_normalization_instances into a new folder * add more tests in test_elementwise_layernorm_fp16.cpp * added some corner cases in test * fixed method to compute lds size for matrix X * changed name of 44_elementwise_normalization into 45_elementwise_normalization * modified some comments * modified some other confused comments * reduce redundant tests in test_elementwise_layernorm_fp16.cpp --- example/27_layernorm/CMakeLists.txt | 2 +- .../CMakeLists.txt | 1 + .../elementwise_layernorm_blockwise.cpp | 195 ++++++ .../device_elementwise_normalization.hpp | 68 ++ .../device_elementwise_normalization_impl.hpp | 592 ++++++++++++++++++ ...elementwise_layernorm_welford_variance.hpp | 500 +++++++++++++++ .../gpu/elementwise_normalization.hpp | 79 +++ .../elementwise_normalization/CMakeLists.txt | 3 + ...elementwise_normalization_f16_instance.cpp | 54 ++ .../profile_elementwise_layernorm_impl.hpp | 264 ++++++++ test/CMakeLists.txt | 1 + test/elementwise_normalization/CMakeLists.txt | 7 + .../test_elementwise_layernorm_fp16.cpp | 47 ++ test/normalization/CMakeLists.txt | 5 +- 14 files changed, 1814 insertions(+), 4 deletions(-) create mode 100644 example/45_elementwise_normalization/CMakeLists.txt create mode 100644 example/45_elementwise_normalization/elementwise_layernorm_blockwise.cpp create mode 100644 include/ck/tensor_operation/gpu/device/device_elementwise_normalization.hpp create mode 100644 include/ck/tensor_operation/gpu/device/impl/device_elementwise_normalization_impl.hpp create mode 100644 include/ck/tensor_operation/gpu/grid/gridwise_elementwise_layernorm_welford_variance.hpp create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/elementwise_normalization.hpp create mode 100644 library/src/tensor_operation_instance/gpu/elementwise_normalization/CMakeLists.txt create mode 100644 library/src/tensor_operation_instance/gpu/elementwise_normalization/device_elementwise_normalization_f16_instance.cpp create mode 100644 profiler/include/profile_elementwise_layernorm_impl.hpp create mode 100644 test/elementwise_normalization/CMakeLists.txt create mode 100644 test/elementwise_normalization/test_elementwise_layernorm_fp16.cpp diff --git a/example/27_layernorm/CMakeLists.txt b/example/27_layernorm/CMakeLists.txt index b2ca59c5e..d96deae45 100644 --- a/example/27_layernorm/CMakeLists.txt +++ b/example/27_layernorm/CMakeLists.txt @@ -1 +1 @@ -add_example_executable(example_layernorm_blockwise layernorm_blockwise.cpp) \ No newline at end of file +add_example_executable(example_layernorm_blockwise layernorm_blockwise.cpp) diff --git a/example/45_elementwise_normalization/CMakeLists.txt b/example/45_elementwise_normalization/CMakeLists.txt new file mode 100644 index 000000000..8f5b9d4d8 --- /dev/null +++ b/example/45_elementwise_normalization/CMakeLists.txt @@ -0,0 +1 @@ +add_example_executable(example_elementwise_layernorm_blockwise elementwise_layernorm_blockwise.cpp) diff --git a/example/45_elementwise_normalization/elementwise_layernorm_blockwise.cpp b/example/45_elementwise_normalization/elementwise_layernorm_blockwise.cpp new file mode 100644 index 000000000..7d6ff12ee --- /dev/null +++ b/example/45_elementwise_normalization/elementwise_layernorm_blockwise.cpp @@ -0,0 +1,195 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include +#include +#include +#include +#include + +#include "ck/ck.hpp" +#include "ck/utility/reduction_enums.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_elementwise_normalization_impl.hpp" +#include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp" + +#include "ck/library/utility/check_err.hpp" +#include "ck/library/utility/device_memory.hpp" +#include "ck/library/utility/host_common_util.hpp" +#include "ck/library/utility/host_tensor.hpp" +#include "ck/library/utility/host_tensor_generator.hpp" +#include "ck/library/reference_tensor_operation/cpu/reference_layernorm.hpp" + +using ADataType = ck::half_t; // Input 1 +using BDataType = ck::half_t; // Input 2 +using XDataType = ck::half_t; +using GammaDataType = ck::half_t; +using BetaDataType = ck::half_t; +using YDataType = ck::half_t; +using AccDataType = float; +using XElementwiseOperation = ck::tensor_operation::element_wise::Add; +using YElementwiseOperation = ck::tensor_operation::element_wise::PassThrough; + +constexpr int Rank = 2; +constexpr int NumReduceDim = 1; + +// X = Elementwise(input1, input2, input3, ...) +// Y = Layernorm(X, beta, gamma) +using DeviceInstance = ck::tensor_operation::device::DeviceElementwiseNormalizationImpl< + ck::Tuple, + GammaDataType, + BetaDataType, + AccDataType, + YDataType, + XElementwiseOperation, + YElementwiseOperation, + Rank, + NumReduceDim, + 256, // BlockSize + 8, // ClusterM + 32, // ClusterK + 1, // SliceM + 32, // SliceK + 1, // SrcVecDim (0=M, 1=K) + 8, // SrcScalarPerVector + 1, // GammaVecDim (0=M, 1=K) + 8, // GammaScalarPerVector + 1, // BetaVecDim (0=M, 1=K) + 8, // BetaScalarPerVector + 8>; // OutScalarPerVector + +template +void host_elementwise2D(HostTensorC& C, + const HostTensorA& A, + const HostTensorB& B, + const std::vector& shape, + Functor functor) +{ + using ctype = ck::remove_reference_t; + + for(std::size_t m = 0; m < shape[0]; ++m) + for(std::size_t n = 0; n < shape[1]; ++n) + { + auto a_val = A(m, n); + auto b_val = B(m, n); + ctype c_val = 0; + functor(c_val, a_val, b_val); + C(m, n) = c_val; + } +} + +int main() +{ + bool time_kernel = true; + + ck::index_t M = 48 * 256; + ck::index_t N = 1024; + ck::index_t Stride = N; + + auto f_host_tensor_descriptor1d = [](std::size_t len, std::size_t stride) { + return HostTensorDescriptor(std::vector({len}), + std::vector({stride})); + }; + + auto f_host_tensor_descriptor2d = [](std::size_t row, std::size_t col, std::size_t stride) { + return HostTensorDescriptor(std::vector({row, col}), + std::vector({stride, 1})); + }; + + Tensor a(f_host_tensor_descriptor2d(M, N, Stride)); + Tensor b(f_host_tensor_descriptor2d(M, N, Stride)); + Tensor gamma(f_host_tensor_descriptor1d(N, 1)); + Tensor beta(f_host_tensor_descriptor1d(N, 1)); + Tensor y(f_host_tensor_descriptor2d(M, N, Stride)); + + a.GenerateTensorValue(GeneratorTensor_2{-5, 5}); + b.GenerateTensorValue(GeneratorTensor_2{-5, 5}); + gamma.GenerateTensorValue(GeneratorTensor_2{-5, 5}); + beta.GenerateTensorValue(GeneratorTensor_2{-5, 5}); + + DeviceMem a_dev(sizeof(ADataType) * a.mDesc.GetElementSpaceSize()); + DeviceMem b_dev(sizeof(BDataType) * b.mDesc.GetElementSpaceSize()); + DeviceMem gamma_dev(sizeof(GammaDataType) * gamma.mDesc.GetElementSpaceSize()); + DeviceMem beta_dev(sizeof(BetaDataType) * beta.mDesc.GetElementSpaceSize()); + DeviceMem y_dev(sizeof(YDataType) * y.mDesc.GetElementSpaceSize()); + + a_dev.ToDevice(a.mData.data()); + b_dev.ToDevice(b.mData.data()); + gamma_dev.ToDevice(gamma.mData.data()); + beta_dev.ToDevice(beta.mData.data()); + + std::array input = {a_dev.GetDeviceBuffer(), b_dev.GetDeviceBuffer()}; + + auto device_instance = DeviceInstance{}; + auto argument_ptr = device_instance.MakeArgumentPointer( + {M, N}, + { + std::vector{a.mDesc.GetStrides().begin(), a.mDesc.GetStrides().end()}, + std::vector{b.mDesc.GetStrides().begin(), b.mDesc.GetStrides().end()}, + }, + {0, 1}, + {0, 1}, + std::vector{y.mDesc.GetStrides().begin(), y.mDesc.GetStrides().end()}, + {1}, + 1e-4, + input, + gamma_dev.GetDeviceBuffer(), + beta_dev.GetDeviceBuffer(), + y_dev.GetDeviceBuffer(), + XElementwiseOperation{}, + YElementwiseOperation{}); + + if(!device_instance.IsSupportedArgument(argument_ptr.get())) + { + std::cout << "The runtime parameters are not supported" << std::endl; + return 1; + }; + + auto invoker_ptr = device_instance.MakeInvokerPointer(); + float ela_time = 0; + ela_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel}); + + float data_mem_size = M * N * sizeof(ADataType) + M * N * sizeof(BDataType) + + M * N * sizeof(YDataType) + N * sizeof(GammaDataType) + + N * sizeof(BetaDataType); + float bandwidth = data_mem_size * 1000 / ela_time / 1024 / 1024 / 1024; + + std::cout << "Bandwidth is : " << bandwidth << "GB/s . " << std::endl; + std::cout << "Time elapase is : " << ela_time << " ms . " << std::endl; + + bool pass = true; + { + std::vector mn = {static_cast(M), + static_cast(N)}; + Tensor x(f_host_tensor_descriptor2d(M, N, Stride)); + host_elementwise2D, + Tensor, + Tensor, + XElementwiseOperation>(x, a, b, mn, XElementwiseOperation{}); + + Tensor host_y(f_host_tensor_descriptor2d(M, N, Stride)); + using ReferenceInstance = + ck::tensor_operation::host::ReferenceLayernorm; + + ReferenceInstance ref; + auto ref_argument = + ref.MakeArgument(x, gamma, beta, host_y, YElementwiseOperation{}, {M, N}, {1}, 1e-4); + auto ref_invoker = ref.MakeInvoker(); + ref_invoker.Run(ref_argument); + + y_dev.FromDevice(y.mData.data()); + pass &= + ck::utils::check_err(y.mData, host_y.mData, "Error: Incorrect results d1", 1e-3, 1e-3); + if(!(pass)) + { + std::cout << "layernorm wrong" << std::endl; + } + } + return (pass ? 0 : 1); +} diff --git a/include/ck/tensor_operation/gpu/device/device_elementwise_normalization.hpp b/include/ck/tensor_operation/gpu/device/device_elementwise_normalization.hpp new file mode 100644 index 000000000..d8a791c32 --- /dev/null +++ b/include/ck/tensor_operation/gpu/device/device_elementwise_normalization.hpp @@ -0,0 +1,68 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include +#include + +#include "ck/tensor_operation/gpu/device/device_base.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { + +template +struct DeviceElementwiseNormalization : public BaseOperator +{ + static constexpr int NumInput = InDataTypeTuple::Size(); + + virtual std::unique_ptr + MakeArgumentPointer(const std::vector lengths, + const std::array, NumInput> inStridesArray, + const std::vector gammaStrides, + const std::vector betaStrides, + const std::vector yStrides, + const std::vector reduceDims, + AccDataType epsilon, + const std::array in_dev_buffers, + const void* p_gamma, + const void* p_beta, + void* p_y, + XElementwiseOperation x_elementwise_op, + YElementwiseOperation y_elementwise_op) = 0; + + virtual std::unique_ptr MakeInvokerPointer() = 0; +}; + +template +using DeviceElementwiseNormalizationPtr = + std::unique_ptr>; + +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/include/ck/tensor_operation/gpu/device/impl/device_elementwise_normalization_impl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_elementwise_normalization_impl.hpp new file mode 100644 index 000000000..8ffc5ef9f --- /dev/null +++ b/include/ck/tensor_operation/gpu/device/impl/device_elementwise_normalization_impl.hpp @@ -0,0 +1,592 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include +#include + +#include "ck/utility/math.hpp" +#include "ck/utility/sequence.hpp" +#include "ck/utility/reduction_operator.hpp" + +#include "ck/tensor_operation/gpu/device/device_elementwise_normalization.hpp" +#include "ck/tensor_operation/gpu/device/device_reduce.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_reduce_common.hpp" +#include "ck/tensor_operation/gpu/grid/gridwise_elementwise_layernorm_welford_variance.hpp" +#include "ck/tensor_operation/gpu/grid/gridwise_set_buffer_value.hpp" +#include "ck/host_utility/device_prop.hpp" +#include "ck/host_utility/kernel_launch.hpp" + +// X = Elementwise(input1, input2, input3, ...) +// Y = Normalization(X, beta, gamma) +namespace ck { +template // Descriptor of inputs, Gamma, Beta +__global__ void kernel_elementwise_layernorm( + const InGrid2dDescTuple in_grid_2d_desc_tuple, // Descriptor tuple of inputs + const GridDesc_M_K x_grid_desc_m_k, // Descriptor of X + const GridDesc_M_K gamma_grid_desc_m_k, // Descriptor of gamma + const GridDesc_M_K beta_grid_desc_m_k, // Descriptor of beta + const GridDesc_M_K y_grid_desc_m_k, // Descriptor of Y + index_t num_k_block_tile_iteration, // + AccDataType epsilon, // Datatype of epsilon + const InDataTypePointerTuple p_in_global_tuple, // Ptr tuple of input matrixs + const GammaDataType* const __restrict__ p_gamma_global, // Ptr of gamma + const BetaDataType* const __restrict__ p_beta_global, // Ptr of beta + YDataType* const __restrict__ p_y_global, // Ptr of y + const XElementwiseOperation x_elementwise_op, // Operation of input + const YElementwiseOperation y_elementwise_op) // Operation of output of normalization +{ + extern __shared__ XDataType p_x_lds[]; + GridwiseElementwiseReduction::Run(in_grid_2d_desc_tuple, // Descriptor tuple of inputs + x_grid_desc_m_k, // Descriptor of X + gamma_grid_desc_m_k, // Descriptor of Gamma + beta_grid_desc_m_k, // Descriptor of Beta + y_grid_desc_m_k, // Descriptor of Y + num_k_block_tile_iteration, // + epsilon, // epsilon + p_in_global_tuple, // Ptr tuple of inputs + p_x_lds, // Ptr of X + p_gamma_global, // Ptr of gamma + p_beta_global, // Ptr of beta + p_y_global, // Ptr of Y + x_elementwise_op, // Operation of input + y_elementwise_op); // Operation of output of normalization +}; +} // namespace ck + +namespace ck { +namespace tensor_operation { +namespace device { + +// Y = LayerNorm(A + B, Beta, Gamma) +template // Size to write destination Y +struct DeviceElementwiseNormalizationImpl + : public DeviceElementwiseNormalization +{ + static constexpr int NumInput = InDataTypeTuple::Size(); + + using XDataType = YDataType; + + static_assert( + (KThreadSliceSize % GammaSrcVectorSize == 0), + "Invalid thread slice sizes and/or gamma vector sizes configuration, please check!"); + + static_assert( + (KThreadSliceSize % BetaSrcVectorSize == 0), + "Invalid thread slice sizes and/or beta vector sizes configuration, please check!"); + + static constexpr index_t M_BlockTileSize = + MThreadClusterSize * MThreadSliceSize; // num of rows calculated in a block + static constexpr index_t K_BlockTileSize = + KThreadClusterSize * KThreadSliceSize; // num of columns calculated in a block + + static auto GenerateInDataTypePointerTuple() + { + return generate_tuple( + [&](auto I) { + using DataType = remove_cvref_t; + return static_cast(nullptr); + }, + Number{}); + }; + + using InDataTypePointerTuple = decltype(GenerateInDataTypePointerTuple()); + + static auto MakeSrc2dDescriptor(const std::vector& inLengths, + const std::vector& inStrides, + int blkGroupSize, + int numBlockTileIteration) + { + constexpr index_t NumInvariantDim = Rank - NumReduceDim; + static constexpr index_t numSrcDim = Rank; + static constexpr bool reduceAllDim = (NumInvariantDim == 0); + + const auto tupleSrcLengths = make_tuple_from_array(inLengths, Number{}); + const auto tupleSrcStrides = make_tuple_from_array(inStrides, Number{}); + + const auto inDesc = make_naive_tensor_descriptor(tupleSrcLengths, tupleSrcStrides); + + const auto in_grid_desc_m_k = [&]() { + if constexpr(reduceAllDim) + { + const auto one_dim_inDesc = transform_tensor_descriptor( + inDesc, + make_tuple(make_merge_transform(tupleSrcLengths)), + make_tuple(typename arithmetic_sequence_gen<0, numSrcDim, 1>::type{}), + make_tuple(Sequence<0>{})); + + return transform_tensor_descriptor(one_dim_inDesc, + make_tuple(make_unmerge_transform(make_tuple( + 1, one_dim_inDesc.GetLength(Number<0>{})))), + make_tuple(Sequence<0>{}), + make_tuple(Sequence<0, 1>{})); + } + else + { + using InvariantDims = typename arithmetic_sequence_gen<0, NumInvariantDim, 1>::type; + using ReduceDims = typename arithmetic_sequence_gen::type; + + const auto reduceDimLengths = + make_tuple_from_array_and_index_seq(inLengths, ReduceDims{}); + const auto invariantDimLengths = + make_tuple_from_array_and_index_seq(inLengths, InvariantDims{}); + + return transform_tensor_descriptor( + inDesc, + make_tuple(make_merge_transform(invariantDimLengths), + make_merge_transform(reduceDimLengths)), + make_tuple(InvariantDims{}, ReduceDims{}), + make_tuple(Sequence<0>{}, Sequence<1>{})); + } + }(); + + const auto invariantLength = in_grid_desc_m_k.GetLength(Number<0>{}); + const auto reduceLength = in_grid_desc_m_k.GetLength(Number<1>{}); + + const int reduceSizePerBlock = K_BlockTileSize * numBlockTileIteration; + const auto inPad_M = + math::integer_least_multiple(invariantLength, M_BlockTileSize) - invariantLength; + const auto inPad_K = reduceSizePerBlock * blkGroupSize - reduceLength; + + auto in_grid_desc_m_k_padded = transform_tensor_descriptor( + in_grid_desc_m_k, + make_tuple(make_right_pad_transform(invariantLength, inPad_M), + make_right_pad_transform(reduceLength, inPad_K)), + make_tuple(Sequence<0>{}, Sequence<1>{}), + make_tuple(Sequence<0>{}, Sequence<1>{})); + + return (in_grid_desc_m_k_padded); + }; + + template + static auto GenerateSrcGrid2dDescTuple(Number) + { + return generate_tuple([&](auto) { return MakeSrc2dDescriptor({1}, {1}, 1, 1); }, + Number{}); + }; + + using InGrid2dDescTuple = decltype(GenerateSrcGrid2dDescTuple(Number{})); + + using GridDesc_M_K = decltype(MakeSrc2dDescriptor({1}, {1}, 1, 1)); + + using GridwiseReduceLayernormGeneric = + GridwiseElementwiseLayernormWelfordVariance_mk_to_mk; + + using GridwiseReduceLayernormSweepOnce = + GridwiseElementwiseLayernormWelfordVariance_mk_to_mk; + + struct Argument : public BaseArgument + { + Argument(const std::vector lengths, + const std::array, NumInput> inStridesArray, + const std::vector gammaStrides, + const std::vector betaStrides, + const std::vector yStrides, + const std::vector reduceDims, + XElementwiseOperation x_elementwise_op, + YElementwiseOperation y_elementwise_op, + AccDataType epsilon, + const std::array in_dev_buffers, + const GammaDataType* p_gamma, + const BetaDataType* p_beta, + YDataType* p_y) + : epsilon_(epsilon), + p_gamma_(p_gamma), + p_beta_(p_beta), + p_y_(p_y), + x_elementwise_op_(x_elementwise_op), + y_elementwise_op_(y_elementwise_op) + { + + Lengths_ = shuffle_tensor_dimensions(lengths, reduceDims); + for(int i = 0; i < NumInput; i++) + { + inStridesArray_[i] = + shuffle_tensor_dimensions(inStridesArray[i], reduceDims); + } + + yStrides_ = shuffle_tensor_dimensions(yStrides, reduceDims); + xStrides_ = shuffle_tensor_dimensions(yStrides, reduceDims); + + gammaStrides_ = shuffle_tensor_dimensions(gammaStrides, reduceDims); + betaStrides_ = shuffle_tensor_dimensions(betaStrides, reduceDims); + + in_dev_buffers_ = generate_tuple( + [&](auto I) { + using DataType = remove_cvref_t; + return static_cast(in_dev_buffers[I.value]); + }, + Number{}); + + long_index_t invariant_total_length; + long_index_t reduce_total_length; + + std::tie(invariant_total_length, reduce_total_length) = + get_2d_lengths(Lengths_); + + blkGroupSize_ = 1; + numBlockTileIteration_ = (reduce_total_length + K_BlockTileSize - 1) / K_BlockTileSize; + + gridSize_ = math::integer_least_multiple(invariant_total_length, M_BlockTileSize) / + M_BlockTileSize * blkGroupSize_; + + in_grid_2d_desc_tuple_ = generate_tuple( + [&](auto I) { + return MakeSrc2dDescriptor( + Lengths_, inStridesArray_[I.value], blkGroupSize_, numBlockTileIteration_); + }, + Number{}); + + x_grid_desc_m_k_ = + MakeSrc2dDescriptor(Lengths_, xStrides_, blkGroupSize_, numBlockTileIteration_); + + gamma_grid_desc_m_k_ = + MakeSrc2dDescriptor(Lengths_, gammaStrides_, blkGroupSize_, numBlockTileIteration_); + + beta_grid_desc_m_k_ = + MakeSrc2dDescriptor(Lengths_, betaStrides_, blkGroupSize_, numBlockTileIteration_); + + y_grid_desc_m_k_ = + MakeSrc2dDescriptor(Lengths_, yStrides_, blkGroupSize_, numBlockTileIteration_); + + sweep_once_ = + x_grid_desc_m_k_.GetLength(Number<1>{}) <= KThreadClusterSize * KThreadSliceSize; + + if(!sweep_once_) // if not sweep once, compute memory size for matrix X in lds for + // store Intermediate results + { + int block_TileSize = M_BlockTileSize * reduce_total_length; + x_lds_size_ = block_TileSize * sizeof(XDataType); + } + else + x_lds_size_ = 0; + } + + AccDataType epsilon_; + + InDataTypePointerTuple in_dev_buffers_; + const GammaDataType* p_gamma_; + const BetaDataType* p_beta_; + YDataType* p_y_; + + std::vector Lengths_; + std::array, NumInput> inStridesArray_; + std::vector xStrides_; + std::vector gammaStrides_; + std::vector betaStrides_; + std::vector yStrides_; + + XElementwiseOperation x_elementwise_op_; + YElementwiseOperation y_elementwise_op_; + + int blkGroupSize_; + int numBlockTileIteration_; + size_t gridSize_; + + InGrid2dDescTuple in_grid_2d_desc_tuple_; + GridDesc_M_K x_grid_desc_m_k_; + GridDesc_M_K gamma_grid_desc_m_k_; + GridDesc_M_K beta_grid_desc_m_k_; + GridDesc_M_K y_grid_desc_m_k_; + bool sweep_once_; + int x_lds_size_; + }; + + struct Invoker : public BaseInvoker + { + float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{}) + { + const auto kernel_main = + arg.sweep_once_ ? kernel_elementwise_layernorm + : kernel_elementwise_layernorm; + + float avg_time = 0; + avg_time += launch_and_time_kernel(stream_config, + kernel_main, + dim3(arg.gridSize_), + dim3(BlockSize), + arg.x_lds_size_, + arg.in_grid_2d_desc_tuple_, + arg.x_grid_desc_m_k_, + arg.gamma_grid_desc_m_k_, + arg.beta_grid_desc_m_k_, + arg.y_grid_desc_m_k_, + arg.numBlockTileIteration_, + arg.epsilon_, + arg.in_dev_buffers_, + arg.p_gamma_, + arg.p_beta_, + arg.p_y_, + arg.x_elementwise_op_, + arg.y_elementwise_op_); + + return (avg_time); + }; + + float Run(const BaseArgument* p_arg, + const StreamConfig& stream_config = StreamConfig{}) override + { + return Run(*dynamic_cast(p_arg), stream_config); + }; + }; + + bool IsSupportedArgument(const BaseArgument* p_arg) override + { + const Argument* p_arg_ = dynamic_cast(p_arg); + + constexpr index_t NumInvariantDim = Rank - NumReduceDim; + + if constexpr(XYSrcVectorDim == 0) + { + if constexpr(NumInvariantDim == 0) + { + return false; + } + else + { + for(int i = 0; i < NumInput; i++) + { + if(p_arg_->inStridesArray_[i][NumInvariantDim - 1] != 1) + return false; + } + + if(p_arg_->inStridesArray_[0][NumInvariantDim - 1] != 1 && + p_arg_->inStridesArray_[1][NumInvariantDim - 1] != 1) + return false; + + if(p_arg_->invariant_lowest_length % XSrcVectorSize != 0) + return false; + }; + } + else + { + for(int i = 0; i < NumInput; i++) + { + if(p_arg_->inStridesArray_[i][Rank - 1] != 1) + return false; + } + + if(p_arg_->Lengths_[Rank - 1] % XSrcVectorSize != 0) + return false; + }; + + if(p_arg_->Lengths_[Rank - 1] % YDstVectorSize != 0) + { + return false; + } + + auto IsScalarPerVectorValid = [](bool isLastDimensionCoalesced, int scalarPerVector) { + bool ret = true; + + if(!isLastDimensionCoalesced) + ret = scalarPerVector == 1; + else + ret = KThreadSliceSize % scalarPerVector == 0; + + return ret; + }; + + if(!IsScalarPerVectorValid(p_arg_->gammaStrides_.back() == 1, GammaSrcVectorSize)) + return false; + + if(!IsScalarPerVectorValid(p_arg_->betaStrides_.back() == 1, BetaSrcVectorSize)) + return false; + + // if fastest dim is not reduced + if constexpr(XYSrcVectorDim == 0) // + { + if(p_arg_->gammaStrides_[NumInvariantDim - 1] != 1) + return (false); + + if(p_arg_->Lengths_[Rank - 1] % GammaSrcVectorSize != 0) + return (false); + } + else // if fastest dim is reduced + { + if(p_arg_->gammaStrides_[Rank - 1] != 1) + return (false); + + if(p_arg_->Lengths_[Rank - 1] % GammaSrcVectorSize != 0) + return (false); + } + + // if fastest dim is not reduced + if constexpr(XYSrcVectorDim == 0) + { + if(p_arg_->betaStrides_[NumInvariantDim - 1] != 1) + return (false); + + if(p_arg_->invariant_lowest_length % BetaSrcVectorSize != 0) + return (false); + } + else // if fastest dim is reduced + { + if(p_arg_->betaStrides_[Rank - 1] != 1) + return (false); + + if(p_arg_->Lengths_[Rank - 1] % BetaSrcVectorSize != 0) + return (false); + } + + return true; + }; + + std::unique_ptr + MakeArgumentPointer(const std::vector lengths, + const std::array, NumInput> inStridesArray, + const std::vector gammaStrides, + const std::vector betaStrides, + const std::vector yStrides, + const std::vector reduceDims, + AccDataType epsilon, + const std::array in_dev_buffers, + const void* p_gamma, + const void* p_beta, + void* p_y, + XElementwiseOperation x_elementwise_op, + YElementwiseOperation y_elementwise_op) override + { + return std::make_unique(lengths, + inStridesArray, + gammaStrides, + betaStrides, + yStrides, + reduceDims, + x_elementwise_op, + y_elementwise_op, + epsilon, + in_dev_buffers, + static_cast(p_gamma), + static_cast(p_beta), + static_cast(p_y)); + }; + + std::unique_ptr MakeInvokerPointer() override + { + return std::make_unique(); + }; + + std::string GetTypeString() const override + { + auto str = std::stringstream(); + + // clang-format off + str << "DeviceElementwiseNormalizationImpl<" << BlockSize << ","; + str << "M_C" << MThreadClusterSize << "_S" << MThreadSliceSize << ","; + str << "K_C" << KThreadClusterSize << "_S" << KThreadSliceSize << ","; + str << "XYSrcVectorDim_" << XYSrcVectorDim << ","; + str << "VectorSize_X" << XSrcVectorSize << "_Gamma" << GammaSrcVectorSize << "_Beta" << BetaSrcVectorSize << "_Y" << YDstVectorSize << ">"; + // clang-format on + + return str.str(); + } +}; + +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_elementwise_layernorm_welford_variance.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_elementwise_layernorm_welford_variance.hpp new file mode 100644 index 000000000..40d75e05a --- /dev/null +++ b/include/ck/tensor_operation/gpu/grid/gridwise_elementwise_layernorm_welford_variance.hpp @@ -0,0 +1,500 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/utility/data_type.hpp" +#include "ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp" +#include "ck/tensor_operation/gpu/block/blockwise_welford.hpp" +#include "ck/tensor_operation/gpu/thread/threadwise_welford.hpp" +#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck { + +// X = Elementwise(input1, input2, input3, ...) +// Y = Normalization(X, beta, gamma) +template +struct GridwiseElementwiseLayernormWelfordVariance_mk_to_mk +{ + static_assert((XSrcVectorDim == 0 && MThreadSliceSize % XSrcVectorSize == 0) || + (XSrcVectorDim == 1 && KThreadSliceSize % XSrcVectorSize == 0), + "Invalid thread slice sizes and/or vector sizes configuration, please check!"); + + static_assert((YDstVectorDim == 0 && MThreadSliceSize % YDstVectorSize == 0) || + (YDstVectorDim == 1 && KThreadSliceSize % YDstVectorSize == 0), + "Invalid thread slice sizes and/or vector sizes configuration, please check!"); + + static constexpr index_t NumInput = InDataTypePointerTuple::Size(); + + static constexpr bool reorder_thread_cluster = (XSrcVectorDim == 0); + + using ThreadClusterLengths_M_K = Sequence; + + using ThreadBufferDimAccessOrder = + typename conditional, Sequence<0, 1>>::type; + + using ThreadClusterArrangeOrder = + typename conditional, Sequence<0, 1>>::type; + + static constexpr auto thread_cluster_desc = + make_cluster_descriptor(ThreadClusterLengths_M_K{}, ThreadClusterArrangeOrder{}); + + using ThreadReduceSrcDesc_M_K = decltype(make_naive_tensor_descriptor_packed( + make_tuple(Number{}, Number{}))); + using ThreadReduceDstDesc_M = + decltype(make_naive_tensor_descriptor_packed(make_tuple(Number{}))); + + using ThreadwiseWelford = + ThreadwiseWelford; + + using BlockwiseWelford = BlockwiseWelford; + + static constexpr auto I0 = Number<0>{}; + static constexpr auto I1 = Number<1>{}; + static constexpr auto I2 = Number<2>{}; + + static constexpr index_t M_BlockTileSize = MThreadClusterSize * MThreadSliceSize; + static constexpr index_t K_BlockTileSize = KThreadClusterSize * KThreadSliceSize; + static constexpr index_t K_BlockTileStepSize = KThreadClusterSize * XSrcVectorSize; + + static constexpr auto XThreadBufferNumber = Number{}; + static constexpr auto GammaThreadBufferNumber = Number{}; + static constexpr auto BetaThreadBufferNumber = Number{}; + static constexpr auto YThreadBufferNumber = Number{}; + + __device__ static int GetKPerThread(const GridDesc_M_K& x_grid_desc_m_k, + int thread_k_cluster_id) + { + int kPerBlock = x_grid_desc_m_k.GetTransforms()[I2].GetUpperLengths()[I0]; + int kPerThread = + kPerBlock < K_BlockTileSize ? 0 : KThreadSliceSize * (kPerBlock / K_BlockTileSize); + int kPerBlockTail = kPerBlock - kPerThread * KThreadClusterSize; + + if(kPerBlockTail > 0) + { + static_for<0, XThreadBufferNumber, 1>{}([&](auto i) { + int thread_max_len = + (thread_k_cluster_id + 1) * XSrcVectorSize + K_BlockTileStepSize * i; + int delta = thread_max_len - kPerBlockTail; + delta = math::clamp(thread_max_len - kPerBlockTail, 0, XSrcVectorSize); + kPerThread += XSrcVectorSize - delta; + }); + } + + return kPerThread; + } + + __device__ static void Run(const InGrid2dDescTuple in_grid_2d_desc_tuple, + const GridDesc_M_K& x_grid_desc_m_k, + const GridDesc_M_K& gamma_grid_desc_m_k, + const GridDesc_M_K& beta_grid_desc_m_k, + const GridDesc_M_K& y_grid_desc_m_k, + index_t num_k_block_tile_iteration, + AccDataType epsilon, + const InDataTypePointerTuple p_in_global_tuple, + XDataType* const __restrict__ p_x_lds, + const GammaDataType* const __restrict__ p_gamma_global, + const BetaDataType* const __restrict__ p_beta_global, + YDataType* const __restrict__ p_y_global, + const XElementwiseOperation x_elementwise_op, + const YElementwiseOperation y_elementwise_op) + { + if constexpr(SweepOnce) + { + num_k_block_tile_iteration = 1; + } + + const index_t thread_local_id = get_thread_local_1d_id(); + const index_t block_global_id = get_block_1d_id(); + const index_t grid_size = get_grid_size(); + + auto in_global_buf_tuple = generate_tuple( + [&](auto I) { + static_assert(in_grid_2d_desc_tuple[I].GetNumOfDimension() == + 2); // matrix dimension + + return make_dynamic_buffer( + p_in_global_tuple[I], in_grid_2d_desc_tuple[I].GetElementSpaceSize()); + }, + Number{}); + + auto y_global_val_buf = make_dynamic_buffer( + p_y_global, y_grid_desc_m_k.GetElementSpaceSize()); + + auto x_lds_val_buf = make_dynamic_buffer( + p_x_lds, x_grid_desc_m_k.GetElementSpaceSize() / grid_size); + + auto in_thread_buf_tuple = generate_tuple( + [&](auto) { + return generate_tuple( + [&](auto) { + return StaticBuffer{}; + }, + Number{}); + }, + Number{}); + + auto x_thread_buf = generate_tuple( + [&](auto) { + return StaticBuffer{}; + }, + Number{}); + + auto gamma_thread_buf = generate_tuple( + [&](auto) { + return StaticBuffer{}; + }, + Number{}); + + auto beta_thread_buf = generate_tuple( + [&](auto) { + return StaticBuffer{}; + }, + Number{}); + + auto y_thread_buf = generate_tuple( + [&](auto) { + return StaticBuffer{}; + }, + Number{}); + + StaticBuffer mean_thread_buf; + StaticBuffer var_thread_buf; + + const auto thread_cluster_idx = + thread_cluster_desc.CalculateBottomIndex(make_multi_index(thread_local_id)); + + const auto thread_m_cluster_id = thread_cluster_idx[I0]; + const auto thread_k_cluster_id = thread_cluster_idx[I1]; + + using ThreadBufferLengths_M_K = Sequence; + + constexpr auto thread_buffer_desc_m_k = make_naive_tensor_descriptor_packed( + make_tuple(Number{}, Number{})); + + auto in_global_load_tuple = generate_tuple( + [&](auto I) { + using DataTypePointer = remove_cvref_t; + using DataType = remove_cv_t>; + + return ThreadwiseTensorSliceTransfer_v2{ + in_grid_2d_desc_tuple[I], + make_multi_index(block_global_id * M_BlockTileSize + + thread_m_cluster_id * MThreadSliceSize, + thread_k_cluster_id * XSrcVectorSize)}; + }, + Number{}); + + auto threadwise_x_load = ThreadwiseTensorSliceTransfer_v2( + x_grid_desc_m_k, + make_multi_index(thread_m_cluster_id * MThreadSliceSize, + thread_k_cluster_id * XSrcVectorSize)); + + auto threadwise_gamma_load = + ThreadwiseTensorSliceTransfer_v2( + gamma_grid_desc_m_k, + make_multi_index(block_global_id * M_BlockTileSize + + thread_m_cluster_id * MThreadSliceSize, + thread_k_cluster_id * GammaSrcVectorSize)); + + auto threadwise_beta_load = + ThreadwiseTensorSliceTransfer_v2( + beta_grid_desc_m_k, + make_multi_index(block_global_id * M_BlockTileSize + + thread_m_cluster_id * MThreadSliceSize, + thread_k_cluster_id * BetaSrcVectorSize)); + + using PassThrough = tensor_operation::element_wise::PassThrough; + PassThrough pass_through_op; + auto threadwise_x_store = + ThreadwiseTensorSliceTransfer_v1r3( + x_grid_desc_m_k, + make_multi_index(thread_m_cluster_id * MThreadSliceSize, + thread_k_cluster_id * XSrcVectorSize), + pass_through_op); + + auto threadwise_y_store = + ThreadwiseTensorSliceTransfer_v1r3( + y_grid_desc_m_k, + make_multi_index(block_global_id * M_BlockTileSize + + thread_m_cluster_id * MThreadSliceSize, + thread_k_cluster_id * YDstVectorSize), + y_elementwise_op); + + // Copy x from Cache + // one pass: fwd, second pass: bwd + constexpr auto thread_copy_fwd_step_m_k = make_multi_index(0, K_BlockTileStepSize); + constexpr auto thread_copy_bwd_step_m_k = + make_multi_index(0, SweepOnce ? 0 : -K_BlockTileSize); + + const auto gamma_global_val_buf = make_dynamic_buffer( + p_gamma_global, gamma_grid_desc_m_k.GetElementSpaceSize()); + + const auto beta_global_val_buf = make_dynamic_buffer( + p_beta_global, beta_grid_desc_m_k.GetElementSpaceSize()); + + auto threadwise_welford = ThreadwiseWelford(); + threadwise_welford.max_count_ = GetKPerThread(x_grid_desc_m_k, thread_k_cluster_id); + + static_for<0, MThreadSliceSize, 1>{}([&](auto I) { + mean_thread_buf(I) = type_convert(0.0f); + var_thread_buf(I) = type_convert(0.0f); + }); + + for(index_t reducedTiles = 0; reducedTiles < num_k_block_tile_iteration; ++reducedTiles) + { + static_for<0, XThreadBufferNumber, 1>{}([&](auto iK0) { + static_for<0, NumInput, 1>{}([&](auto I) { // input load loop + in_global_load_tuple(I).Run(in_grid_2d_desc_tuple[I], + in_global_buf_tuple[I], + thread_buffer_desc_m_k, + make_tuple(I0, I0), + in_thread_buf_tuple(iK0)(I)); + + in_global_load_tuple(I).MoveSrcSliceWindow(in_grid_2d_desc_tuple[I], + thread_copy_fwd_step_m_k); + }); + + static_for<0, MThreadSliceSize, 1>{}([&](auto iM) { // input add loop + static_for<0, XSrcVectorSize, 1>{}([&](auto iK1) { + constexpr auto offset_m_k = + thread_buffer_desc_m_k.CalculateOffset(make_tuple(iM, iK1)); + + // get reference to in data + const auto in_data_refs = generate_tie( + // return type should be lvalue + [&](auto I) -> const auto& { + return in_thread_buf_tuple(iK0)(I)(Number{}); + }, + Number{}); + + // get reference to dst data + auto out_data_refs = generate_tie( + // return type should be lvalue + [&](auto) -> auto& { return x_thread_buf(iK0)(Number{}); }, + I1); + + unpack2(x_elementwise_op, out_data_refs, in_data_refs); + }); + }); + threadwise_welford.Run(x_thread_buf[iK0], mean_thread_buf, var_thread_buf); + + if constexpr(!SweepOnce) + { + threadwise_x_store.Run(thread_buffer_desc_m_k, + make_tuple(I0, I0), + x_thread_buf(iK0), + x_grid_desc_m_k, + x_lds_val_buf); + threadwise_x_store.MoveDstSliceWindow(x_grid_desc_m_k, + thread_copy_fwd_step_m_k); + } + }); + } + + static_for<0, MThreadSliceSize, 1>{}([&](auto I) { + if constexpr(I > 0) + block_sync_lds(); + + int count = threadwise_welford.cur_count_; + BlockwiseWelford::Run(mean_thread_buf(I), var_thread_buf(I), count); + }); + + auto thread_copy_tail_m_k = + (num_k_block_tile_iteration - 1) * XThreadBufferNumber * thread_copy_fwd_step_m_k; + + if constexpr(!SweepOnce) + threadwise_x_load.MoveSrcSliceWindow(x_grid_desc_m_k, thread_copy_tail_m_k); + threadwise_gamma_load.MoveSrcSliceWindow(gamma_grid_desc_m_k, thread_copy_tail_m_k); + threadwise_beta_load.MoveSrcSliceWindow(beta_grid_desc_m_k, thread_copy_tail_m_k); + threadwise_y_store.MoveDstSliceWindow(y_grid_desc_m_k, thread_copy_tail_m_k); + + for(index_t reducedTiles = 0; reducedTiles < num_k_block_tile_iteration; ++reducedTiles) + { + if constexpr(!SweepOnce) + { + static_for<0, XThreadBufferNumber, 1>{}([&](auto i) { + threadwise_x_load.Run(x_grid_desc_m_k, + x_lds_val_buf, + thread_buffer_desc_m_k, + make_tuple(I0, I0), + x_thread_buf(i)); + threadwise_x_load.MoveSrcSliceWindow(x_grid_desc_m_k, thread_copy_fwd_step_m_k); + }); + } + + static_for<0, GammaThreadBufferNumber, 1>{}([&](auto i) { + threadwise_gamma_load.Run(gamma_grid_desc_m_k, + gamma_global_val_buf, + thread_buffer_desc_m_k, + make_tuple(I0, I0), + gamma_thread_buf(i)); + threadwise_gamma_load.MoveSrcSliceWindow(gamma_grid_desc_m_k, + thread_copy_fwd_step_m_k); + }); + + static_for<0, MThreadSliceSize, 1>{}([&](auto iM) { + auto divisor = 1 / __builtin_amdgcn_sqrtf(var_thread_buf(iM) + epsilon); + static_for<0, XThreadBufferNumber, 1>{}([&](auto iK0) { + static_for<0, XSrcVectorSize, 1>{}([&](auto iK1) { + constexpr auto offset_m_k = + thread_buffer_desc_m_k.CalculateOffset(make_tuple(iM, iK1)); + + // normalize + y_thread_buf(iK0)(Number{}) = + (x_thread_buf(iK0)(Number{}) - mean_thread_buf(iM)) * + divisor; + + // gamma + y_thread_buf(iK0)(Number{}) = + y_thread_buf(iK0)(Number{}) * + gamma_thread_buf(iK0)(Number{}); + }); + }); + }); + + static_for<0, BetaThreadBufferNumber, 1>{}([&](auto i) { + threadwise_beta_load.Run(beta_grid_desc_m_k, + beta_global_val_buf, + thread_buffer_desc_m_k, + make_tuple(I0, I0), + beta_thread_buf(i)); + threadwise_beta_load.MoveSrcSliceWindow(beta_grid_desc_m_k, + thread_copy_fwd_step_m_k); + }); + + static_for<0, MThreadSliceSize, 1>{}([&](auto iM) { + static_for<0, XThreadBufferNumber, 1>{}([&](auto iK0) { + static_for<0, XSrcVectorSize, 1>{}([&](auto iK1) { + constexpr auto offset_m_k = + thread_buffer_desc_m_k.CalculateOffset(make_tuple(iM, iK1)); + + // beta + y_thread_buf(iK0)(Number{}) = + y_thread_buf(iK0)(Number{}) + + beta_thread_buf(iK0)(Number{}); + }); + }); + }); + + static_for<0, YThreadBufferNumber, 1>{}([&](auto i) { + threadwise_y_store.Run(thread_buffer_desc_m_k, + make_tuple(I0, I0), + y_thread_buf(i), + y_grid_desc_m_k, + y_global_val_buf); + threadwise_y_store.MoveDstSliceWindow(y_grid_desc_m_k, thread_copy_fwd_step_m_k); + }); + + if constexpr(!SweepOnce) + threadwise_x_load.MoveSrcSliceWindow(x_grid_desc_m_k, 2 * thread_copy_bwd_step_m_k); + threadwise_gamma_load.MoveSrcSliceWindow(gamma_grid_desc_m_k, + 2 * thread_copy_bwd_step_m_k); + threadwise_beta_load.MoveSrcSliceWindow(beta_grid_desc_m_k, + 2 * thread_copy_bwd_step_m_k); + threadwise_y_store.MoveDstSliceWindow(y_grid_desc_m_k, 2 * thread_copy_bwd_step_m_k); + } + } +}; + +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/elementwise_normalization.hpp b/library/include/ck/library/tensor_operation_instance/gpu/elementwise_normalization.hpp new file mode 100644 index 000000000..c87ae159b --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/elementwise_normalization.hpp @@ -0,0 +1,79 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include + +#include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" +#include "ck/tensor_operation/gpu/device/device_elementwise_normalization.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// FP16 +void add_device_elementwise_normalization_rank_2_1_f16_instances( + std::vector, + F16, + F16, + F32, + F16, + element_wise::Add, + PassThrough, + 2, + 1>>>&); + +template +struct DeviceOperationInstanceFactory> +{ + using DeviceOp = DeviceElementwiseNormalization; + + static auto GetInstances() + { + std::vector> op_ptrs; + + if constexpr(is_same_v && is_same_v && + is_same_v) + { + if constexpr(Rank == 2 && NumReduceDim == 1) + { + add_device_elementwise_normalization_rank_2_1_f16_instances(op_ptrs); + } + } + + return op_ptrs; + } +}; + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/elementwise_normalization/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/elementwise_normalization/CMakeLists.txt new file mode 100644 index 000000000..0c7cc2cd3 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/elementwise_normalization/CMakeLists.txt @@ -0,0 +1,3 @@ +add_instance_library(device_elementwise_normalization_instance + device_elementwise_normalization_f16_instance.cpp +) diff --git a/library/src/tensor_operation_instance/gpu/elementwise_normalization/device_elementwise_normalization_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/elementwise_normalization/device_elementwise_normalization_f16_instance.cpp new file mode 100644 index 000000000..7f15372ed --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/elementwise_normalization/device_elementwise_normalization_f16_instance.cpp @@ -0,0 +1,54 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_elementwise_normalization_impl.hpp" +#include "ck/utility/data_type.hpp" + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using F16 = ck::half_t; +using F32 = float; + +using Add = ck::tensor_operation::element_wise::Add; +using Pass = ck::tensor_operation::element_wise::PassThrough; + +template +// clang-format off +using device_elementwise_normalization_f16_instances = + std::tuple < + // XDataType, GammaDataType, BetaDataType, AccDataType, YDataType, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, XYSrcVectorDim, XSrcVectorSize, GammaSrcVectorDim, GammaSrcVectorSize, BetaSrcVectorDim, BetaSrcVectorSize, YDstVectorSize> + DeviceElementwiseNormalizationImpl, F16, F16, F32, F16, XElementwise ,YElementwise, Rank, Reduce, 256, 8, 32, 1, 8, 1, 1, 1, 1, 1, 1, 1>, // fallback kernel + DeviceElementwiseNormalizationImpl, F16, F16, F32, F16, XElementwise ,YElementwise, Rank, Reduce, 256, 8, 32, 1, 8, 1, 2, 1, 2, 1, 2, 2>, // fallback kernel + DeviceElementwiseNormalizationImpl, F16, F16, F32, F16, XElementwise ,YElementwise, Rank, Reduce, 256, 8, 32, 1, 8, 1, 4, 1, 4, 1, 4, 4>, // fallback kernel + DeviceElementwiseNormalizationImpl, F16, F16, F32, F16, XElementwise ,YElementwise, Rank, Reduce, 256, 8, 32, 1, 8, 1, 8, 1, 8, 1, 8, 8>, + DeviceElementwiseNormalizationImpl, F16, F16, F32, F16, XElementwise ,YElementwise, Rank, Reduce, 256, 4, 64, 1, 8, 1, 8, 1, 8, 1, 8, 8>, + DeviceElementwiseNormalizationImpl, F16, F16, F32, F16, XElementwise ,YElementwise, Rank, Reduce, 256, 2, 128, 1, 8, 1, 8, 1, 8, 1, 8, 8>, + DeviceElementwiseNormalizationImpl, F16, F16, F32, F16, XElementwise ,YElementwise, Rank, Reduce, 256, 2, 128, 1, 16, 1, 8, 1, 8, 1, 8, 8>, + DeviceElementwiseNormalizationImpl, F16, F16, F32, F16, XElementwise ,YElementwise, Rank, Reduce, 256, 2, 128, 1, 32, 1, 8, 1, 8, 1, 8, 8>, + DeviceElementwiseNormalizationImpl, F16, F16, F32, F16, XElementwise ,YElementwise, Rank, Reduce, 256, 1, 256, 1, 8, 1, 8, 1, 8, 1, 8, 8>, + DeviceElementwiseNormalizationImpl, F16, F16, F32, F16, XElementwise ,YElementwise, Rank, Reduce, 256, 1, 256, 1, 16, 1, 8, 1, 8, 1, 8, 8>, + DeviceElementwiseNormalizationImpl, F16, F16, F32, F16, XElementwise ,YElementwise, Rank, Reduce, 256, 1, 256, 1, 32, 1, 8, 1, 8, 1, 8, 8>, + DeviceElementwiseNormalizationImpl, F16, F16, F32, F16, XElementwise ,YElementwise, Rank, Reduce, 1024, 1, 1024, 1, 32, 1, 8, 1, 8, 1, 8, 8>, + DeviceElementwiseNormalizationImpl, F16, F16, F32, F16, XElementwise ,YElementwise, Rank, Reduce, 1024, 1, 1024, 1, 8, 1, 2, 1, 2, 1, 2, 2> + >; +// clang-format on + +void add_device_elementwise_normalization_rank_2_1_f16_instances( + std::vector, F16, F16, F32, F16, Add, Pass, 2, 1>>>& + instances) +{ + add_device_operation_instances( + instances, device_elementwise_normalization_f16_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/profiler/include/profile_elementwise_layernorm_impl.hpp b/profiler/include/profile_elementwise_layernorm_impl.hpp new file mode 100644 index 000000000..f5135005f --- /dev/null +++ b/profiler/include/profile_elementwise_layernorm_impl.hpp @@ -0,0 +1,264 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include + +#include "ck/ck.hpp" + +#include "ck/library/tensor_operation_instance/gpu/elementwise_normalization.hpp" + +#include "ck/library/utility/check_err.hpp" +#include "ck/library/utility/device_memory.hpp" +#include "ck/library/utility/host_tensor.hpp" +#include "ck/library/utility/host_tensor_generator.hpp" +#include "ck/library/reference_tensor_operation/cpu/reference_layernorm.hpp" + +namespace ck { +namespace profiler { + +template +void host_elementwise2D(HostTensorC& C, + const HostTensorA& A, + const HostTensorB& B, + const std::vector& shape, + Functor functor) +{ + using ctype = ck::remove_reference_t; + + for(std::size_t m = 0; m < shape[0]; ++m) + for(std::size_t n = 0; n < shape[1]; ++n) + { + auto a_val = A(m, n); + auto b_val = B(m, n); + ctype c_val = 0; + functor(c_val, a_val, b_val); + C(m, n) = c_val; + } +} + +template +bool profile_elementwise_layernorm_impl(int do_verification, + int init_method, + bool do_log, + bool time_kernel, + std::vector length) +{ + using Add = ck::tensor_operation::element_wise::Add; + using PassThrough = ck::tensor_operation::element_wise::PassThrough; + + if(length.size() != 2) + return false; + + index_t M = length[0]; + index_t N = length[1]; + index_t Stride = N; + + constexpr int Rank = 2; + constexpr int NumReduceDim = 1; + + std::vector reduce_dim = {1}; + std::vector gammaBetaLength = {N}; + std::vector gammaBetaStride = {0, 1}; + + auto f_host_tensor_descriptor2d = [](std::size_t row, std::size_t col, std::size_t stride) { + return HostTensorDescriptor(std::vector({row, col}), + std::vector({stride, 1})); + }; + + Tensor a(length); + Tensor b(length); + Tensor gamma(gammaBetaLength); + Tensor beta(gammaBetaLength); + Tensor y(length); + Tensor host_y(length); + + switch(init_method) + { + case 0: + a.GenerateTensorValue(GeneratorTensor_1{}); + b.GenerateTensorValue(GeneratorTensor_1{}); + gamma.GenerateTensorValue(GeneratorTensor_1{}); + beta.GenerateTensorValue(GeneratorTensor_1{}); + break; + case 1: + a.GenerateTensorValue(GeneratorTensor_2{-5, 5}); + b.GenerateTensorValue(GeneratorTensor_2{-5, 5}); + gamma.GenerateTensorValue(GeneratorTensor_2{-5, 5}); + beta.GenerateTensorValue(GeneratorTensor_2{-5, 5}); + break; + default: + a.GenerateTensorValue(GeneratorTensor_3{0, 1}); + b.GenerateTensorValue(GeneratorTensor_3{0, 1}); + gamma.GenerateTensorValue(GeneratorTensor_3{-0.5, 0.5}); + beta.GenerateTensorValue(GeneratorTensor_3{-0.5, 0.5}); + } + + DeviceMem a_dev(sizeof(ADataType) * a.mDesc.GetElementSpaceSize()); + DeviceMem b_dev(sizeof(ADataType) * b.mDesc.GetElementSpaceSize()); + DeviceMem gamma_dev(sizeof(GammaDataType) * gamma.mDesc.GetElementSpaceSize()); + DeviceMem beta_dev(sizeof(BetaDataType) * beta.mDesc.GetElementSpaceSize()); + DeviceMem y_dev(sizeof(YDataType) * y.mDesc.GetElementSpaceSize()); + + a_dev.ToDevice(a.mData.data()); + b_dev.ToDevice(b.mData.data()); + gamma_dev.ToDevice(gamma.mData.data()); + beta_dev.ToDevice(beta.mData.data()); + + std::array input = {a_dev.GetDeviceBuffer(), b_dev.GetDeviceBuffer()}; + + // add device normalization instances + using DeviceOp = ck::tensor_operation::device::DeviceElementwiseNormalization< + ck::Tuple, + GammaDataType, + BetaDataType, + AccDataType, + YDataType, + Add, + PassThrough, + 2, + 1>; + + // get device op instances + const auto instance_ptrs = + ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< + DeviceOp>::GetInstances(); + + std::cout << "found " << instance_ptrs.size() << " instances" << std::endl; + + std::string best_instance_name; + float best_avg_time = std::numeric_limits::max(); + float best_gb_per_sec = 0; + + if(do_verification) + { + using XDataType = ADataType; + std::vector mn = {static_cast(M), + static_cast(N)}; + Tensor x(f_host_tensor_descriptor2d(M, N, Stride)); + host_elementwise2D, Tensor, Tensor, Add>( + x, a, b, mn, Add{}); + + using ReferenceInstance = ck::tensor_operation::host::ReferenceLayernorm; + + ReferenceInstance ref; + auto ref_argument = + ref.MakeArgument(x, gamma, beta, host_y, PassThrough{}, {M, N}, {1}, 1e-4); + auto ref_invoker = ref.MakeInvoker(); + ref_invoker.Run(ref_argument); + } + + int num_kernel = 0; + + for(auto& inst_ptr : instance_ptrs) + { + auto argument_ptr = inst_ptr->MakeArgumentPointer( + length, + { + std::vector{a.mDesc.GetStrides().begin(), a.mDesc.GetStrides().end()}, + std::vector{b.mDesc.GetStrides().begin(), b.mDesc.GetStrides().end()}, + }, + gammaBetaStride, + gammaBetaStride, + std::vector{y.mDesc.GetStrides().begin(), y.mDesc.GetStrides().end()}, + reduce_dim, + 1e-4, + input, + gamma_dev.GetDeviceBuffer(), + beta_dev.GetDeviceBuffer(), + y_dev.GetDeviceBuffer(), + Add{}, + PassThrough{}); + + if(inst_ptr->IsSupportedArgument(argument_ptr.get())) + { + ++num_kernel; + } + else + { + continue; + } + + auto invoker_ptr = inst_ptr->MakeInvokerPointer(); + + float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel}); + + std::size_t num_bytes = a.mDesc.GetElementSize() * sizeof(ADataType) + + b.mDesc.GetElementSize() * sizeof(BDataType) + + gamma.mDesc.GetElementSize() * sizeof(GammaDataType) + + beta.mDesc.GetElementSize() * sizeof(BetaDataType) + + y.mDesc.GetElementSize() * sizeof(YDataType); + + float gb_per_sec = num_bytes / 1.E6 / avg_time; + + if(time_kernel) + std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << gb_per_sec << " GB/s, " + << inst_ptr->GetTypeString() << std::endl; + + if(avg_time < best_avg_time) + { + best_instance_name = inst_ptr->GetTypeString(); + best_avg_time = avg_time; + best_gb_per_sec = gb_per_sec; + } + + if(do_verification) + { + y_dev.FromDevice(y.mData.data()); + + bool pass = + ck::utils::check_err(y.mData, host_y.mData, "Error: Incorrect results", 1e-3, 1e-3); + + if(do_log) + { + LogRangeAsType(std::cout << "a : ", a.mData, ",") << std::endl; + LogRangeAsType(std::cout << "b : ", b.mData, ",") << std::endl; + LogRangeAsType(std::cout << "host_y : ", host_y.mData, ",") << std::endl; + LogRangeAsType(std::cout << "y : ", y.mData, ",") << std::endl; + } + + if(!pass) + { + std::cout << inst_ptr->GetTypeString() << " failed verification: "; + LogRange(std::cout << "lengths = [", length, ", ") << "]." << std::endl; + return false; + } + else + { + if(time_kernel) + std::cout << "pass" << std::endl; + } + } + } + + if(time_kernel) + { + LogRange(std::cout << "length = ", length, ",") << ", "; + std::cout << "num_kernel = " << num_kernel << ", best perf = " << best_avg_time << " ms, " + << best_gb_per_sec << " GB/s, " << best_instance_name << std::endl; + } + + if(num_kernel == 0) + { + std::cout << "Error: No kernel is tested" << std::endl; + return false; + } + + return true; +} + +} // namespace profiler +} // namespace ck diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index e1b0b9c6e..cbe2937ef 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -52,3 +52,4 @@ add_subdirectory(block_to_ctile_map) add_subdirectory(softmax) add_subdirectory(normalization) add_subdirectory(data_type) +add_subdirectory(elementwise_normalization) diff --git a/test/elementwise_normalization/CMakeLists.txt b/test/elementwise_normalization/CMakeLists.txt new file mode 100644 index 000000000..a20eb2632 --- /dev/null +++ b/test/elementwise_normalization/CMakeLists.txt @@ -0,0 +1,7 @@ +add_custom_target(test_elementwise_normalization) + +add_gtest_executable(test_elementwise_layernorm_fp16 test_elementwise_layernorm_fp16.cpp) + +target_link_libraries(test_elementwise_layernorm_fp16 PRIVATE utility device_elementwise_normalization_instance) + +add_dependencies(test_elementwise_normalization test_elementwise_layernorm_fp16) diff --git a/test/elementwise_normalization/test_elementwise_layernorm_fp16.cpp b/test/elementwise_normalization/test_elementwise_layernorm_fp16.cpp new file mode 100644 index 000000000..f01e963bd --- /dev/null +++ b/test/elementwise_normalization/test_elementwise_layernorm_fp16.cpp @@ -0,0 +1,47 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "gtest/gtest.h" +#include "profiler/include/profile_elementwise_layernorm_impl.hpp" + +using F16 = ck::half_t; +using F32 = float; +using ck::index_t; + +template +class TestElementwiseLayernorm : public ::testing::Test +{ + protected: + using ADataType = std::tuple_element_t<0, Tuple>; + using BDataType = std::tuple_element_t<1, Tuple>; + using GammaDataType = std::tuple_element_t<2, Tuple>; + using BetaDataType = std::tuple_element_t<3, Tuple>; + using AccDataType = std::tuple_element_t<4, Tuple>; + using YDataType = std::tuple_element_t<5, Tuple>; + + void Run() + { + // M, N + std::vector> lengths = { + {1, 1}, {25, 16}, {39, 777}, {100, 200}, {1024, 1024}, {48 * 256, 2048}}; + + for(auto length : lengths) + { + bool success = ck::profiler::profile_elementwise_layernorm_impl( + true, 2, false, false, length); + EXPECT_TRUE(success); + } + } +}; + +using KernelTypes = ::testing::Types< + // ADataType, BDataType, GammaDataType, BetaDataType, AccDataType, YDataType> + std::tuple>; + +TYPED_TEST_SUITE(TestElementwiseLayernorm, KernelTypes); +TYPED_TEST(TestElementwiseLayernorm, Test_FP16) { this->Run(); } diff --git a/test/normalization/CMakeLists.txt b/test/normalization/CMakeLists.txt index ab6e2d1cd..4890f2f75 100644 --- a/test/normalization/CMakeLists.txt +++ b/test/normalization/CMakeLists.txt @@ -3,9 +3,9 @@ add_custom_target(test_layernorm) add_gtest_executable(test_layernorm2d_fp32 test_layernorm2d_fp32.cpp) add_gtest_executable(test_layernorm2d_fp16 test_layernorm2d_fp16.cpp) add_gtest_executable(test_groupnorm_fp16 test_groupnorm_fp16.cpp) -add_gtest_executable(test_groupnorm_fp32 test_groupnorm_fp32.cpp) +add_gtest_executable(test_groupnorm_fp32 test_groupnorm_fp32.cpp) -target_link_libraries(test_layernorm2d_fp32 PRIVATE utility) +target_link_libraries(test_layernorm2d_fp32 PRIVATE utility) target_link_libraries(test_layernorm2d_fp16 PRIVATE utility) target_link_libraries(test_groupnorm_fp16 PRIVATE utility device_normalization_instance) target_link_libraries(test_groupnorm_fp32 PRIVATE utility device_normalization_instance) @@ -14,4 +14,3 @@ add_dependencies(test_layernorm test_layernorm2d_fp32) add_dependencies(test_layernorm test_layernorm2d_fp16) add_dependencies(test_layernorm test_groupnorm_fp16) add_dependencies(test_layernorm test_groupnorm_fp32) - -- GitLab From 6ea9257e9d9c9aa83bf603d270da6b3ebf832504 Mon Sep 17 00:00:00 2001 From: guangzlu <87220526+guangzlu@users.noreply.github.com> Date: Tue, 25 Oct 2022 18:37:12 +0800 Subject: [PATCH 05/95] Revert "Fused elementwise layernorm (#468)" (#491) This reverts commit efbcc6eddce63453df8009e5406eef2685f0a1a9. --- example/27_layernorm/CMakeLists.txt | 2 +- .../CMakeLists.txt | 1 - .../elementwise_layernorm_blockwise.cpp | 195 ------ .../device_elementwise_normalization.hpp | 68 -- .../device_elementwise_normalization_impl.hpp | 592 ------------------ ...elementwise_layernorm_welford_variance.hpp | 500 --------------- .../gpu/elementwise_normalization.hpp | 79 --- .../elementwise_normalization/CMakeLists.txt | 3 - ...elementwise_normalization_f16_instance.cpp | 54 -- .../profile_elementwise_layernorm_impl.hpp | 264 -------- test/CMakeLists.txt | 1 - test/elementwise_normalization/CMakeLists.txt | 7 - .../test_elementwise_layernorm_fp16.cpp | 47 -- test/normalization/CMakeLists.txt | 5 +- 14 files changed, 4 insertions(+), 1814 deletions(-) delete mode 100644 example/45_elementwise_normalization/CMakeLists.txt delete mode 100644 example/45_elementwise_normalization/elementwise_layernorm_blockwise.cpp delete mode 100644 include/ck/tensor_operation/gpu/device/device_elementwise_normalization.hpp delete mode 100644 include/ck/tensor_operation/gpu/device/impl/device_elementwise_normalization_impl.hpp delete mode 100644 include/ck/tensor_operation/gpu/grid/gridwise_elementwise_layernorm_welford_variance.hpp delete mode 100644 library/include/ck/library/tensor_operation_instance/gpu/elementwise_normalization.hpp delete mode 100644 library/src/tensor_operation_instance/gpu/elementwise_normalization/CMakeLists.txt delete mode 100644 library/src/tensor_operation_instance/gpu/elementwise_normalization/device_elementwise_normalization_f16_instance.cpp delete mode 100644 profiler/include/profile_elementwise_layernorm_impl.hpp delete mode 100644 test/elementwise_normalization/CMakeLists.txt delete mode 100644 test/elementwise_normalization/test_elementwise_layernorm_fp16.cpp diff --git a/example/27_layernorm/CMakeLists.txt b/example/27_layernorm/CMakeLists.txt index d96deae45..b2ca59c5e 100644 --- a/example/27_layernorm/CMakeLists.txt +++ b/example/27_layernorm/CMakeLists.txt @@ -1 +1 @@ -add_example_executable(example_layernorm_blockwise layernorm_blockwise.cpp) +add_example_executable(example_layernorm_blockwise layernorm_blockwise.cpp) \ No newline at end of file diff --git a/example/45_elementwise_normalization/CMakeLists.txt b/example/45_elementwise_normalization/CMakeLists.txt deleted file mode 100644 index 8f5b9d4d8..000000000 --- a/example/45_elementwise_normalization/CMakeLists.txt +++ /dev/null @@ -1 +0,0 @@ -add_example_executable(example_elementwise_layernorm_blockwise elementwise_layernorm_blockwise.cpp) diff --git a/example/45_elementwise_normalization/elementwise_layernorm_blockwise.cpp b/example/45_elementwise_normalization/elementwise_layernorm_blockwise.cpp deleted file mode 100644 index 7d6ff12ee..000000000 --- a/example/45_elementwise_normalization/elementwise_layernorm_blockwise.cpp +++ /dev/null @@ -1,195 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. - -#include -#include -#include -#include -#include - -#include "ck/ck.hpp" -#include "ck/utility/reduction_enums.hpp" -#include "ck/tensor_operation/gpu/device/impl/device_elementwise_normalization_impl.hpp" -#include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp" - -#include "ck/library/utility/check_err.hpp" -#include "ck/library/utility/device_memory.hpp" -#include "ck/library/utility/host_common_util.hpp" -#include "ck/library/utility/host_tensor.hpp" -#include "ck/library/utility/host_tensor_generator.hpp" -#include "ck/library/reference_tensor_operation/cpu/reference_layernorm.hpp" - -using ADataType = ck::half_t; // Input 1 -using BDataType = ck::half_t; // Input 2 -using XDataType = ck::half_t; -using GammaDataType = ck::half_t; -using BetaDataType = ck::half_t; -using YDataType = ck::half_t; -using AccDataType = float; -using XElementwiseOperation = ck::tensor_operation::element_wise::Add; -using YElementwiseOperation = ck::tensor_operation::element_wise::PassThrough; - -constexpr int Rank = 2; -constexpr int NumReduceDim = 1; - -// X = Elementwise(input1, input2, input3, ...) -// Y = Layernorm(X, beta, gamma) -using DeviceInstance = ck::tensor_operation::device::DeviceElementwiseNormalizationImpl< - ck::Tuple, - GammaDataType, - BetaDataType, - AccDataType, - YDataType, - XElementwiseOperation, - YElementwiseOperation, - Rank, - NumReduceDim, - 256, // BlockSize - 8, // ClusterM - 32, // ClusterK - 1, // SliceM - 32, // SliceK - 1, // SrcVecDim (0=M, 1=K) - 8, // SrcScalarPerVector - 1, // GammaVecDim (0=M, 1=K) - 8, // GammaScalarPerVector - 1, // BetaVecDim (0=M, 1=K) - 8, // BetaScalarPerVector - 8>; // OutScalarPerVector - -template -void host_elementwise2D(HostTensorC& C, - const HostTensorA& A, - const HostTensorB& B, - const std::vector& shape, - Functor functor) -{ - using ctype = ck::remove_reference_t; - - for(std::size_t m = 0; m < shape[0]; ++m) - for(std::size_t n = 0; n < shape[1]; ++n) - { - auto a_val = A(m, n); - auto b_val = B(m, n); - ctype c_val = 0; - functor(c_val, a_val, b_val); - C(m, n) = c_val; - } -} - -int main() -{ - bool time_kernel = true; - - ck::index_t M = 48 * 256; - ck::index_t N = 1024; - ck::index_t Stride = N; - - auto f_host_tensor_descriptor1d = [](std::size_t len, std::size_t stride) { - return HostTensorDescriptor(std::vector({len}), - std::vector({stride})); - }; - - auto f_host_tensor_descriptor2d = [](std::size_t row, std::size_t col, std::size_t stride) { - return HostTensorDescriptor(std::vector({row, col}), - std::vector({stride, 1})); - }; - - Tensor a(f_host_tensor_descriptor2d(M, N, Stride)); - Tensor b(f_host_tensor_descriptor2d(M, N, Stride)); - Tensor gamma(f_host_tensor_descriptor1d(N, 1)); - Tensor beta(f_host_tensor_descriptor1d(N, 1)); - Tensor y(f_host_tensor_descriptor2d(M, N, Stride)); - - a.GenerateTensorValue(GeneratorTensor_2{-5, 5}); - b.GenerateTensorValue(GeneratorTensor_2{-5, 5}); - gamma.GenerateTensorValue(GeneratorTensor_2{-5, 5}); - beta.GenerateTensorValue(GeneratorTensor_2{-5, 5}); - - DeviceMem a_dev(sizeof(ADataType) * a.mDesc.GetElementSpaceSize()); - DeviceMem b_dev(sizeof(BDataType) * b.mDesc.GetElementSpaceSize()); - DeviceMem gamma_dev(sizeof(GammaDataType) * gamma.mDesc.GetElementSpaceSize()); - DeviceMem beta_dev(sizeof(BetaDataType) * beta.mDesc.GetElementSpaceSize()); - DeviceMem y_dev(sizeof(YDataType) * y.mDesc.GetElementSpaceSize()); - - a_dev.ToDevice(a.mData.data()); - b_dev.ToDevice(b.mData.data()); - gamma_dev.ToDevice(gamma.mData.data()); - beta_dev.ToDevice(beta.mData.data()); - - std::array input = {a_dev.GetDeviceBuffer(), b_dev.GetDeviceBuffer()}; - - auto device_instance = DeviceInstance{}; - auto argument_ptr = device_instance.MakeArgumentPointer( - {M, N}, - { - std::vector{a.mDesc.GetStrides().begin(), a.mDesc.GetStrides().end()}, - std::vector{b.mDesc.GetStrides().begin(), b.mDesc.GetStrides().end()}, - }, - {0, 1}, - {0, 1}, - std::vector{y.mDesc.GetStrides().begin(), y.mDesc.GetStrides().end()}, - {1}, - 1e-4, - input, - gamma_dev.GetDeviceBuffer(), - beta_dev.GetDeviceBuffer(), - y_dev.GetDeviceBuffer(), - XElementwiseOperation{}, - YElementwiseOperation{}); - - if(!device_instance.IsSupportedArgument(argument_ptr.get())) - { - std::cout << "The runtime parameters are not supported" << std::endl; - return 1; - }; - - auto invoker_ptr = device_instance.MakeInvokerPointer(); - float ela_time = 0; - ela_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel}); - - float data_mem_size = M * N * sizeof(ADataType) + M * N * sizeof(BDataType) + - M * N * sizeof(YDataType) + N * sizeof(GammaDataType) + - N * sizeof(BetaDataType); - float bandwidth = data_mem_size * 1000 / ela_time / 1024 / 1024 / 1024; - - std::cout << "Bandwidth is : " << bandwidth << "GB/s . " << std::endl; - std::cout << "Time elapase is : " << ela_time << " ms . " << std::endl; - - bool pass = true; - { - std::vector mn = {static_cast(M), - static_cast(N)}; - Tensor x(f_host_tensor_descriptor2d(M, N, Stride)); - host_elementwise2D, - Tensor, - Tensor, - XElementwiseOperation>(x, a, b, mn, XElementwiseOperation{}); - - Tensor host_y(f_host_tensor_descriptor2d(M, N, Stride)); - using ReferenceInstance = - ck::tensor_operation::host::ReferenceLayernorm; - - ReferenceInstance ref; - auto ref_argument = - ref.MakeArgument(x, gamma, beta, host_y, YElementwiseOperation{}, {M, N}, {1}, 1e-4); - auto ref_invoker = ref.MakeInvoker(); - ref_invoker.Run(ref_argument); - - y_dev.FromDevice(y.mData.data()); - pass &= - ck::utils::check_err(y.mData, host_y.mData, "Error: Incorrect results d1", 1e-3, 1e-3); - if(!(pass)) - { - std::cout << "layernorm wrong" << std::endl; - } - } - return (pass ? 0 : 1); -} diff --git a/include/ck/tensor_operation/gpu/device/device_elementwise_normalization.hpp b/include/ck/tensor_operation/gpu/device/device_elementwise_normalization.hpp deleted file mode 100644 index d8a791c32..000000000 --- a/include/ck/tensor_operation/gpu/device/device_elementwise_normalization.hpp +++ /dev/null @@ -1,68 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. - -#pragma once - -#include -#include - -#include "ck/tensor_operation/gpu/device/device_base.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { - -template -struct DeviceElementwiseNormalization : public BaseOperator -{ - static constexpr int NumInput = InDataTypeTuple::Size(); - - virtual std::unique_ptr - MakeArgumentPointer(const std::vector lengths, - const std::array, NumInput> inStridesArray, - const std::vector gammaStrides, - const std::vector betaStrides, - const std::vector yStrides, - const std::vector reduceDims, - AccDataType epsilon, - const std::array in_dev_buffers, - const void* p_gamma, - const void* p_beta, - void* p_y, - XElementwiseOperation x_elementwise_op, - YElementwiseOperation y_elementwise_op) = 0; - - virtual std::unique_ptr MakeInvokerPointer() = 0; -}; - -template -using DeviceElementwiseNormalizationPtr = - std::unique_ptr>; - -} // namespace device -} // namespace tensor_operation -} // namespace ck diff --git a/include/ck/tensor_operation/gpu/device/impl/device_elementwise_normalization_impl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_elementwise_normalization_impl.hpp deleted file mode 100644 index 8ffc5ef9f..000000000 --- a/include/ck/tensor_operation/gpu/device/impl/device_elementwise_normalization_impl.hpp +++ /dev/null @@ -1,592 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. - -#pragma once - -#include -#include - -#include "ck/utility/math.hpp" -#include "ck/utility/sequence.hpp" -#include "ck/utility/reduction_operator.hpp" - -#include "ck/tensor_operation/gpu/device/device_elementwise_normalization.hpp" -#include "ck/tensor_operation/gpu/device/device_reduce.hpp" -#include "ck/tensor_operation/gpu/device/impl/device_reduce_common.hpp" -#include "ck/tensor_operation/gpu/grid/gridwise_elementwise_layernorm_welford_variance.hpp" -#include "ck/tensor_operation/gpu/grid/gridwise_set_buffer_value.hpp" -#include "ck/host_utility/device_prop.hpp" -#include "ck/host_utility/kernel_launch.hpp" - -// X = Elementwise(input1, input2, input3, ...) -// Y = Normalization(X, beta, gamma) -namespace ck { -template // Descriptor of inputs, Gamma, Beta -__global__ void kernel_elementwise_layernorm( - const InGrid2dDescTuple in_grid_2d_desc_tuple, // Descriptor tuple of inputs - const GridDesc_M_K x_grid_desc_m_k, // Descriptor of X - const GridDesc_M_K gamma_grid_desc_m_k, // Descriptor of gamma - const GridDesc_M_K beta_grid_desc_m_k, // Descriptor of beta - const GridDesc_M_K y_grid_desc_m_k, // Descriptor of Y - index_t num_k_block_tile_iteration, // - AccDataType epsilon, // Datatype of epsilon - const InDataTypePointerTuple p_in_global_tuple, // Ptr tuple of input matrixs - const GammaDataType* const __restrict__ p_gamma_global, // Ptr of gamma - const BetaDataType* const __restrict__ p_beta_global, // Ptr of beta - YDataType* const __restrict__ p_y_global, // Ptr of y - const XElementwiseOperation x_elementwise_op, // Operation of input - const YElementwiseOperation y_elementwise_op) // Operation of output of normalization -{ - extern __shared__ XDataType p_x_lds[]; - GridwiseElementwiseReduction::Run(in_grid_2d_desc_tuple, // Descriptor tuple of inputs - x_grid_desc_m_k, // Descriptor of X - gamma_grid_desc_m_k, // Descriptor of Gamma - beta_grid_desc_m_k, // Descriptor of Beta - y_grid_desc_m_k, // Descriptor of Y - num_k_block_tile_iteration, // - epsilon, // epsilon - p_in_global_tuple, // Ptr tuple of inputs - p_x_lds, // Ptr of X - p_gamma_global, // Ptr of gamma - p_beta_global, // Ptr of beta - p_y_global, // Ptr of Y - x_elementwise_op, // Operation of input - y_elementwise_op); // Operation of output of normalization -}; -} // namespace ck - -namespace ck { -namespace tensor_operation { -namespace device { - -// Y = LayerNorm(A + B, Beta, Gamma) -template // Size to write destination Y -struct DeviceElementwiseNormalizationImpl - : public DeviceElementwiseNormalization -{ - static constexpr int NumInput = InDataTypeTuple::Size(); - - using XDataType = YDataType; - - static_assert( - (KThreadSliceSize % GammaSrcVectorSize == 0), - "Invalid thread slice sizes and/or gamma vector sizes configuration, please check!"); - - static_assert( - (KThreadSliceSize % BetaSrcVectorSize == 0), - "Invalid thread slice sizes and/or beta vector sizes configuration, please check!"); - - static constexpr index_t M_BlockTileSize = - MThreadClusterSize * MThreadSliceSize; // num of rows calculated in a block - static constexpr index_t K_BlockTileSize = - KThreadClusterSize * KThreadSliceSize; // num of columns calculated in a block - - static auto GenerateInDataTypePointerTuple() - { - return generate_tuple( - [&](auto I) { - using DataType = remove_cvref_t; - return static_cast(nullptr); - }, - Number{}); - }; - - using InDataTypePointerTuple = decltype(GenerateInDataTypePointerTuple()); - - static auto MakeSrc2dDescriptor(const std::vector& inLengths, - const std::vector& inStrides, - int blkGroupSize, - int numBlockTileIteration) - { - constexpr index_t NumInvariantDim = Rank - NumReduceDim; - static constexpr index_t numSrcDim = Rank; - static constexpr bool reduceAllDim = (NumInvariantDim == 0); - - const auto tupleSrcLengths = make_tuple_from_array(inLengths, Number{}); - const auto tupleSrcStrides = make_tuple_from_array(inStrides, Number{}); - - const auto inDesc = make_naive_tensor_descriptor(tupleSrcLengths, tupleSrcStrides); - - const auto in_grid_desc_m_k = [&]() { - if constexpr(reduceAllDim) - { - const auto one_dim_inDesc = transform_tensor_descriptor( - inDesc, - make_tuple(make_merge_transform(tupleSrcLengths)), - make_tuple(typename arithmetic_sequence_gen<0, numSrcDim, 1>::type{}), - make_tuple(Sequence<0>{})); - - return transform_tensor_descriptor(one_dim_inDesc, - make_tuple(make_unmerge_transform(make_tuple( - 1, one_dim_inDesc.GetLength(Number<0>{})))), - make_tuple(Sequence<0>{}), - make_tuple(Sequence<0, 1>{})); - } - else - { - using InvariantDims = typename arithmetic_sequence_gen<0, NumInvariantDim, 1>::type; - using ReduceDims = typename arithmetic_sequence_gen::type; - - const auto reduceDimLengths = - make_tuple_from_array_and_index_seq(inLengths, ReduceDims{}); - const auto invariantDimLengths = - make_tuple_from_array_and_index_seq(inLengths, InvariantDims{}); - - return transform_tensor_descriptor( - inDesc, - make_tuple(make_merge_transform(invariantDimLengths), - make_merge_transform(reduceDimLengths)), - make_tuple(InvariantDims{}, ReduceDims{}), - make_tuple(Sequence<0>{}, Sequence<1>{})); - } - }(); - - const auto invariantLength = in_grid_desc_m_k.GetLength(Number<0>{}); - const auto reduceLength = in_grid_desc_m_k.GetLength(Number<1>{}); - - const int reduceSizePerBlock = K_BlockTileSize * numBlockTileIteration; - const auto inPad_M = - math::integer_least_multiple(invariantLength, M_BlockTileSize) - invariantLength; - const auto inPad_K = reduceSizePerBlock * blkGroupSize - reduceLength; - - auto in_grid_desc_m_k_padded = transform_tensor_descriptor( - in_grid_desc_m_k, - make_tuple(make_right_pad_transform(invariantLength, inPad_M), - make_right_pad_transform(reduceLength, inPad_K)), - make_tuple(Sequence<0>{}, Sequence<1>{}), - make_tuple(Sequence<0>{}, Sequence<1>{})); - - return (in_grid_desc_m_k_padded); - }; - - template - static auto GenerateSrcGrid2dDescTuple(Number) - { - return generate_tuple([&](auto) { return MakeSrc2dDescriptor({1}, {1}, 1, 1); }, - Number{}); - }; - - using InGrid2dDescTuple = decltype(GenerateSrcGrid2dDescTuple(Number{})); - - using GridDesc_M_K = decltype(MakeSrc2dDescriptor({1}, {1}, 1, 1)); - - using GridwiseReduceLayernormGeneric = - GridwiseElementwiseLayernormWelfordVariance_mk_to_mk; - - using GridwiseReduceLayernormSweepOnce = - GridwiseElementwiseLayernormWelfordVariance_mk_to_mk; - - struct Argument : public BaseArgument - { - Argument(const std::vector lengths, - const std::array, NumInput> inStridesArray, - const std::vector gammaStrides, - const std::vector betaStrides, - const std::vector yStrides, - const std::vector reduceDims, - XElementwiseOperation x_elementwise_op, - YElementwiseOperation y_elementwise_op, - AccDataType epsilon, - const std::array in_dev_buffers, - const GammaDataType* p_gamma, - const BetaDataType* p_beta, - YDataType* p_y) - : epsilon_(epsilon), - p_gamma_(p_gamma), - p_beta_(p_beta), - p_y_(p_y), - x_elementwise_op_(x_elementwise_op), - y_elementwise_op_(y_elementwise_op) - { - - Lengths_ = shuffle_tensor_dimensions(lengths, reduceDims); - for(int i = 0; i < NumInput; i++) - { - inStridesArray_[i] = - shuffle_tensor_dimensions(inStridesArray[i], reduceDims); - } - - yStrides_ = shuffle_tensor_dimensions(yStrides, reduceDims); - xStrides_ = shuffle_tensor_dimensions(yStrides, reduceDims); - - gammaStrides_ = shuffle_tensor_dimensions(gammaStrides, reduceDims); - betaStrides_ = shuffle_tensor_dimensions(betaStrides, reduceDims); - - in_dev_buffers_ = generate_tuple( - [&](auto I) { - using DataType = remove_cvref_t; - return static_cast(in_dev_buffers[I.value]); - }, - Number{}); - - long_index_t invariant_total_length; - long_index_t reduce_total_length; - - std::tie(invariant_total_length, reduce_total_length) = - get_2d_lengths(Lengths_); - - blkGroupSize_ = 1; - numBlockTileIteration_ = (reduce_total_length + K_BlockTileSize - 1) / K_BlockTileSize; - - gridSize_ = math::integer_least_multiple(invariant_total_length, M_BlockTileSize) / - M_BlockTileSize * blkGroupSize_; - - in_grid_2d_desc_tuple_ = generate_tuple( - [&](auto I) { - return MakeSrc2dDescriptor( - Lengths_, inStridesArray_[I.value], blkGroupSize_, numBlockTileIteration_); - }, - Number{}); - - x_grid_desc_m_k_ = - MakeSrc2dDescriptor(Lengths_, xStrides_, blkGroupSize_, numBlockTileIteration_); - - gamma_grid_desc_m_k_ = - MakeSrc2dDescriptor(Lengths_, gammaStrides_, blkGroupSize_, numBlockTileIteration_); - - beta_grid_desc_m_k_ = - MakeSrc2dDescriptor(Lengths_, betaStrides_, blkGroupSize_, numBlockTileIteration_); - - y_grid_desc_m_k_ = - MakeSrc2dDescriptor(Lengths_, yStrides_, blkGroupSize_, numBlockTileIteration_); - - sweep_once_ = - x_grid_desc_m_k_.GetLength(Number<1>{}) <= KThreadClusterSize * KThreadSliceSize; - - if(!sweep_once_) // if not sweep once, compute memory size for matrix X in lds for - // store Intermediate results - { - int block_TileSize = M_BlockTileSize * reduce_total_length; - x_lds_size_ = block_TileSize * sizeof(XDataType); - } - else - x_lds_size_ = 0; - } - - AccDataType epsilon_; - - InDataTypePointerTuple in_dev_buffers_; - const GammaDataType* p_gamma_; - const BetaDataType* p_beta_; - YDataType* p_y_; - - std::vector Lengths_; - std::array, NumInput> inStridesArray_; - std::vector xStrides_; - std::vector gammaStrides_; - std::vector betaStrides_; - std::vector yStrides_; - - XElementwiseOperation x_elementwise_op_; - YElementwiseOperation y_elementwise_op_; - - int blkGroupSize_; - int numBlockTileIteration_; - size_t gridSize_; - - InGrid2dDescTuple in_grid_2d_desc_tuple_; - GridDesc_M_K x_grid_desc_m_k_; - GridDesc_M_K gamma_grid_desc_m_k_; - GridDesc_M_K beta_grid_desc_m_k_; - GridDesc_M_K y_grid_desc_m_k_; - bool sweep_once_; - int x_lds_size_; - }; - - struct Invoker : public BaseInvoker - { - float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{}) - { - const auto kernel_main = - arg.sweep_once_ ? kernel_elementwise_layernorm - : kernel_elementwise_layernorm; - - float avg_time = 0; - avg_time += launch_and_time_kernel(stream_config, - kernel_main, - dim3(arg.gridSize_), - dim3(BlockSize), - arg.x_lds_size_, - arg.in_grid_2d_desc_tuple_, - arg.x_grid_desc_m_k_, - arg.gamma_grid_desc_m_k_, - arg.beta_grid_desc_m_k_, - arg.y_grid_desc_m_k_, - arg.numBlockTileIteration_, - arg.epsilon_, - arg.in_dev_buffers_, - arg.p_gamma_, - arg.p_beta_, - arg.p_y_, - arg.x_elementwise_op_, - arg.y_elementwise_op_); - - return (avg_time); - }; - - float Run(const BaseArgument* p_arg, - const StreamConfig& stream_config = StreamConfig{}) override - { - return Run(*dynamic_cast(p_arg), stream_config); - }; - }; - - bool IsSupportedArgument(const BaseArgument* p_arg) override - { - const Argument* p_arg_ = dynamic_cast(p_arg); - - constexpr index_t NumInvariantDim = Rank - NumReduceDim; - - if constexpr(XYSrcVectorDim == 0) - { - if constexpr(NumInvariantDim == 0) - { - return false; - } - else - { - for(int i = 0; i < NumInput; i++) - { - if(p_arg_->inStridesArray_[i][NumInvariantDim - 1] != 1) - return false; - } - - if(p_arg_->inStridesArray_[0][NumInvariantDim - 1] != 1 && - p_arg_->inStridesArray_[1][NumInvariantDim - 1] != 1) - return false; - - if(p_arg_->invariant_lowest_length % XSrcVectorSize != 0) - return false; - }; - } - else - { - for(int i = 0; i < NumInput; i++) - { - if(p_arg_->inStridesArray_[i][Rank - 1] != 1) - return false; - } - - if(p_arg_->Lengths_[Rank - 1] % XSrcVectorSize != 0) - return false; - }; - - if(p_arg_->Lengths_[Rank - 1] % YDstVectorSize != 0) - { - return false; - } - - auto IsScalarPerVectorValid = [](bool isLastDimensionCoalesced, int scalarPerVector) { - bool ret = true; - - if(!isLastDimensionCoalesced) - ret = scalarPerVector == 1; - else - ret = KThreadSliceSize % scalarPerVector == 0; - - return ret; - }; - - if(!IsScalarPerVectorValid(p_arg_->gammaStrides_.back() == 1, GammaSrcVectorSize)) - return false; - - if(!IsScalarPerVectorValid(p_arg_->betaStrides_.back() == 1, BetaSrcVectorSize)) - return false; - - // if fastest dim is not reduced - if constexpr(XYSrcVectorDim == 0) // - { - if(p_arg_->gammaStrides_[NumInvariantDim - 1] != 1) - return (false); - - if(p_arg_->Lengths_[Rank - 1] % GammaSrcVectorSize != 0) - return (false); - } - else // if fastest dim is reduced - { - if(p_arg_->gammaStrides_[Rank - 1] != 1) - return (false); - - if(p_arg_->Lengths_[Rank - 1] % GammaSrcVectorSize != 0) - return (false); - } - - // if fastest dim is not reduced - if constexpr(XYSrcVectorDim == 0) - { - if(p_arg_->betaStrides_[NumInvariantDim - 1] != 1) - return (false); - - if(p_arg_->invariant_lowest_length % BetaSrcVectorSize != 0) - return (false); - } - else // if fastest dim is reduced - { - if(p_arg_->betaStrides_[Rank - 1] != 1) - return (false); - - if(p_arg_->Lengths_[Rank - 1] % BetaSrcVectorSize != 0) - return (false); - } - - return true; - }; - - std::unique_ptr - MakeArgumentPointer(const std::vector lengths, - const std::array, NumInput> inStridesArray, - const std::vector gammaStrides, - const std::vector betaStrides, - const std::vector yStrides, - const std::vector reduceDims, - AccDataType epsilon, - const std::array in_dev_buffers, - const void* p_gamma, - const void* p_beta, - void* p_y, - XElementwiseOperation x_elementwise_op, - YElementwiseOperation y_elementwise_op) override - { - return std::make_unique(lengths, - inStridesArray, - gammaStrides, - betaStrides, - yStrides, - reduceDims, - x_elementwise_op, - y_elementwise_op, - epsilon, - in_dev_buffers, - static_cast(p_gamma), - static_cast(p_beta), - static_cast(p_y)); - }; - - std::unique_ptr MakeInvokerPointer() override - { - return std::make_unique(); - }; - - std::string GetTypeString() const override - { - auto str = std::stringstream(); - - // clang-format off - str << "DeviceElementwiseNormalizationImpl<" << BlockSize << ","; - str << "M_C" << MThreadClusterSize << "_S" << MThreadSliceSize << ","; - str << "K_C" << KThreadClusterSize << "_S" << KThreadSliceSize << ","; - str << "XYSrcVectorDim_" << XYSrcVectorDim << ","; - str << "VectorSize_X" << XSrcVectorSize << "_Gamma" << GammaSrcVectorSize << "_Beta" << BetaSrcVectorSize << "_Y" << YDstVectorSize << ">"; - // clang-format on - - return str.str(); - } -}; - -} // namespace device -} // namespace tensor_operation -} // namespace ck diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_elementwise_layernorm_welford_variance.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_elementwise_layernorm_welford_variance.hpp deleted file mode 100644 index 40d75e05a..000000000 --- a/include/ck/tensor_operation/gpu/grid/gridwise_elementwise_layernorm_welford_variance.hpp +++ /dev/null @@ -1,500 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. - -#pragma once - -#include "ck/utility/data_type.hpp" -#include "ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp" -#include "ck/tensor_operation/gpu/block/blockwise_welford.hpp" -#include "ck/tensor_operation/gpu/thread/threadwise_welford.hpp" -#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" - -namespace ck { - -// X = Elementwise(input1, input2, input3, ...) -// Y = Normalization(X, beta, gamma) -template -struct GridwiseElementwiseLayernormWelfordVariance_mk_to_mk -{ - static_assert((XSrcVectorDim == 0 && MThreadSliceSize % XSrcVectorSize == 0) || - (XSrcVectorDim == 1 && KThreadSliceSize % XSrcVectorSize == 0), - "Invalid thread slice sizes and/or vector sizes configuration, please check!"); - - static_assert((YDstVectorDim == 0 && MThreadSliceSize % YDstVectorSize == 0) || - (YDstVectorDim == 1 && KThreadSliceSize % YDstVectorSize == 0), - "Invalid thread slice sizes and/or vector sizes configuration, please check!"); - - static constexpr index_t NumInput = InDataTypePointerTuple::Size(); - - static constexpr bool reorder_thread_cluster = (XSrcVectorDim == 0); - - using ThreadClusterLengths_M_K = Sequence; - - using ThreadBufferDimAccessOrder = - typename conditional, Sequence<0, 1>>::type; - - using ThreadClusterArrangeOrder = - typename conditional, Sequence<0, 1>>::type; - - static constexpr auto thread_cluster_desc = - make_cluster_descriptor(ThreadClusterLengths_M_K{}, ThreadClusterArrangeOrder{}); - - using ThreadReduceSrcDesc_M_K = decltype(make_naive_tensor_descriptor_packed( - make_tuple(Number{}, Number{}))); - using ThreadReduceDstDesc_M = - decltype(make_naive_tensor_descriptor_packed(make_tuple(Number{}))); - - using ThreadwiseWelford = - ThreadwiseWelford; - - using BlockwiseWelford = BlockwiseWelford; - - static constexpr auto I0 = Number<0>{}; - static constexpr auto I1 = Number<1>{}; - static constexpr auto I2 = Number<2>{}; - - static constexpr index_t M_BlockTileSize = MThreadClusterSize * MThreadSliceSize; - static constexpr index_t K_BlockTileSize = KThreadClusterSize * KThreadSliceSize; - static constexpr index_t K_BlockTileStepSize = KThreadClusterSize * XSrcVectorSize; - - static constexpr auto XThreadBufferNumber = Number{}; - static constexpr auto GammaThreadBufferNumber = Number{}; - static constexpr auto BetaThreadBufferNumber = Number{}; - static constexpr auto YThreadBufferNumber = Number{}; - - __device__ static int GetKPerThread(const GridDesc_M_K& x_grid_desc_m_k, - int thread_k_cluster_id) - { - int kPerBlock = x_grid_desc_m_k.GetTransforms()[I2].GetUpperLengths()[I0]; - int kPerThread = - kPerBlock < K_BlockTileSize ? 0 : KThreadSliceSize * (kPerBlock / K_BlockTileSize); - int kPerBlockTail = kPerBlock - kPerThread * KThreadClusterSize; - - if(kPerBlockTail > 0) - { - static_for<0, XThreadBufferNumber, 1>{}([&](auto i) { - int thread_max_len = - (thread_k_cluster_id + 1) * XSrcVectorSize + K_BlockTileStepSize * i; - int delta = thread_max_len - kPerBlockTail; - delta = math::clamp(thread_max_len - kPerBlockTail, 0, XSrcVectorSize); - kPerThread += XSrcVectorSize - delta; - }); - } - - return kPerThread; - } - - __device__ static void Run(const InGrid2dDescTuple in_grid_2d_desc_tuple, - const GridDesc_M_K& x_grid_desc_m_k, - const GridDesc_M_K& gamma_grid_desc_m_k, - const GridDesc_M_K& beta_grid_desc_m_k, - const GridDesc_M_K& y_grid_desc_m_k, - index_t num_k_block_tile_iteration, - AccDataType epsilon, - const InDataTypePointerTuple p_in_global_tuple, - XDataType* const __restrict__ p_x_lds, - const GammaDataType* const __restrict__ p_gamma_global, - const BetaDataType* const __restrict__ p_beta_global, - YDataType* const __restrict__ p_y_global, - const XElementwiseOperation x_elementwise_op, - const YElementwiseOperation y_elementwise_op) - { - if constexpr(SweepOnce) - { - num_k_block_tile_iteration = 1; - } - - const index_t thread_local_id = get_thread_local_1d_id(); - const index_t block_global_id = get_block_1d_id(); - const index_t grid_size = get_grid_size(); - - auto in_global_buf_tuple = generate_tuple( - [&](auto I) { - static_assert(in_grid_2d_desc_tuple[I].GetNumOfDimension() == - 2); // matrix dimension - - return make_dynamic_buffer( - p_in_global_tuple[I], in_grid_2d_desc_tuple[I].GetElementSpaceSize()); - }, - Number{}); - - auto y_global_val_buf = make_dynamic_buffer( - p_y_global, y_grid_desc_m_k.GetElementSpaceSize()); - - auto x_lds_val_buf = make_dynamic_buffer( - p_x_lds, x_grid_desc_m_k.GetElementSpaceSize() / grid_size); - - auto in_thread_buf_tuple = generate_tuple( - [&](auto) { - return generate_tuple( - [&](auto) { - return StaticBuffer{}; - }, - Number{}); - }, - Number{}); - - auto x_thread_buf = generate_tuple( - [&](auto) { - return StaticBuffer{}; - }, - Number{}); - - auto gamma_thread_buf = generate_tuple( - [&](auto) { - return StaticBuffer{}; - }, - Number{}); - - auto beta_thread_buf = generate_tuple( - [&](auto) { - return StaticBuffer{}; - }, - Number{}); - - auto y_thread_buf = generate_tuple( - [&](auto) { - return StaticBuffer{}; - }, - Number{}); - - StaticBuffer mean_thread_buf; - StaticBuffer var_thread_buf; - - const auto thread_cluster_idx = - thread_cluster_desc.CalculateBottomIndex(make_multi_index(thread_local_id)); - - const auto thread_m_cluster_id = thread_cluster_idx[I0]; - const auto thread_k_cluster_id = thread_cluster_idx[I1]; - - using ThreadBufferLengths_M_K = Sequence; - - constexpr auto thread_buffer_desc_m_k = make_naive_tensor_descriptor_packed( - make_tuple(Number{}, Number{})); - - auto in_global_load_tuple = generate_tuple( - [&](auto I) { - using DataTypePointer = remove_cvref_t; - using DataType = remove_cv_t>; - - return ThreadwiseTensorSliceTransfer_v2{ - in_grid_2d_desc_tuple[I], - make_multi_index(block_global_id * M_BlockTileSize + - thread_m_cluster_id * MThreadSliceSize, - thread_k_cluster_id * XSrcVectorSize)}; - }, - Number{}); - - auto threadwise_x_load = ThreadwiseTensorSliceTransfer_v2( - x_grid_desc_m_k, - make_multi_index(thread_m_cluster_id * MThreadSliceSize, - thread_k_cluster_id * XSrcVectorSize)); - - auto threadwise_gamma_load = - ThreadwiseTensorSliceTransfer_v2( - gamma_grid_desc_m_k, - make_multi_index(block_global_id * M_BlockTileSize + - thread_m_cluster_id * MThreadSliceSize, - thread_k_cluster_id * GammaSrcVectorSize)); - - auto threadwise_beta_load = - ThreadwiseTensorSliceTransfer_v2( - beta_grid_desc_m_k, - make_multi_index(block_global_id * M_BlockTileSize + - thread_m_cluster_id * MThreadSliceSize, - thread_k_cluster_id * BetaSrcVectorSize)); - - using PassThrough = tensor_operation::element_wise::PassThrough; - PassThrough pass_through_op; - auto threadwise_x_store = - ThreadwiseTensorSliceTransfer_v1r3( - x_grid_desc_m_k, - make_multi_index(thread_m_cluster_id * MThreadSliceSize, - thread_k_cluster_id * XSrcVectorSize), - pass_through_op); - - auto threadwise_y_store = - ThreadwiseTensorSliceTransfer_v1r3( - y_grid_desc_m_k, - make_multi_index(block_global_id * M_BlockTileSize + - thread_m_cluster_id * MThreadSliceSize, - thread_k_cluster_id * YDstVectorSize), - y_elementwise_op); - - // Copy x from Cache - // one pass: fwd, second pass: bwd - constexpr auto thread_copy_fwd_step_m_k = make_multi_index(0, K_BlockTileStepSize); - constexpr auto thread_copy_bwd_step_m_k = - make_multi_index(0, SweepOnce ? 0 : -K_BlockTileSize); - - const auto gamma_global_val_buf = make_dynamic_buffer( - p_gamma_global, gamma_grid_desc_m_k.GetElementSpaceSize()); - - const auto beta_global_val_buf = make_dynamic_buffer( - p_beta_global, beta_grid_desc_m_k.GetElementSpaceSize()); - - auto threadwise_welford = ThreadwiseWelford(); - threadwise_welford.max_count_ = GetKPerThread(x_grid_desc_m_k, thread_k_cluster_id); - - static_for<0, MThreadSliceSize, 1>{}([&](auto I) { - mean_thread_buf(I) = type_convert(0.0f); - var_thread_buf(I) = type_convert(0.0f); - }); - - for(index_t reducedTiles = 0; reducedTiles < num_k_block_tile_iteration; ++reducedTiles) - { - static_for<0, XThreadBufferNumber, 1>{}([&](auto iK0) { - static_for<0, NumInput, 1>{}([&](auto I) { // input load loop - in_global_load_tuple(I).Run(in_grid_2d_desc_tuple[I], - in_global_buf_tuple[I], - thread_buffer_desc_m_k, - make_tuple(I0, I0), - in_thread_buf_tuple(iK0)(I)); - - in_global_load_tuple(I).MoveSrcSliceWindow(in_grid_2d_desc_tuple[I], - thread_copy_fwd_step_m_k); - }); - - static_for<0, MThreadSliceSize, 1>{}([&](auto iM) { // input add loop - static_for<0, XSrcVectorSize, 1>{}([&](auto iK1) { - constexpr auto offset_m_k = - thread_buffer_desc_m_k.CalculateOffset(make_tuple(iM, iK1)); - - // get reference to in data - const auto in_data_refs = generate_tie( - // return type should be lvalue - [&](auto I) -> const auto& { - return in_thread_buf_tuple(iK0)(I)(Number{}); - }, - Number{}); - - // get reference to dst data - auto out_data_refs = generate_tie( - // return type should be lvalue - [&](auto) -> auto& { return x_thread_buf(iK0)(Number{}); }, - I1); - - unpack2(x_elementwise_op, out_data_refs, in_data_refs); - }); - }); - threadwise_welford.Run(x_thread_buf[iK0], mean_thread_buf, var_thread_buf); - - if constexpr(!SweepOnce) - { - threadwise_x_store.Run(thread_buffer_desc_m_k, - make_tuple(I0, I0), - x_thread_buf(iK0), - x_grid_desc_m_k, - x_lds_val_buf); - threadwise_x_store.MoveDstSliceWindow(x_grid_desc_m_k, - thread_copy_fwd_step_m_k); - } - }); - } - - static_for<0, MThreadSliceSize, 1>{}([&](auto I) { - if constexpr(I > 0) - block_sync_lds(); - - int count = threadwise_welford.cur_count_; - BlockwiseWelford::Run(mean_thread_buf(I), var_thread_buf(I), count); - }); - - auto thread_copy_tail_m_k = - (num_k_block_tile_iteration - 1) * XThreadBufferNumber * thread_copy_fwd_step_m_k; - - if constexpr(!SweepOnce) - threadwise_x_load.MoveSrcSliceWindow(x_grid_desc_m_k, thread_copy_tail_m_k); - threadwise_gamma_load.MoveSrcSliceWindow(gamma_grid_desc_m_k, thread_copy_tail_m_k); - threadwise_beta_load.MoveSrcSliceWindow(beta_grid_desc_m_k, thread_copy_tail_m_k); - threadwise_y_store.MoveDstSliceWindow(y_grid_desc_m_k, thread_copy_tail_m_k); - - for(index_t reducedTiles = 0; reducedTiles < num_k_block_tile_iteration; ++reducedTiles) - { - if constexpr(!SweepOnce) - { - static_for<0, XThreadBufferNumber, 1>{}([&](auto i) { - threadwise_x_load.Run(x_grid_desc_m_k, - x_lds_val_buf, - thread_buffer_desc_m_k, - make_tuple(I0, I0), - x_thread_buf(i)); - threadwise_x_load.MoveSrcSliceWindow(x_grid_desc_m_k, thread_copy_fwd_step_m_k); - }); - } - - static_for<0, GammaThreadBufferNumber, 1>{}([&](auto i) { - threadwise_gamma_load.Run(gamma_grid_desc_m_k, - gamma_global_val_buf, - thread_buffer_desc_m_k, - make_tuple(I0, I0), - gamma_thread_buf(i)); - threadwise_gamma_load.MoveSrcSliceWindow(gamma_grid_desc_m_k, - thread_copy_fwd_step_m_k); - }); - - static_for<0, MThreadSliceSize, 1>{}([&](auto iM) { - auto divisor = 1 / __builtin_amdgcn_sqrtf(var_thread_buf(iM) + epsilon); - static_for<0, XThreadBufferNumber, 1>{}([&](auto iK0) { - static_for<0, XSrcVectorSize, 1>{}([&](auto iK1) { - constexpr auto offset_m_k = - thread_buffer_desc_m_k.CalculateOffset(make_tuple(iM, iK1)); - - // normalize - y_thread_buf(iK0)(Number{}) = - (x_thread_buf(iK0)(Number{}) - mean_thread_buf(iM)) * - divisor; - - // gamma - y_thread_buf(iK0)(Number{}) = - y_thread_buf(iK0)(Number{}) * - gamma_thread_buf(iK0)(Number{}); - }); - }); - }); - - static_for<0, BetaThreadBufferNumber, 1>{}([&](auto i) { - threadwise_beta_load.Run(beta_grid_desc_m_k, - beta_global_val_buf, - thread_buffer_desc_m_k, - make_tuple(I0, I0), - beta_thread_buf(i)); - threadwise_beta_load.MoveSrcSliceWindow(beta_grid_desc_m_k, - thread_copy_fwd_step_m_k); - }); - - static_for<0, MThreadSliceSize, 1>{}([&](auto iM) { - static_for<0, XThreadBufferNumber, 1>{}([&](auto iK0) { - static_for<0, XSrcVectorSize, 1>{}([&](auto iK1) { - constexpr auto offset_m_k = - thread_buffer_desc_m_k.CalculateOffset(make_tuple(iM, iK1)); - - // beta - y_thread_buf(iK0)(Number{}) = - y_thread_buf(iK0)(Number{}) + - beta_thread_buf(iK0)(Number{}); - }); - }); - }); - - static_for<0, YThreadBufferNumber, 1>{}([&](auto i) { - threadwise_y_store.Run(thread_buffer_desc_m_k, - make_tuple(I0, I0), - y_thread_buf(i), - y_grid_desc_m_k, - y_global_val_buf); - threadwise_y_store.MoveDstSliceWindow(y_grid_desc_m_k, thread_copy_fwd_step_m_k); - }); - - if constexpr(!SweepOnce) - threadwise_x_load.MoveSrcSliceWindow(x_grid_desc_m_k, 2 * thread_copy_bwd_step_m_k); - threadwise_gamma_load.MoveSrcSliceWindow(gamma_grid_desc_m_k, - 2 * thread_copy_bwd_step_m_k); - threadwise_beta_load.MoveSrcSliceWindow(beta_grid_desc_m_k, - 2 * thread_copy_bwd_step_m_k); - threadwise_y_store.MoveDstSliceWindow(y_grid_desc_m_k, 2 * thread_copy_bwd_step_m_k); - } - } -}; - -} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/elementwise_normalization.hpp b/library/include/ck/library/tensor_operation_instance/gpu/elementwise_normalization.hpp deleted file mode 100644 index c87ae159b..000000000 --- a/library/include/ck/library/tensor_operation_instance/gpu/elementwise_normalization.hpp +++ /dev/null @@ -1,79 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. - -#pragma once - -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" -#include "ck/tensor_operation/gpu/device/device_elementwise_normalization.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" - -#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// FP16 -void add_device_elementwise_normalization_rank_2_1_f16_instances( - std::vector, - F16, - F16, - F32, - F16, - element_wise::Add, - PassThrough, - 2, - 1>>>&); - -template -struct DeviceOperationInstanceFactory> -{ - using DeviceOp = DeviceElementwiseNormalization; - - static auto GetInstances() - { - std::vector> op_ptrs; - - if constexpr(is_same_v && is_same_v && - is_same_v) - { - if constexpr(Rank == 2 && NumReduceDim == 1) - { - add_device_elementwise_normalization_rank_2_1_f16_instances(op_ptrs); - } - } - - return op_ptrs; - } -}; - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/elementwise_normalization/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/elementwise_normalization/CMakeLists.txt deleted file mode 100644 index 0c7cc2cd3..000000000 --- a/library/src/tensor_operation_instance/gpu/elementwise_normalization/CMakeLists.txt +++ /dev/null @@ -1,3 +0,0 @@ -add_instance_library(device_elementwise_normalization_instance - device_elementwise_normalization_f16_instance.cpp -) diff --git a/library/src/tensor_operation_instance/gpu/elementwise_normalization/device_elementwise_normalization_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/elementwise_normalization/device_elementwise_normalization_f16_instance.cpp deleted file mode 100644 index 7f15372ed..000000000 --- a/library/src/tensor_operation_instance/gpu/elementwise_normalization/device_elementwise_normalization_f16_instance.cpp +++ /dev/null @@ -1,54 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/impl/device_elementwise_normalization_impl.hpp" -#include "ck/utility/data_type.hpp" - -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -using F16 = ck::half_t; -using F32 = float; - -using Add = ck::tensor_operation::element_wise::Add; -using Pass = ck::tensor_operation::element_wise::PassThrough; - -template -// clang-format off -using device_elementwise_normalization_f16_instances = - std::tuple < - // XDataType, GammaDataType, BetaDataType, AccDataType, YDataType, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, XYSrcVectorDim, XSrcVectorSize, GammaSrcVectorDim, GammaSrcVectorSize, BetaSrcVectorDim, BetaSrcVectorSize, YDstVectorSize> - DeviceElementwiseNormalizationImpl, F16, F16, F32, F16, XElementwise ,YElementwise, Rank, Reduce, 256, 8, 32, 1, 8, 1, 1, 1, 1, 1, 1, 1>, // fallback kernel - DeviceElementwiseNormalizationImpl, F16, F16, F32, F16, XElementwise ,YElementwise, Rank, Reduce, 256, 8, 32, 1, 8, 1, 2, 1, 2, 1, 2, 2>, // fallback kernel - DeviceElementwiseNormalizationImpl, F16, F16, F32, F16, XElementwise ,YElementwise, Rank, Reduce, 256, 8, 32, 1, 8, 1, 4, 1, 4, 1, 4, 4>, // fallback kernel - DeviceElementwiseNormalizationImpl, F16, F16, F32, F16, XElementwise ,YElementwise, Rank, Reduce, 256, 8, 32, 1, 8, 1, 8, 1, 8, 1, 8, 8>, - DeviceElementwiseNormalizationImpl, F16, F16, F32, F16, XElementwise ,YElementwise, Rank, Reduce, 256, 4, 64, 1, 8, 1, 8, 1, 8, 1, 8, 8>, - DeviceElementwiseNormalizationImpl, F16, F16, F32, F16, XElementwise ,YElementwise, Rank, Reduce, 256, 2, 128, 1, 8, 1, 8, 1, 8, 1, 8, 8>, - DeviceElementwiseNormalizationImpl, F16, F16, F32, F16, XElementwise ,YElementwise, Rank, Reduce, 256, 2, 128, 1, 16, 1, 8, 1, 8, 1, 8, 8>, - DeviceElementwiseNormalizationImpl, F16, F16, F32, F16, XElementwise ,YElementwise, Rank, Reduce, 256, 2, 128, 1, 32, 1, 8, 1, 8, 1, 8, 8>, - DeviceElementwiseNormalizationImpl, F16, F16, F32, F16, XElementwise ,YElementwise, Rank, Reduce, 256, 1, 256, 1, 8, 1, 8, 1, 8, 1, 8, 8>, - DeviceElementwiseNormalizationImpl, F16, F16, F32, F16, XElementwise ,YElementwise, Rank, Reduce, 256, 1, 256, 1, 16, 1, 8, 1, 8, 1, 8, 8>, - DeviceElementwiseNormalizationImpl, F16, F16, F32, F16, XElementwise ,YElementwise, Rank, Reduce, 256, 1, 256, 1, 32, 1, 8, 1, 8, 1, 8, 8>, - DeviceElementwiseNormalizationImpl, F16, F16, F32, F16, XElementwise ,YElementwise, Rank, Reduce, 1024, 1, 1024, 1, 32, 1, 8, 1, 8, 1, 8, 8>, - DeviceElementwiseNormalizationImpl, F16, F16, F32, F16, XElementwise ,YElementwise, Rank, Reduce, 1024, 1, 1024, 1, 8, 1, 2, 1, 2, 1, 2, 2> - >; -// clang-format on - -void add_device_elementwise_normalization_rank_2_1_f16_instances( - std::vector, F16, F16, F32, F16, Add, Pass, 2, 1>>>& - instances) -{ - add_device_operation_instances( - instances, device_elementwise_normalization_f16_instances{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck diff --git a/profiler/include/profile_elementwise_layernorm_impl.hpp b/profiler/include/profile_elementwise_layernorm_impl.hpp deleted file mode 100644 index f5135005f..000000000 --- a/profiler/include/profile_elementwise_layernorm_impl.hpp +++ /dev/null @@ -1,264 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. - -#pragma once - -#include - -#include "ck/ck.hpp" - -#include "ck/library/tensor_operation_instance/gpu/elementwise_normalization.hpp" - -#include "ck/library/utility/check_err.hpp" -#include "ck/library/utility/device_memory.hpp" -#include "ck/library/utility/host_tensor.hpp" -#include "ck/library/utility/host_tensor_generator.hpp" -#include "ck/library/reference_tensor_operation/cpu/reference_layernorm.hpp" - -namespace ck { -namespace profiler { - -template -void host_elementwise2D(HostTensorC& C, - const HostTensorA& A, - const HostTensorB& B, - const std::vector& shape, - Functor functor) -{ - using ctype = ck::remove_reference_t; - - for(std::size_t m = 0; m < shape[0]; ++m) - for(std::size_t n = 0; n < shape[1]; ++n) - { - auto a_val = A(m, n); - auto b_val = B(m, n); - ctype c_val = 0; - functor(c_val, a_val, b_val); - C(m, n) = c_val; - } -} - -template -bool profile_elementwise_layernorm_impl(int do_verification, - int init_method, - bool do_log, - bool time_kernel, - std::vector length) -{ - using Add = ck::tensor_operation::element_wise::Add; - using PassThrough = ck::tensor_operation::element_wise::PassThrough; - - if(length.size() != 2) - return false; - - index_t M = length[0]; - index_t N = length[1]; - index_t Stride = N; - - constexpr int Rank = 2; - constexpr int NumReduceDim = 1; - - std::vector reduce_dim = {1}; - std::vector gammaBetaLength = {N}; - std::vector gammaBetaStride = {0, 1}; - - auto f_host_tensor_descriptor2d = [](std::size_t row, std::size_t col, std::size_t stride) { - return HostTensorDescriptor(std::vector({row, col}), - std::vector({stride, 1})); - }; - - Tensor a(length); - Tensor b(length); - Tensor gamma(gammaBetaLength); - Tensor beta(gammaBetaLength); - Tensor y(length); - Tensor host_y(length); - - switch(init_method) - { - case 0: - a.GenerateTensorValue(GeneratorTensor_1{}); - b.GenerateTensorValue(GeneratorTensor_1{}); - gamma.GenerateTensorValue(GeneratorTensor_1{}); - beta.GenerateTensorValue(GeneratorTensor_1{}); - break; - case 1: - a.GenerateTensorValue(GeneratorTensor_2{-5, 5}); - b.GenerateTensorValue(GeneratorTensor_2{-5, 5}); - gamma.GenerateTensorValue(GeneratorTensor_2{-5, 5}); - beta.GenerateTensorValue(GeneratorTensor_2{-5, 5}); - break; - default: - a.GenerateTensorValue(GeneratorTensor_3{0, 1}); - b.GenerateTensorValue(GeneratorTensor_3{0, 1}); - gamma.GenerateTensorValue(GeneratorTensor_3{-0.5, 0.5}); - beta.GenerateTensorValue(GeneratorTensor_3{-0.5, 0.5}); - } - - DeviceMem a_dev(sizeof(ADataType) * a.mDesc.GetElementSpaceSize()); - DeviceMem b_dev(sizeof(ADataType) * b.mDesc.GetElementSpaceSize()); - DeviceMem gamma_dev(sizeof(GammaDataType) * gamma.mDesc.GetElementSpaceSize()); - DeviceMem beta_dev(sizeof(BetaDataType) * beta.mDesc.GetElementSpaceSize()); - DeviceMem y_dev(sizeof(YDataType) * y.mDesc.GetElementSpaceSize()); - - a_dev.ToDevice(a.mData.data()); - b_dev.ToDevice(b.mData.data()); - gamma_dev.ToDevice(gamma.mData.data()); - beta_dev.ToDevice(beta.mData.data()); - - std::array input = {a_dev.GetDeviceBuffer(), b_dev.GetDeviceBuffer()}; - - // add device normalization instances - using DeviceOp = ck::tensor_operation::device::DeviceElementwiseNormalization< - ck::Tuple, - GammaDataType, - BetaDataType, - AccDataType, - YDataType, - Add, - PassThrough, - 2, - 1>; - - // get device op instances - const auto instance_ptrs = - ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< - DeviceOp>::GetInstances(); - - std::cout << "found " << instance_ptrs.size() << " instances" << std::endl; - - std::string best_instance_name; - float best_avg_time = std::numeric_limits::max(); - float best_gb_per_sec = 0; - - if(do_verification) - { - using XDataType = ADataType; - std::vector mn = {static_cast(M), - static_cast(N)}; - Tensor x(f_host_tensor_descriptor2d(M, N, Stride)); - host_elementwise2D, Tensor, Tensor, Add>( - x, a, b, mn, Add{}); - - using ReferenceInstance = ck::tensor_operation::host::ReferenceLayernorm; - - ReferenceInstance ref; - auto ref_argument = - ref.MakeArgument(x, gamma, beta, host_y, PassThrough{}, {M, N}, {1}, 1e-4); - auto ref_invoker = ref.MakeInvoker(); - ref_invoker.Run(ref_argument); - } - - int num_kernel = 0; - - for(auto& inst_ptr : instance_ptrs) - { - auto argument_ptr = inst_ptr->MakeArgumentPointer( - length, - { - std::vector{a.mDesc.GetStrides().begin(), a.mDesc.GetStrides().end()}, - std::vector{b.mDesc.GetStrides().begin(), b.mDesc.GetStrides().end()}, - }, - gammaBetaStride, - gammaBetaStride, - std::vector{y.mDesc.GetStrides().begin(), y.mDesc.GetStrides().end()}, - reduce_dim, - 1e-4, - input, - gamma_dev.GetDeviceBuffer(), - beta_dev.GetDeviceBuffer(), - y_dev.GetDeviceBuffer(), - Add{}, - PassThrough{}); - - if(inst_ptr->IsSupportedArgument(argument_ptr.get())) - { - ++num_kernel; - } - else - { - continue; - } - - auto invoker_ptr = inst_ptr->MakeInvokerPointer(); - - float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel}); - - std::size_t num_bytes = a.mDesc.GetElementSize() * sizeof(ADataType) + - b.mDesc.GetElementSize() * sizeof(BDataType) + - gamma.mDesc.GetElementSize() * sizeof(GammaDataType) + - beta.mDesc.GetElementSize() * sizeof(BetaDataType) + - y.mDesc.GetElementSize() * sizeof(YDataType); - - float gb_per_sec = num_bytes / 1.E6 / avg_time; - - if(time_kernel) - std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << gb_per_sec << " GB/s, " - << inst_ptr->GetTypeString() << std::endl; - - if(avg_time < best_avg_time) - { - best_instance_name = inst_ptr->GetTypeString(); - best_avg_time = avg_time; - best_gb_per_sec = gb_per_sec; - } - - if(do_verification) - { - y_dev.FromDevice(y.mData.data()); - - bool pass = - ck::utils::check_err(y.mData, host_y.mData, "Error: Incorrect results", 1e-3, 1e-3); - - if(do_log) - { - LogRangeAsType(std::cout << "a : ", a.mData, ",") << std::endl; - LogRangeAsType(std::cout << "b : ", b.mData, ",") << std::endl; - LogRangeAsType(std::cout << "host_y : ", host_y.mData, ",") << std::endl; - LogRangeAsType(std::cout << "y : ", y.mData, ",") << std::endl; - } - - if(!pass) - { - std::cout << inst_ptr->GetTypeString() << " failed verification: "; - LogRange(std::cout << "lengths = [", length, ", ") << "]." << std::endl; - return false; - } - else - { - if(time_kernel) - std::cout << "pass" << std::endl; - } - } - } - - if(time_kernel) - { - LogRange(std::cout << "length = ", length, ",") << ", "; - std::cout << "num_kernel = " << num_kernel << ", best perf = " << best_avg_time << " ms, " - << best_gb_per_sec << " GB/s, " << best_instance_name << std::endl; - } - - if(num_kernel == 0) - { - std::cout << "Error: No kernel is tested" << std::endl; - return false; - } - - return true; -} - -} // namespace profiler -} // namespace ck diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index cbe2937ef..e1b0b9c6e 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -52,4 +52,3 @@ add_subdirectory(block_to_ctile_map) add_subdirectory(softmax) add_subdirectory(normalization) add_subdirectory(data_type) -add_subdirectory(elementwise_normalization) diff --git a/test/elementwise_normalization/CMakeLists.txt b/test/elementwise_normalization/CMakeLists.txt deleted file mode 100644 index a20eb2632..000000000 --- a/test/elementwise_normalization/CMakeLists.txt +++ /dev/null @@ -1,7 +0,0 @@ -add_custom_target(test_elementwise_normalization) - -add_gtest_executable(test_elementwise_layernorm_fp16 test_elementwise_layernorm_fp16.cpp) - -target_link_libraries(test_elementwise_layernorm_fp16 PRIVATE utility device_elementwise_normalization_instance) - -add_dependencies(test_elementwise_normalization test_elementwise_layernorm_fp16) diff --git a/test/elementwise_normalization/test_elementwise_layernorm_fp16.cpp b/test/elementwise_normalization/test_elementwise_layernorm_fp16.cpp deleted file mode 100644 index f01e963bd..000000000 --- a/test/elementwise_normalization/test_elementwise_layernorm_fp16.cpp +++ /dev/null @@ -1,47 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. - -#include "gtest/gtest.h" -#include "profiler/include/profile_elementwise_layernorm_impl.hpp" - -using F16 = ck::half_t; -using F32 = float; -using ck::index_t; - -template -class TestElementwiseLayernorm : public ::testing::Test -{ - protected: - using ADataType = std::tuple_element_t<0, Tuple>; - using BDataType = std::tuple_element_t<1, Tuple>; - using GammaDataType = std::tuple_element_t<2, Tuple>; - using BetaDataType = std::tuple_element_t<3, Tuple>; - using AccDataType = std::tuple_element_t<4, Tuple>; - using YDataType = std::tuple_element_t<5, Tuple>; - - void Run() - { - // M, N - std::vector> lengths = { - {1, 1}, {25, 16}, {39, 777}, {100, 200}, {1024, 1024}, {48 * 256, 2048}}; - - for(auto length : lengths) - { - bool success = ck::profiler::profile_elementwise_layernorm_impl( - true, 2, false, false, length); - EXPECT_TRUE(success); - } - } -}; - -using KernelTypes = ::testing::Types< - // ADataType, BDataType, GammaDataType, BetaDataType, AccDataType, YDataType> - std::tuple>; - -TYPED_TEST_SUITE(TestElementwiseLayernorm, KernelTypes); -TYPED_TEST(TestElementwiseLayernorm, Test_FP16) { this->Run(); } diff --git a/test/normalization/CMakeLists.txt b/test/normalization/CMakeLists.txt index 4890f2f75..ab6e2d1cd 100644 --- a/test/normalization/CMakeLists.txt +++ b/test/normalization/CMakeLists.txt @@ -3,9 +3,9 @@ add_custom_target(test_layernorm) add_gtest_executable(test_layernorm2d_fp32 test_layernorm2d_fp32.cpp) add_gtest_executable(test_layernorm2d_fp16 test_layernorm2d_fp16.cpp) add_gtest_executable(test_groupnorm_fp16 test_groupnorm_fp16.cpp) -add_gtest_executable(test_groupnorm_fp32 test_groupnorm_fp32.cpp) +add_gtest_executable(test_groupnorm_fp32 test_groupnorm_fp32.cpp) -target_link_libraries(test_layernorm2d_fp32 PRIVATE utility) +target_link_libraries(test_layernorm2d_fp32 PRIVATE utility) target_link_libraries(test_layernorm2d_fp16 PRIVATE utility) target_link_libraries(test_groupnorm_fp16 PRIVATE utility device_normalization_instance) target_link_libraries(test_groupnorm_fp32 PRIVATE utility device_normalization_instance) @@ -14,3 +14,4 @@ add_dependencies(test_layernorm test_layernorm2d_fp32) add_dependencies(test_layernorm test_layernorm2d_fp16) add_dependencies(test_layernorm test_groupnorm_fp16) add_dependencies(test_layernorm test_groupnorm_fp32) + -- GitLab From dda3a0a10bb62a6d47e3559b89146a9d02361502 Mon Sep 17 00:00:00 2001 From: Qianfeng Date: Tue, 25 Oct 2022 23:39:11 +0800 Subject: [PATCH 06/95] Update to the Reduction API and instances (#476) * Simplify the macros for declaring and defining the add_device_reduce_instance_xxxx() instances * Change the types of lengths and strides from std::vector to std::array for the reduction device interfaces * Remove DeviceSoftmaxImpl's depending on DeviceReduceMultiblock * Split the cpp and hpp files for reduction instances to enable more parallel compiling * Remove the using of macros for declaring reduction instances and instance references * Update to add_device_reduce_instance_xxxx templated functions * Use ReduceOperation+InElementwiseOp+AccElementwiseOp to repace the ReduceOpId in defining add_reduce_instance_xxxx() templates * Change return format --- example/12_reduce/reduce_blockwise.cpp | 6 +- example/12_reduce/reduce_blockwise_impl.hpp | 30 +-- .../12_reduce/reduce_blockwise_two_call.cpp | 52 ++--- example/12_reduce/reduce_example_common.hpp | 13 +- .../reduce_multiblock_atomic_add.cpp | 6 +- .../reduce_multiblock_atomic_add_impl.hpp | 30 +-- .../gpu/device/device_reduce.hpp | 32 +-- .../device/impl/device_reduce_multiblock.hpp | 80 +++---- .../device/impl/device_reduce_threadwise.hpp | 66 +++--- .../gpu/device/impl/device_softmax_impl.hpp | 195 ++++++++++++----- .../device_operation_instance_factory.hpp | 7 +- .../gpu/reduce/device_reduce_instance.hpp | 95 ++++++-- .../device_reduce_instance_blockwise.hpp | 77 +------ ..._reduce_instance_blockwise_b16_f32_b16.hpp | 59 ----- ...uce_instance_blockwise_b16_f32_b16_add.hpp | 27 +++ ...ce_instance_blockwise_b16_f32_b16_amax.hpp | 31 +++ ...uce_instance_blockwise_b16_f32_b16_avg.hpp | 27 +++ ...uce_instance_blockwise_b16_f32_b16_max.hpp | 31 +++ ...uce_instance_blockwise_b16_f32_b16_min.hpp | 31 +++ ...e_instance_blockwise_b16_f32_b16_norm2.hpp | 27 +++ ..._reduce_instance_blockwise_f16_f16_f16.hpp | 46 ---- ...ce_instance_blockwise_f16_f16_f16_amax.hpp | 31 +++ ...uce_instance_blockwise_f16_f16_f16_max.hpp | 31 +++ ...uce_instance_blockwise_f16_f16_f16_min.hpp | 31 +++ ..._reduce_instance_blockwise_f16_f32_f16.hpp | 34 --- ...uce_instance_blockwise_f16_f32_f16_add.hpp | 27 +++ ...uce_instance_blockwise_f16_f32_f16_avg.hpp | 27 +++ ...e_instance_blockwise_f16_f32_f16_norm2.hpp | 27 +++ ..._reduce_instance_blockwise_f32_f32_f32.hpp | 58 ----- ...uce_instance_blockwise_f32_f32_f32_add.hpp | 27 +++ ...ce_instance_blockwise_f32_f32_f32_amax.hpp | 31 +++ ...uce_instance_blockwise_f32_f32_f32_avg.hpp | 27 +++ ...uce_instance_blockwise_f32_f32_f32_max.hpp | 31 +++ ...uce_instance_blockwise_f32_f32_f32_min.hpp | 31 +++ ...e_instance_blockwise_f32_f32_f32_norm2.hpp | 27 +++ ..._reduce_instance_blockwise_f32_f64_f32.hpp | 34 --- ...uce_instance_blockwise_f32_f64_f32_add.hpp | 27 +++ ...uce_instance_blockwise_f32_f64_f32_avg.hpp | 27 +++ ...e_instance_blockwise_f32_f64_f32_norm2.hpp | 27 +++ ..._reduce_instance_blockwise_f64_f64_f64.hpp | 58 ----- ...uce_instance_blockwise_f64_f64_f64_add.hpp | 27 +++ ...ce_instance_blockwise_f64_f64_f64_amax.hpp | 31 +++ ...uce_instance_blockwise_f64_f64_f64_avg.hpp | 27 +++ ...uce_instance_blockwise_f64_f64_f64_max.hpp | 31 +++ ...uce_instance_blockwise_f64_f64_f64_min.hpp | 31 +++ ...e_instance_blockwise_f64_f64_f64_norm2.hpp | 27 +++ ...ce_reduce_instance_blockwise_i8_i32_i8.hpp | 30 --- ...educe_instance_blockwise_i8_i32_i8_add.hpp | 27 +++ ...educe_instance_blockwise_i8_i32_i8_avg.hpp | 27 +++ ...ice_reduce_instance_blockwise_i8_i8_i8.hpp | 46 ---- ...educe_instance_blockwise_i8_i8_i8_amax.hpp | 31 +++ ...reduce_instance_blockwise_i8_i8_i8_max.hpp | 31 +++ ...reduce_instance_blockwise_i8_i8_i8_min.hpp | 31 +++ .../device_reduce_instance_impl_common.hpp | 13 ++ ..._reduce_instance_multiblock_atomic_add.hpp | 156 ++++--------- ...ance_multiblock_atomic_add_b16_f32_f32.hpp | 30 --- ..._multiblock_atomic_add_b16_f32_f32_add.hpp | 27 +++ ..._multiblock_atomic_add_b16_f32_f32_avg.hpp | 27 +++ ...ance_multiblock_atomic_add_f16_f32_f32.hpp | 30 --- ..._multiblock_atomic_add_f16_f32_f32_add.hpp | 27 +++ ..._multiblock_atomic_add_f16_f32_f32_avg.hpp | 27 +++ ...ance_multiblock_atomic_add_f32_f32_f32.hpp | 30 --- ..._multiblock_atomic_add_f32_f32_f32_add.hpp | 27 +++ ..._multiblock_atomic_add_f32_f32_f32_avg.hpp | 27 +++ ...ance_multiblock_atomic_add_f32_f64_f32.hpp | 30 --- ..._multiblock_atomic_add_f32_f64_f32_add.hpp | 28 +++ ..._multiblock_atomic_add_f32_f64_f32_avg.hpp | 28 +++ ...ance_multiblock_atomic_add_f64_f64_f64.hpp | 30 --- ..._multiblock_atomic_add_f64_f64_f64_add.hpp | 27 +++ ..._multiblock_atomic_add_f64_f64_f64_avg.hpp | 27 +++ .../device_reduce_instance_threadwise.hpp | 77 +------ ...reduce_instance_threadwise_b16_f32_b16.hpp | 59 ----- ...ce_instance_threadwise_b16_f32_b16_add.hpp | 27 +++ ...e_instance_threadwise_b16_f32_b16_amax.hpp | 31 +++ ...ce_instance_threadwise_b16_f32_b16_avg.hpp | 27 +++ ...ce_instance_threadwise_b16_f32_b16_max.hpp | 31 +++ ...ce_instance_threadwise_b16_f32_b16_min.hpp | 31 +++ ..._instance_threadwise_b16_f32_b16_norm2.hpp | 27 +++ ...reduce_instance_threadwise_f16_f16_f16.hpp | 46 ---- ...e_instance_threadwise_f16_f16_f16_amax.hpp | 31 +++ ...ce_instance_threadwise_f16_f16_f16_max.hpp | 31 +++ ...ce_instance_threadwise_f16_f16_f16_min.hpp | 31 +++ ...reduce_instance_threadwise_f16_f32_f16.hpp | 34 --- ...ce_instance_threadwise_f16_f32_f16_add.hpp | 27 +++ ...ce_instance_threadwise_f16_f32_f16_avg.hpp | 27 +++ ..._instance_threadwise_f16_f32_f16_norm2.hpp | 27 +++ ...reduce_instance_threadwise_f32_f32_f32.hpp | 58 ----- ...ce_instance_threadwise_f32_f32_f32_add.hpp | 27 +++ ...e_instance_threadwise_f32_f32_f32_amax.hpp | 31 +++ ...ce_instance_threadwise_f32_f32_f32_avg.hpp | 27 +++ ...ce_instance_threadwise_f32_f32_f32_max.hpp | 31 +++ ...ce_instance_threadwise_f32_f32_f32_min.hpp | 31 +++ ..._instance_threadwise_f32_f32_f32_norm2.hpp | 27 +++ ...reduce_instance_threadwise_f32_f64_f32.hpp | 34 --- ...ce_instance_threadwise_f32_f64_f32_add.hpp | 27 +++ ...ce_instance_threadwise_f32_f64_f32_avg.hpp | 27 +++ ..._instance_threadwise_f32_f64_f32_norm2.hpp | 27 +++ ...reduce_instance_threadwise_f64_f64_f64.hpp | 58 ----- ...ce_instance_threadwise_f64_f64_f64_add.hpp | 27 +++ ...e_instance_threadwise_f64_f64_f64_amax.hpp | 31 +++ ...ce_instance_threadwise_f64_f64_f64_avg.hpp | 27 +++ ...ce_instance_threadwise_f64_f64_f64_max.hpp | 31 +++ ...ce_instance_threadwise_f64_f64_f64_min.hpp | 31 +++ ..._instance_threadwise_f64_f64_f64_norm2.hpp | 27 +++ ...e_reduce_instance_threadwise_i8_i32_i8.hpp | 30 --- ...duce_instance_threadwise_i8_i32_i8_add.hpp | 27 +++ ...duce_instance_threadwise_i8_i32_i8_avg.hpp | 27 +++ ...ce_reduce_instance_threadwise_i8_i8_i8.hpp | 46 ---- ...duce_instance_threadwise_i8_i8_i8_amax.hpp | 31 +++ ...educe_instance_threadwise_i8_i8_i8_max.hpp | 31 +++ ...educe_instance_threadwise_i8_i8_i8_min.hpp | 31 +++ .../ck/library/utility/host_reduction.hpp | 10 +- .../gpu/reduce/CMakeLists.txt | 95 ++++++-- ..._reduce_instance_blockwise_b16_f32_b16.cpp | 56 ----- ...uce_instance_blockwise_b16_f32_b16_add.cpp | 24 ++ ...ce_instance_blockwise_b16_f32_b16_amax.cpp | 28 +++ ...uce_instance_blockwise_b16_f32_b16_avg.cpp | 24 ++ ...uce_instance_blockwise_b16_f32_b16_max.cpp | 28 +++ ...uce_instance_blockwise_b16_f32_b16_min.cpp | 28 +++ ...e_instance_blockwise_b16_f32_b16_norm2.cpp | 24 ++ ..._reduce_instance_blockwise_f16_f16_f16.cpp | 43 ---- ...ce_instance_blockwise_f16_f16_f16_amax.cpp | 28 +++ ...uce_instance_blockwise_f16_f16_f16_max.cpp | 28 +++ ...uce_instance_blockwise_f16_f16_f16_min.cpp | 28 +++ ..._reduce_instance_blockwise_f16_f32_f16.cpp | 31 --- ...uce_instance_blockwise_f16_f32_f16_add.cpp | 24 ++ ...uce_instance_blockwise_f16_f32_f16_avg.cpp | 24 ++ ...e_instance_blockwise_f16_f32_f16_norm2.cpp | 24 ++ ..._reduce_instance_blockwise_f32_f32_f32.cpp | 55 ----- ...uce_instance_blockwise_f32_f32_f32_add.cpp | 24 ++ ...ce_instance_blockwise_f32_f32_f32_amax.cpp | 28 +++ ...uce_instance_blockwise_f32_f32_f32_avg.cpp | 24 ++ ...uce_instance_blockwise_f32_f32_f32_max.cpp | 28 +++ ...uce_instance_blockwise_f32_f32_f32_min.cpp | 28 +++ ...e_instance_blockwise_f32_f32_f32_norm2.cpp | 25 +++ ..._reduce_instance_blockwise_f32_f64_f32.cpp | 30 --- ...uce_instance_blockwise_f32_f64_f32_add.cpp | 23 ++ ...uce_instance_blockwise_f32_f64_f32_avg.cpp | 23 ++ ...e_instance_blockwise_f32_f64_f32_norm2.cpp | 23 ++ ..._reduce_instance_blockwise_f64_f64_f64.cpp | 55 ----- ...uce_instance_blockwise_f64_f64_f64_add.cpp | 24 ++ ...ce_instance_blockwise_f64_f64_f64_amax.cpp | 28 +++ ...uce_instance_blockwise_f64_f64_f64_avg.cpp | 24 ++ ...uce_instance_blockwise_f64_f64_f64_max.cpp | 28 +++ ...uce_instance_blockwise_f64_f64_f64_min.cpp | 28 +++ ...e_instance_blockwise_f64_f64_f64_norm2.cpp | 24 ++ ...ce_reduce_instance_blockwise_i8_i32_i8.cpp | 27 --- ...educe_instance_blockwise_i8_i32_i8_add.cpp | 24 ++ ...educe_instance_blockwise_i8_i32_i8_avg.cpp | 24 ++ ...ice_reduce_instance_blockwise_i8_i8_i8.cpp | 43 ---- ...educe_instance_blockwise_i8_i8_i8_amax.cpp | 28 +++ ...reduce_instance_blockwise_i8_i8_i8_max.cpp | 28 +++ ...reduce_instance_blockwise_i8_i8_i8_min.cpp | 28 +++ ...ance_multiblock_atomic_add_b16_f32_f32.cpp | 26 --- ..._multiblock_atomic_add_b16_f32_f32_add.cpp | 23 ++ ..._multiblock_atomic_add_b16_f32_f32_avg.cpp | 23 ++ ...ance_multiblock_atomic_add_f16_f32_f32.cpp | 27 --- ..._multiblock_atomic_add_f16_f32_f32_add.cpp | 24 ++ ..._multiblock_atomic_add_f16_f32_f32_avg.cpp | 24 ++ ...ance_multiblock_atomic_add_f32_f32_f32.cpp | 26 --- ..._multiblock_atomic_add_f32_f32_f32_add.cpp | 23 ++ ..._multiblock_atomic_add_f32_f32_f32_avg.cpp | 23 ++ ...ance_multiblock_atomic_add_f32_f64_f32.cpp | 26 --- ..._multiblock_atomic_add_f32_f64_f32_add.cpp | 23 ++ ..._multiblock_atomic_add_f32_f64_f32_avg.cpp | 23 ++ ...ance_multiblock_atomic_add_f64_f64_f64.cpp | 27 --- ..._multiblock_atomic_add_f64_f64_f64_add.cpp | 24 ++ ..._multiblock_atomic_add_f64_f64_f64_avg.cpp | 24 ++ ...reduce_instance_threadwise_b16_f32_b16.cpp | 56 ----- ...ce_instance_threadwise_b16_f32_b16_add.cpp | 24 ++ ...e_instance_threadwise_b16_f32_b16_amax.cpp | 28 +++ ...ce_instance_threadwise_b16_f32_b16_avg.cpp | 24 ++ ...ce_instance_threadwise_b16_f32_b16_max.cpp | 28 +++ ...ce_instance_threadwise_b16_f32_b16_min.cpp | 28 +++ ..._instance_threadwise_b16_f32_b16_norm2.cpp | 24 ++ ...reduce_instance_threadwise_f16_f16_f16.cpp | 43 ---- ...e_instance_threadwise_f16_f16_f16_amax.cpp | 28 +++ ...ce_instance_threadwise_f16_f16_f16_max.cpp | 28 +++ ...ce_instance_threadwise_f16_f16_f16_min.cpp | 28 +++ ...reduce_instance_threadwise_f16_f32_f16.cpp | 30 --- ...ce_instance_threadwise_f16_f32_f16_add.cpp | 23 ++ ...ce_instance_threadwise_f16_f32_f16_avg.cpp | 23 ++ ..._instance_threadwise_f16_f32_f16_norm2.cpp | 23 ++ ...reduce_instance_threadwise_f32_f32_f32.cpp | 55 ----- ...ce_instance_threadwise_f32_f32_f32_add.cpp | 24 ++ ...e_instance_threadwise_f32_f32_f32_amax.cpp | 28 +++ ...ce_instance_threadwise_f32_f32_f32_avg.cpp | 24 ++ ...ce_instance_threadwise_f32_f32_f32_max.cpp | 28 +++ ...ce_instance_threadwise_f32_f32_f32_min.cpp | 28 +++ ..._instance_threadwise_f32_f32_f32_norm2.cpp | 24 ++ ...reduce_instance_threadwise_f32_f64_f32.cpp | 31 --- ...ce_instance_threadwise_f32_f64_f32_add.cpp | 24 ++ ...ce_instance_threadwise_f32_f64_f32_avg.cpp | 24 ++ ..._instance_threadwise_f32_f64_f32_norm2.cpp | 24 ++ ...reduce_instance_threadwise_f64_f64_f64.cpp | 54 ----- ...ce_instance_threadwise_f64_f64_f64_add.cpp | 23 ++ ...e_instance_threadwise_f64_f64_f64_amax.cpp | 27 +++ ...ce_instance_threadwise_f64_f64_f64_avg.cpp | 23 ++ ...ce_instance_threadwise_f64_f64_f64_max.cpp | 27 +++ ...ce_instance_threadwise_f64_f64_f64_min.cpp | 27 +++ ..._instance_threadwise_f64_f64_f64_norm2.cpp | 23 ++ ...e_reduce_instance_threadwise_i8_i32_i8.cpp | 28 --- ...duce_instance_threadwise_i8_i32_i8_add.cpp | 25 +++ ...duce_instance_threadwise_i8_i32_i8_avg.cpp | 24 ++ ...ce_reduce_instance_threadwise_i8_i8_i8.cpp | 43 ---- ...duce_instance_threadwise_i8_i8_i8_amax.cpp | 28 +++ ...educe_instance_threadwise_i8_i8_i8_max.cpp | 28 +++ ...educe_instance_threadwise_i8_i8_i8_min.cpp | 28 +++ profiler/include/profile_reduce_impl.hpp | 205 ++++++++++-------- 209 files changed, 4652 insertions(+), 2285 deletions(-) delete mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16.hpp create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_add.hpp create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_amax.hpp create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_avg.hpp create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_max.hpp create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_min.hpp create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_norm2.hpp delete mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16.hpp create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_amax.hpp create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_max.hpp create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_min.hpp delete mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16.hpp create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_add.hpp create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_avg.hpp create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_norm2.hpp delete mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32.hpp create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_add.hpp create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_amax.hpp create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_avg.hpp create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_max.hpp create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_min.hpp create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_norm2.hpp delete mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32.hpp create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_add.hpp create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_avg.hpp create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_norm2.hpp delete mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64.hpp create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_add.hpp create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_amax.hpp create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_avg.hpp create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_max.hpp create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_min.hpp create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_norm2.hpp delete mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i32_i8.hpp create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i32_i8_add.hpp create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i32_i8_avg.hpp delete mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8.hpp create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8_amax.hpp create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8_max.hpp create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8_min.hpp delete mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_b16_f32_f32.hpp create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_b16_f32_f32_add.hpp create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_b16_f32_f32_avg.hpp delete mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32.hpp create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32_add.hpp create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32_avg.hpp delete mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32.hpp create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32_add.hpp create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32_avg.hpp delete mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32.hpp create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32_add.hpp create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32_avg.hpp delete mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f64_f64_f64.hpp create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f64_f64_f64_add.hpp create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f64_f64_f64_avg.hpp delete mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16.hpp create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_add.hpp create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_amax.hpp create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_avg.hpp create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_max.hpp create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_min.hpp create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_norm2.hpp delete mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16.hpp create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16_amax.hpp create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16_max.hpp create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16_min.hpp delete mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16.hpp create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16_add.hpp create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16_avg.hpp create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16_norm2.hpp delete mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32.hpp create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_add.hpp create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_amax.hpp create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_avg.hpp create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_max.hpp create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_min.hpp create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_norm2.hpp delete mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32.hpp create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_add.hpp create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_avg.hpp create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_norm2.hpp delete mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64.hpp create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_add.hpp create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_amax.hpp create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_avg.hpp create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_max.hpp create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_min.hpp create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_norm2.hpp delete mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8.hpp create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8_add.hpp create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8_avg.hpp delete mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8.hpp create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_amax.hpp create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_max.hpp create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_min.hpp delete mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16.cpp create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_add.cpp create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_amax.cpp create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_avg.cpp create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_max.cpp create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_min.cpp create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_norm2.cpp delete mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16.cpp create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_amax.cpp create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_max.cpp create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_min.cpp delete mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16.cpp create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_add.cpp create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_avg.cpp create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_norm2.cpp delete mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32.cpp create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_add.cpp create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_amax.cpp create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_avg.cpp create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_max.cpp create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_min.cpp create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_norm2.cpp delete mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32.cpp create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_add.cpp create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_avg.cpp create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_norm2.cpp delete mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64.cpp create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_add.cpp create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_amax.cpp create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_avg.cpp create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_max.cpp create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_min.cpp create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_norm2.cpp delete mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i32_i8.cpp create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i32_i8_add.cpp create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i32_i8_avg.cpp delete mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8.cpp create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8_amax.cpp create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8_max.cpp create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8_min.cpp delete mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_b16_f32_f32.cpp create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_b16_f32_f32_add.cpp create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_b16_f32_f32_avg.cpp delete mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32.cpp create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32_add.cpp create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32_avg.cpp delete mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32.cpp create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32_add.cpp create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32_avg.cpp delete mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32.cpp create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32_add.cpp create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32_avg.cpp delete mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f64_f64_f64.cpp create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f64_f64_f64_add.cpp create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f64_f64_f64_avg.cpp delete mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16.cpp create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_add.cpp create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_amax.cpp create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_avg.cpp create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_max.cpp create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_min.cpp create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_norm2.cpp delete mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16.cpp create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16_amax.cpp create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16_max.cpp create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16_min.cpp delete mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16.cpp create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16_add.cpp create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16_avg.cpp create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16_norm2.cpp delete mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32.cpp create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_add.cpp create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_amax.cpp create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_avg.cpp create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_max.cpp create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_min.cpp create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_norm2.cpp delete mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32.cpp create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_add.cpp create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_avg.cpp create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_norm2.cpp delete mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64.cpp create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_add.cpp create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_amax.cpp create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_avg.cpp create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_max.cpp create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_min.cpp create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_norm2.cpp delete mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8.cpp create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8_add.cpp create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8_avg.cpp delete mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8.cpp create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_amax.cpp create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_max.cpp create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_min.cpp diff --git a/example/12_reduce/reduce_blockwise.cpp b/example/12_reduce/reduce_blockwise.cpp index c1bcdbb82..fb9a6e640 100644 --- a/example/12_reduce/reduce_blockwise.cpp +++ b/example/12_reduce/reduce_blockwise.cpp @@ -140,6 +140,10 @@ bool reduce_blockwise_test(bool do_verification, if(ShapeType::Rank_ != inLengths.size() || ShapeType::NumReduceDim_ != reduceDims.size()) return; + std::array arrReduceDims; + + std::copy(reduceDims.begin(), reduceDims.end(), arrReduceDims.begin()); + result = reduce_blockwise_impl( - do_verification, init_method, time_kernel, inLengths, reduceDims, alpha, beta); + do_verification, init_method, time_kernel, inLengths, arrReduceDims, alpha, beta); matched = true; }); diff --git a/example/12_reduce/reduce_blockwise_impl.hpp b/example/12_reduce/reduce_blockwise_impl.hpp index 1d2769ea9..ad5537eb4 100644 --- a/example/12_reduce/reduce_blockwise_impl.hpp +++ b/example/12_reduce/reduce_blockwise_impl.hpp @@ -30,7 +30,7 @@ int reduce_blockwise_impl(bool do_verification, int init_method, bool time_kernel, const std::vector& inLengths, - const std::vector& reduceDims, + const std::array& reduceDims, float alpha, float beta) @@ -38,6 +38,8 @@ int reduce_blockwise_impl(bool do_verification, using namespace ck; using namespace ck::tensor_operation::device; + constexpr index_t NumOutDim = (Rank - NumReduceDim == 0) ? 1 : Rank - NumReduceDim; + constexpr bool op_support_indices = (ReduceOpId == ReduceTensorOp::MIN || ReduceOpId == ReduceTensorOp::MAX || ReduceOpId == ReduceTensorOp::AMAX); @@ -143,7 +145,7 @@ int reduce_blockwise_impl(bool do_verification, std::vector outLengths; - std::vector invariantDims = get_invariant_dims(reduceDims); + auto invariantDims = get_invariant_dims(reduceDims); if(invariantDims.empty()) outLengths.push_back(1); @@ -256,22 +258,22 @@ int reduce_blockwise_impl(bool do_verification, acc_elementwise_op); }; - std::vector i_inLengths; - std::vector i_inStrides; - std::vector i_outLengths; - std::vector i_outStrides; + std::array arrInLengths; + std::array arrInStrides; + std::array arrOutLengths; + std::array arrOutStrides; - i_inLengths.assign(inLengths.begin(), inLengths.end()); - i_inStrides.assign(inStrides.begin(), inStrides.end()); - i_outLengths.assign(outLengths.begin(), outLengths.end()); - i_outStrides.assign(outStrides.begin(), outStrides.end()); + std::copy(inLengths.begin(), inLengths.end(), arrInLengths.begin()); + std::copy(inStrides.begin(), inStrides.end(), arrInStrides.begin()); + std::copy(outLengths.begin(), outLengths.end(), arrOutLengths.begin()); + std::copy(outStrides.begin(), outStrides.end(), arrOutStrides.begin()); auto reduce = DeviceReduceInstance{}; - auto argument_ptr = reduce.MakeArgumentPointer(i_inLengths, - i_inStrides, - i_outLengths, - i_outStrides, + auto argument_ptr = reduce.MakeArgumentPointer(arrInLengths, + arrInStrides, + arrOutLengths, + arrOutStrides, reduceDims, alpha, beta, diff --git a/example/12_reduce/reduce_blockwise_two_call.cpp b/example/12_reduce/reduce_blockwise_two_call.cpp index a84856c33..a5c24b13a 100644 --- a/example/12_reduce/reduce_blockwise_two_call.cpp +++ b/example/12_reduce/reduce_blockwise_two_call.cpp @@ -90,15 +90,15 @@ static bool time_kernel; int main(int argc, char* argv[]) { // used by the device reduction - const std::vector reduceDims_1 = {4}; - const std::vector invariantDims_1 = {0, 1, 2, 3}; + const std::array reduceDims_1 = {4}; + // const std::array invariantDims_1 = {0, 1, 2, 3}; - const std::vector reduceDims_2 = {3}; - const std::vector invariantDims_2 = {0, 1, 2}; + const std::array reduceDims_2 = {3}; + // const std::array invariantDims_2 = {0, 1, 2}; // used by the host reduction - const std::vector reduceDims = {3, 4}; - const std::vector invariantDims = {0, 1, 2}; + const std::array reduceDims = {3, 4}; + const std::array invariantDims = {0, 1, 2}; const std::vector inLengths_1 = {64, 320, 80, 4, 128}; @@ -214,26 +214,26 @@ int main(int argc, char* argv[]) acc_elementwise_op); }; - std::vector i_inLengths_1; - std::vector i_inStrides_1; - std::vector i_inLengths_2; - std::vector i_inStrides_2; - std::vector i_outLengths; - std::vector i_outStrides; + std::array arrInLengths_1; + std::array arrInStrides_1; + std::array arrInLengths_2; + std::array arrInStrides_2; + std::array arrOutLengths; + std::array arrOutStrides; - i_inLengths_1.assign(inLengths_1.begin(), inLengths_1.end()); - i_inStrides_1.assign(inStrides_1.begin(), inStrides_1.end()); - i_inLengths_2.assign(inLengths_2.begin(), inLengths_2.end()); - i_inStrides_2.assign(inStrides_2.begin(), inStrides_2.end()); - i_outLengths.assign(outLengths.begin(), outLengths.end()); - i_outStrides.assign(outStrides.begin(), outStrides.end()); + std::copy(inLengths_1.begin(), inLengths_1.end(), arrInLengths_1.begin()); + std::copy(inStrides_1.begin(), inStrides_1.end(), arrInStrides_1.begin()); + std::copy(inLengths_2.begin(), inLengths_2.end(), arrInLengths_2.begin()); + std::copy(inStrides_2.begin(), inStrides_2.end(), arrInStrides_2.begin()); + std::copy(outLengths.begin(), outLengths.end(), arrOutLengths.begin()); + std::copy(outStrides.begin(), outStrides.end(), arrOutStrides.begin()); auto reduce_1 = DeviceReduceInstance_1{}; - auto argument_ptr_1 = reduce_1.MakeArgumentPointer(i_inLengths_1, - i_inStrides_1, - i_inLengths_2, - i_inStrides_2, + auto argument_ptr_1 = reduce_1.MakeArgumentPointer(arrInLengths_1, + arrInStrides_1, + arrInLengths_2, + arrInStrides_2, reduceDims_1, 1.0f, 0.0f, @@ -255,10 +255,10 @@ int main(int argc, char* argv[]) auto reduce_2 = DeviceReduceInstance_2{}; - auto argument_ptr_2 = reduce_2.MakeArgumentPointer(i_inLengths_2, - i_inStrides_2, - i_outLengths, - i_outStrides, + auto argument_ptr_2 = reduce_2.MakeArgumentPointer(arrInLengths_2, + arrInStrides_2, + arrOutLengths, + arrOutStrides, reduceDims_2, alpha, beta, diff --git a/example/12_reduce/reduce_example_common.hpp b/example/12_reduce/reduce_example_common.hpp index 6334f608e..05f0a0edb 100644 --- a/example/12_reduce/reduce_example_common.hpp +++ b/example/12_reduce/reduce_example_common.hpp @@ -5,11 +5,10 @@ #include "ck/ck.hpp" -template -std::vector get_invariant_dims(const std::vector& reduceDims) +template +static inline std::array +get_invariant_dims(const std::array& reduceDims) { - assert(NumReduceDim == reduceDims.size()); - int reduceFlag = 0; // flag the bits for the reduceDims @@ -18,13 +17,15 @@ std::vector get_invariant_dims(const std::vector& reduceDims) reduceFlag |= 1 << reduceDims[i]; }; - std::vector invariantDims; + std::array invariantDims; // collect invariant dimensions + int dim = 0; for(int i = 0; i < Rank; i++) if((reduceFlag & (1 << i)) == 0) { - invariantDims.push_back(i); + invariantDims[dim] = i; + dim++; }; return invariantDims; diff --git a/example/12_reduce/reduce_multiblock_atomic_add.cpp b/example/12_reduce/reduce_multiblock_atomic_add.cpp index 9b56598ca..90c04855b 100644 --- a/example/12_reduce/reduce_multiblock_atomic_add.cpp +++ b/example/12_reduce/reduce_multiblock_atomic_add.cpp @@ -138,13 +138,17 @@ bool reduce_multiblock_atomic_add_test(bool do_verification, if(ShapeType::Rank_ != inLengths.size() || ShapeType::NumReduceDim_ != reduceDims.size()) return; + std::array a_reduceDims; + + std::copy(reduceDims.begin(), reduceDims.end(), a_reduceDims.begin()); + result = reduce_multiblock_atomic_add_impl( - do_verification, init_method, time_kernel, inLengths, reduceDims, alpha, beta); + do_verification, init_method, time_kernel, inLengths, a_reduceDims, alpha, beta); matched = true; }); diff --git a/example/12_reduce/reduce_multiblock_atomic_add_impl.hpp b/example/12_reduce/reduce_multiblock_atomic_add_impl.hpp index b67854673..0a5355f33 100644 --- a/example/12_reduce/reduce_multiblock_atomic_add_impl.hpp +++ b/example/12_reduce/reduce_multiblock_atomic_add_impl.hpp @@ -29,7 +29,7 @@ int reduce_multiblock_atomic_add_impl(bool do_verification, int init_method, bool time_kernel, const std::vector& inLengths, - const std::vector& reduceDims, + const std::array& reduceDims, float alpha, float beta) @@ -37,6 +37,8 @@ int reduce_multiblock_atomic_add_impl(bool do_verification, using namespace ck; using namespace ck::tensor_operation::device; + constexpr index_t NumOutDim = (Rank - NumReduceDim == 0) ? 1 : Rank - NumReduceDim; + constexpr bool op_support_atomic_add = (ReduceOpId == ReduceTensorOp::ADD || ReduceOpId == ReduceTensorOp::AVG); @@ -84,7 +86,7 @@ int reduce_multiblock_atomic_add_impl(bool do_verification, std::vector outLengths; - std::vector invariantDims = get_invariant_dims(reduceDims); + auto invariantDims = get_invariant_dims(reduceDims); if(invariantDims.empty()) outLengths.push_back(1); @@ -169,22 +171,22 @@ int reduce_multiblock_atomic_add_impl(bool do_verification, acc_elementwise_op); }; - std::vector i_inLengths; - std::vector i_inStrides; - std::vector i_outLengths; - std::vector i_outStrides; + std::array arrInLengths; + std::array arrInStrides; + std::array arrOutLengths; + std::array arrOutStrides; - i_inLengths.assign(inLengths.begin(), inLengths.end()); - i_inStrides.assign(inStrides.begin(), inStrides.end()); - i_outLengths.assign(outLengths.begin(), outLengths.end()); - i_outStrides.assign(outStrides.begin(), outStrides.end()); + std::copy(inLengths.begin(), inLengths.end(), arrInLengths.begin()); + std::copy(inStrides.begin(), inStrides.end(), arrInStrides.begin()); + std::copy(outLengths.begin(), outLengths.end(), arrOutLengths.begin()); + std::copy(outStrides.begin(), outStrides.end(), arrOutStrides.begin()); auto reduce = DeviceReduceInstance{}; - auto argument_ptr = reduce.MakeArgumentPointer(i_inLengths, - i_inStrides, - i_outLengths, - i_outStrides, + auto argument_ptr = reduce.MakeArgumentPointer(arrInLengths, + arrInStrides, + arrOutLengths, + arrOutStrides, reduceDims, alpha, beta, diff --git a/include/ck/tensor_operation/gpu/device/device_reduce.hpp b/include/ck/tensor_operation/gpu/device/device_reduce.hpp index 468d0b5ab..15aeb8e91 100644 --- a/include/ck/tensor_operation/gpu/device/device_reduce.hpp +++ b/include/ck/tensor_operation/gpu/device/device_reduce.hpp @@ -3,27 +3,30 @@ #pragma once -#include +#include #include -#include -#include "ck/utility/common_header.hpp" -#include "ck/utility/reduction_enums.hpp" +#include "ck/ck.hpp" #include "ck/tensor_operation/gpu/device/device_base.hpp" namespace ck { namespace tensor_operation { namespace device { -template +template struct DeviceReduce : public BaseOperator { + static constexpr index_t NumOutDim = (Rank - NumReduceDim == 0) ? 1 : Rank - NumReduceDim; + virtual std::unique_ptr - MakeArgumentPointer(const std::vector inLengths, - const std::vector inStrides, - const std::vector outLengths, - const std::vector outStrides, - const std::vector reduceDims, + MakeArgumentPointer(const std::array inLengths, + const std::array inStrides, + const std::array outLengths, + const std::array outStrides, + const std::array reduceDims, float alpha, float beta, const void* in_dev, @@ -36,9 +39,12 @@ struct DeviceReduce : public BaseOperator virtual std::unique_ptr MakeInvokerPointer() = 0; }; -template -using DeviceReducePtr = - std::unique_ptr>; +template +using DeviceReducePtr = std::unique_ptr< + DeviceReduce>; } // namespace device } // namespace tensor_operation diff --git a/include/ck/tensor_operation/gpu/device/impl/device_reduce_multiblock.hpp b/include/ck/tensor_operation/gpu/device/impl/device_reduce_multiblock.hpp index da53841cc..0ccac7c74 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_reduce_multiblock.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_reduce_multiblock.hpp @@ -5,9 +5,8 @@ #include #include +#include -#include "ck/utility/common_header.hpp" -#include "ck/utility/reduction_operator.hpp" #include "ck/tensor_description/tensor_descriptor.hpp" #include "ck/tensor_description/tensor_descriptor_helper.hpp" #include "ck/tensor_operation/gpu/device/device_reduce.hpp" @@ -41,7 +40,8 @@ template -struct DeviceReduceMultiBlock : public DeviceReduce +struct DeviceReduceMultiBlock + : public DeviceReduce { static_assert(Rank <= 6, "Bigger Rank size is not supported!"); static_assert(BlockSize == MThreadClusterSize * KThreadClusterSize, @@ -58,8 +58,8 @@ struct DeviceReduceMultiBlock : public DeviceReduce& inLengths, - const std::vector& inStrides, + static auto MakeSrc2dDescriptor(const std::array& inLengths, + const std::array& inStrides, int blkGroupSize, int numBlockTileIteration) { - const auto tupleSrcLengths = make_tuple_from_array(inLengths, Number{}); - const auto tupleSrcStrides = make_tuple_from_array(inStrides, Number{}); + const auto tupleSrcLengths = + generate_tuple([&](auto I) { return inLengths[I]; }, Number{}); + const auto tupleSrcStrides = + generate_tuple([&](auto I) { return inStrides[I]; }, Number{}); const auto inDesc = make_naive_tensor_descriptor(tupleSrcLengths, tupleSrcStrides); @@ -97,7 +99,7 @@ struct DeviceReduceMultiBlock : public DeviceReduce::type{}), + make_tuple(typename arithmetic_sequence_gen<0, NumSrcDim, 1>::type{}), make_tuple(Sequence<0>{})); return transform_tensor_descriptor(one_dim_inDesc, @@ -111,10 +113,10 @@ struct DeviceReduceMultiBlock : public DeviceReduce::type; using ReduceDims = typename arithmetic_sequence_gen::type; - const auto reduceDimLengths = - make_tuple_from_array_and_index_seq(inLengths, ReduceDims{}); + const auto reduceDimLengths = generate_tuple( + [&](auto I) { return inLengths[NumInvariantDim + I]; }, Number{}); const auto invariantDimLengths = - make_tuple_from_array_and_index_seq(inLengths, InvariantDims{}); + generate_tuple([&](auto I) { return inLengths[I]; }, Number{}); return transform_tensor_descriptor( inDesc, @@ -143,18 +145,20 @@ struct DeviceReduceMultiBlock : public DeviceReduce& outLengths, - const std::vector& outStrides) + static auto MakeDst1dDescriptor(const std::array& outLengths, + const std::array& outStrides) { - const auto tupleDstLengths = make_tuple_from_array(outLengths, Number{}); - const auto tupleDstStrides = make_tuple_from_array(outStrides, Number{}); + const auto tupleDstLengths = + generate_tuple([&](auto I) { return outLengths[I]; }, Number{}); + const auto tupleDstStrides = + generate_tuple([&](auto I) { return outStrides[I]; }, Number{}); auto outDesc = make_naive_tensor_descriptor(tupleDstLengths, tupleDstStrides); auto out_grid_desc_m = transform_tensor_descriptor( outDesc, make_tuple(make_merge_transform(tupleDstLengths)), - make_tuple(typename arithmetic_sequence_gen<0, numDstDim, 1>::type{}), + make_tuple(typename arithmetic_sequence_gen<0, NumDstDim, 1>::type{}), make_tuple(Sequence<0>{})); const auto invariantLength = out_grid_desc_m.GetLength(Number<0>{}); @@ -170,18 +174,20 @@ struct DeviceReduceMultiBlock : public DeviceReduce& outLengths, - const std::vector& outStrides) + static auto MakeDst1dDescriptorForBufferSet(const std::array& outLengths, + const std::array& outStrides) { - const auto tupleDstLengths = make_tuple_from_array(outLengths, Number{}); - const auto tupleDstStrides = make_tuple_from_array(outStrides, Number{}); + const auto tupleDstLengths = + generate_tuple([&](auto I) { return outLengths[I]; }, Number{}); + const auto tupleDstStrides = + generate_tuple([&](auto I) { return outStrides[I]; }, Number{}); auto outDesc = make_naive_tensor_descriptor(tupleDstLengths, tupleDstStrides); auto out_grid_desc_m = transform_tensor_descriptor( outDesc, make_tuple(make_merge_transform(tupleDstLengths)), - make_tuple(typename arithmetic_sequence_gen<0, numDstDim, 1>::type{}), + make_tuple(typename arithmetic_sequence_gen<0, NumDstDim, 1>::type{}), make_tuple(Sequence<0>{})); const auto length = out_grid_desc_m.GetLength(Number<0>{}); @@ -198,11 +204,11 @@ struct DeviceReduceMultiBlock : public DeviceReduce inLengths, - const std::vector inStrides, - const std::vector outLengths, - const std::vector outStrides, - const std::vector reduceDims, + Argument(const std::array inLengths, + const std::array inStrides, + const std::array outLengths, + const std::array outStrides, + const std::array reduceDims, float alpha, float beta, const InDataType* in_dev, @@ -272,10 +278,10 @@ struct DeviceReduceMultiBlock : public DeviceReduce inLengths_; - std::vector inStrides_; - std::vector outLengths_; - std::vector outStrides_; + std::array inLengths_; + std::array inStrides_; + std::array outLengths_; + std::array outStrides_; AccDataType alpha_; AccDataType beta_; @@ -459,11 +465,11 @@ struct DeviceReduceMultiBlock : public DeviceReduce - MakeArgumentPointer(const std::vector inLengths, - const std::vector inStrides, - const std::vector outLengths, - const std::vector outStrides, - const std::vector reduceDims, + MakeArgumentPointer(const std::array inLengths, + const std::array inStrides, + const std::array outLengths, + const std::array outStrides, + const std::array reduceDims, float alpha, float beta, const void* in_dev, diff --git a/include/ck/tensor_operation/gpu/device/impl/device_reduce_threadwise.hpp b/include/ck/tensor_operation/gpu/device/impl/device_reduce_threadwise.hpp index f958a7e67..05e14f080 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_reduce_threadwise.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_reduce_threadwise.hpp @@ -5,6 +5,7 @@ #include #include +#include #include "ck/host_utility/device_prop.hpp" #include "ck/host_utility/kernel_launch.hpp" @@ -34,7 +35,8 @@ template -struct DeviceReduceThreadWise : public DeviceReduce +struct DeviceReduceThreadWise + : public DeviceReduce { static_assert(Rank <= 6, "Bigger Rank size is not supported!"); @@ -49,18 +51,20 @@ struct DeviceReduceThreadWise : public DeviceReduce& inLengths, - const std::vector& inStrides) + static auto MakeSrc2dDescriptor(const std::array& inLengths, + const std::array& inStrides) { - const auto tupleSrcLengths = make_tuple_from_array(inLengths, Number{}); - const auto tupleSrcStrides = make_tuple_from_array(inStrides, Number{}); + const auto tupleSrcLengths = + generate_tuple([&](auto I) { return inLengths[I]; }, Number{}); + const auto tupleSrcStrides = + generate_tuple([&](auto I) { return inStrides[I]; }, Number{}); const auto inDesc = make_naive_tensor_descriptor(tupleSrcLengths, tupleSrcStrides); @@ -70,7 +74,7 @@ struct DeviceReduceThreadWise : public DeviceReduce::type{}), + make_tuple(typename arithmetic_sequence_gen<0, NumSrcDim, 1>::type{}), make_tuple(Sequence<0>{})); return transform_tensor_descriptor(one_dim_inDesc, @@ -84,10 +88,10 @@ struct DeviceReduceThreadWise : public DeviceReduce::type; using ReduceDims = typename arithmetic_sequence_gen::type; - const auto reduceDimLengths = - make_tuple_from_array_and_index_seq(inLengths, ReduceDims{}); + const auto reduceDimLengths = generate_tuple( + [&](auto I) { return inLengths[NumInvariantDim + I]; }, Number{}); const auto invariantDimLengths = - make_tuple_from_array_and_index_seq(inLengths, InvariantDims{}); + generate_tuple([&](auto I) { return inLengths[I]; }, Number{}); return transform_tensor_descriptor( inDesc, @@ -116,18 +120,20 @@ struct DeviceReduceThreadWise : public DeviceReduce& outLengths, - const std::vector& outStrides) + static auto MakeDst1dDescriptor(const std::array& outLengths, + const std::array& outStrides) { - const auto tupleDstLengths = make_tuple_from_array(outLengths, Number{}); - const auto tupleDstStrides = make_tuple_from_array(outStrides, Number{}); + const auto tupleDstLengths = + generate_tuple([&](auto I) { return outLengths[I]; }, Number{}); + const auto tupleDstStrides = + generate_tuple([&](auto I) { return outStrides[I]; }, Number{}); auto outDesc = make_naive_tensor_descriptor(tupleDstLengths, tupleDstStrides); auto out_grid_desc_m = transform_tensor_descriptor( outDesc, make_tuple(make_merge_transform(tupleDstLengths)), - make_tuple(typename arithmetic_sequence_gen<0, numDstDim, 1>::type{}), + make_tuple(typename arithmetic_sequence_gen<0, NumDstDim, 1>::type{}), make_tuple(Sequence<0>{})); const auto invariantLength = out_grid_desc_m.GetLength(Number<0>{}); @@ -145,11 +151,11 @@ struct DeviceReduceThreadWise : public DeviceReduce inLengths, - const std::vector inStrides, - const std::vector outLengths, - const std::vector outStrides, - const std::vector reduceDims, + Argument(const std::array inLengths, + const std::array inStrides, + const std::array outLengths, + const std::array outStrides, + const std::array reduceDims, float alpha, float beta, const InDataType* in_dev, @@ -187,10 +193,10 @@ struct DeviceReduceThreadWise : public DeviceReduce inLengths_; - std::vector inStrides_; - std::vector outLengths_; - std::vector outStrides_; + std::array inLengths_; + std::array inStrides_; + std::array outLengths_; + std::array outStrides_; AccDataType alpha_; AccDataType beta_; @@ -321,11 +327,11 @@ struct DeviceReduceThreadWise : public DeviceReduce - MakeArgumentPointer(const std::vector inLengths, - const std::vector inStrides, - const std::vector outLengths, - const std::vector outStrides, - const std::vector reduceDims, + MakeArgumentPointer(const std::array inLengths, + const std::array inStrides, + const std::array outLengths, + const std::array outStrides, + const std::array reduceDims, float alpha, float beta, const void* in_dev, diff --git a/include/ck/tensor_operation/gpu/device/impl/device_softmax_impl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_softmax_impl.hpp index 17f8d13d2..fba820578 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_softmax_impl.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_softmax_impl.hpp @@ -8,12 +8,9 @@ #include "ck/utility/reduction_operator.hpp" #include "ck/tensor_operation/gpu/device/device_base.hpp" -#include "ck/tensor_operation/gpu/device/device_reduce.hpp" #include "ck/tensor_operation/gpu/device/device_softmax.hpp" #include "ck/tensor_operation/gpu/device/impl/device_reduce_common.hpp" -#include "ck/tensor_operation/gpu/device/impl/device_reduce_multiblock.hpp" #include "ck/tensor_operation/gpu/grid/gridwise_softmax.hpp" -#include "ck/tensor_operation/gpu/grid/gridwise_set_buffer_value.hpp" #include "ck/host_utility/device_prop.hpp" #include "ck/host_utility/kernel_launch.hpp" @@ -50,29 +47,80 @@ struct DeviceSoftmaxImpl : public DeviceSoftmax; // OutDstVectorSize - - using GridDesc_M_K = decltype(Reduction::MakeSrc2dDescriptor({1}, {1}, 1, 1)); + static constexpr index_t NumInvariantDim = Rank - NumReduceDim; + + static constexpr index_t NumSrcDim = Rank; + static constexpr index_t NumDstDim = (NumInvariantDim == 0) ? 1 : NumInvariantDim; + static constexpr bool reduceAllDim = (NumInvariantDim == 0); + + static constexpr index_t M_BlockTileSize = MThreadClusterSize * MThreadSliceSize; + static constexpr index_t K_BlockTileSize = KThreadClusterSize * KThreadSliceSize; + + static auto MakeSrc2dDescriptor(const std::vector& inLengths, + const std::vector& inStrides, + int blkGroupSize, + int numBlockTileIteration) + { + const auto tupleSrcLengths = + generate_tuple([&](auto I) { return inLengths[I]; }, Number{}); + const auto tupleSrcStrides = + generate_tuple([&](auto I) { return inStrides[I]; }, Number{}); + + const auto inDesc = make_naive_tensor_descriptor(tupleSrcLengths, tupleSrcStrides); + + const auto in_grid_desc_m_k = [&]() { + if constexpr(reduceAllDim) + { + const auto one_dim_inDesc = transform_tensor_descriptor( + inDesc, + make_tuple(make_merge_transform(tupleSrcLengths)), + make_tuple(typename arithmetic_sequence_gen<0, NumSrcDim, 1>::type{}), + make_tuple(Sequence<0>{})); + + return transform_tensor_descriptor(one_dim_inDesc, + make_tuple(make_unmerge_transform(make_tuple( + 1, one_dim_inDesc.GetLength(Number<0>{})))), + make_tuple(Sequence<0>{}), + make_tuple(Sequence<0, 1>{})); + } + else + { + using InvariantDims = typename arithmetic_sequence_gen<0, NumInvariantDim, 1>::type; + using ReduceDims = typename arithmetic_sequence_gen::type; + + const auto reduceDimLengths = generate_tuple( + [&](auto I) { return inLengths[NumInvariantDim + I]; }, Number{}); + const auto invariantDimLengths = + generate_tuple([&](auto I) { return inLengths[I]; }, Number{}); + + return transform_tensor_descriptor( + inDesc, + make_tuple(make_merge_transform(invariantDimLengths), + make_merge_transform(reduceDimLengths)), + make_tuple(InvariantDims{}, ReduceDims{}), + make_tuple(Sequence<0>{}, Sequence<1>{})); + } + }(); + + const auto invariantLength = in_grid_desc_m_k.GetLength(Number<0>{}); + const auto reduceLength = in_grid_desc_m_k.GetLength(Number<1>{}); + + const int reduceSizePerBlock = K_BlockTileSize * numBlockTileIteration; + const auto inPad_M = + math::integer_least_multiple(invariantLength, M_BlockTileSize) - invariantLength; + const auto inPad_K = reduceSizePerBlock * blkGroupSize - reduceLength; + + auto in_grid_desc_m_k_padded = transform_tensor_descriptor( + in_grid_desc_m_k, + make_tuple(make_right_pad_transform(invariantLength, inPad_M), + make_right_pad_transform(reduceLength, inPad_K)), + make_tuple(Sequence<0>{}, Sequence<1>{}), + make_tuple(Sequence<0>{}, Sequence<1>{})); + + return (in_grid_desc_m_k_padded); + }; + + using GridDesc_M_K = decltype(MakeSrc2dDescriptor({1}, {1}, 1, 1)); using GridwiseSoftmaxGeneric = GridwiseSoftmax_mk_to_mk; - struct Argument : public Reduction::Argument + struct Argument : public BaseArgument { Argument(const std::vector inLengths, const std::vector inStrides, @@ -113,42 +161,60 @@ struct DeviceSoftmaxImpl : public DeviceSoftmax; diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance.hpp index 97e9addfb..550a7b034 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance.hpp @@ -3,24 +3,77 @@ #pragma once -#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16.hpp" -#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16.hpp" -#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32.hpp" -#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32.hpp" -#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64.hpp" -#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8.hpp" -#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i32_i8.hpp" -#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16.hpp" -#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32.hpp" -#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32.hpp" -#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32.hpp" -#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f64_f64_f64.hpp" -#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_b16_f32_f32.hpp" -#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16.hpp" -#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16.hpp" -#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32.hpp" -#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32.hpp" -#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64.hpp" -#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8.hpp" -#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8.hpp" -#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_min.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_max.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_amax.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_add.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_avg.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_norm2.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_add.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_avg.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_norm2.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_min.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_max.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_amax.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_add.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_avg.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_norm2.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_add.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_avg.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_norm2.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_min.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_max.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_amax.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8_min.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8_max.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8_amax.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i32_i8_add.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i32_i8_avg.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_add.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_avg.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_norm2.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_min.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_max.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_amax.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32_add.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32_avg.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32_add.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32_avg.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32_add.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32_avg.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f64_f64_f64_add.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f64_f64_f64_avg.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_b16_f32_f32_add.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_b16_f32_f32_avg.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16_min.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16_max.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16_amax.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16_add.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16_avg.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16_norm2.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_add.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_avg.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_norm2.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_min.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_max.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_amax.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_add.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_avg.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_norm2.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_add.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_avg.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_norm2.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_min.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_max.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_amax.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_min.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_max.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_amax.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8_add.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8_avg.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_add.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_avg.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_norm2.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_min.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_max.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_amax.hpp" diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp index fa76526c5..90cfe837d 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp @@ -5,6 +5,8 @@ #include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp" #include "ck/tensor_operation/gpu/device/impl/device_reduce_multiblock.hpp" + +#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp" #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_impl_common.hpp" namespace ck { @@ -63,33 +65,20 @@ using reduce_configuration_2_instances_blockwise = std::tuple< >; #endif -template -using deviceReduceBlockWisePtrType = DeviceReducePtr< - typename reduce_unary_operator::InElementwiseOperation, - typename reduce_unary_operator::AccElementwiseOperation>; - template + bool OutputIndex> void add_device_reduce_instance_blockwise( - std::vector>& device_op_instances) + std::vector>& + device_op_instances) { - using ReduceOperation = typename reduce_binary_operator::opType; - using InElementwiseOperation = - typename reduce_unary_operator::InElementwiseOperation; - using AccElementwiseOperation = - typename reduce_unary_operator::AccElementwiseOperation; - - constexpr bool Indexable = - (ReduceOpId == ReduceTensorOp::MIN || ReduceOpId == ReduceTensorOp::MAX || - ReduceOpId == ReduceTensorOp::AMAX); - constexpr bool OutputIndex = Indexable && UseIndex; - static_for<0, std::tuple_size::value, 1>{}( [&](auto i) { using cfg1 = remove_cvref_t( \ - std::vector> & device_op_instances) - -#define ADD_BLOCKWISE_INST_BY_ID( \ - inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \ - ADD_BLOCKWISE_INST_BY_TYPE(inT, \ - compT, \ - outT, \ - static_cast(ReduceOpId), \ - static_cast(NanOpt), \ - static_cast(IndicesOpt), \ - Rank, \ - NumReduceDim) - -#define ADD_BLOCKWISE_INST_REF_BY_TYPE( \ - inT, compT, outT, ReduceOpId, PropagateNan, UseIndex, Rank, NumReduceDim) \ - extern template void add_device_reduce_instance_blockwise( \ - std::vector> & device_op_instances) - -#define ADD_BLOCKWISE_INST_REF_BY_ID( \ - inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \ - ADD_BLOCKWISE_INST_REF_BY_TYPE(inT, \ - compT, \ - outT, \ - static_cast(ReduceOpId), \ - static_cast(NanOpt), \ - static_cast(IndicesOpt), \ - Rank, \ - NumReduceDim) - } // namespace instance } // namespace device } // namespace tensor_operation diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16.hpp deleted file mode 100644 index 8d1fed046..000000000 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16.hpp +++ /dev/null @@ -1,59 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. - -#pragma once - -#include "ck/utility/data_type.hpp" - -#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// clang-format off -// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim -ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 0, 0, 0, 4, 3); // for ADD -ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 0, 0, 0, 4, 4); -ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 0, 0, 0, 4, 1); -ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 0, 0, 0, 2, 1); -ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 5, 0, 0, 4, 3); // for AVG -ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 5, 0, 0, 4, 4); -ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 5, 0, 0, 4, 1); -ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 5, 0, 0, 2, 1); -ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 7, 0, 0, 4, 3); // for NORM2 -ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 7, 0, 0, 4, 4); -ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 7, 0, 0, 4, 1); -ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 7, 0, 0, 2, 1); - -ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 0, 4, 3); // for MIN -ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 0, 4, 4); -ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 0, 4, 1); -ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 0, 2, 1); -ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 0, 4, 3); // for MAX -ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 0, 4, 4); -ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 0, 4, 1); -ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 0, 2, 1); -ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 0, 4, 3); // for AMAX -ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 0, 4, 4); -ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 0, 4, 1); -ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 0, 2, 1); -ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 1, 4, 3); // for MIN -ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 1, 4, 4); -ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 1, 4, 1); -ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 1, 2, 1); -ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 1, 4, 3); // for MAX -ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 1, 4, 4); -ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 1, 4, 1); -ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 1, 2, 1); -ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 1, 4, 3); // for AMAX -ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 1, 4, 4); -ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 1, 4, 1); -ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 1, 2, 1); -// clang-format on - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_add.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_add.hpp new file mode 100644 index 000000000..521d93e60 --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_add.hpp @@ -0,0 +1,27 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/utility/data_type.hpp" +#include "ck/utility/reduction_enums.hpp" + +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_amax.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_amax.hpp new file mode 100644 index 000000000..fe3fd6c0a --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_amax.hpp @@ -0,0 +1,31 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/utility/data_type.hpp" +#include "ck/utility/reduction_enums.hpp" + +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_avg.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_avg.hpp new file mode 100644 index 000000000..52a2b69cd --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_avg.hpp @@ -0,0 +1,27 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/utility/data_type.hpp" +#include "ck/utility/reduction_enums.hpp" + +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_max.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_max.hpp new file mode 100644 index 000000000..ee4fee41e --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_max.hpp @@ -0,0 +1,31 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/utility/data_type.hpp" +#include "ck/utility/reduction_enums.hpp" + +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_min.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_min.hpp new file mode 100644 index 000000000..3abdb7f95 --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_min.hpp @@ -0,0 +1,31 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/utility/data_type.hpp" +#include "ck/utility/reduction_enums.hpp" + +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_norm2.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_norm2.hpp new file mode 100644 index 000000000..b0dbcf31d --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_norm2.hpp @@ -0,0 +1,27 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/utility/data_type.hpp" +#include "ck/utility/reduction_enums.hpp" + +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16.hpp deleted file mode 100644 index ae7f13ce9..000000000 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16.hpp +++ /dev/null @@ -1,46 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. - -#pragma once - -#include "ck/utility/data_type.hpp" - -#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// clang-format off -// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim -ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 0, 4, 3); // for MIN -ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 0, 4, 4); -ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 0, 4, 1); -ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 0, 2, 1); -ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 0, 4, 3); // for MAX -ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 0, 4, 4); -ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 0, 4, 1); -ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 0, 2, 1); -ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 0, 4, 3); // for AMAX -ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 0, 4, 4); -ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 0, 4, 1); -ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 0, 2, 1); -ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 1, 4, 3); // for MIN -ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 1, 4, 4); -ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 1, 4, 1); -ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 1, 2, 1); -ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 1, 4, 3); // for MAX -ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 1, 4, 4); -ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 1, 4, 1); -ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 1, 2, 1); -ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 1, 4, 3); // for AMAX -ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 1, 4, 4); -ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 1, 4, 1); -ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 1, 2, 1); -// clang-format on - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_amax.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_amax.hpp new file mode 100644 index 000000000..7bbf3df0a --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_amax.hpp @@ -0,0 +1,31 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/utility/data_type.hpp" +#include "ck/utility/reduction_enums.hpp" + +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_max.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_max.hpp new file mode 100644 index 000000000..559f32226 --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_max.hpp @@ -0,0 +1,31 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/utility/data_type.hpp" +#include "ck/utility/reduction_enums.hpp" + +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_min.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_min.hpp new file mode 100644 index 000000000..28c961078 --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_min.hpp @@ -0,0 +1,31 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/utility/data_type.hpp" +#include "ck/utility/reduction_enums.hpp" + +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16.hpp deleted file mode 100644 index c26e13659..000000000 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16.hpp +++ /dev/null @@ -1,34 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. - -#pragma once - -#include "ck/utility/data_type.hpp" - -#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// clang-format off -// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim -ADD_BLOCKWISE_INST_REF_BY_ID(half_t, float, half_t, 0, 0, 0, 4, 3); // for ADD -ADD_BLOCKWISE_INST_REF_BY_ID(half_t, float, half_t, 0, 0, 0, 4, 4); -ADD_BLOCKWISE_INST_REF_BY_ID(half_t, float, half_t, 0, 0, 0, 4, 1); -ADD_BLOCKWISE_INST_REF_BY_ID(half_t, float, half_t, 0, 0, 0, 2, 1); -ADD_BLOCKWISE_INST_REF_BY_ID(half_t, float, half_t, 5, 0, 0, 4, 3); // for AVG -ADD_BLOCKWISE_INST_REF_BY_ID(half_t, float, half_t, 5, 0, 0, 4, 4); -ADD_BLOCKWISE_INST_REF_BY_ID(half_t, float, half_t, 5, 0, 0, 4, 1); -ADD_BLOCKWISE_INST_REF_BY_ID(half_t, float, half_t, 5, 0, 0, 2, 1); -ADD_BLOCKWISE_INST_REF_BY_ID(half_t, float, half_t, 7, 0, 0, 4, 3); // for NORM2 -ADD_BLOCKWISE_INST_REF_BY_ID(half_t, float, half_t, 7, 0, 0, 4, 4); -ADD_BLOCKWISE_INST_REF_BY_ID(half_t, float, half_t, 7, 0, 0, 4, 1); -ADD_BLOCKWISE_INST_REF_BY_ID(half_t, float, half_t, 7, 0, 0, 2, 1); -// clang-format on - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_add.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_add.hpp new file mode 100644 index 000000000..5080d2863 --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_add.hpp @@ -0,0 +1,27 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/utility/data_type.hpp" +#include "ck/utility/reduction_enums.hpp" + +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_avg.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_avg.hpp new file mode 100644 index 000000000..0d24d1537 --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_avg.hpp @@ -0,0 +1,27 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/utility/data_type.hpp" +#include "ck/utility/reduction_enums.hpp" + +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_norm2.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_norm2.hpp new file mode 100644 index 000000000..c806e807c --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_norm2.hpp @@ -0,0 +1,27 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/utility/data_type.hpp" +#include "ck/utility/reduction_enums.hpp" + +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32.hpp deleted file mode 100644 index 30064d588..000000000 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32.hpp +++ /dev/null @@ -1,58 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. - -#pragma once - -#include "ck/utility/data_type.hpp" - -#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// clang-format off -// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim -ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 0, 0, 0, 4, 3); // for ADD -ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 0, 0, 0, 4, 4); -ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 0, 0, 0, 4, 1); -ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 0, 0, 0, 2, 1); -ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 5, 0, 0, 4, 3); // for AVG -ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 5, 0, 0, 4, 4); -ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 5, 0, 0, 4, 1); -ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 5, 0, 0, 2, 1); -ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 7, 0, 0, 4, 3); // for NORM2 -ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 7, 0, 0, 4, 4); -ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 7, 0, 0, 4, 1); -ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 7, 0, 0, 2, 1); -ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 2, 0, 0, 4, 3); // for MIN -ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 2, 0, 0, 4, 4); -ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 2, 0, 0, 4, 1); -ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 2, 0, 0, 2, 1); -ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 3, 0, 0, 4, 3); // for MAX -ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 3, 0, 0, 4, 4); -ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 3, 0, 0, 4, 1); -ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 3, 0, 0, 2, 1); -ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 4, 0, 0, 4, 3); // for AMAX -ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 4, 0, 0, 4, 4); -ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 4, 0, 0, 4, 1); -ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 4, 0, 0, 2, 1); -ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 2, 0, 1, 4, 3); // for MIN -ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 2, 0, 1, 4, 4); -ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 2, 0, 1, 4, 1); -ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 2, 0, 1, 2, 1); -ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 3, 0, 1, 4, 3); // for MAX -ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 3, 0, 1, 4, 4); -ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 3, 0, 1, 4, 1); -ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 3, 0, 1, 2, 1); -ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 4, 0, 1, 4, 3); // for AMAX -ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 4, 0, 1, 4, 4); -ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 4, 0, 1, 4, 1); -ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 4, 0, 1, 2, 1); -// clang-format on - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_add.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_add.hpp new file mode 100644 index 000000000..b7c046e75 --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_add.hpp @@ -0,0 +1,27 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/utility/data_type.hpp" +#include "ck/utility/reduction_enums.hpp" + +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_amax.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_amax.hpp new file mode 100644 index 000000000..771bec1c9 --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_amax.hpp @@ -0,0 +1,31 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/utility/data_type.hpp" +#include "ck/utility/reduction_enums.hpp" + +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_avg.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_avg.hpp new file mode 100644 index 000000000..c1fe8addb --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_avg.hpp @@ -0,0 +1,27 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/utility/data_type.hpp" +#include "ck/utility/reduction_enums.hpp" + +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_max.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_max.hpp new file mode 100644 index 000000000..6bc0662fe --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_max.hpp @@ -0,0 +1,31 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/utility/data_type.hpp" +#include "ck/utility/reduction_enums.hpp" + +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_min.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_min.hpp new file mode 100644 index 000000000..6f8005132 --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_min.hpp @@ -0,0 +1,31 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/utility/data_type.hpp" +#include "ck/utility/reduction_enums.hpp" + +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_norm2.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_norm2.hpp new file mode 100644 index 000000000..c771ac4fa --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_norm2.hpp @@ -0,0 +1,27 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/utility/data_type.hpp" +#include "ck/utility/reduction_enums.hpp" + +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32.hpp deleted file mode 100644 index c9f6a1a5f..000000000 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32.hpp +++ /dev/null @@ -1,34 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. - -#pragma once - -#include "ck/utility/data_type.hpp" - -#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// clang-format off -// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim -ADD_BLOCKWISE_INST_REF_BY_ID(float, double, float, 0, 0, 0, 4, 3); // for ADD -ADD_BLOCKWISE_INST_REF_BY_ID(float, double, float, 0, 0, 0, 4, 4); -ADD_BLOCKWISE_INST_REF_BY_ID(float, double, float, 0, 0, 0, 4, 1); -ADD_BLOCKWISE_INST_REF_BY_ID(float, double, float, 0, 0, 0, 2, 1); -ADD_BLOCKWISE_INST_REF_BY_ID(float, double, float, 5, 0, 0, 4, 3); // for AVG -ADD_BLOCKWISE_INST_REF_BY_ID(float, double, float, 5, 0, 0, 4, 4); -ADD_BLOCKWISE_INST_REF_BY_ID(float, double, float, 5, 0, 0, 4, 1); -ADD_BLOCKWISE_INST_REF_BY_ID(float, double, float, 5, 0, 0, 2, 1); -ADD_BLOCKWISE_INST_REF_BY_ID(float, double, float, 7, 0, 0, 4, 3); // for NORM2 -ADD_BLOCKWISE_INST_REF_BY_ID(float, double, float, 7, 0, 0, 4, 4); -ADD_BLOCKWISE_INST_REF_BY_ID(float, double, float, 7, 0, 0, 4, 1); -ADD_BLOCKWISE_INST_REF_BY_ID(float, double, float, 7, 0, 0, 2, 1); -// clang-format on - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_add.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_add.hpp new file mode 100644 index 000000000..b9ddbb9ae --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_add.hpp @@ -0,0 +1,27 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/utility/data_type.hpp" +#include "ck/utility/reduction_enums.hpp" + +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_avg.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_avg.hpp new file mode 100644 index 000000000..390a719ce --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_avg.hpp @@ -0,0 +1,27 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/utility/data_type.hpp" +#include "ck/utility/reduction_enums.hpp" + +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_norm2.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_norm2.hpp new file mode 100644 index 000000000..2a9ddbc61 --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_norm2.hpp @@ -0,0 +1,27 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/utility/data_type.hpp" +#include "ck/utility/reduction_enums.hpp" + +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64.hpp deleted file mode 100644 index c598e64cd..000000000 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64.hpp +++ /dev/null @@ -1,58 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. - -#pragma once - -#include "ck/utility/data_type.hpp" - -#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// clang-format off -// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim -ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 0, 0, 0, 4, 3); // for ADD -ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 0, 0, 0, 4, 4); -ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 0, 0, 0, 4, 1); -ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 0, 0, 0, 2, 1); -ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 5, 0, 0, 4, 3); // for AVG -ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 5, 0, 0, 4, 4); -ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 5, 0, 0, 4, 1); -ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 5, 0, 0, 2, 1); -ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 7, 0, 0, 4, 3); // for NORM2 -ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 7, 0, 0, 4, 4); -ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 7, 0, 0, 4, 1); -ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 7, 0, 0, 2, 1); -ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 2, 0, 0, 4, 3); // for MIN -ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 2, 0, 0, 4, 4); -ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 2, 0, 0, 4, 1); -ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 2, 0, 0, 2, 1); -ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 3, 0, 0, 4, 3); // for MAX -ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 3, 0, 0, 4, 4); -ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 3, 0, 0, 4, 1); -ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 3, 0, 0, 2, 1); -ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 4, 0, 0, 4, 3); // for AMAX -ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 4, 0, 0, 4, 4); -ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 4, 0, 0, 4, 1); -ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 4, 0, 0, 2, 1); -ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 2, 0, 1, 4, 3); // for MIN -ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 2, 0, 1, 4, 4); -ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 2, 0, 1, 4, 1); -ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 2, 0, 1, 2, 1); -ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 3, 0, 1, 4, 3); // for MAX -ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 3, 0, 1, 4, 4); -ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 3, 0, 1, 4, 1); -ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 3, 0, 1, 2, 1); -ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 4, 0, 1, 4, 3); // for AMAX -ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 4, 0, 1, 4, 4); -ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 4, 0, 1, 4, 1); -ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 4, 0, 1, 2, 1); -// clang-format on - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_add.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_add.hpp new file mode 100644 index 000000000..574688444 --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_add.hpp @@ -0,0 +1,27 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/utility/data_type.hpp" +#include "ck/utility/reduction_enums.hpp" + +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_amax.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_amax.hpp new file mode 100644 index 000000000..ad0f2357e --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_amax.hpp @@ -0,0 +1,31 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/utility/data_type.hpp" +#include "ck/utility/reduction_enums.hpp" + +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_avg.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_avg.hpp new file mode 100644 index 000000000..c7d952763 --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_avg.hpp @@ -0,0 +1,27 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/utility/data_type.hpp" +#include "ck/utility/reduction_enums.hpp" + +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_max.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_max.hpp new file mode 100644 index 000000000..ec5622993 --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_max.hpp @@ -0,0 +1,31 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/utility/data_type.hpp" +#include "ck/utility/reduction_enums.hpp" + +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_min.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_min.hpp new file mode 100644 index 000000000..48f66da65 --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_min.hpp @@ -0,0 +1,31 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/utility/data_type.hpp" +#include "ck/utility/reduction_enums.hpp" + +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_norm2.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_norm2.hpp new file mode 100644 index 000000000..fabfa5b4c --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_norm2.hpp @@ -0,0 +1,27 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/utility/data_type.hpp" +#include "ck/utility/reduction_enums.hpp" + +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i32_i8.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i32_i8.hpp deleted file mode 100644 index cd1594992..000000000 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i32_i8.hpp +++ /dev/null @@ -1,30 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. - -#pragma once - -#include "ck/utility/data_type.hpp" - -#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// clang-format off -// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim -ADD_BLOCKWISE_INST_REF_BY_ID(int8_t, int32_t, int8_t, 0, 0, 0, 4, 3); // for ADD -ADD_BLOCKWISE_INST_REF_BY_ID(int8_t, int32_t, int8_t, 0, 0, 0, 4, 4); -ADD_BLOCKWISE_INST_REF_BY_ID(int8_t, int32_t, int8_t, 0, 0, 0, 4, 1); -ADD_BLOCKWISE_INST_REF_BY_ID(int8_t, int32_t, int8_t, 0, 0, 0, 2, 1); -ADD_BLOCKWISE_INST_REF_BY_ID(int8_t, int32_t, int8_t, 5, 0, 0, 4, 3); // for AVG -ADD_BLOCKWISE_INST_REF_BY_ID(int8_t, int32_t, int8_t, 5, 0, 0, 4, 4); -ADD_BLOCKWISE_INST_REF_BY_ID(int8_t, int32_t, int8_t, 5, 0, 0, 4, 1); -ADD_BLOCKWISE_INST_REF_BY_ID(int8_t, int32_t, int8_t, 5, 0, 0, 2, 1); -// clang-format on - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i32_i8_add.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i32_i8_add.hpp new file mode 100644 index 000000000..e08faec20 --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i32_i8_add.hpp @@ -0,0 +1,27 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/utility/data_type.hpp" +#include "ck/utility/reduction_enums.hpp" + +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i32_i8_avg.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i32_i8_avg.hpp new file mode 100644 index 000000000..a1e692aae --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i32_i8_avg.hpp @@ -0,0 +1,27 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/utility/data_type.hpp" +#include "ck/utility/reduction_enums.hpp" + +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8.hpp deleted file mode 100644 index bf62f92ad..000000000 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8.hpp +++ /dev/null @@ -1,46 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. - -#pragma once - -#include "ck/utility/data_type.hpp" - -#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// clang-format off -// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim -ADD_BLOCKWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 2, 0, 0, 4, 3); // for MIN -ADD_BLOCKWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 2, 0, 0, 4, 4); -ADD_BLOCKWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 2, 0, 0, 4, 1); -ADD_BLOCKWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 2, 0, 0, 2, 1); -ADD_BLOCKWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 3, 0, 0, 4, 3); // for MAX -ADD_BLOCKWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 3, 0, 0, 4, 4); -ADD_BLOCKWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 3, 0, 0, 4, 1); -ADD_BLOCKWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 3, 0, 0, 2, 1); -ADD_BLOCKWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 0, 4, 3); // for AMAX -ADD_BLOCKWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 0, 4, 4); -ADD_BLOCKWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 0, 4, 1); -ADD_BLOCKWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 0, 2, 1); -ADD_BLOCKWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 2, 0, 1, 4, 3); // for MIN -ADD_BLOCKWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 2, 0, 1, 4, 4); -ADD_BLOCKWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 2, 0, 1, 4, 1); -ADD_BLOCKWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 2, 0, 1, 2, 1); -ADD_BLOCKWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 3, 0, 1, 4, 3); // for MAX -ADD_BLOCKWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 3, 0, 1, 4, 4); -ADD_BLOCKWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 3, 0, 1, 4, 1); -ADD_BLOCKWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 3, 0, 1, 2, 1); -ADD_BLOCKWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 4, 3); // for AMAX -ADD_BLOCKWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 4, 4); -ADD_BLOCKWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 4, 1); -ADD_BLOCKWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 2, 1); -// clang-format on - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8_amax.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8_amax.hpp new file mode 100644 index 000000000..e9654e8cc --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8_amax.hpp @@ -0,0 +1,31 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/utility/data_type.hpp" +#include "ck/utility/reduction_enums.hpp" + +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8_max.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8_max.hpp new file mode 100644 index 000000000..782442130 --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8_max.hpp @@ -0,0 +1,31 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/utility/data_type.hpp" +#include "ck/utility/reduction_enums.hpp" + +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8_min.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8_min.hpp new file mode 100644 index 000000000..df323d40b --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8_min.hpp @@ -0,0 +1,31 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/utility/data_type.hpp" +#include "ck/utility/reduction_enums.hpp" + +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_impl_common.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_impl_common.hpp index 9fc409a08..8c08e5ef2 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_impl_common.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_impl_common.hpp @@ -3,6 +3,9 @@ #pragma once +#include "ck/utility/reduction_operator.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + namespace ck { namespace tensor_operation { namespace device { @@ -32,6 +35,16 @@ struct ReductionConfiguration_2 static constexpr int KThreadSliceSize_ = KThreadSliceSize; }; +using ReduceAdd = ck::reduce::Add; +using ReduceMin = ck::reduce::Min; +using ReduceMax = ck::reduce::Max; +using ReduceAMax = ck::reduce::AMax; + +using UnarySquare = ck::tensor_operation::element_wise::UnarySquare; +using UnarySqrt = ck::tensor_operation::element_wise::UnarySqrt; +using UnaryDivide = ck::tensor_operation::element_wise::UnaryDivide; +using UnaryAbs = ck::tensor_operation::element_wise::UnaryAbs; + #define QUICK_REDUCE_TEST 1 } // namespace instance diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp index a4c17368f..acf55d068 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp @@ -6,6 +6,7 @@ #include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp" #include "ck/tensor_operation/gpu/device/impl/device_reduce_multiblock.hpp" +#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp" #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_impl_common.hpp" namespace ck { @@ -64,135 +65,58 @@ using reduce_configuration_2_instances_multiblock_atomic_add = std::tuple< >; #endif -template -using deviceReduceMultiBlockAtomicAddPtrType = DeviceReducePtr< - typename reduce_unary_operator::InElementwiseOperation, - typename reduce_unary_operator::AccElementwiseOperation>; - template + bool OutputIndex> void add_device_reduce_instance_multiblock_atomic_add( - std::vector>& device_op_instances) + std::vector>& + device_op_instances) { - using ReduceOperation = typename reduce_binary_operator::opType; - using InElementwiseOperation = - typename reduce_unary_operator::InElementwiseOperation; - using AccElementwiseOperation = - typename reduce_unary_operator::AccElementwiseOperation; - - constexpr bool Indexable = - (ReduceOpId == ReduceTensorOp::MIN || ReduceOpId == ReduceTensorOp::MAX || - ReduceOpId == ReduceTensorOp::AMAX); - constexpr bool OutputIndex = Indexable && UseIndex; - - static_assert(UseIndex == false, - "AtomicAdd can only be used with reduction operations using no index!"); + static_for<0, + std::tuple_size::value, + 1>{}([&](auto i) { + using cfg1 = remove_cvref_t(reduce_configuration_1_instances_multiblock_atomic_add{}))>; - constexpr bool op_acceptable = - (ReduceOpId == ReduceTensorOp::ADD || ReduceOpId == ReduceTensorOp::MUL || - ReduceOpId == ReduceTensorOp::AVG || ReduceOpId == ReduceTensorOp::NORM1); - - constexpr bool out_type_acceptable = - (std::is_same::value || std::is_same::value); - - if constexpr(!op_acceptable || !out_type_acceptable) - return; - else - { static_for<0, - std::tuple_size::value, - 1>{}([&](auto i) { - using cfg1 = remove_cvref_t(reduce_configuration_1_instances_multiblock_atomic_add{}))>; - - static_for< - 0, - std::tuple_size::value, - 1>{}([&](auto j) { - using cfg2 = remove_cvref_t(reduce_configuration_2_instances_multiblock_atomic_add{}))>; - - using ReduceOpInstance = - DeviceReduceMultiBlock; - - device_op_instances.push_back( - std::make_unique(ReduceOpInstance{})); - }); + std::tuple_size::value, + 1>{}([&](auto j) { + using cfg2 = remove_cvref_t(reduce_configuration_2_instances_multiblock_atomic_add{}))>; + + using ReduceOpInstance = DeviceReduceMultiBlock; + + device_op_instances.push_back(std::make_unique(ReduceOpInstance{})); }); - } + }); }; -#define ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_TYPE( \ - inT, compT, outT, ReduceOpId, PropagateNan, UseIndex, Rank, NumReduceDim) \ - template void add_device_reduce_instance_multiblock_atomic_add( \ - std::vector> & device_op_instances) - -#define ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID( \ - inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \ - ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_TYPE(inT, \ - compT, \ - outT, \ - static_cast(ReduceOpId), \ - static_cast(NanOpt), \ - static_cast(IndicesOpt), \ - Rank, \ - NumReduceDim) - -#define ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_TYPE( \ - inT, compT, outT, ReduceOpId, PropagateNan, UseIndex, Rank, NumReduceDim) \ - extern template void add_device_reduce_instance_multiblock_atomic_add( \ - std::vector> & device_op_instances) - -#define ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID( \ - inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \ - ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_TYPE(inT, \ - compT, \ - outT, \ - static_cast(ReduceOpId), \ - static_cast(NanOpt), \ - static_cast(IndicesOpt), \ - Rank, \ - NumReduceDim) - } // namespace instance } // namespace device } // namespace tensor_operation diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_b16_f32_f32.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_b16_f32_f32.hpp deleted file mode 100644 index 3efc58506..000000000 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_b16_f32_f32.hpp +++ /dev/null @@ -1,30 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. - -#pragma once - -#include "ck/utility/data_type.hpp" - -#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// clang-format off -// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim -ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(bhalf_t, float, float, 0, 0, 0, 4, 3); // for ADD -ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(bhalf_t, float, float, 0, 0, 0, 4, 4); -ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(bhalf_t, float, float, 0, 0, 0, 4, 1); -ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(bhalf_t, float, float, 0, 0, 0, 2, 1); -ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(bhalf_t, float, float, 5, 0, 0, 4, 3); // for AVG -ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(bhalf_t, float, float, 5, 0, 0, 4, 4); -ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(bhalf_t, float, float, 5, 0, 0, 4, 1); -ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(bhalf_t, float, float, 5, 0, 0, 2, 1); -// clang-format on - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_b16_f32_f32_add.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_b16_f32_f32_add.hpp new file mode 100644 index 000000000..f5102f497 --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_b16_f32_f32_add.hpp @@ -0,0 +1,27 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/utility/data_type.hpp" +#include "ck/utility/reduction_enums.hpp" + +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +extern template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +extern template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +extern template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +extern template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_b16_f32_f32_avg.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_b16_f32_f32_avg.hpp new file mode 100644 index 000000000..ec513113d --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_b16_f32_f32_avg.hpp @@ -0,0 +1,27 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/utility/data_type.hpp" +#include "ck/utility/reduction_enums.hpp" + +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +extern template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +extern template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +extern template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +extern template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32.hpp deleted file mode 100644 index 804cba12c..000000000 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32.hpp +++ /dev/null @@ -1,30 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. - -#pragma once - -#include "ck/utility/data_type.hpp" - -#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// clang-format off -// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim -ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(half_t, float, float, 0, 0, 0, 4, 3); // for ADD -ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(half_t, float, float, 0, 0, 0, 4, 4); -ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(half_t, float, float, 0, 0, 0, 4, 1); -ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(half_t, float, float, 0, 0, 0, 2, 1); -ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(half_t, float, float, 5, 0, 0, 4, 3); // for AVG -ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(half_t, float, float, 5, 0, 0, 4, 4); -ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(half_t, float, float, 5, 0, 0, 4, 1); -ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(half_t, float, float, 5, 0, 0, 2, 1); -// clang-format on - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32_add.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32_add.hpp new file mode 100644 index 000000000..3a3d53b8c --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32_add.hpp @@ -0,0 +1,27 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/utility/data_type.hpp" +#include "ck/utility/reduction_enums.hpp" + +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +extern template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +extern template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +extern template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +extern template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32_avg.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32_avg.hpp new file mode 100644 index 000000000..bbf439896 --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32_avg.hpp @@ -0,0 +1,27 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/utility/data_type.hpp" +#include "ck/utility/reduction_enums.hpp" + +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +extern template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +extern template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +extern template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +extern template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32.hpp deleted file mode 100644 index 32eb843a1..000000000 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32.hpp +++ /dev/null @@ -1,30 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. - -#pragma once - -#include "ck/utility/data_type.hpp" - -#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// clang-format off -// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim -ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(float, float, float, 0, 0, 0, 4, 3); // for ADD -ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(float, float, float, 0, 0, 0, 4, 4); -ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(float, float, float, 0, 0, 0, 4, 1); -ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(float, float, float, 0, 0, 0, 2, 1); -ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(float, float, float, 5, 0, 0, 4, 3); // for AVG -ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(float, float, float, 5, 0, 0, 4, 4); -ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(float, float, float, 5, 0, 0, 4, 1); -ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(float, float, float, 5, 0, 0, 2, 1); -// clang-format on - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32_add.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32_add.hpp new file mode 100644 index 000000000..55147a60e --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32_add.hpp @@ -0,0 +1,27 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/utility/data_type.hpp" +#include "ck/utility/reduction_enums.hpp" + +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +extern template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +extern template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +extern template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +extern template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32_avg.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32_avg.hpp new file mode 100644 index 000000000..4bff06c6a --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32_avg.hpp @@ -0,0 +1,27 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/utility/data_type.hpp" +#include "ck/utility/reduction_enums.hpp" + +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +extern template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +extern template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +extern template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +extern template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32.hpp deleted file mode 100644 index 9f2a89247..000000000 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32.hpp +++ /dev/null @@ -1,30 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. - -#pragma once - -#include "ck/utility/data_type.hpp" - -#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// clang-format off -// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim -ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(float, double, float, 0, 0, 0, 4, 3); // for ADD -ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(float, double, float, 0, 0, 0, 4, 4); -ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(float, double, float, 0, 0, 0, 4, 1); -ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(float, double, float, 0, 0, 0, 2, 1); -ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(float, double, float, 5, 0, 0, 4, 3); // for AVG -ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(float, double, float, 5, 0, 0, 4, 4); -ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(float, double, float, 5, 0, 0, 4, 1); -ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(float, double, float, 5, 0, 0, 2, 1); -// clang-format on - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32_add.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32_add.hpp new file mode 100644 index 000000000..daffa1aa4 --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32_add.hpp @@ -0,0 +1,28 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/utility/data_type.hpp" +#include "ck/utility/reduction_enums.hpp" + +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +extern template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +extern template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +extern template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +extern template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +// clang-format on +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32_avg.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32_avg.hpp new file mode 100644 index 000000000..52c417112 --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32_avg.hpp @@ -0,0 +1,28 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/utility/data_type.hpp" +#include "ck/utility/reduction_enums.hpp" + +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +extern template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +extern template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +extern template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +extern template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +// clang-format on +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f64_f64_f64.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f64_f64_f64.hpp deleted file mode 100644 index bd2006999..000000000 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f64_f64_f64.hpp +++ /dev/null @@ -1,30 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. - -#pragma once - -#include "ck/utility/data_type.hpp" - -#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// clang-format off -// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim -ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(double, double, double, 0, 0, 0, 4, 3); // for ADD -ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(double, double, double, 0, 0, 0, 4, 4); -ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(double, double, double, 0, 0, 0, 4, 1); -ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(double, double, double, 0, 0, 0, 2, 1); -ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(double, double, double, 5, 0, 0, 4, 3); // for AVG -ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(double, double, double, 5, 0, 0, 4, 4); -ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(double, double, double, 5, 0, 0, 4, 1); -ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(double, double, double, 5, 0, 0, 2, 1); -// clang-format on - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f64_f64_f64_add.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f64_f64_f64_add.hpp new file mode 100644 index 000000000..2f358b06e --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f64_f64_f64_add.hpp @@ -0,0 +1,27 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/utility/data_type.hpp" +#include "ck/utility/reduction_enums.hpp" + +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +extern template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +extern template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +extern template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +extern template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f64_f64_f64_avg.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f64_f64_f64_avg.hpp new file mode 100644 index 000000000..84c99dcc5 --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f64_f64_f64_avg.hpp @@ -0,0 +1,27 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/utility/data_type.hpp" +#include "ck/utility/reduction_enums.hpp" + +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +extern template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +extern template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +extern template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +extern template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp index e09fd688d..dfcc8dd85 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp @@ -5,6 +5,8 @@ #include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp" #include "ck/tensor_operation/gpu/device/impl/device_reduce_threadwise.hpp" + +#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp" #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_impl_common.hpp" namespace ck { @@ -49,33 +51,20 @@ using reduce_configuration_2_instances_threadwise = std::tuple< >; #endif -template -using deviceReduceThreadWisePtrType = DeviceReducePtr< - typename reduce_unary_operator::InElementwiseOperation, - typename reduce_unary_operator::AccElementwiseOperation>; - template + bool OutputIndex> void add_device_reduce_instance_threadwise( - std::vector>& device_op_instances) + std::vector>& + device_op_instances) { - using ReduceOperation = typename reduce_binary_operator::opType; - using InElementwiseOperation = - typename reduce_unary_operator::InElementwiseOperation; - using AccElementwiseOperation = - typename reduce_unary_operator::AccElementwiseOperation; - - constexpr bool Indexable = - (ReduceOpId == ReduceTensorOp::MIN || ReduceOpId == ReduceTensorOp::MAX || - ReduceOpId == ReduceTensorOp::AMAX); - constexpr bool OutputIndex = Indexable && UseIndex; - using cfg1 = ReductionConfiguration_1<256, 256, 1>; static_for<0, std::tuple_size::value, 1>{}( @@ -89,8 +78,8 @@ void add_device_reduce_instance_threadwise( Rank, NumReduceDim, ReduceOperation, - InElementwiseOperation, - AccElementwiseOperation, + InElementwiseOp, + AccElementwiseOp, PropagateNan, OutputIndex, false, // HaveIndexInputIfOutputIndex @@ -105,52 +94,6 @@ void add_device_reduce_instance_threadwise( }); }; -#define ADD_THREADWISE_INST_BY_TYPE( \ - inT, compT, outT, ReduceOpId, PropagateNan, UseIndex, Rank, NumReduceDim) \ - template void add_device_reduce_instance_threadwise( \ - std::vector> & device_op_instances) - -#define ADD_THREADWISE_INST_BY_ID( \ - inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \ - ADD_THREADWISE_INST_BY_TYPE(inT, \ - compT, \ - outT, \ - static_cast(ReduceOpId), \ - static_cast(NanOpt), \ - static_cast(IndicesOpt), \ - Rank, \ - NumReduceDim) - -#define ADD_THREADWISE_INST_REF_BY_TYPE( \ - inT, compT, outT, ReduceOpId, PropagateNan, UseIndex, Rank, NumReduceDim) \ - extern template void add_device_reduce_instance_threadwise( \ - std::vector> & device_op_instances) - -#define ADD_THREADWISE_INST_REF_BY_ID( \ - inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \ - ADD_THREADWISE_INST_REF_BY_TYPE(inT, \ - compT, \ - outT, \ - static_cast(ReduceOpId), \ - static_cast(NanOpt), \ - static_cast(IndicesOpt), \ - Rank, \ - NumReduceDim) - } // namespace instance } // namespace device } // namespace tensor_operation diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16.hpp deleted file mode 100644 index 5f7f5c7af..000000000 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16.hpp +++ /dev/null @@ -1,59 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. - -#pragma once - -#include "ck/utility/data_type.hpp" - -#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// clang-format off -// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim -ADD_THREADWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 0, 0, 0, 4, 3); // for ADD -ADD_THREADWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 0, 0, 0, 4, 4); -ADD_THREADWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 0, 0, 0, 4, 1); -ADD_THREADWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 0, 0, 0, 2, 1); -ADD_THREADWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 5, 0, 0, 4, 3); // for AVG -ADD_THREADWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 5, 0, 0, 4, 4); -ADD_THREADWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 5, 0, 0, 4, 1); -ADD_THREADWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 5, 0, 0, 2, 1); -ADD_THREADWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 7, 0, 0, 4, 3); // for NORM2 -ADD_THREADWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 7, 0, 0, 4, 4); -ADD_THREADWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 7, 0, 0, 4, 1); -ADD_THREADWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 7, 0, 0, 2, 1); - -ADD_THREADWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 0, 4, 3); // for MIN -ADD_THREADWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 0, 4, 4); -ADD_THREADWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 0, 4, 1); -ADD_THREADWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 0, 2, 1); -ADD_THREADWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 0, 4, 3); // for MAX -ADD_THREADWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 0, 4, 4); -ADD_THREADWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 0, 4, 1); -ADD_THREADWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 0, 2, 1); -ADD_THREADWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 0, 4, 3); // for AMAX -ADD_THREADWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 0, 4, 4); -ADD_THREADWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 0, 4, 1); -ADD_THREADWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 0, 2, 1); -ADD_THREADWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 1, 4, 3); // for MIN -ADD_THREADWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 1, 4, 4); -ADD_THREADWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 1, 4, 1); -ADD_THREADWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 1, 2, 1); -ADD_THREADWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 1, 4, 3); // for MAX -ADD_THREADWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 1, 4, 4); -ADD_THREADWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 1, 4, 1); -ADD_THREADWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 1, 2, 1); -ADD_THREADWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 1, 4, 3); // for AMAX -ADD_THREADWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 1, 4, 4); -ADD_THREADWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 1, 4, 1); -ADD_THREADWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 1, 2, 1); -// clang-format on - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_add.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_add.hpp new file mode 100644 index 000000000..4168508b2 --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_add.hpp @@ -0,0 +1,27 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/utility/data_type.hpp" +#include "ck/utility/reduction_enums.hpp" + +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_amax.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_amax.hpp new file mode 100644 index 000000000..317006e3a --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_amax.hpp @@ -0,0 +1,31 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/utility/data_type.hpp" +#include "ck/utility/reduction_enums.hpp" + +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_avg.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_avg.hpp new file mode 100644 index 000000000..fc7718ddc --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_avg.hpp @@ -0,0 +1,27 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/utility/data_type.hpp" +#include "ck/utility/reduction_enums.hpp" + +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_max.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_max.hpp new file mode 100644 index 000000000..e6616386c --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_max.hpp @@ -0,0 +1,31 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/utility/data_type.hpp" +#include "ck/utility/reduction_enums.hpp" + +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_min.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_min.hpp new file mode 100644 index 000000000..a9441b8e8 --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_min.hpp @@ -0,0 +1,31 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/utility/data_type.hpp" +#include "ck/utility/reduction_enums.hpp" + +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_norm2.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_norm2.hpp new file mode 100644 index 000000000..6820ace8c --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_norm2.hpp @@ -0,0 +1,27 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/utility/data_type.hpp" +#include "ck/utility/reduction_enums.hpp" + +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16.hpp deleted file mode 100644 index 3c21b408c..000000000 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16.hpp +++ /dev/null @@ -1,46 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. - -#pragma once - -#include "ck/utility/data_type.hpp" - -#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// clang-format off -// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim -ADD_THREADWISE_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 0, 4, 3); // for MIN -ADD_THREADWISE_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 0, 4, 4); -ADD_THREADWISE_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 0, 4, 1); -ADD_THREADWISE_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 0, 2, 1); -ADD_THREADWISE_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 0, 4, 3); // for MAX -ADD_THREADWISE_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 0, 4, 4); -ADD_THREADWISE_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 0, 4, 1); -ADD_THREADWISE_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 0, 2, 1); -ADD_THREADWISE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 0, 4, 3); // for AMAX -ADD_THREADWISE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 0, 4, 4); -ADD_THREADWISE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 0, 4, 1); -ADD_THREADWISE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 0, 2, 1); -ADD_THREADWISE_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 1, 4, 3); // for MIN -ADD_THREADWISE_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 1, 4, 4); -ADD_THREADWISE_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 1, 4, 1); -ADD_THREADWISE_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 1, 2, 1); -ADD_THREADWISE_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 1, 4, 3); // for MAX -ADD_THREADWISE_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 1, 4, 4); -ADD_THREADWISE_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 1, 4, 1); -ADD_THREADWISE_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 1, 2, 1); -ADD_THREADWISE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 1, 4, 3); // for AMAX -ADD_THREADWISE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 1, 4, 4); -ADD_THREADWISE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 1, 4, 1); -ADD_THREADWISE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 1, 2, 1); -// clang-format on - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16_amax.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16_amax.hpp new file mode 100644 index 000000000..ab3d4e6e2 --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16_amax.hpp @@ -0,0 +1,31 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/utility/data_type.hpp" +#include "ck/utility/reduction_enums.hpp" + +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16_max.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16_max.hpp new file mode 100644 index 000000000..ee08c9635 --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16_max.hpp @@ -0,0 +1,31 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/utility/data_type.hpp" +#include "ck/utility/reduction_enums.hpp" + +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16_min.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16_min.hpp new file mode 100644 index 000000000..1007ca27b --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16_min.hpp @@ -0,0 +1,31 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/utility/data_type.hpp" +#include "ck/utility/reduction_enums.hpp" + +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16.hpp deleted file mode 100644 index cd116986d..000000000 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16.hpp +++ /dev/null @@ -1,34 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. - -#pragma once - -#include "ck/utility/data_type.hpp" - -#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// clang-format off -// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim -ADD_THREADWISE_INST_REF_BY_ID(half_t, float, half_t, 0, 0, 0, 4, 3); // for ADD -ADD_THREADWISE_INST_REF_BY_ID(half_t, float, half_t, 0, 0, 0, 4, 4); -ADD_THREADWISE_INST_REF_BY_ID(half_t, float, half_t, 0, 0, 0, 4, 1); -ADD_THREADWISE_INST_REF_BY_ID(half_t, float, half_t, 0, 0, 0, 2, 1); -ADD_THREADWISE_INST_REF_BY_ID(half_t, float, half_t, 5, 0, 0, 4, 3); // for AVG -ADD_THREADWISE_INST_REF_BY_ID(half_t, float, half_t, 5, 0, 0, 4, 4); -ADD_THREADWISE_INST_REF_BY_ID(half_t, float, half_t, 5, 0, 0, 4, 1); -ADD_THREADWISE_INST_REF_BY_ID(half_t, float, half_t, 5, 0, 0, 2, 1); -ADD_THREADWISE_INST_REF_BY_ID(half_t, float, half_t, 7, 0, 0, 4, 3); // for NORM2 -ADD_THREADWISE_INST_REF_BY_ID(half_t, float, half_t, 7, 0, 0, 4, 4); -ADD_THREADWISE_INST_REF_BY_ID(half_t, float, half_t, 7, 0, 0, 4, 1); -ADD_THREADWISE_INST_REF_BY_ID(half_t, float, half_t, 7, 0, 0, 2, 1); -// clang-format on - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16_add.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16_add.hpp new file mode 100644 index 000000000..1d562c499 --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16_add.hpp @@ -0,0 +1,27 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/utility/data_type.hpp" +#include "ck/utility/reduction_enums.hpp" + +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16_avg.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16_avg.hpp new file mode 100644 index 000000000..5aac638b1 --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16_avg.hpp @@ -0,0 +1,27 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/utility/data_type.hpp" +#include "ck/utility/reduction_enums.hpp" + +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16_norm2.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16_norm2.hpp new file mode 100644 index 000000000..7a3c76409 --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16_norm2.hpp @@ -0,0 +1,27 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/utility/data_type.hpp" +#include "ck/utility/reduction_enums.hpp" + +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32.hpp deleted file mode 100644 index a764735fa..000000000 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32.hpp +++ /dev/null @@ -1,58 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. - -#pragma once - -#include "ck/utility/data_type.hpp" - -#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// clang-format off -// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim -ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 0, 0, 0, 4, 3); // for ADD -ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 0, 0, 0, 4, 4); -ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 0, 0, 0, 4, 1); -ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 0, 0, 0, 2, 1); -ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 5, 0, 0, 4, 3); // for AVG -ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 5, 0, 0, 4, 4); -ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 5, 0, 0, 4, 1); -ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 5, 0, 0, 2, 1); -ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 7, 0, 0, 4, 3); // for NORM2 -ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 7, 0, 0, 4, 4); -ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 7, 0, 0, 4, 1); -ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 7, 0, 0, 2, 1); -ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 2, 0, 0, 4, 3); // for MIN -ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 2, 0, 0, 4, 4); -ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 2, 0, 0, 4, 1); -ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 2, 0, 0, 2, 1); -ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 3, 0, 0, 4, 3); // for MAX -ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 3, 0, 0, 4, 4); -ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 3, 0, 0, 4, 1); -ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 3, 0, 0, 2, 1); -ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 4, 0, 0, 4, 3); // for AMAX -ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 4, 0, 0, 4, 4); -ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 4, 0, 0, 4, 1); -ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 4, 0, 0, 2, 1); -ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 2, 0, 1, 4, 3); // for MIN -ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 2, 0, 1, 4, 4); -ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 2, 0, 1, 4, 1); -ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 2, 0, 1, 2, 1); -ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 3, 0, 1, 4, 3); // for MAX -ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 3, 0, 1, 4, 4); -ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 3, 0, 1, 4, 1); -ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 3, 0, 1, 2, 1); -ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 4, 0, 1, 4, 3); // for AMAX -ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 4, 0, 1, 4, 4); -ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 4, 0, 1, 4, 1); -ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 4, 0, 1, 2, 1); -// clang-format on - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_add.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_add.hpp new file mode 100644 index 000000000..4685d7b5d --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_add.hpp @@ -0,0 +1,27 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/utility/data_type.hpp" +#include "ck/utility/reduction_enums.hpp" + +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_amax.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_amax.hpp new file mode 100644 index 000000000..1de338fb4 --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_amax.hpp @@ -0,0 +1,31 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/utility/data_type.hpp" +#include "ck/utility/reduction_enums.hpp" + +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_avg.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_avg.hpp new file mode 100644 index 000000000..e86c41a94 --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_avg.hpp @@ -0,0 +1,27 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/utility/data_type.hpp" +#include "ck/utility/reduction_enums.hpp" + +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_max.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_max.hpp new file mode 100644 index 000000000..2ca900856 --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_max.hpp @@ -0,0 +1,31 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/utility/data_type.hpp" +#include "ck/utility/reduction_enums.hpp" + +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_min.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_min.hpp new file mode 100644 index 000000000..38380e71e --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_min.hpp @@ -0,0 +1,31 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/utility/data_type.hpp" +#include "ck/utility/reduction_enums.hpp" + +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_norm2.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_norm2.hpp new file mode 100644 index 000000000..04c5f3e65 --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_norm2.hpp @@ -0,0 +1,27 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/utility/data_type.hpp" +#include "ck/utility/reduction_enums.hpp" + +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32.hpp deleted file mode 100644 index 7d47c79f8..000000000 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32.hpp +++ /dev/null @@ -1,34 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. - -#pragma once - -#include "ck/utility/data_type.hpp" - -#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// clang-format off -// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim -ADD_THREADWISE_INST_REF_BY_ID(float, double, float, 0, 0, 0, 4, 3); // for ADD -ADD_THREADWISE_INST_REF_BY_ID(float, double, float, 0, 0, 0, 4, 4); -ADD_THREADWISE_INST_REF_BY_ID(float, double, float, 0, 0, 0, 4, 1); -ADD_THREADWISE_INST_REF_BY_ID(float, double, float, 0, 0, 0, 2, 1); -ADD_THREADWISE_INST_REF_BY_ID(float, double, float, 5, 0, 0, 4, 3); // for AVG -ADD_THREADWISE_INST_REF_BY_ID(float, double, float, 5, 0, 0, 4, 4); -ADD_THREADWISE_INST_REF_BY_ID(float, double, float, 5, 0, 0, 4, 1); -ADD_THREADWISE_INST_REF_BY_ID(float, double, float, 5, 0, 0, 2, 1); -ADD_THREADWISE_INST_REF_BY_ID(float, double, float, 7, 0, 0, 4, 3); // for NORM2 -ADD_THREADWISE_INST_REF_BY_ID(float, double, float, 7, 0, 0, 4, 4); -ADD_THREADWISE_INST_REF_BY_ID(float, double, float, 7, 0, 0, 4, 1); -ADD_THREADWISE_INST_REF_BY_ID(float, double, float, 7, 0, 0, 2, 1); -// clang-format on - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_add.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_add.hpp new file mode 100644 index 000000000..fef5d4088 --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_add.hpp @@ -0,0 +1,27 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/utility/data_type.hpp" +#include "ck/utility/reduction_enums.hpp" + +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_avg.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_avg.hpp new file mode 100644 index 000000000..2416f614c --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_avg.hpp @@ -0,0 +1,27 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/utility/data_type.hpp" +#include "ck/utility/reduction_enums.hpp" + +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_norm2.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_norm2.hpp new file mode 100644 index 000000000..fbd0285ae --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_norm2.hpp @@ -0,0 +1,27 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/utility/data_type.hpp" +#include "ck/utility/reduction_enums.hpp" + +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64.hpp deleted file mode 100644 index faced808a..000000000 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64.hpp +++ /dev/null @@ -1,58 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. - -#pragma once - -#include "ck/utility/data_type.hpp" - -#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// clang-format off -// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim -ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 0, 0, 0, 4, 3); // for ADD -ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 0, 0, 0, 4, 4); -ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 0, 0, 0, 4, 1); -ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 0, 0, 0, 2, 1); -ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 5, 0, 0, 4, 3); // for AVG -ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 5, 0, 0, 4, 4); -ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 5, 0, 0, 4, 1); -ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 5, 0, 0, 2, 1); -ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 7, 0, 0, 4, 3); // for NORM2 -ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 7, 0, 0, 4, 4); -ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 7, 0, 0, 4, 1); -ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 7, 0, 0, 2, 1); -ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 2, 0, 0, 4, 3); // for MIN -ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 2, 0, 0, 4, 4); -ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 2, 0, 0, 4, 1); -ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 2, 0, 0, 2, 1); -ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 3, 0, 0, 4, 3); // for MAX -ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 3, 0, 0, 4, 4); -ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 3, 0, 0, 4, 1); -ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 3, 0, 0, 2, 1); -ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 4, 0, 0, 4, 3); // for AMAX -ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 4, 0, 0, 4, 4); -ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 4, 0, 0, 4, 1); -ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 4, 0, 0, 2, 1); -ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 2, 0, 1, 4, 3); // for MIN -ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 2, 0, 1, 4, 4); -ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 2, 0, 1, 4, 1); -ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 2, 0, 1, 2, 1); -ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 3, 0, 1, 4, 3); // for MAX -ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 3, 0, 1, 4, 4); -ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 3, 0, 1, 4, 1); -ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 3, 0, 1, 2, 1); -ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 4, 0, 1, 4, 3); // for AMAX -ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 4, 0, 1, 4, 4); -ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 4, 0, 1, 4, 1); -ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 4, 0, 1, 2, 1); -// clang-format on - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_add.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_add.hpp new file mode 100644 index 000000000..103b85a01 --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_add.hpp @@ -0,0 +1,27 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/utility/data_type.hpp" +#include "ck/utility/reduction_enums.hpp" + +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_amax.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_amax.hpp new file mode 100644 index 000000000..e01f590f0 --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_amax.hpp @@ -0,0 +1,31 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/utility/data_type.hpp" +#include "ck/utility/reduction_enums.hpp" + +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_avg.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_avg.hpp new file mode 100644 index 000000000..14a7459bb --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_avg.hpp @@ -0,0 +1,27 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/utility/data_type.hpp" +#include "ck/utility/reduction_enums.hpp" + +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_max.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_max.hpp new file mode 100644 index 000000000..7dfd80601 --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_max.hpp @@ -0,0 +1,31 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/utility/data_type.hpp" +#include "ck/utility/reduction_enums.hpp" + +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_min.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_min.hpp new file mode 100644 index 000000000..7670a27c8 --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_min.hpp @@ -0,0 +1,31 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/utility/data_type.hpp" +#include "ck/utility/reduction_enums.hpp" + +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_norm2.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_norm2.hpp new file mode 100644 index 000000000..8bb85f377 --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_norm2.hpp @@ -0,0 +1,27 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/utility/data_type.hpp" +#include "ck/utility/reduction_enums.hpp" + +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8.hpp deleted file mode 100644 index 111ba7a0c..000000000 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8.hpp +++ /dev/null @@ -1,30 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. - -#pragma once - -#include "ck/utility/data_type.hpp" - -#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// clang-format off -// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim -ADD_THREADWISE_INST_REF_BY_ID(int8_t, int32_t, int8_t, 0, 0, 0, 4, 3); // for ADD -ADD_THREADWISE_INST_REF_BY_ID(int8_t, int32_t, int8_t, 0, 0, 0, 4, 4); -ADD_THREADWISE_INST_REF_BY_ID(int8_t, int32_t, int8_t, 0, 0, 0, 4, 1); -ADD_THREADWISE_INST_REF_BY_ID(int8_t, int32_t, int8_t, 0, 0, 0, 2, 1); -ADD_THREADWISE_INST_REF_BY_ID(int8_t, int32_t, int8_t, 5, 0, 0, 4, 3); // for AVG -ADD_THREADWISE_INST_REF_BY_ID(int8_t, int32_t, int8_t, 5, 0, 0, 4, 4); -ADD_THREADWISE_INST_REF_BY_ID(int8_t, int32_t, int8_t, 5, 0, 0, 4, 1); -ADD_THREADWISE_INST_REF_BY_ID(int8_t, int32_t, int8_t, 5, 0, 0, 2, 1); -// clang-format on - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8_add.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8_add.hpp new file mode 100644 index 000000000..a005ba8d4 --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8_add.hpp @@ -0,0 +1,27 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/utility/data_type.hpp" +#include "ck/utility/reduction_enums.hpp" + +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8_avg.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8_avg.hpp new file mode 100644 index 000000000..9e8c07eb4 --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8_avg.hpp @@ -0,0 +1,27 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/utility/data_type.hpp" +#include "ck/utility/reduction_enums.hpp" + +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8.hpp deleted file mode 100644 index c771f057d..000000000 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8.hpp +++ /dev/null @@ -1,46 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. - -#pragma once - -#include "ck/utility/data_type.hpp" - -#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// clang-format off -// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim -ADD_THREADWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 2, 0, 0, 4, 3); // for MIN -ADD_THREADWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 2, 0, 0, 4, 4); -ADD_THREADWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 2, 0, 0, 4, 1); -ADD_THREADWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 2, 0, 0, 2, 1); -ADD_THREADWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 3, 0, 0, 4, 3); // for MAX -ADD_THREADWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 3, 0, 0, 4, 4); -ADD_THREADWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 3, 0, 0, 4, 1); -ADD_THREADWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 3, 0, 0, 2, 1); -ADD_THREADWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 0, 4, 3); // for AMAX -ADD_THREADWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 0, 4, 4); -ADD_THREADWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 0, 4, 1); -ADD_THREADWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 0, 2, 1); -ADD_THREADWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 2, 0, 1, 4, 3); // for MIN -ADD_THREADWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 2, 0, 1, 4, 4); -ADD_THREADWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 2, 0, 1, 4, 1); -ADD_THREADWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 2, 0, 1, 2, 1); -ADD_THREADWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 3, 0, 1, 4, 3); // for MAX -ADD_THREADWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 3, 0, 1, 4, 4); -ADD_THREADWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 3, 0, 1, 4, 1); -ADD_THREADWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 3, 0, 1, 2, 1); -ADD_THREADWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 4, 3); // for AMAX -ADD_THREADWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 4, 4); -ADD_THREADWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 4, 1); -ADD_THREADWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 2, 1); -// clang-format on - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_amax.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_amax.hpp new file mode 100644 index 000000000..a69f88f5a --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_amax.hpp @@ -0,0 +1,31 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/utility/data_type.hpp" +#include "ck/utility/reduction_enums.hpp" + +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_max.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_max.hpp new file mode 100644 index 000000000..734b31c1e --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_max.hpp @@ -0,0 +1,31 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/utility/data_type.hpp" +#include "ck/utility/reduction_enums.hpp" + +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_min.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_min.hpp new file mode 100644 index 000000000..237bd9696 --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_min.hpp @@ -0,0 +1,31 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/utility/data_type.hpp" +#include "ck/utility/reduction_enums.hpp" + +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/utility/host_reduction.hpp b/library/include/ck/library/utility/host_reduction.hpp index f02ebcd79..7c0c969ac 100644 --- a/library/include/ck/library/utility/host_reduction.hpp +++ b/library/include/ck/library/utility/host_reduction.hpp @@ -96,10 +96,9 @@ struct ReductionHost static constexpr int NumInvariantDim = Rank - NumReduceDim; std::vector outStrides; - std::vector invariantDims; - std::vector reduceDims; IndexDataType divider; + std::array reduceLengths; std::array reduceStrides; std::array invariantLengths; @@ -110,15 +109,12 @@ struct ReductionHost ReductionHost(HostTensorDescriptor& inDesc, HostTensorDescriptor& outDesc, - const std::vector& invariantDims_, - const std::vector& reduceDims_) + const std::array invariantDims, + const std::array reduceDims) { // this->outLengths = to_int_vector(outDesc.GetLengths()); this->outStrides = outDesc.GetStrides(); - this->invariantDims = invariantDims_; - this->reduceDims = reduceDims_; - int product = 1; for(int i = 0; i < NumReduceDim; i++) diff --git a/library/src/tensor_operation_instance/gpu/reduce/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/reduce/CMakeLists.txt index 4eddd6b64..31ae7226f 100644 --- a/library/src/tensor_operation_instance/gpu/reduce/CMakeLists.txt +++ b/library/src/tensor_operation_instance/gpu/reduce/CMakeLists.txt @@ -1,23 +1,76 @@ add_instance_library(device_reduce_instance - device_reduce_instance_blockwise_f16_f16_f16.cpp - device_reduce_instance_blockwise_f16_f32_f16.cpp - device_reduce_instance_blockwise_f32_f32_f32.cpp - device_reduce_instance_blockwise_f32_f64_f32.cpp - device_reduce_instance_blockwise_f64_f64_f64.cpp - device_reduce_instance_blockwise_i8_i32_i8.cpp - device_reduce_instance_blockwise_i8_i8_i8.cpp - device_reduce_instance_blockwise_b16_f32_b16.cpp - device_reduce_instance_threadwise_f16_f16_f16.cpp - device_reduce_instance_threadwise_f16_f32_f16.cpp - device_reduce_instance_threadwise_f32_f32_f32.cpp - device_reduce_instance_threadwise_f32_f64_f32.cpp - device_reduce_instance_threadwise_f64_f64_f64.cpp - device_reduce_instance_threadwise_i8_i32_i8.cpp - device_reduce_instance_threadwise_i8_i8_i8.cpp - device_reduce_instance_threadwise_b16_f32_b16.cpp - device_reduce_instance_multiblock_atomic_add_f16_f32_f32.cpp - device_reduce_instance_multiblock_atomic_add_f32_f32_f32.cpp - device_reduce_instance_multiblock_atomic_add_f32_f64_f32.cpp - device_reduce_instance_multiblock_atomic_add_f64_f64_f64.cpp - device_reduce_instance_multiblock_atomic_add_b16_f32_f32.cpp + device_reduce_instance_blockwise_f16_f16_f16_min.cpp + device_reduce_instance_blockwise_f16_f16_f16_max.cpp + device_reduce_instance_blockwise_f16_f16_f16_amax.cpp + device_reduce_instance_blockwise_f16_f32_f16_add.cpp + device_reduce_instance_blockwise_f16_f32_f16_avg.cpp + device_reduce_instance_blockwise_f16_f32_f16_norm2.cpp + device_reduce_instance_blockwise_f32_f32_f32_add.cpp + device_reduce_instance_blockwise_f32_f32_f32_avg.cpp + device_reduce_instance_blockwise_f32_f32_f32_norm2.cpp + device_reduce_instance_blockwise_f32_f32_f32_min.cpp + device_reduce_instance_blockwise_f32_f32_f32_max.cpp + device_reduce_instance_blockwise_f32_f32_f32_amax.cpp + device_reduce_instance_blockwise_f32_f64_f32_add.cpp + device_reduce_instance_blockwise_f32_f64_f32_avg.cpp + device_reduce_instance_blockwise_f32_f64_f32_norm2.cpp + device_reduce_instance_blockwise_f64_f64_f64_add.cpp + device_reduce_instance_blockwise_f64_f64_f64_avg.cpp + device_reduce_instance_blockwise_f64_f64_f64_norm2.cpp + device_reduce_instance_blockwise_f64_f64_f64_min.cpp + device_reduce_instance_blockwise_f64_f64_f64_max.cpp + device_reduce_instance_blockwise_f64_f64_f64_amax.cpp + device_reduce_instance_blockwise_i8_i32_i8_add.cpp + device_reduce_instance_blockwise_i8_i32_i8_avg.cpp + device_reduce_instance_blockwise_i8_i8_i8_min.cpp + device_reduce_instance_blockwise_i8_i8_i8_max.cpp + device_reduce_instance_blockwise_i8_i8_i8_amax.cpp + device_reduce_instance_blockwise_b16_f32_b16_add.cpp + device_reduce_instance_blockwise_b16_f32_b16_avg.cpp + device_reduce_instance_blockwise_b16_f32_b16_norm2.cpp + device_reduce_instance_blockwise_b16_f32_b16_min.cpp + device_reduce_instance_blockwise_b16_f32_b16_max.cpp + device_reduce_instance_blockwise_b16_f32_b16_amax.cpp + device_reduce_instance_threadwise_f16_f16_f16_min.cpp + device_reduce_instance_threadwise_f16_f16_f16_max.cpp + device_reduce_instance_threadwise_f16_f16_f16_amax.cpp + device_reduce_instance_threadwise_f16_f32_f16_add.cpp + device_reduce_instance_threadwise_f16_f32_f16_avg.cpp + device_reduce_instance_threadwise_f16_f32_f16_norm2.cpp + device_reduce_instance_threadwise_f32_f32_f32_add.cpp + device_reduce_instance_threadwise_f32_f32_f32_avg.cpp + device_reduce_instance_threadwise_f32_f32_f32_norm2.cpp + device_reduce_instance_threadwise_f32_f32_f32_min.cpp + device_reduce_instance_threadwise_f32_f32_f32_max.cpp + device_reduce_instance_threadwise_f32_f32_f32_amax.cpp + device_reduce_instance_threadwise_f32_f64_f32_add.cpp + device_reduce_instance_threadwise_f32_f64_f32_avg.cpp + device_reduce_instance_threadwise_f32_f64_f32_norm2.cpp + device_reduce_instance_threadwise_f64_f64_f64_add.cpp + device_reduce_instance_threadwise_f64_f64_f64_avg.cpp + device_reduce_instance_threadwise_f64_f64_f64_norm2.cpp + device_reduce_instance_threadwise_f64_f64_f64_min.cpp + device_reduce_instance_threadwise_f64_f64_f64_max.cpp + device_reduce_instance_threadwise_f64_f64_f64_amax.cpp + device_reduce_instance_threadwise_i8_i32_i8_add.cpp + device_reduce_instance_threadwise_i8_i32_i8_avg.cpp + device_reduce_instance_threadwise_i8_i8_i8_min.cpp + device_reduce_instance_threadwise_i8_i8_i8_max.cpp + device_reduce_instance_threadwise_i8_i8_i8_amax.cpp + device_reduce_instance_threadwise_b16_f32_b16_add.cpp + device_reduce_instance_threadwise_b16_f32_b16_avg.cpp + device_reduce_instance_threadwise_b16_f32_b16_norm2.cpp + device_reduce_instance_threadwise_b16_f32_b16_min.cpp + device_reduce_instance_threadwise_b16_f32_b16_max.cpp + device_reduce_instance_threadwise_b16_f32_b16_amax.cpp + device_reduce_instance_multiblock_atomic_add_f16_f32_f32_add.cpp + device_reduce_instance_multiblock_atomic_add_f16_f32_f32_avg.cpp + device_reduce_instance_multiblock_atomic_add_f32_f32_f32_add.cpp + device_reduce_instance_multiblock_atomic_add_f32_f32_f32_avg.cpp + device_reduce_instance_multiblock_atomic_add_f32_f64_f32_add.cpp + device_reduce_instance_multiblock_atomic_add_f32_f64_f32_avg.cpp + device_reduce_instance_multiblock_atomic_add_f64_f64_f64_add.cpp + device_reduce_instance_multiblock_atomic_add_f64_f64_f64_avg.cpp + device_reduce_instance_multiblock_atomic_add_b16_f32_f32_add.cpp + device_reduce_instance_multiblock_atomic_add_b16_f32_f32_avg.cpp ) diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16.cpp deleted file mode 100644 index c97efbc90..000000000 --- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16.cpp +++ /dev/null @@ -1,56 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. - -#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// clang-format off -// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim -ADD_BLOCKWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 0, 0, 0, 4, 3); // for ADD -ADD_BLOCKWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 0, 0, 0, 4, 4); -ADD_BLOCKWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 0, 0, 0, 4, 1); -ADD_BLOCKWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 0, 0, 0, 2, 1); -ADD_BLOCKWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 5, 0, 0, 4, 3); // for AVG -ADD_BLOCKWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 5, 0, 0, 4, 4); -ADD_BLOCKWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 5, 0, 0, 4, 1); -ADD_BLOCKWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 5, 0, 0, 2, 1); -ADD_BLOCKWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 7, 0, 0, 4, 3); // for NORM2 -ADD_BLOCKWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 7, 0, 0, 4, 4); -ADD_BLOCKWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 7, 0, 0, 4, 1); -ADD_BLOCKWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 7, 0, 0, 2, 1); - -ADD_BLOCKWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 0, 4, 3); // for MIN -ADD_BLOCKWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 0, 4, 4); -ADD_BLOCKWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 0, 4, 1); -ADD_BLOCKWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 0, 2, 1); -ADD_BLOCKWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 0, 4, 3); // for MAX -ADD_BLOCKWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 0, 4, 4); -ADD_BLOCKWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 0, 4, 1); -ADD_BLOCKWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 0, 2, 1); -ADD_BLOCKWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 0, 4, 3); // for AMAX -ADD_BLOCKWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 0, 4, 4); -ADD_BLOCKWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 0, 4, 1); -ADD_BLOCKWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 0, 2, 1); -ADD_BLOCKWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 1, 4, 3); // for MIN -ADD_BLOCKWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 1, 4, 4); -ADD_BLOCKWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 1, 4, 1); -ADD_BLOCKWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 1, 2, 1); -ADD_BLOCKWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 1, 4, 3); // for MAX -ADD_BLOCKWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 1, 4, 4); -ADD_BLOCKWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 1, 4, 1); -ADD_BLOCKWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 1, 2, 1); -ADD_BLOCKWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 1, 4, 3); // for AMAX -ADD_BLOCKWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 1, 4, 4); -ADD_BLOCKWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 1, 4, 1); -ADD_BLOCKWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 1, 2, 1); -// clang-format on - -} // namespace instance -} // namespace device -} // namespace tensor_operation - -} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_add.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_add.cpp new file mode 100644 index 000000000..1909183a5 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_add.cpp @@ -0,0 +1,24 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/utility/reduction_enums.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_amax.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_amax.cpp new file mode 100644 index 000000000..ec3020102 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_amax.cpp @@ -0,0 +1,28 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/utility/reduction_enums.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_avg.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_avg.cpp new file mode 100644 index 000000000..89f3e5828 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_avg.cpp @@ -0,0 +1,24 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/utility/reduction_enums.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_max.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_max.cpp new file mode 100644 index 000000000..f1bdd1927 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_max.cpp @@ -0,0 +1,28 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/utility/reduction_enums.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_min.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_min.cpp new file mode 100644 index 000000000..58e9c5622 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_min.cpp @@ -0,0 +1,28 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/utility/reduction_enums.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_norm2.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_norm2.cpp new file mode 100644 index 000000000..e5012c651 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_norm2.cpp @@ -0,0 +1,24 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/utility/reduction_enums.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16.cpp deleted file mode 100644 index 5e73b3d8b..000000000 --- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16.cpp +++ /dev/null @@ -1,43 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. - -#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// clang-format off -// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim -ADD_BLOCKWISE_INST_BY_ID(half_t, half_t, half_t, 2, 0, 0, 4, 3); // for MIN -ADD_BLOCKWISE_INST_BY_ID(half_t, half_t, half_t, 2, 0, 0, 4, 4); -ADD_BLOCKWISE_INST_BY_ID(half_t, half_t, half_t, 2, 0, 0, 4, 1); -ADD_BLOCKWISE_INST_BY_ID(half_t, half_t, half_t, 2, 0, 0, 2, 1); -ADD_BLOCKWISE_INST_BY_ID(half_t, half_t, half_t, 3, 0, 0, 4, 3); // for MAX -ADD_BLOCKWISE_INST_BY_ID(half_t, half_t, half_t, 3, 0, 0, 4, 4); -ADD_BLOCKWISE_INST_BY_ID(half_t, half_t, half_t, 3, 0, 0, 4, 1); -ADD_BLOCKWISE_INST_BY_ID(half_t, half_t, half_t, 3, 0, 0, 2, 1); -ADD_BLOCKWISE_INST_BY_ID(half_t, half_t, half_t, 4, 0, 0, 4, 3); // for AMAX -ADD_BLOCKWISE_INST_BY_ID(half_t, half_t, half_t, 4, 0, 0, 4, 4); -ADD_BLOCKWISE_INST_BY_ID(half_t, half_t, half_t, 4, 0, 0, 4, 1); -ADD_BLOCKWISE_INST_BY_ID(half_t, half_t, half_t, 4, 0, 0, 2, 1); -ADD_BLOCKWISE_INST_BY_ID(half_t, half_t, half_t, 2, 0, 1, 4, 3); // for MIN -ADD_BLOCKWISE_INST_BY_ID(half_t, half_t, half_t, 2, 0, 1, 4, 4); -ADD_BLOCKWISE_INST_BY_ID(half_t, half_t, half_t, 2, 0, 1, 4, 1); -ADD_BLOCKWISE_INST_BY_ID(half_t, half_t, half_t, 2, 0, 1, 2, 1); -ADD_BLOCKWISE_INST_BY_ID(half_t, half_t, half_t, 3, 0, 1, 4, 3); // for MAX -ADD_BLOCKWISE_INST_BY_ID(half_t, half_t, half_t, 3, 0, 1, 4, 4); -ADD_BLOCKWISE_INST_BY_ID(half_t, half_t, half_t, 3, 0, 1, 4, 1); -ADD_BLOCKWISE_INST_BY_ID(half_t, half_t, half_t, 3, 0, 1, 2, 1); -ADD_BLOCKWISE_INST_BY_ID(half_t, half_t, half_t, 4, 0, 1, 4, 3); // for AMAX -ADD_BLOCKWISE_INST_BY_ID(half_t, half_t, half_t, 4, 0, 1, 4, 4); -ADD_BLOCKWISE_INST_BY_ID(half_t, half_t, half_t, 4, 0, 1, 4, 1); -ADD_BLOCKWISE_INST_BY_ID(half_t, half_t, half_t, 4, 0, 1, 2, 1); -// clang-format on - -} // namespace instance -} // namespace device -} // namespace tensor_operation - -} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_amax.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_amax.cpp new file mode 100644 index 000000000..0970cb9d7 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_amax.cpp @@ -0,0 +1,28 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/utility/reduction_enums.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_max.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_max.cpp new file mode 100644 index 000000000..6ee179a51 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_max.cpp @@ -0,0 +1,28 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/utility/reduction_enums.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_min.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_min.cpp new file mode 100644 index 000000000..e53b40306 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_min.cpp @@ -0,0 +1,28 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/utility/reduction_enums.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16.cpp deleted file mode 100644 index 93d3e2701..000000000 --- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16.cpp +++ /dev/null @@ -1,31 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. - -#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// clang-format off -// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim -ADD_BLOCKWISE_INST_BY_ID(half_t, float, half_t, 0, 0, 0, 4, 3); // for ADD -ADD_BLOCKWISE_INST_BY_ID(half_t, float, half_t, 0, 0, 0, 4, 4); -ADD_BLOCKWISE_INST_BY_ID(half_t, float, half_t, 0, 0, 0, 4, 1); -ADD_BLOCKWISE_INST_BY_ID(half_t, float, half_t, 0, 0, 0, 2, 1); -ADD_BLOCKWISE_INST_BY_ID(half_t, float, half_t, 5, 0, 0, 4, 3); // for AVG -ADD_BLOCKWISE_INST_BY_ID(half_t, float, half_t, 5, 0, 0, 4, 4); -ADD_BLOCKWISE_INST_BY_ID(half_t, float, half_t, 5, 0, 0, 4, 1); -ADD_BLOCKWISE_INST_BY_ID(half_t, float, half_t, 5, 0, 0, 2, 1); -ADD_BLOCKWISE_INST_BY_ID(half_t, float, half_t, 7, 0, 0, 4, 3); // for NORM2 -ADD_BLOCKWISE_INST_BY_ID(half_t, float, half_t, 7, 0, 0, 4, 4); -ADD_BLOCKWISE_INST_BY_ID(half_t, float, half_t, 7, 0, 0, 4, 1); -ADD_BLOCKWISE_INST_BY_ID(half_t, float, half_t, 7, 0, 0, 2, 1); -// clang-format on - -} // namespace instance -} // namespace device -} // namespace tensor_operation - -} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_add.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_add.cpp new file mode 100644 index 000000000..cab5738fb --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_add.cpp @@ -0,0 +1,24 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/utility/reduction_enums.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_avg.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_avg.cpp new file mode 100644 index 000000000..7d2a4fad2 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_avg.cpp @@ -0,0 +1,24 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/utility/reduction_enums.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_norm2.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_norm2.cpp new file mode 100644 index 000000000..e08b64f8b --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_norm2.cpp @@ -0,0 +1,24 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/utility/reduction_enums.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32.cpp deleted file mode 100644 index 38800ddde..000000000 --- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32.cpp +++ /dev/null @@ -1,55 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. - -#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// clang-format off -// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim -ADD_BLOCKWISE_INST_BY_ID(float, float, float, 0, 0, 0, 4, 3); // for ADD -ADD_BLOCKWISE_INST_BY_ID(float, float, float, 0, 0, 0, 4, 4); -ADD_BLOCKWISE_INST_BY_ID(float, float, float, 0, 0, 0, 4, 1); -ADD_BLOCKWISE_INST_BY_ID(float, float, float, 0, 0, 0, 2, 1); -ADD_BLOCKWISE_INST_BY_ID(float, float, float, 5, 0, 0, 4, 3); // for AVG -ADD_BLOCKWISE_INST_BY_ID(float, float, float, 5, 0, 0, 4, 4); -ADD_BLOCKWISE_INST_BY_ID(float, float, float, 5, 0, 0, 4, 1); -ADD_BLOCKWISE_INST_BY_ID(float, float, float, 5, 0, 0, 2, 1); -ADD_BLOCKWISE_INST_BY_ID(float, float, float, 7, 0, 0, 4, 3); // for NORM2 -ADD_BLOCKWISE_INST_BY_ID(float, float, float, 7, 0, 0, 4, 4); -ADD_BLOCKWISE_INST_BY_ID(float, float, float, 7, 0, 0, 4, 1); -ADD_BLOCKWISE_INST_BY_ID(float, float, float, 7, 0, 0, 2, 1); -ADD_BLOCKWISE_INST_BY_ID(float, float, float, 2, 0, 0, 4, 3); // for MIN -ADD_BLOCKWISE_INST_BY_ID(float, float, float, 2, 0, 0, 4, 4); -ADD_BLOCKWISE_INST_BY_ID(float, float, float, 2, 0, 0, 4, 1); -ADD_BLOCKWISE_INST_BY_ID(float, float, float, 2, 0, 0, 2, 1); -ADD_BLOCKWISE_INST_BY_ID(float, float, float, 3, 0, 0, 4, 3); // for MAX -ADD_BLOCKWISE_INST_BY_ID(float, float, float, 3, 0, 0, 4, 4); -ADD_BLOCKWISE_INST_BY_ID(float, float, float, 3, 0, 0, 4, 1); -ADD_BLOCKWISE_INST_BY_ID(float, float, float, 3, 0, 0, 2, 1); -ADD_BLOCKWISE_INST_BY_ID(float, float, float, 4, 0, 0, 4, 3); // for AMAX -ADD_BLOCKWISE_INST_BY_ID(float, float, float, 4, 0, 0, 4, 4); -ADD_BLOCKWISE_INST_BY_ID(float, float, float, 4, 0, 0, 4, 1); -ADD_BLOCKWISE_INST_BY_ID(float, float, float, 4, 0, 0, 2, 1); -ADD_BLOCKWISE_INST_BY_ID(float, float, float, 2, 0, 1, 4, 3); // for MIN -ADD_BLOCKWISE_INST_BY_ID(float, float, float, 2, 0, 1, 4, 4); -ADD_BLOCKWISE_INST_BY_ID(float, float, float, 2, 0, 1, 4, 1); -ADD_BLOCKWISE_INST_BY_ID(float, float, float, 2, 0, 1, 2, 1); -ADD_BLOCKWISE_INST_BY_ID(float, float, float, 3, 0, 1, 4, 3); // for MAX -ADD_BLOCKWISE_INST_BY_ID(float, float, float, 3, 0, 1, 4, 4); -ADD_BLOCKWISE_INST_BY_ID(float, float, float, 3, 0, 1, 4, 1); -ADD_BLOCKWISE_INST_BY_ID(float, float, float, 3, 0, 1, 2, 1); -ADD_BLOCKWISE_INST_BY_ID(float, float, float, 4, 0, 1, 4, 3); // for AMAX -ADD_BLOCKWISE_INST_BY_ID(float, float, float, 4, 0, 1, 4, 4); -ADD_BLOCKWISE_INST_BY_ID(float, float, float, 4, 0, 1, 4, 1); -ADD_BLOCKWISE_INST_BY_ID(float, float, float, 4, 0, 1, 2, 1); -// clang-format on - -} // namespace instance -} // namespace device -} // namespace tensor_operation - -} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_add.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_add.cpp new file mode 100644 index 000000000..89cabf376 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_add.cpp @@ -0,0 +1,24 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/utility/reduction_enums.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_amax.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_amax.cpp new file mode 100644 index 000000000..1e602c121 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_amax.cpp @@ -0,0 +1,28 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/utility/reduction_enums.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_avg.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_avg.cpp new file mode 100644 index 000000000..489b4bc45 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_avg.cpp @@ -0,0 +1,24 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/utility/reduction_enums.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_max.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_max.cpp new file mode 100644 index 000000000..04e2c5b16 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_max.cpp @@ -0,0 +1,28 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/utility/reduction_enums.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_min.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_min.cpp new file mode 100644 index 000000000..5c0e53604 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_min.cpp @@ -0,0 +1,28 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/utility/reduction_enums.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_norm2.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_norm2.cpp new file mode 100644 index 000000000..899dfcd37 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_norm2.cpp @@ -0,0 +1,25 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/utility/reduction_enums.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +// clang-format on +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32.cpp deleted file mode 100644 index b821aeee0..000000000 --- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32.cpp +++ /dev/null @@ -1,30 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. - -#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// clang-format off -// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim -ADD_BLOCKWISE_INST_BY_ID(float, double, float, 0, 0, 0, 4, 3); // for ADD -ADD_BLOCKWISE_INST_BY_ID(float, double, float, 0, 0, 0, 4, 4); -ADD_BLOCKWISE_INST_BY_ID(float, double, float, 0, 0, 0, 4, 1); -ADD_BLOCKWISE_INST_BY_ID(float, double, float, 0, 0, 0, 2, 1); -ADD_BLOCKWISE_INST_BY_ID(float, double, float, 5, 0, 0, 4, 3); // for AVG -ADD_BLOCKWISE_INST_BY_ID(float, double, float, 5, 0, 0, 4, 4); -ADD_BLOCKWISE_INST_BY_ID(float, double, float, 5, 0, 0, 4, 1); -ADD_BLOCKWISE_INST_BY_ID(float, double, float, 5, 0, 0, 2, 1); -ADD_BLOCKWISE_INST_BY_ID(float, double, float, 7, 0, 0, 4, 3); // for NORM2 -ADD_BLOCKWISE_INST_BY_ID(float, double, float, 7, 0, 0, 4, 4); -ADD_BLOCKWISE_INST_BY_ID(float, double, float, 7, 0, 0, 4, 1); -ADD_BLOCKWISE_INST_BY_ID(float, double, float, 7, 0, 0, 2, 1); -// clang-format on - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_add.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_add.cpp new file mode 100644 index 000000000..5624337a4 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_add.cpp @@ -0,0 +1,23 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/utility/reduction_enums.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_avg.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_avg.cpp new file mode 100644 index 000000000..2f3067ce2 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_avg.cpp @@ -0,0 +1,23 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/utility/reduction_enums.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_norm2.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_norm2.cpp new file mode 100644 index 000000000..2648e7d59 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_norm2.cpp @@ -0,0 +1,23 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/utility/reduction_enums.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64.cpp deleted file mode 100644 index 074d0cfdf..000000000 --- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64.cpp +++ /dev/null @@ -1,55 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. - -#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// clang-format off -// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim -ADD_BLOCKWISE_INST_BY_ID(double, double, double, 0, 0, 0, 4, 3); // for ADD -ADD_BLOCKWISE_INST_BY_ID(double, double, double, 0, 0, 0, 4, 4); -ADD_BLOCKWISE_INST_BY_ID(double, double, double, 0, 0, 0, 4, 1); -ADD_BLOCKWISE_INST_BY_ID(double, double, double, 0, 0, 0, 2, 1); -ADD_BLOCKWISE_INST_BY_ID(double, double, double, 5, 0, 0, 4, 3); // for AVG -ADD_BLOCKWISE_INST_BY_ID(double, double, double, 5, 0, 0, 4, 4); -ADD_BLOCKWISE_INST_BY_ID(double, double, double, 5, 0, 0, 4, 1); -ADD_BLOCKWISE_INST_BY_ID(double, double, double, 5, 0, 0, 2, 1); -ADD_BLOCKWISE_INST_BY_ID(double, double, double, 7, 0, 0, 4, 3); // for NORM2 -ADD_BLOCKWISE_INST_BY_ID(double, double, double, 7, 0, 0, 4, 4); -ADD_BLOCKWISE_INST_BY_ID(double, double, double, 7, 0, 0, 4, 1); -ADD_BLOCKWISE_INST_BY_ID(double, double, double, 7, 0, 0, 2, 1); -ADD_BLOCKWISE_INST_BY_ID(double, double, double, 2, 0, 0, 4, 3); // for MIN -ADD_BLOCKWISE_INST_BY_ID(double, double, double, 2, 0, 0, 4, 4); -ADD_BLOCKWISE_INST_BY_ID(double, double, double, 2, 0, 0, 4, 1); -ADD_BLOCKWISE_INST_BY_ID(double, double, double, 2, 0, 0, 2, 1); -ADD_BLOCKWISE_INST_BY_ID(double, double, double, 3, 0, 0, 4, 3); // for MAX -ADD_BLOCKWISE_INST_BY_ID(double, double, double, 3, 0, 0, 4, 4); -ADD_BLOCKWISE_INST_BY_ID(double, double, double, 3, 0, 0, 4, 1); -ADD_BLOCKWISE_INST_BY_ID(double, double, double, 3, 0, 0, 2, 1); -ADD_BLOCKWISE_INST_BY_ID(double, double, double, 4, 0, 0, 4, 3); // for AMAX -ADD_BLOCKWISE_INST_BY_ID(double, double, double, 4, 0, 0, 4, 4); -ADD_BLOCKWISE_INST_BY_ID(double, double, double, 4, 0, 0, 4, 1); -ADD_BLOCKWISE_INST_BY_ID(double, double, double, 4, 0, 0, 2, 1); -ADD_BLOCKWISE_INST_BY_ID(double, double, double, 2, 0, 1, 4, 3); // for MIN -ADD_BLOCKWISE_INST_BY_ID(double, double, double, 2, 0, 1, 4, 4); -ADD_BLOCKWISE_INST_BY_ID(double, double, double, 2, 0, 1, 4, 1); -ADD_BLOCKWISE_INST_BY_ID(double, double, double, 2, 0, 1, 2, 1); -ADD_BLOCKWISE_INST_BY_ID(double, double, double, 3, 0, 1, 4, 3); // for MAX -ADD_BLOCKWISE_INST_BY_ID(double, double, double, 3, 0, 1, 4, 4); -ADD_BLOCKWISE_INST_BY_ID(double, double, double, 3, 0, 1, 4, 1); -ADD_BLOCKWISE_INST_BY_ID(double, double, double, 3, 0, 1, 2, 1); -ADD_BLOCKWISE_INST_BY_ID(double, double, double, 4, 0, 1, 4, 3); // for AMAX -ADD_BLOCKWISE_INST_BY_ID(double, double, double, 4, 0, 1, 4, 4); -ADD_BLOCKWISE_INST_BY_ID(double, double, double, 4, 0, 1, 4, 1); -ADD_BLOCKWISE_INST_BY_ID(double, double, double, 4, 0, 1, 2, 1); -// clang-format on - -} // namespace instance -} // namespace device -} // namespace tensor_operation - -} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_add.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_add.cpp new file mode 100644 index 000000000..f67ae2ee7 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_add.cpp @@ -0,0 +1,24 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/utility/reduction_enums.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_amax.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_amax.cpp new file mode 100644 index 000000000..6f8e07851 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_amax.cpp @@ -0,0 +1,28 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/utility/reduction_enums.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_avg.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_avg.cpp new file mode 100644 index 000000000..69fecf72f --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_avg.cpp @@ -0,0 +1,24 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/utility/reduction_enums.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_max.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_max.cpp new file mode 100644 index 000000000..129a4f0f0 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_max.cpp @@ -0,0 +1,28 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/utility/reduction_enums.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_min.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_min.cpp new file mode 100644 index 000000000..21babc4aa --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_min.cpp @@ -0,0 +1,28 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/utility/reduction_enums.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_norm2.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_norm2.cpp new file mode 100644 index 000000000..b85b3e2b6 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_norm2.cpp @@ -0,0 +1,24 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/utility/reduction_enums.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i32_i8.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i32_i8.cpp deleted file mode 100644 index e803fb842..000000000 --- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i32_i8.cpp +++ /dev/null @@ -1,27 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. - -#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// clang-format off -// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim -ADD_BLOCKWISE_INST_BY_ID(int8_t, int32_t, int8_t, 0, 0, 0, 4, 3); // for ADD -ADD_BLOCKWISE_INST_BY_ID(int8_t, int32_t, int8_t, 0, 0, 0, 4, 4); -ADD_BLOCKWISE_INST_BY_ID(int8_t, int32_t, int8_t, 0, 0, 0, 4, 1); -ADD_BLOCKWISE_INST_BY_ID(int8_t, int32_t, int8_t, 0, 0, 0, 2, 1); -ADD_BLOCKWISE_INST_BY_ID(int8_t, int32_t, int8_t, 5, 0, 0, 4, 3); // for AVG -ADD_BLOCKWISE_INST_BY_ID(int8_t, int32_t, int8_t, 5, 0, 0, 4, 4); -ADD_BLOCKWISE_INST_BY_ID(int8_t, int32_t, int8_t, 5, 0, 0, 4, 1); -ADD_BLOCKWISE_INST_BY_ID(int8_t, int32_t, int8_t, 5, 0, 0, 2, 1); -// clang-format on - -} // namespace instance -} // namespace device -} // namespace tensor_operation - -} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i32_i8_add.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i32_i8_add.cpp new file mode 100644 index 000000000..24a8293b5 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i32_i8_add.cpp @@ -0,0 +1,24 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/utility/reduction_enums.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i32_i8_avg.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i32_i8_avg.cpp new file mode 100644 index 000000000..73e60fa95 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i32_i8_avg.cpp @@ -0,0 +1,24 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/utility/reduction_enums.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8.cpp deleted file mode 100644 index 4bf4139d2..000000000 --- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8.cpp +++ /dev/null @@ -1,43 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. - -#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// clang-format off -// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim -ADD_BLOCKWISE_INST_BY_ID(int8_t, int8_t, int8_t, 2, 0, 0, 4, 3); // for MIN -ADD_BLOCKWISE_INST_BY_ID(int8_t, int8_t, int8_t, 2, 0, 0, 4, 4); -ADD_BLOCKWISE_INST_BY_ID(int8_t, int8_t, int8_t, 2, 0, 0, 4, 1); -ADD_BLOCKWISE_INST_BY_ID(int8_t, int8_t, int8_t, 2, 0, 0, 2, 1); -ADD_BLOCKWISE_INST_BY_ID(int8_t, int8_t, int8_t, 3, 0, 0, 4, 3); // for MAX -ADD_BLOCKWISE_INST_BY_ID(int8_t, int8_t, int8_t, 3, 0, 0, 4, 4); -ADD_BLOCKWISE_INST_BY_ID(int8_t, int8_t, int8_t, 3, 0, 0, 4, 1); -ADD_BLOCKWISE_INST_BY_ID(int8_t, int8_t, int8_t, 3, 0, 0, 2, 1); -ADD_BLOCKWISE_INST_BY_ID(int8_t, int8_t, int8_t, 4, 0, 0, 4, 3); // for AMAX -ADD_BLOCKWISE_INST_BY_ID(int8_t, int8_t, int8_t, 4, 0, 0, 4, 4); -ADD_BLOCKWISE_INST_BY_ID(int8_t, int8_t, int8_t, 4, 0, 0, 4, 1); -ADD_BLOCKWISE_INST_BY_ID(int8_t, int8_t, int8_t, 4, 0, 0, 2, 1); -ADD_BLOCKWISE_INST_BY_ID(int8_t, int8_t, int8_t, 2, 0, 1, 4, 3); // for MIN -ADD_BLOCKWISE_INST_BY_ID(int8_t, int8_t, int8_t, 2, 0, 1, 4, 4); -ADD_BLOCKWISE_INST_BY_ID(int8_t, int8_t, int8_t, 2, 0, 1, 4, 1); -ADD_BLOCKWISE_INST_BY_ID(int8_t, int8_t, int8_t, 2, 0, 1, 2, 1); -ADD_BLOCKWISE_INST_BY_ID(int8_t, int8_t, int8_t, 3, 0, 1, 4, 3); // for MAX -ADD_BLOCKWISE_INST_BY_ID(int8_t, int8_t, int8_t, 3, 0, 1, 4, 4); -ADD_BLOCKWISE_INST_BY_ID(int8_t, int8_t, int8_t, 3, 0, 1, 4, 1); -ADD_BLOCKWISE_INST_BY_ID(int8_t, int8_t, int8_t, 3, 0, 1, 2, 1); -ADD_BLOCKWISE_INST_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 4, 3); // for AMAX -ADD_BLOCKWISE_INST_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 4, 4); -ADD_BLOCKWISE_INST_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 4, 1); -ADD_BLOCKWISE_INST_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 2, 1); -// clang-format on - -} // namespace instance -} // namespace device -} // namespace tensor_operation - -} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8_amax.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8_amax.cpp new file mode 100644 index 000000000..72e649d89 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8_amax.cpp @@ -0,0 +1,28 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/utility/reduction_enums.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8_max.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8_max.cpp new file mode 100644 index 000000000..a7e053a06 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8_max.cpp @@ -0,0 +1,28 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/utility/reduction_enums.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8_min.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8_min.cpp new file mode 100644 index 000000000..0e3abd35b --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8_min.cpp @@ -0,0 +1,28 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/utility/reduction_enums.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_b16_f32_f32.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_b16_f32_f32.cpp deleted file mode 100644 index a571655cd..000000000 --- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_b16_f32_f32.cpp +++ /dev/null @@ -1,26 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. - -#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// clang-format off -// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim -ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(bhalf_t, float, float, 0, 0, 0, 4, 3); // for ADD -ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(bhalf_t, float, float, 0, 0, 0, 4, 4); -ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(bhalf_t, float, float, 0, 0, 0, 4, 1); -ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(bhalf_t, float, float, 0, 0, 0, 2, 1); -ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(bhalf_t, float, float, 5, 0, 0, 4, 3); // for AVG -ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(bhalf_t, float, float, 5, 0, 0, 4, 4); -ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(bhalf_t, float, float, 5, 0, 0, 4, 1); -ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(bhalf_t, float, float, 5, 0, 0, 2, 1); -// clang-format on - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_b16_f32_f32_add.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_b16_f32_f32_add.cpp new file mode 100644 index 000000000..4b3245607 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_b16_f32_f32_add.cpp @@ -0,0 +1,23 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/utility/reduction_enums.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_b16_f32_f32_avg.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_b16_f32_f32_avg.cpp new file mode 100644 index 000000000..3298587a4 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_b16_f32_f32_avg.cpp @@ -0,0 +1,23 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/utility/reduction_enums.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32.cpp deleted file mode 100644 index 9ad9a630b..000000000 --- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32.cpp +++ /dev/null @@ -1,27 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. - -#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// clang-format off -// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim -ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(half_t, float, float, 0, 0, 0, 4, 3); // for ADD -ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(half_t, float, float, 0, 0, 0, 4, 4); -ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(half_t, float, float, 0, 0, 0, 4, 1); -ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(half_t, float, float, 0, 0, 0, 2, 1); -ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(half_t, float, float, 5, 0, 0, 4, 3); // for AVG -ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(half_t, float, float, 5, 0, 0, 4, 4); -ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(half_t, float, float, 5, 0, 0, 4, 1); -ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(half_t, float, float, 5, 0, 0, 2, 1); -// clang-format on - -} // namespace instance -} // namespace device -} // namespace tensor_operation - -} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32_add.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32_add.cpp new file mode 100644 index 000000000..729d4fd6e --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32_add.cpp @@ -0,0 +1,24 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/utility/reduction_enums.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32_avg.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32_avg.cpp new file mode 100644 index 000000000..e3e36e312 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32_avg.cpp @@ -0,0 +1,24 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/utility/reduction_enums.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32.cpp deleted file mode 100644 index 4ee70702c..000000000 --- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32.cpp +++ /dev/null @@ -1,26 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. - -#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// clang-format off -// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim -ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(float, float, float, 0, 0, 0, 4, 3); // for ADD -ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(float, float, float, 0, 0, 0, 4, 4); -ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(float, float, float, 0, 0, 0, 4, 1); -ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(float, float, float, 0, 0, 0, 2, 1); -ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(float, float, float, 5, 0, 0, 4, 3); // for AVG -ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(float, float, float, 5, 0, 0, 4, 4); -ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(float, float, float, 5, 0, 0, 4, 1); -ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(float, float, float, 5, 0, 0, 2, 1); -// clang-format on - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32_add.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32_add.cpp new file mode 100644 index 000000000..e7580e7d7 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32_add.cpp @@ -0,0 +1,23 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/utility/reduction_enums.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32_avg.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32_avg.cpp new file mode 100644 index 000000000..1e6feb007 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32_avg.cpp @@ -0,0 +1,23 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/utility/reduction_enums.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32.cpp deleted file mode 100644 index 8c5fa80e8..000000000 --- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32.cpp +++ /dev/null @@ -1,26 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. - -#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// clang-format off -// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim -ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(float, double, float, 0, 0, 0, 4, 3); // for ADD -ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(float, double, float, 0, 0, 0, 4, 4); -ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(float, double, float, 0, 0, 0, 4, 1); -ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(float, double, float, 0, 0, 0, 2, 1); -ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(float, double, float, 5, 0, 0, 4, 3); // for AVG -ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(float, double, float, 5, 0, 0, 4, 4); -ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(float, double, float, 5, 0, 0, 4, 1); -ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(float, double, float, 5, 0, 0, 2, 1); -// clang-format on - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32_add.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32_add.cpp new file mode 100644 index 000000000..669c4d34c --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32_add.cpp @@ -0,0 +1,23 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/utility/reduction_enums.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32_avg.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32_avg.cpp new file mode 100644 index 000000000..335a5474c --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32_avg.cpp @@ -0,0 +1,23 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/utility/reduction_enums.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f64_f64_f64.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f64_f64_f64.cpp deleted file mode 100644 index d2b81c486..000000000 --- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f64_f64_f64.cpp +++ /dev/null @@ -1,27 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. - -#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// clang-format off -// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim -ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(double, double, double, 0, 0, 0, 4, 3); // for ADD -ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(double, double, double, 0, 0, 0, 4, 4); -ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(double, double, double, 0, 0, 0, 4, 1); -ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(double, double, double, 0, 0, 0, 2, 1); -ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(double, double, double, 5, 0, 0, 4, 3); // for AVG -ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(double, double, double, 5, 0, 0, 4, 4); -ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(double, double, double, 5, 0, 0, 4, 1); -ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(double, double, double, 5, 0, 0, 2, 1); -// clang-format on - -} // namespace instance -} // namespace device -} // namespace tensor_operation - -} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f64_f64_f64_add.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f64_f64_f64_add.cpp new file mode 100644 index 000000000..e95e8391a --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f64_f64_f64_add.cpp @@ -0,0 +1,24 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/utility/reduction_enums.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f64_f64_f64_avg.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f64_f64_f64_avg.cpp new file mode 100644 index 000000000..25498158a --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f64_f64_f64_avg.cpp @@ -0,0 +1,24 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/utility/reduction_enums.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16.cpp deleted file mode 100644 index 8d678e784..000000000 --- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16.cpp +++ /dev/null @@ -1,56 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. - -#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// clang-format off -// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim -ADD_THREADWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 0, 0, 0, 4, 3); // for ADD -ADD_THREADWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 0, 0, 0, 4, 4); -ADD_THREADWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 0, 0, 0, 4, 1); -ADD_THREADWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 0, 0, 0, 2, 1); -ADD_THREADWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 5, 0, 0, 4, 3); // for AVG -ADD_THREADWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 5, 0, 0, 4, 4); -ADD_THREADWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 5, 0, 0, 4, 1); -ADD_THREADWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 5, 0, 0, 2, 1); -ADD_THREADWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 7, 0, 0, 4, 3); // for NORM2 -ADD_THREADWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 7, 0, 0, 4, 4); -ADD_THREADWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 7, 0, 0, 4, 1); -ADD_THREADWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 7, 0, 0, 2, 1); - -ADD_THREADWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 0, 4, 3); // for MIN -ADD_THREADWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 0, 4, 4); -ADD_THREADWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 0, 4, 1); -ADD_THREADWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 0, 2, 1); -ADD_THREADWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 0, 4, 3); // for MAX -ADD_THREADWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 0, 4, 4); -ADD_THREADWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 0, 4, 1); -ADD_THREADWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 0, 2, 1); -ADD_THREADWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 0, 4, 3); // for AMAX -ADD_THREADWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 0, 4, 4); -ADD_THREADWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 0, 4, 1); -ADD_THREADWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 0, 2, 1); -ADD_THREADWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 1, 4, 3); // for MIN -ADD_THREADWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 1, 4, 4); -ADD_THREADWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 1, 4, 1); -ADD_THREADWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 1, 2, 1); -ADD_THREADWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 1, 4, 3); // for MAX -ADD_THREADWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 1, 4, 4); -ADD_THREADWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 1, 4, 1); -ADD_THREADWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 1, 2, 1); -ADD_THREADWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 1, 4, 3); // for AMAX -ADD_THREADWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 1, 4, 4); -ADD_THREADWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 1, 4, 1); -ADD_THREADWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 1, 2, 1); -// clang-format on - -} // namespace instance -} // namespace device -} // namespace tensor_operation - -} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_add.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_add.cpp new file mode 100644 index 000000000..7262b8a5b --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_add.cpp @@ -0,0 +1,24 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/utility/reduction_enums.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_amax.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_amax.cpp new file mode 100644 index 000000000..c526a74f1 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_amax.cpp @@ -0,0 +1,28 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/utility/reduction_enums.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_avg.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_avg.cpp new file mode 100644 index 000000000..4c7252e74 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_avg.cpp @@ -0,0 +1,24 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/utility/reduction_enums.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_max.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_max.cpp new file mode 100644 index 000000000..618900a7d --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_max.cpp @@ -0,0 +1,28 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/utility/reduction_enums.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_min.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_min.cpp new file mode 100644 index 000000000..ce747cbc7 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_min.cpp @@ -0,0 +1,28 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/utility/reduction_enums.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_norm2.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_norm2.cpp new file mode 100644 index 000000000..06f622b9e --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_norm2.cpp @@ -0,0 +1,24 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/utility/reduction_enums.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16.cpp deleted file mode 100644 index 010560586..000000000 --- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16.cpp +++ /dev/null @@ -1,43 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. - -#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// clang-format off -// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim -ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 2, 0, 0, 4, 3); // for MIN -ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 2, 0, 0, 4, 4); -ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 2, 0, 0, 4, 1); -ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 2, 0, 0, 2, 1); -ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 3, 0, 0, 4, 3); // for MAX -ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 3, 0, 0, 4, 4); -ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 3, 0, 0, 4, 1); -ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 3, 0, 0, 2, 1); -ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 4, 0, 0, 4, 3); // for AMAX -ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 4, 0, 0, 4, 4); -ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 4, 0, 0, 4, 1); -ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 4, 0, 0, 2, 1); -ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 2, 0, 1, 4, 3); // for MIN -ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 2, 0, 1, 4, 4); -ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 2, 0, 1, 4, 1); -ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 2, 0, 1, 2, 1); -ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 3, 0, 1, 4, 3); // for MAX -ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 3, 0, 1, 4, 4); -ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 3, 0, 1, 4, 1); -ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 3, 0, 1, 2, 1); -ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 4, 0, 1, 4, 3); // for AMAX -ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 4, 0, 1, 4, 4); -ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 4, 0, 1, 4, 1); -ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 4, 0, 1, 2, 1); -// clang-format on - -} // namespace instance -} // namespace device -} // namespace tensor_operation - -} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16_amax.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16_amax.cpp new file mode 100644 index 000000000..708eb58d4 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16_amax.cpp @@ -0,0 +1,28 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/utility/reduction_enums.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16_max.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16_max.cpp new file mode 100644 index 000000000..c8a62fa14 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16_max.cpp @@ -0,0 +1,28 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/utility/reduction_enums.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16_min.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16_min.cpp new file mode 100644 index 000000000..ce2092153 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16_min.cpp @@ -0,0 +1,28 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/utility/reduction_enums.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16.cpp deleted file mode 100644 index 55c53dfd5..000000000 --- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16.cpp +++ /dev/null @@ -1,30 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. - -#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// clang-format off -// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim -ADD_THREADWISE_INST_BY_ID(half_t, float, half_t, 0, 0, 0, 4, 3); // for ADD -ADD_THREADWISE_INST_BY_ID(half_t, float, half_t, 0, 0, 0, 4, 4); -ADD_THREADWISE_INST_BY_ID(half_t, float, half_t, 0, 0, 0, 4, 1); -ADD_THREADWISE_INST_BY_ID(half_t, float, half_t, 0, 0, 0, 2, 1); -ADD_THREADWISE_INST_BY_ID(half_t, float, half_t, 5, 0, 0, 4, 3); // for AVG -ADD_THREADWISE_INST_BY_ID(half_t, float, half_t, 5, 0, 0, 4, 4); -ADD_THREADWISE_INST_BY_ID(half_t, float, half_t, 5, 0, 0, 4, 1); -ADD_THREADWISE_INST_BY_ID(half_t, float, half_t, 5, 0, 0, 2, 1); -ADD_THREADWISE_INST_BY_ID(half_t, float, half_t, 7, 0, 0, 4, 3); // for NORM2 -ADD_THREADWISE_INST_BY_ID(half_t, float, half_t, 7, 0, 0, 4, 4); -ADD_THREADWISE_INST_BY_ID(half_t, float, half_t, 7, 0, 0, 4, 1); -ADD_THREADWISE_INST_BY_ID(half_t, float, half_t, 7, 0, 0, 2, 1); -// clang-format on - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16_add.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16_add.cpp new file mode 100644 index 000000000..29251a8b9 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16_add.cpp @@ -0,0 +1,23 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/utility/reduction_enums.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16_avg.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16_avg.cpp new file mode 100644 index 000000000..734fa9fd3 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16_avg.cpp @@ -0,0 +1,23 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/utility/reduction_enums.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16_norm2.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16_norm2.cpp new file mode 100644 index 000000000..d7a0e2bfe --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16_norm2.cpp @@ -0,0 +1,23 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/utility/reduction_enums.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32.cpp deleted file mode 100644 index 367cf9a65..000000000 --- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32.cpp +++ /dev/null @@ -1,55 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. - -#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// clang-format off -// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim -ADD_THREADWISE_INST_BY_ID(float, float, float, 0, 0, 0, 4, 3); // for ADD -ADD_THREADWISE_INST_BY_ID(float, float, float, 0, 0, 0, 4, 4); -ADD_THREADWISE_INST_BY_ID(float, float, float, 0, 0, 0, 4, 1); -ADD_THREADWISE_INST_BY_ID(float, float, float, 0, 0, 0, 2, 1); -ADD_THREADWISE_INST_BY_ID(float, float, float, 5, 0, 0, 4, 3); // for AVG -ADD_THREADWISE_INST_BY_ID(float, float, float, 5, 0, 0, 4, 4); -ADD_THREADWISE_INST_BY_ID(float, float, float, 5, 0, 0, 4, 1); -ADD_THREADWISE_INST_BY_ID(float, float, float, 5, 0, 0, 2, 1); -ADD_THREADWISE_INST_BY_ID(float, float, float, 7, 0, 0, 4, 3); // for NORM2 -ADD_THREADWISE_INST_BY_ID(float, float, float, 7, 0, 0, 4, 4); -ADD_THREADWISE_INST_BY_ID(float, float, float, 7, 0, 0, 4, 1); -ADD_THREADWISE_INST_BY_ID(float, float, float, 7, 0, 0, 2, 1); -ADD_THREADWISE_INST_BY_ID(float, float, float, 2, 0, 0, 4, 3); // for MIN -ADD_THREADWISE_INST_BY_ID(float, float, float, 2, 0, 0, 4, 4); -ADD_THREADWISE_INST_BY_ID(float, float, float, 2, 0, 0, 4, 1); -ADD_THREADWISE_INST_BY_ID(float, float, float, 2, 0, 0, 2, 1); -ADD_THREADWISE_INST_BY_ID(float, float, float, 3, 0, 0, 4, 3); // for MAX -ADD_THREADWISE_INST_BY_ID(float, float, float, 3, 0, 0, 4, 4); -ADD_THREADWISE_INST_BY_ID(float, float, float, 3, 0, 0, 4, 1); -ADD_THREADWISE_INST_BY_ID(float, float, float, 3, 0, 0, 2, 1); -ADD_THREADWISE_INST_BY_ID(float, float, float, 4, 0, 0, 4, 3); // for AMAX -ADD_THREADWISE_INST_BY_ID(float, float, float, 4, 0, 0, 4, 4); -ADD_THREADWISE_INST_BY_ID(float, float, float, 4, 0, 0, 4, 1); -ADD_THREADWISE_INST_BY_ID(float, float, float, 4, 0, 0, 2, 1); -ADD_THREADWISE_INST_BY_ID(float, float, float, 2, 0, 1, 4, 3); // for MIN -ADD_THREADWISE_INST_BY_ID(float, float, float, 2, 0, 1, 4, 4); -ADD_THREADWISE_INST_BY_ID(float, float, float, 2, 0, 1, 4, 1); -ADD_THREADWISE_INST_BY_ID(float, float, float, 2, 0, 1, 2, 1); -ADD_THREADWISE_INST_BY_ID(float, float, float, 3, 0, 1, 4, 3); // for MAX -ADD_THREADWISE_INST_BY_ID(float, float, float, 3, 0, 1, 4, 4); -ADD_THREADWISE_INST_BY_ID(float, float, float, 3, 0, 1, 4, 1); -ADD_THREADWISE_INST_BY_ID(float, float, float, 3, 0, 1, 2, 1); -ADD_THREADWISE_INST_BY_ID(float, float, float, 4, 0, 1, 4, 3); // for AMAX -ADD_THREADWISE_INST_BY_ID(float, float, float, 4, 0, 1, 4, 4); -ADD_THREADWISE_INST_BY_ID(float, float, float, 4, 0, 1, 4, 1); -ADD_THREADWISE_INST_BY_ID(float, float, float, 4, 0, 1, 2, 1); -// clang-format on - -} // namespace instance -} // namespace device -} // namespace tensor_operation - -} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_add.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_add.cpp new file mode 100644 index 000000000..8b97f3008 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_add.cpp @@ -0,0 +1,24 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/utility/reduction_enums.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_amax.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_amax.cpp new file mode 100644 index 000000000..53d01e38d --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_amax.cpp @@ -0,0 +1,28 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/utility/reduction_enums.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_avg.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_avg.cpp new file mode 100644 index 000000000..125d054f3 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_avg.cpp @@ -0,0 +1,24 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/utility/reduction_enums.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_max.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_max.cpp new file mode 100644 index 000000000..fb86a2bbe --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_max.cpp @@ -0,0 +1,28 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/utility/reduction_enums.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_min.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_min.cpp new file mode 100644 index 000000000..49af08390 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_min.cpp @@ -0,0 +1,28 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/utility/reduction_enums.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_norm2.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_norm2.cpp new file mode 100644 index 000000000..30cc1b13e --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_norm2.cpp @@ -0,0 +1,24 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/utility/reduction_enums.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32.cpp deleted file mode 100644 index 18fd08448..000000000 --- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32.cpp +++ /dev/null @@ -1,31 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. - -#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// clang-format off -// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim -ADD_THREADWISE_INST_BY_ID(float, double, float, 0, 0, 0, 4, 3); // for ADD -ADD_THREADWISE_INST_BY_ID(float, double, float, 0, 0, 0, 4, 4); -ADD_THREADWISE_INST_BY_ID(float, double, float, 0, 0, 0, 4, 1); -ADD_THREADWISE_INST_BY_ID(float, double, float, 0, 0, 0, 2, 1); -ADD_THREADWISE_INST_BY_ID(float, double, float, 5, 0, 0, 4, 3); // for AVG -ADD_THREADWISE_INST_BY_ID(float, double, float, 5, 0, 0, 4, 4); -ADD_THREADWISE_INST_BY_ID(float, double, float, 5, 0, 0, 4, 1); -ADD_THREADWISE_INST_BY_ID(float, double, float, 5, 0, 0, 2, 1); -ADD_THREADWISE_INST_BY_ID(float, double, float, 7, 0, 0, 4, 3); // for NORM2 -ADD_THREADWISE_INST_BY_ID(float, double, float, 7, 0, 0, 4, 4); -ADD_THREADWISE_INST_BY_ID(float, double, float, 7, 0, 0, 4, 1); -ADD_THREADWISE_INST_BY_ID(float, double, float, 7, 0, 0, 2, 1); -// clang-format on - -} // namespace instance -} // namespace device -} // namespace tensor_operation - -} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_add.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_add.cpp new file mode 100644 index 000000000..24f8a9ba5 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_add.cpp @@ -0,0 +1,24 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/utility/reduction_enums.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_avg.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_avg.cpp new file mode 100644 index 000000000..a26702f05 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_avg.cpp @@ -0,0 +1,24 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/utility/reduction_enums.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_norm2.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_norm2.cpp new file mode 100644 index 000000000..34fe32628 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_norm2.cpp @@ -0,0 +1,24 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/utility/reduction_enums.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64.cpp deleted file mode 100644 index 3d02f3cbe..000000000 --- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64.cpp +++ /dev/null @@ -1,54 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. - -#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// clang-format off -// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim -ADD_THREADWISE_INST_BY_ID(double, double, double, 0, 0, 0, 4, 3); // for ADD -ADD_THREADWISE_INST_BY_ID(double, double, double, 0, 0, 0, 4, 4); -ADD_THREADWISE_INST_BY_ID(double, double, double, 0, 0, 0, 4, 1); -ADD_THREADWISE_INST_BY_ID(double, double, double, 0, 0, 0, 2, 1); -ADD_THREADWISE_INST_BY_ID(double, double, double, 5, 0, 0, 4, 3); // for AVG -ADD_THREADWISE_INST_BY_ID(double, double, double, 5, 0, 0, 4, 4); -ADD_THREADWISE_INST_BY_ID(double, double, double, 5, 0, 0, 4, 1); -ADD_THREADWISE_INST_BY_ID(double, double, double, 5, 0, 0, 2, 1); -ADD_THREADWISE_INST_BY_ID(double, double, double, 7, 0, 0, 4, 3); // for NORM2 -ADD_THREADWISE_INST_BY_ID(double, double, double, 7, 0, 0, 4, 4); -ADD_THREADWISE_INST_BY_ID(double, double, double, 7, 0, 0, 4, 1); -ADD_THREADWISE_INST_BY_ID(double, double, double, 7, 0, 0, 2, 1); -ADD_THREADWISE_INST_BY_ID(double, double, double, 2, 0, 0, 4, 3); // for MIN -ADD_THREADWISE_INST_BY_ID(double, double, double, 2, 0, 0, 4, 4); -ADD_THREADWISE_INST_BY_ID(double, double, double, 2, 0, 0, 4, 1); -ADD_THREADWISE_INST_BY_ID(double, double, double, 2, 0, 0, 2, 1); -ADD_THREADWISE_INST_BY_ID(double, double, double, 3, 0, 0, 4, 3); // for MAX -ADD_THREADWISE_INST_BY_ID(double, double, double, 3, 0, 0, 4, 4); -ADD_THREADWISE_INST_BY_ID(double, double, double, 3, 0, 0, 4, 1); -ADD_THREADWISE_INST_BY_ID(double, double, double, 3, 0, 0, 2, 1); -ADD_THREADWISE_INST_BY_ID(double, double, double, 4, 0, 0, 4, 3); // for AMAX -ADD_THREADWISE_INST_BY_ID(double, double, double, 4, 0, 0, 4, 4); -ADD_THREADWISE_INST_BY_ID(double, double, double, 4, 0, 0, 4, 1); -ADD_THREADWISE_INST_BY_ID(double, double, double, 4, 0, 0, 2, 1); -ADD_THREADWISE_INST_BY_ID(double, double, double, 2, 0, 1, 4, 3); // for MIN -ADD_THREADWISE_INST_BY_ID(double, double, double, 2, 0, 1, 4, 4); -ADD_THREADWISE_INST_BY_ID(double, double, double, 2, 0, 1, 4, 1); -ADD_THREADWISE_INST_BY_ID(double, double, double, 2, 0, 1, 2, 1); -ADD_THREADWISE_INST_BY_ID(double, double, double, 3, 0, 1, 4, 3); // for MAX -ADD_THREADWISE_INST_BY_ID(double, double, double, 3, 0, 1, 4, 4); -ADD_THREADWISE_INST_BY_ID(double, double, double, 3, 0, 1, 4, 1); -ADD_THREADWISE_INST_BY_ID(double, double, double, 3, 0, 1, 2, 1); -ADD_THREADWISE_INST_BY_ID(double, double, double, 4, 0, 1, 4, 3); // for AMAX -ADD_THREADWISE_INST_BY_ID(double, double, double, 4, 0, 1, 4, 4); -ADD_THREADWISE_INST_BY_ID(double, double, double, 4, 0, 1, 4, 1); -ADD_THREADWISE_INST_BY_ID(double, double, double, 4, 0, 1, 2, 1); -// clang-format on - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_add.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_add.cpp new file mode 100644 index 000000000..74b15eddb --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_add.cpp @@ -0,0 +1,23 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/utility/reduction_enums.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_amax.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_amax.cpp new file mode 100644 index 000000000..65762492f --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_amax.cpp @@ -0,0 +1,27 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/utility/reduction_enums.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_avg.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_avg.cpp new file mode 100644 index 000000000..5e74295a0 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_avg.cpp @@ -0,0 +1,23 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/utility/reduction_enums.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_max.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_max.cpp new file mode 100644 index 000000000..6fdea6cc4 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_max.cpp @@ -0,0 +1,27 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/utility/reduction_enums.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_min.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_min.cpp new file mode 100644 index 000000000..317d573da --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_min.cpp @@ -0,0 +1,27 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/utility/reduction_enums.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_norm2.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_norm2.cpp new file mode 100644 index 000000000..29f95ebcc --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_norm2.cpp @@ -0,0 +1,23 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/utility/reduction_enums.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8.cpp deleted file mode 100644 index fcf072a08..000000000 --- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8.cpp +++ /dev/null @@ -1,28 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. - -#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// clang-format off -// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim -ADD_THREADWISE_INST_BY_ID(int8_t, int32_t, int8_t, 0, 0, 0, 4, 3); // for ADD -ADD_THREADWISE_INST_BY_ID(int8_t, int32_t, int8_t, 0, 0, 0, 4, 4); -ADD_THREADWISE_INST_BY_ID(int8_t, int32_t, int8_t, 0, 0, 0, 4, 1); -ADD_THREADWISE_INST_BY_ID(int8_t, int32_t, int8_t, 0, 0, 0, 2, 1); -ADD_THREADWISE_INST_BY_ID(int8_t, int32_t, int8_t, 5, 0, 0, 4, 3); // for AVG -ADD_THREADWISE_INST_BY_ID(int8_t, int32_t, int8_t, 5, 0, 0, 4, 4); -ADD_THREADWISE_INST_BY_ID(int8_t, int32_t, int8_t, 5, 0, 0, 4, 1); -ADD_THREADWISE_INST_BY_ID(int8_t, int32_t, int8_t, 5, 0, 0, 2, 1); -// clang-format on -// clang-format on - -} // namespace instance -} // namespace device -} // namespace tensor_operation - -} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8_add.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8_add.cpp new file mode 100644 index 000000000..aa9f47cbc --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8_add.cpp @@ -0,0 +1,25 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/utility/reduction_enums.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +// clang-format on +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8_avg.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8_avg.cpp new file mode 100644 index 000000000..54a9dd1ab --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8_avg.cpp @@ -0,0 +1,24 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/utility/reduction_enums.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8.cpp deleted file mode 100644 index 85d7ce8b4..000000000 --- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8.cpp +++ /dev/null @@ -1,43 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. - -#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// clang-format off -// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim -ADD_THREADWISE_INST_BY_ID(int8_t, int8_t, int8_t, 2, 0, 0, 4, 3); // for MIN -ADD_THREADWISE_INST_BY_ID(int8_t, int8_t, int8_t, 2, 0, 0, 4, 4); -ADD_THREADWISE_INST_BY_ID(int8_t, int8_t, int8_t, 2, 0, 0, 4, 1); -ADD_THREADWISE_INST_BY_ID(int8_t, int8_t, int8_t, 2, 0, 0, 2, 1); -ADD_THREADWISE_INST_BY_ID(int8_t, int8_t, int8_t, 3, 0, 0, 4, 3); // for MAX -ADD_THREADWISE_INST_BY_ID(int8_t, int8_t, int8_t, 3, 0, 0, 4, 4); -ADD_THREADWISE_INST_BY_ID(int8_t, int8_t, int8_t, 3, 0, 0, 4, 1); -ADD_THREADWISE_INST_BY_ID(int8_t, int8_t, int8_t, 3, 0, 0, 2, 1); -ADD_THREADWISE_INST_BY_ID(int8_t, int8_t, int8_t, 4, 0, 0, 4, 3); // for AMAX -ADD_THREADWISE_INST_BY_ID(int8_t, int8_t, int8_t, 4, 0, 0, 4, 4); -ADD_THREADWISE_INST_BY_ID(int8_t, int8_t, int8_t, 4, 0, 0, 4, 1); -ADD_THREADWISE_INST_BY_ID(int8_t, int8_t, int8_t, 4, 0, 0, 2, 1); -ADD_THREADWISE_INST_BY_ID(int8_t, int8_t, int8_t, 2, 0, 1, 4, 3); // for MIN -ADD_THREADWISE_INST_BY_ID(int8_t, int8_t, int8_t, 2, 0, 1, 4, 4); -ADD_THREADWISE_INST_BY_ID(int8_t, int8_t, int8_t, 2, 0, 1, 4, 1); -ADD_THREADWISE_INST_BY_ID(int8_t, int8_t, int8_t, 2, 0, 1, 2, 1); -ADD_THREADWISE_INST_BY_ID(int8_t, int8_t, int8_t, 3, 0, 1, 4, 3); // for MAX -ADD_THREADWISE_INST_BY_ID(int8_t, int8_t, int8_t, 3, 0, 1, 4, 4); -ADD_THREADWISE_INST_BY_ID(int8_t, int8_t, int8_t, 3, 0, 1, 4, 1); -ADD_THREADWISE_INST_BY_ID(int8_t, int8_t, int8_t, 3, 0, 1, 2, 1); -ADD_THREADWISE_INST_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 4, 3); // for AMAX -ADD_THREADWISE_INST_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 4, 4); -ADD_THREADWISE_INST_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 4, 1); -ADD_THREADWISE_INST_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 2, 1); -// clang-format on - -} // namespace instance -} // namespace device -} // namespace tensor_operation - -} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_amax.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_amax.cpp new file mode 100644 index 000000000..4ef5717b5 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_amax.cpp @@ -0,0 +1,28 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/utility/reduction_enums.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_max.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_max.cpp new file mode 100644 index 000000000..140a3c197 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_max.cpp @@ -0,0 +1,28 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/utility/reduction_enums.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_min.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_min.cpp new file mode 100644 index 000000000..317b4ad39 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_min.cpp @@ -0,0 +1,28 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/utility/reduction_enums.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck diff --git a/profiler/include/profile_reduce_impl.hpp b/profiler/include/profile_reduce_impl.hpp index 2d06ec22c..981962bdc 100644 --- a/profiler/include/profile_reduce_impl.hpp +++ b/profiler/include/profile_reduce_impl.hpp @@ -18,57 +18,61 @@ namespace tensor_operation { namespace device { namespace instance { -template +template struct ReduceDescription { - static constexpr int Rank_ = Rank; - static constexpr int NumReduceDim_ = NumReduceDim; - static constexpr int ReduceOpId_ = ReduceOpId; - static constexpr int PropagateNan_ = PropagateNan; - static constexpr int UseIndex_ = UseIndex; + static constexpr index_t Rank_ = Rank; + static constexpr index_t NumReduceDim_ = NumReduceDim; + static constexpr ReduceTensorOp ReduceOpId_ = ReduceOpId; + static constexpr bool PropagateNan_ = PropagateNan; + static constexpr bool UseIndex_ = UseIndex; }; using reduce_description_instances = - std::tuple, // for ADD - ReduceDescription<4, 4, 0, false, false>, - ReduceDescription<4, 1, 0, false, false>, - ReduceDescription<2, 1, 0, false, false>, - - ReduceDescription<4, 3, 5, false, false>, // for AVG - ReduceDescription<4, 4, 5, false, false>, - ReduceDescription<4, 1, 5, false, false>, - ReduceDescription<2, 1, 5, false, false>, - - ReduceDescription<4, 3, 7, false, false>, // for NORM2 - ReduceDescription<4, 4, 7, false, false>, - ReduceDescription<4, 1, 7, false, false>, - ReduceDescription<2, 1, 7, false, false>, - - ReduceDescription<4, 3, 2, false, false>, // for MIN - ReduceDescription<4, 4, 2, false, false>, - ReduceDescription<4, 1, 2, false, false>, - ReduceDescription<2, 1, 2, false, false>, - ReduceDescription<4, 3, 3, false, false>, // for MAX - ReduceDescription<4, 4, 3, false, false>, - ReduceDescription<4, 1, 3, false, false>, - ReduceDescription<2, 1, 3, false, false>, - ReduceDescription<4, 3, 4, false, false>, // for AMAX - ReduceDescription<4, 4, 4, false, false>, - ReduceDescription<4, 1, 4, false, false>, - ReduceDescription<2, 1, 4, false, false>, - - ReduceDescription<4, 3, 2, false, true>, // for MIN - ReduceDescription<4, 4, 2, false, true>, - ReduceDescription<4, 1, 2, false, true>, - ReduceDescription<2, 1, 2, false, true>, - ReduceDescription<4, 3, 3, false, true>, // for MAX - ReduceDescription<4, 4, 3, false, true>, - ReduceDescription<4, 1, 3, false, true>, - ReduceDescription<2, 1, 3, false, true>, - ReduceDescription<4, 3, 4, false, true>, // for AMAX - ReduceDescription<4, 4, 4, false, true>, - ReduceDescription<4, 1, 4, false, true>, - ReduceDescription<2, 1, 4, false, true>>; + std::tuple, // for ADD + ReduceDescription<4, 4, ReduceTensorOp::ADD, false, false>, + ReduceDescription<4, 1, ReduceTensorOp::ADD, false, false>, + ReduceDescription<2, 1, ReduceTensorOp::ADD, false, false>, + + ReduceDescription<4, 3, ReduceTensorOp::AVG, false, false>, // for AVG + ReduceDescription<4, 4, ReduceTensorOp::AVG, false, false>, + ReduceDescription<4, 1, ReduceTensorOp::AVG, false, false>, + ReduceDescription<2, 1, ReduceTensorOp::AVG, false, false>, + + ReduceDescription<4, 3, ReduceTensorOp::NORM2, false, false>, // for NORM2 + ReduceDescription<4, 4, ReduceTensorOp::NORM2, false, false>, + ReduceDescription<4, 1, ReduceTensorOp::NORM2, false, false>, + ReduceDescription<2, 1, ReduceTensorOp::NORM2, false, false>, + + ReduceDescription<4, 3, ReduceTensorOp::MIN, false, false>, // for MIN + ReduceDescription<4, 4, ReduceTensorOp::MIN, false, false>, + ReduceDescription<4, 1, ReduceTensorOp::MIN, false, false>, + ReduceDescription<2, 1, ReduceTensorOp::MIN, false, false>, + ReduceDescription<4, 3, ReduceTensorOp::MAX, false, false>, // for MAX + ReduceDescription<4, 4, ReduceTensorOp::MAX, false, false>, + ReduceDescription<4, 1, ReduceTensorOp::MAX, false, false>, + ReduceDescription<2, 1, ReduceTensorOp::MAX, false, false>, + ReduceDescription<4, 3, ReduceTensorOp::AMAX, false, false>, // for AMAX + ReduceDescription<4, 4, ReduceTensorOp::AMAX, false, false>, + ReduceDescription<4, 1, ReduceTensorOp::AMAX, false, false>, + ReduceDescription<2, 1, ReduceTensorOp::AMAX, false, false>, + + ReduceDescription<4, 3, ReduceTensorOp::MIN, false, true>, // for MIN + ReduceDescription<4, 4, ReduceTensorOp::MIN, false, true>, + ReduceDescription<4, 1, ReduceTensorOp::MIN, false, true>, + ReduceDescription<2, 1, ReduceTensorOp::MIN, false, true>, + ReduceDescription<4, 3, ReduceTensorOp::MAX, false, true>, // for MAX + ReduceDescription<4, 4, ReduceTensorOp::MAX, false, true>, + ReduceDescription<4, 1, ReduceTensorOp::MAX, false, true>, + ReduceDescription<2, 1, ReduceTensorOp::MAX, false, true>, + ReduceDescription<4, 3, ReduceTensorOp::AMAX, false, true>, // for AMAX + ReduceDescription<4, 4, ReduceTensorOp::AMAX, false, true>, + ReduceDescription<4, 1, ReduceTensorOp::AMAX, false, true>, + ReduceDescription<2, 1, ReduceTensorOp::AMAX, false, true>>; template bool description_match(const DescriptionType& description, @@ -78,9 +82,8 @@ bool description_match(const DescriptionType& description, bool PropagateNan, bool UseIndex) { - if(description.Rank_ != Rank || description.ReduceOpId_ != static_cast(ReduceOpId) || - description.PropagateNan_ != static_cast(PropagateNan) || - description.UseIndex_ != static_cast(UseIndex)) + if(description.Rank_ != Rank || description.ReduceOpId_ != ReduceOpId || + description.PropagateNan_ != PropagateNan || description.UseIndex_ != UseIndex) return (false); if(DescriptionType::NumReduceDim_ != reduceDims.size()) @@ -99,11 +102,10 @@ bool description_match(const DescriptionType& description, namespace ck { namespace profiler { -template -static inline std::vector get_invariant_dims(const std::vector& reduceDims) +template +static inline std::array +get_invariant_dims(const std::array& reduceDims) { - assert(NumReduceDim == reduceDims.size()); - int reduceFlag = 0; // flag the bits for the reduceDims @@ -112,13 +114,15 @@ static inline std::vector get_invariant_dims(const std::vector& reduce reduceFlag |= 1 << reduceDims[i]; }; - std::vector invariantDims; + std::array invariantDims; // collect invariant dimensions + int dim = 0; for(int i = 0; i < Rank; i++) if((reduceFlag & (1 << i)) == 0) { - invariantDims.push_back(i); + invariantDims[dim] = i; + dim++; }; return invariantDims; @@ -137,7 +141,7 @@ bool profile_reduce_impl_impl(bool do_verification, bool do_dumpout, bool time_kernel, const std::vector& inLengths, - const std::vector& reduceDims, + const std::array& reduceDims, float alpha, float beta) { @@ -145,6 +149,8 @@ bool profile_reduce_impl_impl(bool do_verification, using namespace ck::tensor_operation::device::instance; using ck::host_common::dumpBufferToFile; + constexpr index_t NumOutDim = (Rank - NumReduceDim == 0) ? 1 : Rank - NumReduceDim; + constexpr bool op_support_indices = (ReduceOpId == ReduceTensorOp::MIN || ReduceOpId == ReduceTensorOp::MAX || ReduceOpId == ReduceTensorOp::AMAX); @@ -279,28 +285,32 @@ bool profile_reduce_impl_impl(bool do_verification, reduce_unary_operator::GetElementwiseOperator( static_cast(reduce_total_length)); - using DeviceReduceInstPtr0 = - DeviceReducePtr; + using DeviceReduceInstPtr = + DeviceReducePtr; - std::vector reduce0_ptrs; + std::vector reduce_ptrs; add_device_reduce_instance_threadwise(reduce0_ptrs); + UseIndex>(reduce_ptrs); add_device_reduce_instance_blockwise(reduce0_ptrs); + UseIndex>(reduce_ptrs); if constexpr(use_atomic_add) { @@ -309,12 +319,14 @@ bool profile_reduce_impl_impl(bool do_verification, OutDataType, Rank, NumReduceDim, - ReduceOpId, + ReduceOperation, + InElementwiseOperation, + AccElementwiseOperation, PropagateNan, - UseIndex>(reduce0_ptrs); + UseIndex>(reduce_ptrs); } - if(reduce0_ptrs.empty()) + if(reduce_ptrs.empty()) { throw std::runtime_error("Wrong! No device REDUCE instance found"); }; @@ -342,22 +354,22 @@ bool profile_reduce_impl_impl(bool do_verification, acc_elementwise_op); }; - std::vector i_inLengths; - std::vector i_inStrides; - std::vector i_outLengths; - std::vector i_outStrides; + std::array arrInLengths; + std::array arrInStrides; + std::array arrOutLengths; + std::array arrOutStrides; - i_inLengths.assign(inLengths.begin(), inLengths.end()); - i_inStrides.assign(inStrides.begin(), inStrides.end()); - i_outLengths.assign(outLengths.begin(), outLengths.end()); - i_outStrides.assign(outStrides.begin(), outStrides.end()); + std::copy(inLengths.begin(), inLengths.end(), arrInLengths.begin()); + std::copy(inStrides.begin(), inStrides.end(), arrInStrides.begin()); + std::copy(outLengths.begin(), outLengths.end(), arrOutLengths.begin()); + std::copy(outStrides.begin(), outStrides.end(), arrOutStrides.begin()); - for(auto& reduce_ptr : reduce0_ptrs) + for(auto& reduce_ptr : reduce_ptrs) { - auto argument_ptr = reduce_ptr->MakeArgumentPointer(i_inLengths, - i_inStrides, - i_outLengths, - i_outStrides, + auto argument_ptr = reduce_ptr->MakeArgumentPointer(arrInLengths, + arrInStrides, + arrOutLengths, + arrOutStrides, reduceDims, alpha, beta, @@ -478,22 +490,25 @@ bool profile_reduce_impl(bool do_verification, descType{}, inLengths.size(), reduceDims, ReduceOpId, PropagateNan, UseIndex)) return; - pass = pass && - profile_reduce_impl_impl(descType::ReduceOpId_), - static_cast(descType::PropagateNan_), - static_cast(descType::UseIndex_)>(do_verification, - init_method, - do_dumpout, - time_kernel, - inLengths, - reduceDims, - alpha, - beta); + std::array arrReduceDims; + + std::copy(reduceDims.begin(), reduceDims.end(), arrReduceDims.begin()); + + pass = pass && profile_reduce_impl_impl(descType::ReduceOpId_), + descType::PropagateNan_, + descType::UseIndex_>(do_verification, + init_method, + do_dumpout, + time_kernel, + inLengths, + arrReduceDims, + alpha, + beta); matched = true; }); -- GitLab From 0ee3aea16af66fd33282ce7a505533377fb3a74f Mon Sep 17 00:00:00 2001 From: Illia Silin <98187287+illsilin@users.noreply.github.com> Date: Wed, 26 Oct 2022 09:25:27 -0700 Subject: [PATCH 07/95] fix the script parsing the QA results (#495) --- script/process_perf_data.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/script/process_perf_data.py b/script/process_perf_data.py index de1703cfc..638e4ef56 100644 --- a/script/process_perf_data.py +++ b/script/process_perf_data.py @@ -81,7 +81,7 @@ def parse_logfile(logfile): StrideA=[] StrideB=[] StrideC=[] - if 'perf_gemm' in logfile: + if 'perf_gemm.log' in logfile: for line in open(logfile): if 'Best Perf' in line: lst=line.split() @@ -120,14 +120,14 @@ def parse_logfile(logfile): res = [x for _,x in sorted(zip(tests,tflops))] #sorted_kernels = [x for _,x in sorted(zip(tests,kernels))] test_list=list(range(1,len(tests)+1)) - #parse conv_fwd performance tests: - elif 'conv_fwd' in logfile: + #parse conv_fwd and conv_bwd performance tests: + elif 'conv_fwd' in logfile or 'conv_bwd_data' in logfile: for line in open(logfile): if 'tflops:' in line: lst=line.split() res.append(lst[1]) #parse all other performance tests: - elif 'resnet50' in logfile or 'batched_gemm' in logfile or 'grouped_gemm' in logfile or 'conv_bwd_data' in logfile or 'gemm_bilinear' in logfile or 'reduction' in logfile: + elif 'resnet50' in logfile or 'batched_gemm' in logfile or 'grouped_gemm' in logfile or 'gemm_bilinear' in logfile or 'reduction' in logfile: for line in open(logfile): if 'Best Perf' in line: lst=line.split() @@ -149,7 +149,7 @@ def store_new_test_result(table_name, test_results, testlist, branch_name, node_ df=pd.DataFrame(data=[params],columns=['Branch_ID','Node_ID','GPU_arch','Compute Units','ROCM_version','HIP_version','Environment','Datetime']) df_add=pd.DataFrame(data=[test_results],columns=testlist) df=pd.concat([df,df_add],axis=1) - print("new test results dataframe:",df) + #print("new test results dataframe:",df) df.to_sql(table_name,connection,if_exists='append',index=False) return 0 @@ -165,7 +165,7 @@ def compare_test_to_baseline(baseline,test,testlist): print("test # ",i,"shows regression by {:.3f}%".format( (float(test[i])-base_list[i])/base_list[i]*100)) regression=1 - ave_perf=ave_perf+float(test[i])/base_list[i] + if base_list[i]>0: ave_perf=ave_perf+float(test[i])/base_list[i] if regression==0: print("no regressions found") ave_perf=ave_perf/len(base_list) @@ -248,7 +248,7 @@ def main(): conn = sqlEngine.connect() #save gemm performance tests: - if 'perf_gemm' in filename: + if 'perf_gemm.log' in filename: #write the ck_gemm_test_params table only needed once the test set changes #post_test_params(test_list,conn) for i in range(1,len(results)+1): -- GitLab From 57106048aeb20f55461e7c25e689aa0a945beb7a Mon Sep 17 00:00:00 2001 From: Anthony Chang Date: Fri, 28 Oct 2022 02:25:12 +0800 Subject: [PATCH 08/95] Gemm standalone bench executable (#480) * prototype 4 layouts fix default stride all problem sizes tidy move file update build script restore old file fix build * refactor standalone test to use gemm test harness * simplify gemm test * update build script * remove redundant * early return when cmd arg doesn't match * tidy * report failure when result not validated * tidy * Apply suggestions from code review Co-authored-by: Adam Osewski <19374865+aosewski@users.noreply.github.com> Co-authored-by: Adam Osewski <19374865+aosewski@users.noreply.github.com> --- test/gemm/CMakeLists.txt | 10 ++ test/gemm/gemm_bf16.cpp | 57 +------ test/gemm/gemm_fp16.cpp | 57 +------ test/gemm/gemm_fp32.cpp | 57 +------ test/gemm/gemm_fp64.cpp | 57 +------ test/gemm/gemm_int8.cpp | 57 +------ test/gemm/gemm_standalone_xdl_fp16.cpp | 162 ++++++++++++++++++++ test/gemm/gemm_util.hpp | 107 ++++++++----- test/gemm/instance/gemm_f16_nn_instance.cpp | 86 +++++++++++ test/gemm/instance/gemm_f16_nn_instance.hpp | 41 +++++ test/gemm/instance/gemm_f16_nt_instance.cpp | 86 +++++++++++ test/gemm/instance/gemm_f16_nt_instance.hpp | 41 +++++ test/gemm/instance/gemm_f16_tn_instance.cpp | 86 +++++++++++ test/gemm/instance/gemm_f16_tn_instance.hpp | 41 +++++ test/gemm/instance/gemm_f16_tt_instance.cpp | 86 +++++++++++ test/gemm/instance/gemm_f16_tt_instance.hpp | 41 +++++ test/gemm/run_gemm_test.inc | 41 +++++ 17 files changed, 816 insertions(+), 297 deletions(-) create mode 100644 test/gemm/gemm_standalone_xdl_fp16.cpp create mode 100644 test/gemm/instance/gemm_f16_nn_instance.cpp create mode 100644 test/gemm/instance/gemm_f16_nn_instance.hpp create mode 100644 test/gemm/instance/gemm_f16_nt_instance.cpp create mode 100644 test/gemm/instance/gemm_f16_nt_instance.hpp create mode 100644 test/gemm/instance/gemm_f16_tn_instance.cpp create mode 100644 test/gemm/instance/gemm_f16_tn_instance.hpp create mode 100644 test/gemm/instance/gemm_f16_tt_instance.cpp create mode 100644 test/gemm/instance/gemm_f16_tt_instance.hpp create mode 100644 test/gemm/run_gemm_test.inc diff --git a/test/gemm/CMakeLists.txt b/test/gemm/CMakeLists.txt index 8069dac15..c427586bb 100644 --- a/test/gemm/CMakeLists.txt +++ b/test/gemm/CMakeLists.txt @@ -13,3 +13,13 @@ target_link_libraries(test_gemm_bf16 PRIVATE device_gemm_instance) add_test_executable(test_gemm_int8 gemm_int8.cpp) target_link_libraries(test_gemm_int8 PRIVATE utility) target_link_libraries(test_gemm_int8 PRIVATE device_gemm_instance) + +add_library(gemm_standalone_xdl_fp16_instances STATIC + instance/gemm_f16_nn_instance.cpp + instance/gemm_f16_nt_instance.cpp + instance/gemm_f16_tn_instance.cpp + instance/gemm_f16_tt_instance.cpp +) +add_test_executable(test_gemm_standalone_xdl_fp16 gemm_standalone_xdl_fp16.cpp) +target_link_libraries(test_gemm_standalone_xdl_fp16 PRIVATE gemm_standalone_xdl_fp16_instances utility) +target_include_directories(test_gemm_standalone_xdl_fp16 PRIVATE instance/) diff --git a/test/gemm/gemm_bf16.cpp b/test/gemm/gemm_bf16.cpp index 6130ec9bc..5290d4663 100644 --- a/test/gemm/gemm_bf16.cpp +++ b/test/gemm/gemm_bf16.cpp @@ -24,56 +24,11 @@ #include "test/gemm/gemm_util.hpp" -int main() -{ - using ADataType = ck::bhalf_t; - using BDataType = ck::bhalf_t; - using CDataType = ck::bhalf_t; - using AccDataType = float; +using ADataType = ck::bhalf_t; +using BDataType = ck::bhalf_t; +using CDataType = ck::bhalf_t; +using AccDataType = float; - using Row = ck::tensor_layout::gemm::RowMajor; - using Col = ck::tensor_layout::gemm::ColumnMajor; +#include "run_gemm_test.inc" - using PassThrough = ck::tensor_operation::element_wise::PassThrough; - - auto test = [&](auto a_layout, auto b_layout, auto c_layout) { - bool pass = true; - - using DeviceOp = ck::tensor_operation::device::DeviceGemm; - - const auto gemmPtrs = - ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< - DeviceOp>::GetInstances(); - - for(auto& gemmPtr : gemmPtrs) - { - pass &= ck::gemm_util::TestGemm, - ADataType, - BDataType, - CDataType, - AccDataType, - decltype(a_layout), - decltype(b_layout), - decltype(c_layout), - PassThrough, - PassThrough, - PassThrough>{}(gemmPtr); - } - - return pass; - }; - - bool pass = test(Row{}, Row{}, Row{}) && test(Row{}, Col{}, Row{}) && - test(Col{}, Row{}, Row{}) && test(Col{}, Col{}, Row{}); - - std::cout << "TestGemm ..... " << (pass ? "SUCCESS" : "FAILURE") << std::endl; - return pass ? 0 : 1; -} +int main() { return run_gemm_test(); } diff --git a/test/gemm/gemm_fp16.cpp b/test/gemm/gemm_fp16.cpp index 05e696cad..92e225def 100644 --- a/test/gemm/gemm_fp16.cpp +++ b/test/gemm/gemm_fp16.cpp @@ -24,56 +24,11 @@ #include "test/gemm/gemm_util.hpp" -int main() -{ - using ADataType = ck::half_t; - using BDataType = ck::half_t; - using CDataType = ck::half_t; - using AccDataType = float; +using ADataType = ck::half_t; +using BDataType = ck::half_t; +using CDataType = ck::half_t; +using AccDataType = float; - using Row = ck::tensor_layout::gemm::RowMajor; - using Col = ck::tensor_layout::gemm::ColumnMajor; +#include "run_gemm_test.inc" - using PassThrough = ck::tensor_operation::element_wise::PassThrough; - - auto test = [&](auto a_layout, auto b_layout, auto c_layout) { - bool pass = true; - - using DeviceOp = ck::tensor_operation::device::DeviceGemm; - - const auto gemmPtrs = - ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< - DeviceOp>::GetInstances(); - - for(auto& gemmPtr : gemmPtrs) - { - pass &= ck::gemm_util::TestGemm, - ADataType, - BDataType, - CDataType, - AccDataType, - decltype(a_layout), - decltype(b_layout), - decltype(c_layout), - PassThrough, - PassThrough, - PassThrough>{}(gemmPtr); - } - - return pass; - }; - - bool pass = test(Row{}, Row{}, Row{}) && test(Row{}, Col{}, Row{}) && - test(Col{}, Row{}, Row{}) && test(Col{}, Col{}, Row{}); - - std::cout << "TestGemm ..... " << (pass ? "SUCCESS" : "FAILURE") << std::endl; - return pass ? 0 : 1; -} +int main() { return run_gemm_test(); } diff --git a/test/gemm/gemm_fp32.cpp b/test/gemm/gemm_fp32.cpp index 3e141d7b3..5d8c4881b 100644 --- a/test/gemm/gemm_fp32.cpp +++ b/test/gemm/gemm_fp32.cpp @@ -24,56 +24,11 @@ #include "test/gemm/gemm_util.hpp" -int main() -{ - using ADataType = float; - using BDataType = float; - using CDataType = float; - using AccDataType = float; +using ADataType = float; +using BDataType = float; +using CDataType = float; +using AccDataType = float; - using Row = ck::tensor_layout::gemm::RowMajor; - using Col = ck::tensor_layout::gemm::ColumnMajor; +#include "run_gemm_test.inc" - using PassThrough = ck::tensor_operation::element_wise::PassThrough; - - auto test = [&](auto a_layout, auto b_layout, auto c_layout) { - bool pass = true; - - using DeviceOp = ck::tensor_operation::device::DeviceGemm; - - const auto gemmPtrs = - ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< - DeviceOp>::GetInstances(); - - for(auto& gemmPtr : gemmPtrs) - { - pass &= ck::gemm_util::TestGemm, - ADataType, - BDataType, - CDataType, - AccDataType, - decltype(a_layout), - decltype(b_layout), - decltype(c_layout), - PassThrough, - PassThrough, - PassThrough>{}(gemmPtr); - } - - return pass; - }; - - bool pass = test(Row{}, Row{}, Row{}) && test(Row{}, Col{}, Row{}) && - test(Col{}, Row{}, Row{}) && test(Col{}, Col{}, Row{}); - - std::cout << "TestGemm ..... " << (pass ? "SUCCESS" : "FAILURE") << std::endl; - return pass ? 0 : 1; -} +int main() { return run_gemm_test(); } diff --git a/test/gemm/gemm_fp64.cpp b/test/gemm/gemm_fp64.cpp index 96dc459a3..85d7f95bf 100644 --- a/test/gemm/gemm_fp64.cpp +++ b/test/gemm/gemm_fp64.cpp @@ -24,56 +24,11 @@ #include "test/gemm/gemm_util.hpp" -int main() -{ - using ADataType = double; - using BDataType = double; - using CDataType = double; - using AccDataType = double; +using ADataType = double; +using BDataType = double; +using CDataType = double; +using AccDataType = double; - using Row = ck::tensor_layout::gemm::RowMajor; - using Col = ck::tensor_layout::gemm::ColumnMajor; +#include "run_gemm_test.inc" - using PassThrough = ck::tensor_operation::element_wise::PassThrough; - - auto test = [&](auto a_layout, auto b_layout, auto c_layout) { - bool pass = true; - - using DeviceOp = ck::tensor_operation::device::DeviceGemm; - - const auto gemmPtrs = - ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< - DeviceOp>::GetInstances(); - - for(auto& gemmPtr : gemmPtrs) - { - pass &= ck::gemm_util::TestGemm, - ADataType, - BDataType, - CDataType, - AccDataType, - decltype(a_layout), - decltype(b_layout), - decltype(c_layout), - PassThrough, - PassThrough, - PassThrough>{}(gemmPtr); - } - - return pass; - }; - - bool pass = test(Row{}, Row{}, Row{}) && test(Row{}, Col{}, Row{}) && - test(Col{}, Row{}, Row{}) && test(Col{}, Col{}, Row{}); - - std::cout << "TestGemm ..... " << (pass ? "SUCCESS" : "FAILURE") << std::endl; - return pass ? 0 : 1; -} +int main() { return run_gemm_test(); } diff --git a/test/gemm/gemm_int8.cpp b/test/gemm/gemm_int8.cpp index c7d79782a..e73b22ce9 100644 --- a/test/gemm/gemm_int8.cpp +++ b/test/gemm/gemm_int8.cpp @@ -24,56 +24,11 @@ #include "test/gemm/gemm_util.hpp" -int main() -{ - using ADataType = int8_t; - using BDataType = int8_t; - using CDataType = int8_t; - using AccDataType = int32_t; +using ADataType = int8_t; +using BDataType = int8_t; +using CDataType = int8_t; +using AccDataType = int32_t; - using Row = ck::tensor_layout::gemm::RowMajor; - using Col = ck::tensor_layout::gemm::ColumnMajor; +#include "run_gemm_test.inc" - using PassThrough = ck::tensor_operation::element_wise::PassThrough; - - auto test = [&](auto a_layout, auto b_layout, auto c_layout) { - bool pass = true; - - using DeviceOp = ck::tensor_operation::device::DeviceGemm; - - const auto gemmPtrs = - ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< - DeviceOp>::GetInstances(); - - for(auto& gemmPtr : gemmPtrs) - { - pass &= ck::gemm_util::TestGemm, - ADataType, - BDataType, - CDataType, - AccDataType, - decltype(a_layout), - decltype(b_layout), - decltype(c_layout), - PassThrough, - PassThrough, - PassThrough>{}(gemmPtr); - } - - return pass; - }; - - bool pass = test(Row{}, Row{}, Row{}) && test(Row{}, Col{}, Row{}) && - test(Col{}, Row{}, Row{}) && test(Col{}, Col{}, Row{}); - - std::cout << "TestGemm ..... " << (pass ? "SUCCESS" : "FAILURE") << std::endl; - return pass ? 0 : 1; -} +int main() { return run_gemm_test(); } diff --git a/test/gemm/gemm_standalone_xdl_fp16.cpp b/test/gemm/gemm_standalone_xdl_fp16.cpp new file mode 100644 index 000000000..8f5a5c557 --- /dev/null +++ b/test/gemm/gemm_standalone_xdl_fp16.cpp @@ -0,0 +1,162 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "gemm_util.hpp" + +#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle.hpp" + +#include "gemm_f16_nn_instance.hpp" +#include "gemm_f16_nt_instance.hpp" +#include "gemm_f16_tn_instance.hpp" +#include "gemm_f16_tt_instance.hpp" + +using Row = ck::tensor_layout::gemm::RowMajor; +using Col = ck::tensor_layout::gemm::ColumnMajor; + +using PassThrough = ck::tensor_operation::element_wise::PassThrough; +using F16 = ck::half_t; +using ADataType = F16; +using BDataType = F16; +using AccDataType = float; +using CDataType = F16; + +using ALayout = Row; +using BLayout = Col; +using CLayout = Row; + +using AElementOp = PassThrough; +using BElementOp = PassThrough; +using CElementOp = PassThrough; + +using ck::gemm_util::GemmParams; +using ck::tensor_operation::device::BaseOperator; +using ck::tensor_operation::device::DeviceGemm; +using namespace ck::tensor_operation::device::instance; + +using DeviceGemmNN = + DeviceGemm; +using DeviceGemmNT = + DeviceGemm; +using DeviceGemmTN = + DeviceGemm; +using DeviceGemmTT = + DeviceGemm; + +struct LayoutConfig +{ + bool ARowMajor; + bool BRowMajor; + bool CRowMajor; +}; + +int main(int argc, char* argv[]) +{ + // Class DeviceGemm is templated by layout and precision types so it is not an option to contain + // them in a single vector. Instead we use abstract BaseOperator class and dynamic_cast() it + // upon invocation. + // And since DeviceGemm does not expose template arg information, an extra book keeping class + // LayoutConfig is used for determining which type a BaseOperator instance should be cast to. + using OpFactoryFn = void (*)(std::vector>&); + + std::vector> problems = { + // clang-format off + // 104 tiles + {GemmParams{2048, 3328, 4096}, LayoutConfig{false, false, true}, add_gemm_f16_nn_256x256}, + {GemmParams{2048, 1664, 4096}, LayoutConfig{false, false, true}, add_gemm_f16_nn_256x128}, + {GemmParams{1024, 1664, 4096}, LayoutConfig{false, false, true}, add_gemm_f16_nn_128x128}, + {GemmParams{1024, 832, 4096}, LayoutConfig{false, false, true}, add_gemm_f16_nn_128x64}, + {GemmParams{2048, 3328, 4096}, LayoutConfig{false, true, true}, add_gemm_f16_nt_256x256}, + {GemmParams{2048, 1664, 4096}, LayoutConfig{false, true, true}, add_gemm_f16_nt_256x128}, + {GemmParams{1024, 1664, 4096}, LayoutConfig{false, true, true}, add_gemm_f16_nt_128x128}, + {GemmParams{1024, 832, 4096}, LayoutConfig{false, true, true}, add_gemm_f16_nt_128x64}, + {GemmParams{2048, 3328, 4096}, LayoutConfig{true, false, true}, add_gemm_f16_tn_256x256}, + {GemmParams{2048, 1664, 4096}, LayoutConfig{true, false, true}, add_gemm_f16_tn_256x128}, + {GemmParams{1024, 1664, 4096}, LayoutConfig{true, false, true}, add_gemm_f16_tn_128x128}, + {GemmParams{1024, 832, 4096}, LayoutConfig{true, false, true}, add_gemm_f16_tn_128x64}, + {GemmParams{2048, 3328, 4096}, LayoutConfig{true, true, true}, add_gemm_f16_tt_256x256}, + {GemmParams{2048, 1664, 4096}, LayoutConfig{true, true, true}, add_gemm_f16_tt_256x128}, + {GemmParams{1024, 1664, 4096}, LayoutConfig{true, true, true}, add_gemm_f16_tt_128x128}, + {GemmParams{1024, 832, 4096}, LayoutConfig{true, true, true}, add_gemm_f16_tt_128x64}, + // 110 tiles + {GemmParams{2560, 2816, 4096}, LayoutConfig{false, false, true}, add_gemm_f16_nn_256x256}, + {GemmParams{2560, 1408, 4096}, LayoutConfig{false, false, true}, add_gemm_f16_nn_256x128}, + {GemmParams{1280, 1408, 4096}, LayoutConfig{false, false, true}, add_gemm_f16_nn_128x128}, + {GemmParams{1280, 704, 4096}, LayoutConfig{false, false, true}, add_gemm_f16_nn_128x64}, + {GemmParams{2560, 2816, 4096}, LayoutConfig{false, true, true}, add_gemm_f16_nt_256x256}, + {GemmParams{2560, 1408, 4096}, LayoutConfig{false, true, true}, add_gemm_f16_nt_256x128}, + {GemmParams{1280, 1408, 4096}, LayoutConfig{false, true, true}, add_gemm_f16_nt_128x128}, + {GemmParams{1280, 704, 4096}, LayoutConfig{false, true, true}, add_gemm_f16_nt_128x64}, + {GemmParams{2560, 2816, 4096}, LayoutConfig{true, false, true}, add_gemm_f16_tn_256x256}, + {GemmParams{2560, 1408, 4096}, LayoutConfig{true, false, true}, add_gemm_f16_tn_256x128}, + {GemmParams{1280, 1408, 4096}, LayoutConfig{true, false, true}, add_gemm_f16_tn_128x128}, + {GemmParams{1280, 704, 4096}, LayoutConfig{true, false, true}, add_gemm_f16_tn_128x64}, + {GemmParams{2560, 2816, 4096}, LayoutConfig{true, true, true}, add_gemm_f16_tt_256x256}, + {GemmParams{2560, 1408, 4096}, LayoutConfig{true, true, true}, add_gemm_f16_tt_256x128}, + {GemmParams{1280, 1408, 4096}, LayoutConfig{true, true, true}, add_gemm_f16_tt_128x128}, + {GemmParams{1280, 704, 4096}, LayoutConfig{true, true, true}, add_gemm_f16_tt_128x64}, + // clang-format on + }; + + bool do_verification = true; + bool time_kernel = true; + + if(argc == 1) + { + // use default + } + else if(argc == 3) + { + do_verification = std::stoi(argv[1]); + time_kernel = std::stoi(argv[2]); + } + else + { + std::cerr << "arg1: verification (0=no, 1=yes)" << std::endl + << "arg2: time kernel (0=no, 1=yes)" << std::endl; + return 0; + } + + bool pass = true; + for(auto& p : problems) + { + GemmParams& problem_size = std::get<0>(p); + const LayoutConfig& layout_config = std::get<1>(p); + const auto& factory = std::get<2>(p); + std::vector> ops; + factory(ops); + + // overwrite strides + problem_size.StrideA = layout_config.ARowMajor ? problem_size.K : problem_size.M; + problem_size.StrideB = layout_config.BRowMajor ? problem_size.N : problem_size.K; + problem_size.StrideC = layout_config.CRowMajor ? problem_size.N : problem_size.M; + + if(!layout_config.ARowMajor && !layout_config.BRowMajor) + { + auto op_ptr = dynamic_cast(ops[0].get()); + pass &= ck::gemm_util::TestGemm{}( + op_ptr, problem_size, do_verification, time_kernel); + } + else if(!layout_config.ARowMajor && layout_config.BRowMajor) + { + auto op_ptr = dynamic_cast(ops[0].get()); + pass &= ck::gemm_util::TestGemm{}( + op_ptr, problem_size, do_verification, time_kernel); + } + else if(layout_config.ARowMajor && !layout_config.BRowMajor) + { + auto op_ptr = dynamic_cast(ops[0].get()); + pass &= ck::gemm_util::TestGemm{}( + op_ptr, problem_size, do_verification, time_kernel); + } + else if(layout_config.ARowMajor && layout_config.BRowMajor) + { + auto op_ptr = dynamic_cast(ops[0].get()); + pass &= ck::gemm_util::TestGemm{}( + op_ptr, problem_size, do_verification, time_kernel); + } + } + + std::cout << (pass ? "ALL TESTS PASSED" : "SOME TESTS FAILED") << std::endl; + return pass ? 0 : 1; +} diff --git a/test/gemm/gemm_util.hpp b/test/gemm/gemm_util.hpp index 2df605be1..6291215b3 100644 --- a/test/gemm/gemm_util.hpp +++ b/test/gemm/gemm_util.hpp @@ -16,21 +16,13 @@ namespace gemm_util { struct GemmParams { - GemmParams() - : M(1024), N(1024), K(1024), StrideA(1024), StrideB(1024), StrideC(1024), alpha(1), beta(0) - { - } - - ck::index_t M; - ck::index_t N; - ck::index_t K; + ck::index_t M = 1024; + ck::index_t N = 1024; + ck::index_t K = 1024; - ck::index_t StrideA; - ck::index_t StrideB; - ck::index_t StrideC; - - float alpha; - float beta; + ck::index_t StrideA = 1024; + ck::index_t StrideB = 1024; + ck::index_t StrideC = 1024; }; template & C, AElementwiseOperation a_element_op, BElementwiseOperation b_element_op, - CElementwiseOperation c_element_op) + CElementwiseOperation c_element_op, + bool time_kernel) { DeviceMem a_m_k_device_buf(sizeof(ADataType) * A.mDesc.GetElementSpaceSize()); DeviceMem b_k_n_device_buf(sizeof(BDataType) * B.mDesc.GetElementSpaceSize()); @@ -94,7 +87,20 @@ bool RunDeviceGEMM(DeviceGemmPtr_& gemmPtr, { a_m_k_device_buf.ToDevice(A.mData.data()); b_k_n_device_buf.ToDevice(B.mData.data()); - invoker_ptr->Run(argument_ptr.get()); + float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel}); + + std::size_t flop = std::size_t(2) * params.M * params.N * params.K; + std::size_t num_btype = sizeof(ADataType) * params.M * params.K + + sizeof(BDataType) * params.K * params.N + + sizeof(CDataType) * params.M * params.N; + + float tflops = static_cast(flop) / 1.E9 / ave_time; + + float gb_per_sec = num_btype / 1.E6 / ave_time; + + std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec + << " GB/s, " << std::endl; + c_m_n_device_buf.FromDevice(C.mData.data()); return true; @@ -109,19 +115,15 @@ bool RunDeviceGEMM(DeviceGemmPtr_& gemmPtr, } } -template +template struct TestGemm { + template auto PrepareGemmTensor(const ck::gemm_util::GemmParams& params) { auto f_host_tensor_descriptor = @@ -156,25 +158,42 @@ struct TestGemm f_generate_tensor_value(a_m_k, ADataType{}); f_generate_tensor_value(b_k_n, BDataType{}); + std::cout << "a_m_k: " << a_m_k.mDesc << std::endl; + std::cout << "b_k_n: " << b_k_n.mDesc << std::endl; + std::cout << "c_m_n: " << c_m_n_host_result.mDesc << std::endl; + return std::make_tuple(a_m_k, b_k_n, c_m_n_host_result, c_m_n_device_result); } - auto operator()(const DeviceGemmPtr_& gemmPtr) + template