Commit 49282565 authored by Jing Zhang's avatar Jing Zhang
Browse files

add grouped gemm instalces

parent 01ee5e53
...@@ -16,7 +16,7 @@ namespace element_wise { ...@@ -16,7 +16,7 @@ namespace element_wise {
extern "C" __device__ float __ocml_native_recip_f32(float); extern "C" __device__ float __ocml_native_recip_f32(float);
#endif #endif
struct PassThrough struct PassThroughPack2
{ {
template <typename Y, typename X> template <typename Y, typename X>
__host__ __device__ void operator()(Y& y, const X& x) const; __host__ __device__ void operator()(Y& y, const X& x) const;
...@@ -64,6 +64,12 @@ struct PassThrough ...@@ -64,6 +64,12 @@ struct PassThrough
} }
constexpr const static bool is_pack2_invocable = true; constexpr const static bool is_pack2_invocable = true;
};
struct PassThrough
{
template <typename Y, typename X>
__host__ __device__ void operator()(Y& y, const X& x) const;
template <> template <>
__host__ __device__ void operator()<double, double>(double& y, const double& x) const __host__ __device__ void operator()<double, double>(double& y, const double& x) const
......
...@@ -4,97 +4,99 @@ list(APPEND GEMM_INSTANCES device_gemm_xdl_f64_f64_f64_mk_kn_mn_instance.cpp ...@@ -4,97 +4,99 @@ list(APPEND GEMM_INSTANCES device_gemm_xdl_f64_f64_f64_mk_kn_mn_instance.cpp
device_gemm_xdl_f64_f64_f64_km_kn_mn_instance.cpp device_gemm_xdl_f64_f64_f64_km_kn_mn_instance.cpp
device_gemm_xdl_f64_f64_f64_km_nk_mn_instance.cpp) device_gemm_xdl_f64_f64_f64_km_nk_mn_instance.cpp)
list(APPEND GEMM_INSTANCES list(APPEND GEMM_INSTANCES
#device_gemm_xdl_f32_f32_f32_mk_kn_mn_instance.cpp device_gemm_xdl_f32_f32_f32_mk_kn_mn_instance.cpp
#device_gemm_xdl_f32_f32_f32_mk_nk_mn_instance.cpp device_gemm_xdl_f32_f32_f32_mk_nk_mn_instance.cpp
#device_gemm_xdl_f32_f32_f32_km_kn_mn_instance.cpp device_gemm_xdl_f32_f32_f32_km_kn_mn_instance.cpp
#device_gemm_xdl_f32_f32_f32_km_nk_mn_instance.cpp device_gemm_xdl_f32_f32_f32_km_nk_mn_instance.cpp
device_gemm_xdl_c_shuffle_f32_f32_f32_mk_kn_mn_instance.cpp device_gemm_xdl_c_shuffle_f32_f32_f32_mk_kn_mn_instance.cpp
device_gemm_xdl_c_shuffle_f32_f32_f32_mk_nk_mn_instance.cpp device_gemm_xdl_c_shuffle_f32_f32_f32_mk_nk_mn_instance.cpp
device_gemm_xdl_c_shuffle_f32_f32_f32_km_kn_mn_instance.cpp device_gemm_xdl_c_shuffle_f32_f32_f32_km_kn_mn_instance.cpp
device_gemm_xdl_c_shuffle_f32_f32_f32_km_nk_mn_instance.cpp device_gemm_xdl_c_shuffle_f32_f32_f32_km_nk_mn_instance.cpp
#device_gemm_dl_f32_f32_f32_mk_kn_mn_instance.cpp device_gemm_dl_f32_f32_f32_mk_kn_mn_instance.cpp
#device_gemm_dl_f32_f32_f32_mk_nk_mn_instance.cpp device_gemm_dl_f32_f32_f32_mk_nk_mn_instance.cpp
#device_gemm_dl_f32_f32_f32_km_kn_mn_instance.cpp device_gemm_dl_f32_f32_f32_km_kn_mn_instance.cpp
#device_gemm_dl_f32_f32_f32_km_nk_mn_instance.cpp device_gemm_dl_f32_f32_f32_km_nk_mn_instance.cpp
) )
list(APPEND GEMM_INSTANCES list(APPEND GEMM_INSTANCES
#device_gemm_dl_f16_f16_f16_mk_kn_mn_instance.cpp device_gemm_dl_f16_f16_f16_mk_kn_mn_instance.cpp
#device_gemm_dl_f16_f16_f16_mk_kn_mn_irregular_instance.cpp device_gemm_dl_f16_f16_f16_mk_kn_mn_irregular_instance.cpp
#device_gemm_dl_f16_f16_f16_mk_nk_mn_instance.cpp device_gemm_dl_f16_f16_f16_mk_nk_mn_instance.cpp
#device_gemm_dl_f16_f16_f16_mk_nk_mn_irregular_instance.cpp device_gemm_dl_f16_f16_f16_mk_nk_mn_irregular_instance.cpp
#device_gemm_dl_f16_f16_f16_km_kn_mn_instance.cpp device_gemm_dl_f16_f16_f16_km_kn_mn_instance.cpp
#device_gemm_dl_f16_f16_f16_km_kn_mn_irregular_instance.cpp device_gemm_dl_f16_f16_f16_km_kn_mn_irregular_instance.cpp
#device_gemm_dl_f16_f16_f16_km_nk_mn_instance.cpp device_gemm_dl_f16_f16_f16_km_nk_mn_instance.cpp
#device_gemm_dl_f16_f16_f16_km_nk_mn_irregular_instance.cpp device_gemm_dl_f16_f16_f16_km_nk_mn_irregular_instance.cpp
#device_gemm_dpp_f16_f16_f16_km_kn_mn_instance.cpp device_gemm_dpp_f16_f16_f16_km_kn_mn_instance.cpp
#device_gemm_dpp_f16_f16_f16_km_nk_mn_instance.cpp device_gemm_dpp_f16_f16_f16_km_nk_mn_instance.cpp
#device_gemm_dpp_f16_f16_f16_mk_kn_mn_instance.cpp device_gemm_dpp_f16_f16_f16_mk_kn_mn_instance.cpp
#device_gemm_dpp_f16_f16_f16_mk_nk_mn_instance.cpp device_gemm_dpp_f16_f16_f16_mk_nk_mn_instance.cpp
#device_gemm_dpp_f16_f16_f16_km_kn_mn_irregular_instance.cpp device_gemm_dpp_f16_f16_f16_km_kn_mn_irregular_instance.cpp
#device_gemm_dpp_f16_f16_f16_km_nk_mn_irregular_instance.cpp device_gemm_dpp_f16_f16_f16_km_nk_mn_irregular_instance.cpp
#device_gemm_dpp_f16_f16_f16_mk_kn_mn_irregular_instance.cpp device_gemm_dpp_f16_f16_f16_mk_kn_mn_irregular_instance.cpp
#device_gemm_dpp_f16_f16_f16_mk_nk_mn_irregular_instance.cpp device_gemm_dpp_f16_f16_f16_mk_nk_mn_irregular_instance.cpp
device_gemm_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_instance.cpp device_gemm_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_instance.cpp
device_gemm_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_instance.cpp device_gemm_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_instance.cpp
device_gemm_xdl_c_shuffle_f16_f16_f16_km_kn_mn_instance.cpp device_gemm_xdl_c_shuffle_f16_f16_f16_km_kn_mn_instance.cpp
device_gemm_xdl_c_shuffle_f16_f16_f16_km_nk_mn_instance.cpp device_gemm_xdl_c_shuffle_f16_f16_f16_km_nk_mn_instance.cpp
device_gemm_xdl_c_shuffle_2_stage_f16_f16_f16_mk_nk_mn_instance.cpp device_gemm_xdl_c_shuffle_2_stage_f16_f16_f16_mk_nk_mn_instance.cpp
#device_gemm_xdl_f16_f16_f16/km_kn_mn_add_instance.cpp device_gemm_xdl_f16_f16_f16/km_kn_mn_add_instance.cpp
#device_gemm_xdl_f16_f16_f16/km_kn_mn_default_pipeline_v1_instance.cpp device_gemm_xdl_f16_f16_f16/km_kn_mn_default_pipeline_v1_instance.cpp
#device_gemm_xdl_f16_f16_f16/km_kn_mn_default_pipeline_v2_instance.cpp device_gemm_xdl_f16_f16_f16/km_kn_mn_default_pipeline_v2_instance.cpp
#device_gemm_xdl_f16_f16_f16/km_kn_mn_default_pipeline_v2_opt_instance.cpp device_gemm_xdl_f16_f16_f16/km_kn_mn_default_pipeline_v2_opt_instance.cpp
#device_gemm_xdl_f16_f16_f16/km_kn_mn_interwave_pipeline_v1_instance.cpp device_gemm_xdl_f16_f16_f16/km_kn_mn_interwave_pipeline_v1_instance.cpp
#device_gemm_xdl_f16_f16_f16/km_kn_mn_irregular_default_pipeline_v1_instance.cpp device_gemm_xdl_f16_f16_f16/km_kn_mn_irregular_default_pipeline_v1_instance.cpp
#device_gemm_xdl_f16_f16_f16/km_kn_mn_irregular_default_pipeline_v2_instance.cpp device_gemm_xdl_f16_f16_f16/km_kn_mn_irregular_default_pipeline_v2_instance.cpp
#device_gemm_xdl_f16_f16_f16/km_kn_mn_irregular_interwave_pipeline_v1_instance.cpp device_gemm_xdl_f16_f16_f16/km_kn_mn_irregular_interwave_pipeline_v1_instance.cpp
#device_gemm_xdl_f16_f16_f16/km_nk_mn_add_instance.cpp device_gemm_xdl_f16_f16_f16/km_nk_mn_add_instance.cpp
#device_gemm_xdl_f16_f16_f16/km_nk_mn_default_pipeline_v1_instance.cpp device_gemm_xdl_f16_f16_f16/km_nk_mn_default_pipeline_v1_instance.cpp
#device_gemm_xdl_f16_f16_f16/km_nk_mn_default_pipeline_v2_instance.cpp device_gemm_xdl_f16_f16_f16/km_nk_mn_default_pipeline_v2_instance.cpp
#device_gemm_xdl_f16_f16_f16/km_nk_mn_default_pipeline_v2_opt_instance.cpp device_gemm_xdl_f16_f16_f16/km_nk_mn_default_pipeline_v2_opt_instance.cpp
#device_gemm_xdl_f16_f16_f16/km_nk_mn_interwave_pipeline_v1_instance.cpp device_gemm_xdl_f16_f16_f16/km_nk_mn_interwave_pipeline_v1_instance.cpp
#device_gemm_xdl_f16_f16_f16/km_nk_mn_irregular_default_pipeline_v1_instance.cpp device_gemm_xdl_f16_f16_f16/km_nk_mn_irregular_default_pipeline_v1_instance.cpp
#device_gemm_xdl_f16_f16_f16/km_nk_mn_irregular_default_pipeline_v2_instance.cpp device_gemm_xdl_f16_f16_f16/km_nk_mn_irregular_default_pipeline_v2_instance.cpp
#device_gemm_xdl_f16_f16_f16/km_nk_mn_irregular_interwave_pipeline_v1_instance.cpp device_gemm_xdl_f16_f16_f16/km_nk_mn_irregular_interwave_pipeline_v1_instance.cpp
#device_gemm_xdl_f16_f16_f16/mk_kn_mn_add_instance.cpp device_gemm_xdl_f16_f16_f16/mk_kn_mn_add_instance.cpp
#device_gemm_xdl_f16_f16_f16/mk_kn_mn_default_pipeline_v1_instance.cpp device_gemm_xdl_f16_f16_f16/mk_kn_mn_default_pipeline_v1_instance.cpp
#device_gemm_xdl_f16_f16_f16/mk_kn_mn_default_pipeline_v2_instance.cpp device_gemm_xdl_f16_f16_f16/mk_kn_mn_default_pipeline_v2_instance.cpp
#device_gemm_xdl_f16_f16_f16/mk_kn_mn_default_pipeline_v2_opt_instance.cpp device_gemm_xdl_f16_f16_f16/mk_kn_mn_default_pipeline_v2_opt_instance.cpp
#device_gemm_xdl_f16_f16_f16/mk_kn_mn_interwave_pipeline_v1_instance.cpp device_gemm_xdl_f16_f16_f16/mk_kn_mn_interwave_pipeline_v1_instance.cpp
#device_gemm_xdl_f16_f16_f16/mk_kn_mn_irregular_default_pipeline_v1_instance.cpp device_gemm_xdl_f16_f16_f16/mk_kn_mn_irregular_default_pipeline_v1_instance.cpp
#device_gemm_xdl_f16_f16_f16/mk_kn_mn_irregular_default_pipeline_v2_instance.cpp device_gemm_xdl_f16_f16_f16/mk_kn_mn_irregular_default_pipeline_v2_instance.cpp
#device_gemm_xdl_f16_f16_f16/mk_kn_mn_irregular_interwave_pipeline_v1_instance.cpp device_gemm_xdl_f16_f16_f16/mk_kn_mn_irregular_interwave_pipeline_v1_instance.cpp
#device_gemm_xdl_f16_f16_f16/mk_nk_mn_add_instance.cpp device_gemm_xdl_f16_f16_f16/mk_nk_mn_add_instance.cpp
#device_gemm_xdl_f16_f16_f16/mk_nk_mn_default_pipeline_v1_instance.cpp device_gemm_xdl_f16_f16_f16/mk_nk_mn_default_pipeline_v1_instance.cpp
#device_gemm_xdl_f16_f16_f16/mk_nk_mn_default_pipeline_v2_instance.cpp device_gemm_xdl_f16_f16_f16/mk_nk_mn_default_pipeline_v2_instance.cpp
#device_gemm_xdl_f16_f16_f16/mk_nk_mn_default_pipeline_v2_opt_instance.cpp device_gemm_xdl_f16_f16_f16/mk_nk_mn_default_pipeline_v2_opt_instance.cpp
#device_gemm_xdl_f16_f16_f16/mk_nk_mn_interwave_pipeline_v1_instance.cpp device_gemm_xdl_f16_f16_f16/mk_nk_mn_interwave_pipeline_v1_instance.cpp
#device_gemm_xdl_f16_f16_f16/mk_nk_mn_irregular_default_pipeline_v1_instance.cpp device_gemm_xdl_f16_f16_f16/mk_nk_mn_irregular_default_pipeline_v1_instance.cpp
#device_gemm_xdl_f16_f16_f16/mk_nk_mn_irregular_default_pipeline_v2_instance.cpp device_gemm_xdl_f16_f16_f16/mk_nk_mn_irregular_default_pipeline_v2_instance.cpp
#device_gemm_xdl_f16_f16_f16/mk_nk_mn_irregular_interwave_pipeline_v1_instance.cpp device_gemm_xdl_f16_f16_f16/mk_nk_mn_irregular_interwave_pipeline_v1_instance.cpp
) )
list(APPEND GEMM_INSTANCES list(APPEND GEMM_INSTANCES
#device_gemm_dl_i8_i8_i8_mk_kn_mn_instance.cpp device_gemm_dl_i8_i8_i8_mk_kn_mn_instance.cpp
#device_gemm_dl_i8_i8_i8_mk_kn_mn_irregular_instance.cpp device_gemm_dl_i8_i8_i8_mk_kn_mn_irregular_instance.cpp
#device_gemm_dl_i8_i8_i8_mk_nk_mn_instance.cpp device_gemm_dl_i8_i8_i8_mk_nk_mn_instance.cpp
#device_gemm_dl_i8_i8_i8_mk_nk_mn_irregular_instance.cpp device_gemm_dl_i8_i8_i8_mk_nk_mn_irregular_instance.cpp
#device_gemm_dl_i8_i8_i8_km_kn_mn_instance.cpp device_gemm_dl_i8_i8_i8_km_kn_mn_instance.cpp
#device_gemm_dl_i8_i8_i8_km_kn_mn_irregular_instance.cpp device_gemm_dl_i8_i8_i8_km_kn_mn_irregular_instance.cpp
#device_gemm_dl_i8_i8_i8_km_nk_mn_instance.cpp device_gemm_dl_i8_i8_i8_km_nk_mn_instance.cpp
#device_gemm_dl_i8_i8_i8_km_nk_mn_irregular_instance.cpp device_gemm_dl_i8_i8_i8_km_nk_mn_irregular_instance.cpp
device_gemm_xdl_c_shuffle_i8_i8_i8_mk_kn_mn_instance.cpp device_gemm_xdl_c_shuffle_i8_i8_i8_mk_kn_mn_instance.cpp
device_gemm_xdl_c_shuffle_i8_i8_i8_mk_nk_mn_instance.cpp device_gemm_xdl_c_shuffle_i8_i8_i8_mk_nk_mn_instance.cpp
device_gemm_xdl_c_shuffle_i8_i8_i8_km_kn_mn_instance.cpp device_gemm_xdl_c_shuffle_i8_i8_i8_km_kn_mn_instance.cpp
device_gemm_xdl_c_shuffle_i8_i8_i8_km_nk_mn_instance.cpp) device_gemm_xdl_c_shuffle_i8_i8_i8_km_nk_mn_instance.cpp)
list(APPEND GEMM_INSTANCES device_gemm_xdl_c_shuffle_bf16_bf16_bf16_mk_kn_mn_instance.cpp list(APPEND GEMM_INSTANCES
device_gemm_xdl_c_shuffle_bf16_bf16_bf16_mk_kn_mn_instance.cpp
device_gemm_xdl_c_shuffle_bf16_bf16_bf16_mk_nk_mn_instance.cpp device_gemm_xdl_c_shuffle_bf16_bf16_bf16_mk_nk_mn_instance.cpp
device_gemm_xdl_c_shuffle_bf16_bf16_bf16_km_kn_mn_instance.cpp device_gemm_xdl_c_shuffle_bf16_bf16_bf16_km_kn_mn_instance.cpp
device_gemm_xdl_c_shuffle_bf16_bf16_bf16_km_nk_mn_instance.cpp) device_gemm_xdl_c_shuffle_bf16_bf16_bf16_km_nk_mn_instance.cpp)
list(APPEND GEMM_INSTANCES device_gemm_xdl_c_shuffle_fp8_fp8_fp8_mk_kn_mn_instance.cpp list(APPEND GEMM_INSTANCES
device_gemm_xdl_c_shuffle_fp8_fp8_fp8_mk_kn_mn_instance.cpp
device_gemm_xdl_c_shuffle_fp8_fp8_fp8_mk_nk_mn_instance.cpp device_gemm_xdl_c_shuffle_fp8_fp8_fp8_mk_nk_mn_instance.cpp
device_gemm_xdl_c_shuffle_fp8_fp8_fp8_km_kn_mn_instance.cpp device_gemm_xdl_c_shuffle_fp8_fp8_fp8_km_kn_mn_instance.cpp
device_gemm_xdl_c_shuffle_fp8_fp8_fp8_km_nk_mn_instance.cpp) device_gemm_xdl_c_shuffle_fp8_fp8_fp8_km_nk_mn_instance.cpp)
...@@ -108,21 +110,21 @@ if (ENABLE_PIPELINE_V2_OPT) ...@@ -108,21 +110,21 @@ if (ENABLE_PIPELINE_V2_OPT)
CK_USE_WAVES_PER_EU=1 CK_USE_WAVES_PER_EU=1
CK_MIN_WAVES_PER_EU=1 CK_MIN_WAVES_PER_EU=1
CK_MAX_WAVES_PER_EU=1 CK_MAX_WAVES_PER_EU=1
) )
set(IGLP_OPT_DEFS set(IGLP_OPT_DEFS
CK_EXPERIMENTAL_PIPELINE_V2_IGLP_OPT=1 CK_EXPERIMENTAL_PIPELINE_V2_IGLP_OPT=1
) )
# TODO: The "-vectorize-slp=false" LLVM option is a workaround to prevent inefficient instruction scheduling # TODO: The "-vectorize-slp=false" LLVM option is a workaround to prevent inefficient instruction scheduling
# caused by the SLP Vectorizer. Remove this option after fix the SLP Vectorizer issue. # caused by the SLP Vectorizer. Remove this option after fix the SLP Vectorizer issue.
# layout=NT # layout=NT
set_source_files_properties(device_gemm_xdl_f16_f16_f16/km_kn_mn_default_pipeline_v2_opt_instance.cpp PROPERTIES set_source_files_properties(device_gemm_xdl_f16_f16_f16/km_kn_mn_default_pipeline_v2_opt_instance.cpp PROPERTIES
COMPILE_OPTIONS ";-mllvm;-vectorize-slp=false" COMPILE_OPTIONS ";-mllvm;-vectorize-slp=false"
COMPILE_DEFINITIONS "${WAVES_PER_EU_DEFS};${IGLP_OPT_DEFS}") COMPILE_DEFINITIONS "${WAVES_PER_EU_DEFS};${IGLP_OPT_DEFS}")
# layout=NN # layout=NN
set_source_files_properties(device_gemm_xdl_f16_f16_f16/km_nk_mn_default_pipeline_v2_opt_instance.cpp PROPERTIES set_source_files_properties(device_gemm_xdl_f16_f16_f16/km_nk_mn_default_pipeline_v2_opt_instance.cpp PROPERTIES
COMPILE_OPTIONS ";-mllvm;-vectorize-slp=false" COMPILE_OPTIONS ";-mllvm;-vectorize-slp=false"
COMPILE_DEFINITIONS "${WAVES_PER_EU_DEFS};${IGLP_OPT_DEFS}") COMPILE_DEFINITIONS "${WAVES_PER_EU_DEFS};${IGLP_OPT_DEFS}")
# layout=TT # layout=TT
set_source_files_properties(device_gemm_xdl_f16_f16_f16/mk_kn_mn_default_pipeline_v2_opt_instance.cpp PROPERTIES set_source_files_properties(device_gemm_xdl_f16_f16_f16/mk_kn_mn_default_pipeline_v2_opt_instance.cpp PROPERTIES
COMPILE_OPTIONS ";;" COMPILE_OPTIONS ";;"
......
...@@ -7,6 +7,6 @@ add_instance_library(device_grouped_gemm_instance ...@@ -7,6 +7,6 @@ add_instance_library(device_grouped_gemm_instance
device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_instance.cpp device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_instance.cpp
device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_irregular_instance.cpp device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_irregular_instance.cpp
device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_irregular_instance.cpp device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_irregular_instance.cpp
device_grouped_gemm_xdl_splitk_f16_f8_f16_mk_kn_mn_irregular_instance.cpp device_grouped_gemm_xdl_splitk_f16_f8_f16_mk_kn_mn_irregular_instance.cpp
device_grouped_gemm_xdl_splitk_f8_f16_f16_mk_kn_mn_irregular_instance.cpp
) )
...@@ -3,51 +3,53 @@ set(PROFILER_SOURCES ...@@ -3,51 +3,53 @@ set(PROFILER_SOURCES
profiler.cpp profiler.cpp
profile_gemm.cpp profile_gemm.cpp
profile_gemm_splitk.cpp profile_gemm_splitk.cpp
#profile_gemm_bias_add_reduce.cpp profile_gemm_bias_add_reduce.cpp
#profile_gemm_add_multiply.cpp profile_gemm_add_multiply.cpp
#profile_gemm_multiply_add.cpp profile_gemm_multiply_add.cpp
#profile_gemm_reduce.cpp profile_gemm_reduce.cpp
#profile_batched_gemm.cpp profile_batched_gemm.cpp
#profile_batched_gemm_reduce.cpp profile_batched_gemm_reduce.cpp
#profile_conv_fwd.cpp profile_conv_fwd.cpp
#profile_conv_fwd_bias_relu.cpp profile_conv_fwd_bias_relu.cpp
#profile_conv_fwd_bias_relu_add.cpp profile_conv_fwd_bias_relu_add.cpp
#profile_conv_bwd_data.cpp profile_conv_bwd_data.cpp
#profile_grouped_conv_fwd.cpp profile_grouped_conv_fwd.cpp
#profile_grouped_conv_bwd_weight.cpp profile_grouped_conv_bwd_weight.cpp
#profile_reduce.cpp profile_reduce.cpp
#profile_groupnorm.cpp profile_groupnorm.cpp
#profile_layernorm.cpp profile_layernorm.cpp
#profile_max_pool3d_fwd.cpp profile_max_pool3d_fwd.cpp
#profile_avg_pool3d_bwd.cpp profile_avg_pool3d_bwd.cpp
#profile_max_pool3d_bwd.cpp profile_max_pool3d_bwd.cpp
#profile_softmax.cpp profile_softmax.cpp
#profile_batchnorm_fwd.cpp profile_batchnorm_fwd.cpp
#profile_batchnorm_bwd.cpp profile_batchnorm_bwd.cpp
#profile_batchnorm_infer.cpp profile_batchnorm_infer.cpp
#profile_grouped_conv_bwd_data.cpp profile_grouped_conv_bwd_data.cpp
#profile_conv_tensor_rearrange.cpp profile_conv_tensor_rearrange.cpp
) )
#if(DL_KERNELS)
#list(APPEND PROFILER_SOURCES profile_batched_gemm_multi_d.cpp) if(DL_KERNELS)
#endif() list(APPEND PROFILER_SOURCES profile_batched_gemm_multi_d.cpp)
#if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES) endif()
#list(APPEND PROFILER_SOURCES profile_batched_gemm_gemm.cpp)
#list(APPEND PROFILER_SOURCES profile_gemm_fastgelu.cpp) if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
#list(APPEND PROFILER_SOURCES profile_gemm_streamk.cpp) list(APPEND PROFILER_SOURCES profile_batched_gemm_gemm.cpp)
#list(APPEND PROFILER_SOURCES profile_gemm_bilinear.cpp) list(APPEND PROFILER_SOURCES profile_gemm_fastgelu.cpp)
#list(APPEND PROFILER_SOURCES profile_gemm_add_fastgelu.cpp) list(APPEND PROFILER_SOURCES profile_gemm_streamk.cpp)
#list(APPEND PROFILER_SOURCES profile_gemm_add_add_fastgelu.cpp) list(APPEND PROFILER_SOURCES profile_gemm_bilinear.cpp)
#list(APPEND PROFILER_SOURCES profile_gemm_add_relu_add_layernorm.cpp) list(APPEND PROFILER_SOURCES profile_gemm_add_fastgelu.cpp)
#list(APPEND PROFILER_SOURCES profile_batched_gemm_add_relu_gemm_add.cpp) list(APPEND PROFILER_SOURCES profile_gemm_add_add_fastgelu.cpp)
list(APPEND PROFILER_SOURCES profile_gemm_add_relu_add_layernorm.cpp)
list(APPEND PROFILER_SOURCES profile_batched_gemm_add_relu_gemm_add.cpp)
list(APPEND PROFILER_SOURCES profile_grouped_gemm.cpp) list(APPEND PROFILER_SOURCES profile_grouped_gemm.cpp)
#list(APPEND PROFILER_SOURCES profile_grouped_gemm_fastgelu.cpp) list(APPEND PROFILER_SOURCES profile_grouped_gemm_fastgelu.cpp)
#endif() endif()
#if(DTYPES MATCHES "fp32" OR DTYPES MATCHES "fp64" OR NOT DEFINED DTYPES) if(DTYPES MATCHES "fp32" OR DTYPES MATCHES "fp64" OR NOT DEFINED DTYPES)
#list(APPEND PROFILER_SOURCES profile_contraction_bilinear.cpp) list(APPEND PROFILER_SOURCES profile_contraction_bilinear.cpp)
#list(APPEND PROFILER_SOURCES profile_contraction_scale.cpp) list(APPEND PROFILER_SOURCES profile_contraction_scale.cpp)
#endif() endif()
set(PROFILER_EXECUTABLE ckProfiler) set(PROFILER_EXECUTABLE ckProfiler)
...@@ -57,57 +59,58 @@ target_compile_options(${PROFILER_EXECUTABLE} PRIVATE -Wno-global-constructors) ...@@ -57,57 +59,58 @@ target_compile_options(${PROFILER_EXECUTABLE} PRIVATE -Wno-global-constructors)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE utility) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE utility)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_instance)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_splitk_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_splitk_instance)
#target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_add_multiply_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_add_multiply_instance)
#target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_multiply_add_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_multiply_add_instance)
#target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_reduce_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_reduce_instance)
#target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_bias_add_reduce_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_bias_add_reduce_instance)
#target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batched_gemm_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batched_gemm_instance)
#target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batched_gemm_reduce_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batched_gemm_reduce_instance)
#target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_conv2d_fwd_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_conv2d_fwd_instance)
#target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv1d_fwd_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv1d_fwd_instance)
#target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv2d_fwd_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv2d_fwd_instance)
#target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv3d_fwd_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv3d_fwd_instance)
#target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_conv1d_bwd_data_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_conv1d_bwd_data_instance)
#target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_conv2d_bwd_data_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_conv2d_bwd_data_instance)
#target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_conv3d_bwd_data_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_conv3d_bwd_data_instance)
#target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv1d_bwd_weight_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv1d_bwd_weight_instance)
#target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv2d_bwd_weight_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv2d_bwd_weight_instance)
#target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv3d_bwd_weight_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv3d_bwd_weight_instance)
#target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_conv2d_fwd_bias_relu_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_conv2d_fwd_bias_relu_instance)
#target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_conv2d_fwd_bias_relu_add_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_conv2d_fwd_bias_relu_add_instance)
#target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_normalization_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_normalization_instance)
#target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_softmax_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_softmax_instance)
#target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_reduce_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_reduce_instance)
#target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batchnorm_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batchnorm_instance)
#target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_pool3d_fwd_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_pool3d_fwd_instance)
#target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_avg_pool3d_bwd_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_avg_pool3d_bwd_instance)
#target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_max_pool_bwd_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_max_pool_bwd_instance)
#target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv2d_bwd_data_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv2d_bwd_data_instance)
#target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv3d_bwd_data_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv3d_bwd_data_instance)
#target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_image_to_column_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_image_to_column_instance)
#target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_column_to_image_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_column_to_image_instance)
#if(DTYPES MATCHES "fp32" OR DTYPES MATCHES "fp64" OR NOT DEFINED DTYPES) if(DTYPES MATCHES "fp32" OR DTYPES MATCHES "fp64" OR NOT DEFINED DTYPES)
#target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_contraction_bilinear_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_contraction_bilinear_instance)
#target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_contraction_scale_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_contraction_scale_instance)
#endif() endif()
#if(DL_KERNELS) if(DL_KERNELS)
#target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batched_gemm_multi_d_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batched_gemm_multi_d_instance)
#endif() endif()
#if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES) if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
#target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_add_fastgelu_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_add_fastgelu_instance)
#target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_add_relu_add_layernorm_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_add_relu_add_layernorm_instance)
#target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_bilinear_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_bilinear_instance)
#target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_add_add_fastgelu_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_add_add_fastgelu_instance)
#target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_streamk_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_streamk_instance)
#target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_fastgelu_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_fastgelu_instance)
#target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batched_gemm_gemm_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batched_gemm_gemm_instance)
#target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batched_gemm_add_relu_gemm_add_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batched_gemm_add_relu_gemm_add_instance)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_gemm_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_gemm_instance)
#target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_gemm_fastgelu_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_gemm_fastgelu_instance)
#endif() endif()
rocm_install(TARGETS ${PROFILER_EXECUTABLE} COMPONENT profiler) rocm_install(TARGETS ${PROFILER_EXECUTABLE} COMPONENT profiler)
...@@ -27,7 +27,8 @@ enum struct GemmDataType ...@@ -27,7 +27,8 @@ enum struct GemmDataType
F16_F16_F16, // 1 F16_F16_F16, // 1
BF16_BF16_BF16, // 2 BF16_BF16_BF16, // 2
INT8_INT8_INT8, // 3 INT8_INT8_INT8, // 3
F16_F8_F16, // 4 F8_F16_F16, // 4
F16_F8_F16, // 5
}; };
#define OP_NAME "grouped_gemm" #define OP_NAME "grouped_gemm"
...@@ -170,6 +171,26 @@ int profile_grouped_gemm(int argc, char* argv[]) ...@@ -170,6 +171,26 @@ int profile_grouped_gemm(int argc, char* argv[])
StrideCs, StrideCs,
kbatch); kbatch);
} }
else if(data_type == GemmDataType::F8_F16_F16 && layout == GemmMatrixLayout::MK_KN_MN)
{
ck::profiler::profile_grouped_gemm_impl<ck::f8_t,
ck::half_t,
ck::half_t,
float,
ck::tensor_layout::gemm::RowMajor,
ck::tensor_layout::gemm::RowMajor,
ck::tensor_layout::gemm::RowMajor>(do_verification,
init_method,
do_log,
time_kernel,
Ms,
Ns,
Ks,
StrideAs,
StrideBs,
StrideCs,
kbatch);
}
else if(data_type == GemmDataType::F16_F8_F16 && layout == GemmMatrixLayout::MK_KN_MN) else if(data_type == GemmDataType::F16_F8_F16 && layout == GemmMatrixLayout::MK_KN_MN)
{ {
ck::profiler::profile_grouped_gemm_impl<ck::half_t, ck::profiler::profile_grouped_gemm_impl<ck::half_t,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment