Commit f4055dc7 authored by Jing Zhang's avatar Jing Zhang
Browse files

fixed splitK_gemm_fp32

parent 74604d74
...@@ -93,9 +93,9 @@ struct DeviceGemmXdlSplitK : public DeviceGemmSplitK<ALayout, ...@@ -93,9 +93,9 @@ struct DeviceGemmXdlSplitK : public DeviceGemmSplitK<ALayout,
const auto a_grid_desc_m_kpad = transform_tensor_descriptor( const auto a_grid_desc_m_kpad = transform_tensor_descriptor(
a_grid_desc_m_k, a_grid_desc_m_k,
make_tuple(make_right_pad_transform(K, KPad - K), make_pass_through_transform(M)), make_tuple(make_pass_through_transform(M), make_right_pad_transform(K, KPad - K)),
make_tuple(Sequence<0>{}, Sequence<1>{}), make_tuple(Sequence<0>{}, Sequence<1>{}),
make_tuple(Sequence<0>{}, Sequence<1>{})); make_tuple(Sequence<1>{}, Sequence<0>{}));
if constexpr(GemmSpec == GemmSpecialization::MNPadding) if constexpr(GemmSpec == GemmSpecialization::MNPadding)
{ {
......
...@@ -5,49 +5,49 @@ include_directories(BEFORE ...@@ -5,49 +5,49 @@ include_directories(BEFORE
# ck_profiler # ck_profiler
set(PROFILER_SOURCE set(PROFILER_SOURCE
src/profiler.cpp src/profiler.cpp
src/profile_gemm.cpp #src/profile_gemm.cpp
src/profile_gemm_splitk.cpp src/profile_gemm_splitk.cpp
src/profile_gemm_bilinear.cpp #src/profile_gemm_bilinear.cpp
src/profile_gemm_bias_add_reduce.cpp #src/profile_gemm_bias_add_reduce.cpp
src/profile_gemm_add_add_fastgelu.cpp #src/profile_gemm_add_add_fastgelu.cpp
src/profile_gemm_reduce.cpp #src/profile_gemm_reduce.cpp
src/profile_batched_gemm.cpp #src/profile_batched_gemm.cpp
src/profile_batched_gemm_reduce.cpp #src/profile_batched_gemm_reduce.cpp
src/profile_grouped_gemm.cpp #src/profile_grouped_gemm.cpp
src/profile_conv_fwd.cpp #src/profile_conv_fwd.cpp
src/profile_conv_fwd_bias_relu.cpp #src/profile_conv_fwd_bias_relu.cpp
src/profile_conv_fwd_bias_relu_add.cpp #src/profile_conv_fwd_bias_relu_add.cpp
src/profile_conv_bwd_data.cpp #src/profile_conv_bwd_data.cpp
src/profile_conv_bwd_weight.cpp #src/profile_conv_bwd_weight.cpp
src/profile_grouped_conv_fwd.cpp #src/profile_grouped_conv_fwd.cpp
src/profile_reduce.cpp #src/profile_reduce.cpp
src/profile_layernorm.cpp #src/profile_layernorm.cpp
src/profile_normalization.cpp #src/profile_normalization.cpp
) )
add_executable(ckProfiler ${PROFILER_SOURCE}) add_executable(ckProfiler ${PROFILER_SOURCE})
target_link_libraries(ckProfiler PRIVATE utility) target_link_libraries(ckProfiler PRIVATE utility)
target_link_libraries(ckProfiler PRIVATE device_gemm_instance) #target_link_libraries(ckProfiler PRIVATE device_gemm_instance)
target_link_libraries(ckProfiler PRIVATE device_gemm_splitk_instance) target_link_libraries(ckProfiler PRIVATE device_gemm_splitk_instance)
target_link_libraries(ckProfiler PRIVATE device_gemm_bilinear_instance) #target_link_libraries(ckProfiler PRIVATE device_gemm_bilinear_instance)
target_link_libraries(ckProfiler PRIVATE device_gemm_add_add_fastgelu_instance) #target_link_libraries(ckProfiler PRIVATE device_gemm_add_add_fastgelu_instance)
target_link_libraries(ckProfiler PRIVATE device_gemm_reduce_instance) #target_link_libraries(ckProfiler PRIVATE device_gemm_reduce_instance)
target_link_libraries(ckProfiler PRIVATE device_gemm_bias_add_reduce_instance) #target_link_libraries(ckProfiler PRIVATE device_gemm_bias_add_reduce_instance)
target_link_libraries(ckProfiler PRIVATE device_batched_gemm_instance) #target_link_libraries(ckProfiler PRIVATE device_batched_gemm_instance)
target_link_libraries(ckProfiler PRIVATE device_batched_gemm_reduce_instance) #target_link_libraries(ckProfiler PRIVATE device_batched_gemm_reduce_instance)
target_link_libraries(ckProfiler PRIVATE device_grouped_gemm_instance) #target_link_libraries(ckProfiler PRIVATE device_grouped_gemm_instance)
target_link_libraries(ckProfiler PRIVATE device_conv2d_fwd_instance) #target_link_libraries(ckProfiler PRIVATE device_conv2d_fwd_instance)
target_link_libraries(ckProfiler PRIVATE device_grouped_conv1d_fwd_instance) #target_link_libraries(ckProfiler PRIVATE device_grouped_conv1d_fwd_instance)
target_link_libraries(ckProfiler PRIVATE device_grouped_conv2d_fwd_instance) #target_link_libraries(ckProfiler PRIVATE device_grouped_conv2d_fwd_instance)
target_link_libraries(ckProfiler PRIVATE device_grouped_conv3d_fwd_instance) #target_link_libraries(ckProfiler PRIVATE device_grouped_conv3d_fwd_instance)
target_link_libraries(ckProfiler PRIVATE device_conv1d_bwd_data_instance) #target_link_libraries(ckProfiler PRIVATE device_conv1d_bwd_data_instance)
target_link_libraries(ckProfiler PRIVATE device_conv2d_bwd_data_instance) #target_link_libraries(ckProfiler PRIVATE device_conv2d_bwd_data_instance)
target_link_libraries(ckProfiler PRIVATE device_conv3d_bwd_data_instance) #target_link_libraries(ckProfiler PRIVATE device_conv3d_bwd_data_instance)
target_link_libraries(ckProfiler PRIVATE device_conv1d_bwd_weight_instance) #target_link_libraries(ckProfiler PRIVATE device_conv1d_bwd_weight_instance)
target_link_libraries(ckProfiler PRIVATE device_conv2d_bwd_weight_instance) #target_link_libraries(ckProfiler PRIVATE device_conv2d_bwd_weight_instance)
target_link_libraries(ckProfiler PRIVATE device_conv3d_bwd_weight_instance) #target_link_libraries(ckProfiler PRIVATE device_conv3d_bwd_weight_instance)
target_link_libraries(ckProfiler PRIVATE device_conv2d_fwd_bias_relu_instance) #target_link_libraries(ckProfiler PRIVATE device_conv2d_fwd_bias_relu_instance)
target_link_libraries(ckProfiler PRIVATE device_conv2d_fwd_bias_relu_add_instance) #target_link_libraries(ckProfiler PRIVATE device_conv2d_fwd_bias_relu_add_instance)
target_link_libraries(ckProfiler PRIVATE device_normalization_instance) #target_link_libraries(ckProfiler PRIVATE device_normalization_instance)
target_link_libraries(ckProfiler PRIVATE device_reduce_instance) #target_link_libraries(ckProfiler PRIVATE device_reduce_instance)
...@@ -46,6 +46,7 @@ static void print_helper_message() ...@@ -46,6 +46,7 @@ static void print_helper_message()
int main(int argc, char* argv[]) int main(int argc, char* argv[])
{ {
#if 0
if(argc == 1) if(argc == 1)
{ {
print_helper_message(); print_helper_message();
...@@ -124,6 +125,11 @@ int main(int argc, char* argv[]) ...@@ -124,6 +125,11 @@ int main(int argc, char* argv[])
{ {
return profile_layernorm(argc, argv); return profile_layernorm(argc, argv);
} }
#endif
if(strcmp(argv[1], "gemm_splitk") == 0)
{
return profile_gemm_splitk(argc, argv);
}
else else
{ {
print_helper_message(); print_helper_message();
......
...@@ -10,7 +10,7 @@ cmake ...@@ -10,7 +10,7 @@ cmake
-D CMAKE_INSTALL_PREFIX=${MY_PROJECT_INSTALL} \ -D CMAKE_INSTALL_PREFIX=${MY_PROJECT_INSTALL} \
-D BUILD_DEV=OFF \ -D BUILD_DEV=OFF \
-D CMAKE_BUILD_TYPE=Release \ -D CMAKE_BUILD_TYPE=Release \
-D CMAKE_CXX_FLAGS=" -O3 -ftemplate-backtrace-limit=0 -gline-tables-only -save-temps=$PWD" \ -D CMAKE_CXX_FLAGS=" --offload-arch=gfx908 --offload-arch=gfx90a -O3 -ftemplate-backtrace-limit=0 -mllvm --amdgpu-spill-vgpr-to-agpr=0 -gline-tables-only -save-temps=$PWD" \
-D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc \ -D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc \
-D CMAKE_PREFIX_PATH=/opt/rocm \ -D CMAKE_PREFIX_PATH=/opt/rocm \
-D CMAKE_VERBOSE_MAKEFILE:BOOL=ON \ -D CMAKE_VERBOSE_MAKEFILE:BOOL=ON \
......
...@@ -127,10 +127,10 @@ print_log_header $reduction_log $env_type $branch $host_name ...@@ -127,10 +127,10 @@ print_log_header $reduction_log $env_type $branch $host_name
export splitK_gemm_log="perf_splitK_gemm_${gpu_arch}.log" export splitK_gemm_log="perf_splitK_gemm_${gpu_arch}.log"
print_log_header $splitK_gemm_log $env_type $branch $host_name print_log_header $splitK_gemm_log $env_type $branch $host_name
#../script/profile_splitK_gemm.sh gemm_splitk 0 0 $verify 1 0 1 4 | tee -a $splitK_gemm_log ../script/profile_splitK_gemm.sh gemm_splitk 0 0 $verify 1 0 1 4 | tee -a $splitK_gemm_log
#../script/profile_splitK_gemm.sh gemm_splitk 0 1 $verify 1 0 1 4 | tee -a $splitK_gemm_log ../script/profile_splitK_gemm.sh gemm_splitk 0 1 $verify 1 0 1 4 | tee -a $splitK_gemm_log
#../script/profile_splitK_gemm.sh gemm_splitk 0 2 $verify 1 0 1 4 | tee -a $splitK_gemm_log ../script/profile_splitK_gemm.sh gemm_splitk 0 2 $verify 1 0 1 4 | tee -a $splitK_gemm_log
#../script/profile_splitK_gemm.sh gemm_splitk 0 3 $verify 1 0 1 4 | tee -a $splitK_gemm_log ../script/profile_splitK_gemm.sh gemm_splitk 0 3 $verify 1 0 1 4 | tee -a $splitK_gemm_log
../script/profile_splitK_gemm.sh gemm_splitk 1 0 $verify 1 0 1 4 | tee -a $splitK_gemm_log ../script/profile_splitK_gemm.sh gemm_splitk 1 0 $verify 1 0 1 4 | tee -a $splitK_gemm_log
../script/profile_splitK_gemm.sh gemm_splitk 1 1 $verify 1 0 1 4 | tee -a $splitK_gemm_log ../script/profile_splitK_gemm.sh gemm_splitk 1 1 $verify 1 0 1 4 | tee -a $splitK_gemm_log
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment