merge develop

1dbdab56 · Jing Zhang · d2e49b23 · bac7df8f · 1dbdab56 · 1dbdab56
Commit 1dbdab56 authored Aug 18, 2022 by Jing Zhang
20 changed files
--- a/example/28_grouped_gemm_bias_e_permute/grouped_gemm_bias_e_permute_xdl_fp16.cpp
+++ b/example/28_grouped_gemm_bias_e_permute/grouped_gemm_bias_e_permute_xdl_fp16.cpp
--- a/example/29_batched_gemm_bias_e_permute/CMakeLists.txt
+++ b/example/29_batched_gemm_bias_e_permute/CMakeLists.txt
+add_example_executable(example_batched_gemm_bias_e_permute_xdl_fp16 batched_gemm_bias_e_permute_xdl_fp16.cpp)
--- a/example/29_batched_gemm_bias_e_permute/batched_gemm_bias_e_permute_xdl_fp16.cpp
+++ b/example/29_batched_gemm_bias_e_permute/batched_gemm_bias_e_permute_xdl_fp16.cpp
--- a/example/30_grouped_convnd_fwd_bias_relu/CMakeLists.txt
+++ b/example/30_grouped_convnd_fwd_bias_relu/CMakeLists.txt
-add_example_executable(example_grouped_convnd_fwd_bias_relu_xdl_fp16 grouped_convnd_fwd_bias_relu_xdl_fp16.cpp)
-target_link_libraries(example_grouped_convnd_fwd_bias_relu_xdl_fp16 PRIVATE utility)
--- a/example/30_grouped_convnd_fwd_bias_relu_add/CMakeLists.txt
+++ b/example/30_grouped_convnd_fwd_bias_relu_add/CMakeLists.txt
+add_example_executable(example_grouped_convnd_fwd_bias_relu_add_xdl_fp16 grouped_convnd_fwd_bias_relu_add_xdl_fp16.cpp)
+target_link_libraries(example_grouped_convnd_fwd_bias_relu_add_xdl_fp16 PRIVATE utility)
+
+add_example_executable(example_grouped_convnd_fwd_bias_relu_add_xdl_fp32 grouped_convnd_fwd_bias_relu_add_xdl_fp32.cpp)
+target_link_libraries(example_grouped_convnd_fwd_bias_relu_add_xdl_fp32 PRIVATE utility)
+
+add_example_executable(example_grouped_convnd_fwd_bias_relu_add_xdl_bf16 grouped_convnd_fwd_bias_relu_add_xdl_bf16.cpp)
+target_link_libraries(example_grouped_convnd_fwd_bias_relu_add_xdl_bf16 PRIVATE utility)
+
+add_example_executable(example_grouped_convnd_fwd_bias_relu_add_xdl_int8 grouped_convnd_fwd_bias_relu_add_xdl_int8.cpp)
+target_link_libraries(example_grouped_convnd_fwd_bias_relu_add_xdl_int8 PRIVATE utility)
\ No newline at end of file
--- a/example/30_grouped_convnd_fwd_bias_relu/README.md
+++ b/example/30_grouped_convnd_fwd_bias_relu/README.md
@@ -12,17 +12,23 @@
 # <left padding>, (ie LeftPy, LeftPx for 2D)
 # <right padding>, (ie RightPy, RightPx for 2D)

-bin/example_grouped_convnd_fwd_bias_relu_xdl_fp16 1 1 1
+bin/example_grouped_convnd_fwd_bias_relu_add_xdl_fp16 1 1 1
 ```

 Result (MI100)
 ```
-in: dim 5, lengths {1, 128, 192, 71, 71}, strides {6912, 967872, 1, 13632, 192}
-wei: dim 5, lengths {1, 256, 192, 3, 3}, strides {192, 1728, 1, 576, 192}
-bias: dim 5, lengths {1, 128, 256, 36, 36}, strides {256, 0, 1, 0, 0}
-out: dim 5, lengths {1, 128, 256, 36, 36}, strides {256, 331776, 1, 9216, 256}
-launch_and_time_kernel: grid_dim {1296, 1, 1}, block_dim {256, 1, 1}
+in: dim 5, lengths {2, 128, 192, 71, 71}, strides {192, 1935744, 1, 27264, 384}
+wei: dim 5, lengths {2, 256, 192, 3, 3}, strides {442368, 1728, 1, 576, 192}
+bias: dim 5, lengths {2, 128, 256, 36, 36}, strides {256, 0, 1, 0, 0}
+residual: dim 5, lengths {2, 128, 256, 36, 36}, strides {256, 0, 1, 0, 0}
+out: dim 5, lengths {2, 128, 256, 36, 36}, strides {256, 663552, 1, 18432, 512}
+A[M, K]: {165888, 1728}
+B[N, K]: {256, 1728}
+Ds[M, N]: {165888, 256}
+Ds[M, N]: {165888, 256}
+E[M, N]: {165888, 256}
+launch_and_time_kernel: grid_dim {2592, 1, 1}, block_dim {256, 1, 1}
 Warm up 1 time
 Start running 10 times...
-Perf: 1.19215 ms, 123.112 TFlops, 279.827 GB/s, DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<256, 128, 256, 32, Default>
+Perf: 2.48075 ms, 118.325 TFlops, 268.946 GB/s, DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<256, 128, 256, 32, Default>
 ```
\ No newline at end of file
--- a/example/30_grouped_convnd_fwd_bias_relu/grouped_convnd_fwd_bias_common.hpp
+++ b/example/30_grouped_convnd_fwd_bias_relu/grouped_convnd_fwd_bias_common.hpp
--- a/example/30_grouped_convnd_fwd_bias_relu_add/grouped_convnd_fwd_bias_relu_add_xdl_bf16.cpp
+++ b/example/30_grouped_convnd_fwd_bias_relu_add/grouped_convnd_fwd_bias_relu_add_xdl_bf16.cpp
--- a/example/30_grouped_convnd_fwd_bias_relu/grouped_convnd_fwd_bias_relu_xdl_fp16.cpp
+++ b/example/30_grouped_convnd_fwd_bias_relu/grouped_convnd_fwd_bias_relu_xdl_fp16.cpp
--- a/example/30_grouped_convnd_fwd_bias_relu_add/grouped_convnd_fwd_bias_relu_add_xdl_fp32.cpp
+++ b/example/30_grouped_convnd_fwd_bias_relu_add/grouped_convnd_fwd_bias_relu_add_xdl_fp32.cpp
--- a/example/30_grouped_convnd_fwd_bias_relu_add/grouped_convnd_fwd_bias_relu_add_xdl_int8.cpp
+++ b/example/30_grouped_convnd_fwd_bias_relu_add/grouped_convnd_fwd_bias_relu_add_xdl_int8.cpp
--- a/example/31_batched_gemm_gemm/CMakeLists.txt
+++ b/example/31_batched_gemm_gemm/CMakeLists.txt
+add_example_executable(example_batched_gemm_gemm_xdl_fp16 batched_gemm_gemm_xdl_fp16.cpp)
--- a/example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_fp16.cpp
+++ b/example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_fp16.cpp
--- a/example/32_batched_gemm_scale_softmax_gemm/CMakeLists.txt
+++ b/example/32_batched_gemm_scale_softmax_gemm/CMakeLists.txt
+add_example_executable(example_batched_gemm_scale_softmax_gemm_xdl_fp16 batched_gemm_scale_softmax_gemm_xdl_fp16.cpp)
--- a/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_xdl_fp16.cpp
+++ b/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_xdl_fp16.cpp
--- a/example/33_multiple_reduce/CMakeLists.txt
+++ b/example/33_multiple_reduce/CMakeLists.txt
+add_example_executable(example_dual_reduce_multiblock dual_reduce_multiblock.cpp)
+add_example_executable(example_dual_reduce_threadwise dual_reduce_threadwise.cpp)
--- a/example/33_multiple_reduce/README.md
+++ b/example/33_multiple_reduce/README.md
+# Instructions for ```example_dual_reduce```
+
+## Run ```example_dual_reduce_multiblock```
+```bash
+# -D <xxx> : input 4-d tensor lengths
+# -v <x> :   verification (0=no, 1=yes)
+#arg1: initialization (0=no init, 1=single integer value, 2=scope integer value, 3=decimal value)
+#arg2: time kernel (0=no, 1=yes) 
+./bin/example_dual_reduce_multiblock -D 600,28,28,256 -v 1 2 1
+```
+
+Result
+```
+./bin/example_dual_reduce_multiblock -D 600,28,28,256 -v 1 2 1                        
+launch_and_time_kernel: grid_dim {150, 1, 1}, block_dim {256, 1, 1} 
+Warm up 1 time
+Start running 10 times...
+Perf: 1.19529 ms, 201.499 GB/s, DeviceMultipleReduceBlockWise<256,M_C4_S1,K_C64_S1,InSrcVectorDim_1_InSrcVectorSize_1,OutDstVectorSize_1_1>
+```
+
+## Run ```example_dual_reduce_threadwise```
+```bash
+# -D <xxx> : input 4-d tensor lengths
+# -v <x> :   verification (0=no, 1=yes)
+#arg1: initialization (0=no init, 1=single integer value, 2=scope integer value, 3=decimal value)
+#arg2: time kernel (0=no, 1=yes)
+./bin/example_dual_reduce_multiblock -D 8000,4,4,4 -v 1 2 1
+```
+
+Result
+```
+./bin/example_dual_reduce_threadwise -D 8000,4,4,4 -v 1 2 1
+launch_and_time_kernel: grid_dim {32, 1, 1}, block_dim {256, 1, 1} 
+Warm up 1 time
+Start running 10 times...
+Perf: 0.01512 ms, 71.9577 GB/s, DeviceMultipleReduceThreadwise<256,M_C256_S1,K_C1_S4,InSrcVectorDim_1_InSrcVectorSize_2,OutDstVectorSize_1_1>
+```
--- a/example/33_multiple_reduce/dual_reduce_common.hpp
+++ b/example/33_multiple_reduce/dual_reduce_common.hpp
--- a/example/33_multiple_reduce/dual_reduce_multiblock.cpp
+++ b/example/33_multiple_reduce/dual_reduce_multiblock.cpp
--- a/example/33_multiple_reduce/dual_reduce_threadwise.cpp
+++ b/example/33_multiple_reduce/dual_reduce_threadwise.cpp