merge develop

1dbdab56 · Jing Zhang · d2e49b23 · bac7df8f · 1dbdab56 · 1dbdab56
Commit 1dbdab56 authored Aug 18, 2022 by Jing Zhang
20 changed files
--- a/example/34_batchnorm/CMakeLists.txt
+++ b/example/34_batchnorm/CMakeLists.txt
+add_example_executable(example_batchnorm_forward batchnorm_forward_nhwc.cpp)
+add_example_executable(example_batchnorm_infer batchnorm_infer_nhwc.cpp)
--- a/example/34_batchnorm/README.md
+++ b/example/34_batchnorm/README.md
--- a/example/34_batchnorm/batchnorm_common.hpp
+++ b/example/34_batchnorm/batchnorm_common.hpp
--- a/example/34_batchnorm/batchnorm_forward_impl.hpp
+++ b/example/34_batchnorm/batchnorm_forward_impl.hpp
--- a/example/34_batchnorm/batchnorm_forward_nhwc.cpp
+++ b/example/34_batchnorm/batchnorm_forward_nhwc.cpp
--- a/example/34_batchnorm/batchnorm_infer_impl.hpp
+++ b/example/34_batchnorm/batchnorm_infer_impl.hpp
--- a/example/34_batchnorm/batchnorm_infer_nhwc.cpp
+++ b/example/34_batchnorm/batchnorm_infer_nhwc.cpp
--- a/example/31_splitK_gemm/CMakeLists.txt
+++ b/example/31_splitK_gemm/CMakeLists.txt
--- a/example/31_splitK_gemm/splitK_gemm_xdl_bfp16.cpp
+++ b/example/31_splitK_gemm/splitK_gemm_xdl_bfp16.cpp
--- a/example/31_splitK_gemm/splitK_gemm_xdl_fp16.cpp
+++ b/example/31_splitK_gemm/splitK_gemm_xdl_fp16.cpp
--- a/example/31_splitK_gemm/splitK_gemm_xdl_fp32.cpp
+++ b/example/31_splitK_gemm/splitK_gemm_xdl_fp32.cpp
--- a/example/31_splitK_gemm/splitK_gemm_xdl_int8.cpp
+++ b/example/31_splitK_gemm/splitK_gemm_xdl_int8.cpp
--- a/example/CMakeLists.txt
+++ b/example/CMakeLists.txt
@@ -30,7 +30,7 @@ add_subdirectory(12_reduce)
 add_subdirectory(13_pool2d_fwd)
 add_subdirectory(14_gemm_xdl_requant_relu_requant)
 add_subdirectory(15_grouped_gemm)
-add_subdirectory(16_gemm_reduce)
+add_subdirectory(16_gemm_multi_d_multi_reduces)
 add_subdirectory(17_convnd_bwd_data)
 add_subdirectory(18_batched_gemm_reduce)
 add_subdirectory(19_binary_elementwise)
@@ -42,6 +42,11 @@ add_subdirectory(24_batched_gemm)
 add_subdirectory(25_gemm_bias_e_permute)
 add_subdirectory(26_contraction)
 add_subdirectory(27_layernorm)
-add_subdirectory(28_grouped_gemm_bias)
+add_subdirectory(28_grouped_gemm_bias_e_permute)
-add_subdirectory(30_grouped_convnd_fwd_bias_relu)
+add_subdirectory(29_batched_gemm_bias_e_permute)
-add_subdirectory(31_splitK_gemm)
+add_subdirectory(30_grouped_convnd_fwd_bias_relu_add)
+add_subdirectory(31_batched_gemm_gemm)
+add_subdirectory(32_batched_gemm_scale_softmax_gemm)
+add_subdirectory(33_multiple_reduce)
+add_subdirectory(34_batchnorm)
+add_subdirectory(35_splitK_gemm)
--- a/include/ck/tensor_description/tensor_descriptor.hpp
+++ b/include/ck/tensor_description/tensor_descriptor.hpp
@@ -4,6 +4,7 @@
 #pragma once
 #include "ck/utility/common_header.hpp"
+#include "ck/utility/sequence_helper.hpp"
 #include "ck/tensor_description/multi_index_transform.hpp"
 namespace ck {
@@ -159,6 +160,12 @@ struct TensorDescriptor
        return transforms_[Number<itran>{}].GetUpperLengths()[Number<idim_up>{}];
    }
+    __host__ __device__ constexpr auto GetLengths() const
+    {
+        // FIXME: use Tuple of reference instead
+        return generate_sequence_v2([&](auto I) { return GetLength(I); }, Number<ndim_visible_>{});
+    }
    __host__ __device__ constexpr auto GetElementSize() const { return element_size_; }
    __host__ __device__ constexpr auto GetElementSpaceSize() const { return element_space_size_; }

--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops_skip_b_lds.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops_skip_b_lds.hpp
--- a/include/ck/tensor_operation/gpu/block/blockwise_softmax.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_softmax.hpp
--- a/include/ck/tensor_operation/gpu/block/blockwise_welford.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_welford.hpp
--- a/include/ck/tensor_operation/gpu/block/reduction_functions_blockwise.hpp
+++ b/include/ck/tensor_operation/gpu/block/reduction_functions_blockwise.hpp
--- a/include/ck/tensor_operation/gpu/device/device_5ary_elementwise.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_5ary_elementwise.hpp