Merge remote-tracking branch 'origin/develop' into cpu_avx2

6dfb4e78 · carlushuang · 397a68f2 · 1ced00a5 · 6dfb4e78 · 6dfb4e78
Commit 6dfb4e78 authored Jun 12, 2022 by carlushuang
20 changed files
--- a/example/21_gemm_layernorm/CMakeLists.txt
+++ b/example/21_gemm_layernorm/CMakeLists.txt
+add_example_executable(example_gemm_layernorm_xdl_fp16 gemm_layernorm_xdl_fp16.cpp)
--- a/example/21_gemm_layernorm/gemm_layernorm_xdl_fp16.cpp
+++ b/example/21_gemm_layernorm/gemm_layernorm_xdl_fp16.cpp
--- a/example/22_cgemm/CMakeLists.txt
+++ b/example/22_cgemm/CMakeLists.txt
+add_example_executable(example_cgemm_xdl_fp16 cgemm_xdl_fp16.cpp)
--- a/example/22_cgemm/cgemm_xdl_fp16.cpp
+++ b/example/22_cgemm/cgemm_xdl_fp16.cpp
--- a/example/CMakeLists.txt
+++ b/example/CMakeLists.txt
 include_directories(BEFORE
    ${PROJECT_SOURCE_DIR}/include/ck
    ${PROJECT_SOURCE_DIR}/include/ck/utility
+    ${PROJECT_SOURCE_DIR}/include/ck/host_utility
    ${PROJECT_SOURCE_DIR}/include/ck/tensor_description
    ${PROJECT_SOURCE_DIR}/include/ck/tensor
    ${PROJECT_SOURCE_DIR}/include/ck/problem_transform
@@ -38,7 +39,7 @@ function(add_example_executable_no_testing EXAMPLE_NAME FILE_NAME)
    add_executable(${EXAMPLE_NAME} ${FILE_NAME})
    target_link_libraries(${EXAMPLE_NAME} PRIVATE host_tensor)
    add_dependencies(examples ${EXAMPLE_NAME})
-endfunction(add_example_executable EXAMPLE_NAME)
+endfunction(add_example_executable_no_testing EXAMPLE_NAME)
 add_subdirectory(01_gemm)
 add_subdirectory(02_gemm_alpha_beta)
@@ -52,10 +53,13 @@ add_subdirectory(11_conv2d_bwd_weight)
 add_subdirectory(12_reduce)
 add_subdirectory(13_pool2d_fwd)
 add_subdirectory(14_gemm_xdl_requant_relu_requant)
-add_subdirectory(17_convnd_bwd_data_xdl)
 add_subdirectory(15_grouped_gemm)
 add_subdirectory(16_gemm_reduce)
+add_subdirectory(17_convnd_bwd_data_xdl)
 add_subdirectory(18_batched_gemm_reduce)
+add_subdirectory(19_binary_elementwise)
+add_subdirectory(20_convnd_bwd_weight_xdl)
+add_subdirectory(21_gemm_layernorm)
+add_subdirectory(22_cgemm)
 add_subdirectory(cpu_01_conv2d_fwd)
 add_subdirectory(cpu_02_conv2d_fwd_bias_relu_add)
--- a/include/ck/config.hpp
+++ b/include/ck/config.hpp
@@ -87,6 +87,12 @@
 #define CK_USE_AMD_BUFFER_ATOMIC_ADD_FLOAT 0
 #endif
+#if defined(__gfx90a__) // for GPU code
+#define CK_USE_AMD_BUFFER_ATOMIC_MAX_FLOAT64 1
+#else
+#define CK_USE_AMD_BUFFER_ATOMIC_MAX_FLOAT64 0
+#endif
 // inline asm
 #define CK_USE_AMD_INLINE_ASM 1
@@ -102,10 +108,11 @@
 // experimental feature: static tensor descriptor
 #define CK_EXPERIMENTAL_STATIC_TENSOR_DESCRIPTOR 0
-// experimental feature: buffer load/store/atomic-add OOB trick
+// experimental feature: buffer load/store/atomic-add/ OOB trick
 #define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 0
 #define CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK_OFFSET_TRICK 1
 #define CK_EXPERIMENTAL_USE_BUFFER_ATOMIC_ADD_OOB_CHECK_OFFSET_TRICK 1
+#define CK_EXPERIMENTAL_USE_BUFFER_ATOMIC_MAX_OOB_CHECK_OFFSET_TRICK 1
 // experimental feature: in-regsiter sub-dword transpose
 #define CK_EXPERIMENTAL_USE_IN_REGISTER_SUB_DWORD_TRANSPOSE 1
@@ -157,9 +164,23 @@ enum struct InMemoryDataOperationEnum
 {
    Set,
    AtomicAdd,
+    AtomicMax,
    Add
 };
+template <InMemoryDataOperationEnum... Is>
+struct InMemoryDataOperationEnumSequence
+{
+    static constexpr int mSize = sizeof...(Is);
+    __host__ __device__ static constexpr InMemoryDataOperationEnum At(int I)
+    {
+        // the last dummy element is to prevent compiler complain about empty array, when mSize = 0
+        const InMemoryDataOperationEnum mData[mSize + 1] = {Is..., InMemoryDataOperationEnum::Set};
+        return mData[I];
+    }
+};
 // TODO: no longer needed, remove this
 enum struct ActivTypeEnum
 {

--- a/include/ck/host_utility/device_prop.hpp
+++ b/include/ck/host_utility/device_prop.hpp
+#pragma once
+#include <string>
+#include <map>
+namespace ck {
+inline std::string get_device_name()
+{
+    hipDeviceProp_t props{};
+    int device;
+    auto status = hipGetDevice(&device);
+    if(status != hipSuccess)
+    {
+        return std::string();
+    }
+    status = hipGetDeviceProperties(&props, device);
+    if(status != hipSuccess)
+    {
+        return std::string();
+    }
+    const std::string raw_name(props.gcnArchName);
+    // https://github.com/ROCmSoftwarePlatform/MIOpen/blob/8498875aef84878e04c1eabefdf6571514891086/src/target_properties.cpp#L40
+    static std::map<std::string, std::string> device_name_map = {
+        {"Ellesmere", "gfx803"},
+        {"Baffin", "gfx803"},
+        {"RacerX", "gfx803"},
+        {"Polaris10", "gfx803"},
+        {"Polaris11", "gfx803"},
+        {"Tonga", "gfx803"},
+        {"Fiji", "gfx803"},
+        {"gfx800", "gfx803"},
+        {"gfx802", "gfx803"},
+        {"gfx804", "gfx803"},
+        {"Vega10", "gfx900"},
+        {"gfx901", "gfx900"},
+        {"10.3.0 Sienna_Cichlid 18", "gfx1030"},
+    };
+    const auto name = raw_name.substr(0, raw_name.find(':')); // str.substr(0, npos) returns str.
+    auto match = device_name_map.find(name);
+    if(match != device_name_map.end())
+        return match->second;
+    return name;
+}
+} // namespace ck
--- a/include/ck/options.hpp
+++ b/include/ck/options.hpp
+#pragma once
+#define CK_TIME_KERNEL 1
--- a/include/ck/options.hpp.in
+++ b/include/ck/options.hpp.in
-#pragma once
-#cmakedefine01 CK_TIME_KERNEL
-#cmakedefine CK_NOGPU
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_dlops_v2r3.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_dlops_v2r3.hpp
-#ifndef CK_BLOCKWISE_GEMM_DLOPS_V2R3_HPP
+#pragma once
-#define CK_BLOCKWISE_GEMM_DLOPS_V2R3_HPP
 #include "common_header.hpp"
 #include "tensor_adaptor.hpp"
-#include "threadwise_tensor_slice_transfer_v2.hpp"
+#include "threadwise_tensor_slice_transfer_v4r1.hpp"
-#include "threadwise_contraction_dlops.hpp"
+#include "threadwise_contraction_dl.hpp"
 namespace ck {
@@ -41,7 +39,7 @@ template <index_t BlockSize,
          typename enable_if<ABlockDesc_BK0_BM_BK1::IsKnownAtCompileTime() &&
                                 BBlockDesc_BK0_BN_BK1::IsKnownAtCompileTime(),
                             bool>::type = false>
-struct BlockwiseGemmDlops_A_BK0_BM_BK1_B_BK0_BN_BK1_C_BM0_BM1_BN0_BN1_pipeline_BM0_2_BN0_2
+struct BlockwiseGemmDl_A_BK0_BM_BK1_B_BK0_BN_BK1_C_BM0_BM1_BN0_BN1_pipeline_BM0_2_BN0_2
 {
    using AIndex = MultiIndex<3>;
    using BIndex = MultiIndex<3>;
@@ -148,7 +146,7 @@ struct BlockwiseGemmDlops_A_BK0_BM_BK1_B_BK0_BN_BK1_C_BM0_BM1_BN0_BN1_pipeline_B
        MakeBBlockDescriptor_BK0_BN0_BN1_BK1(BBlockDesc_BK0_BN_BK1{});
    public:
-    __device__ BlockwiseGemmDlops_A_BK0_BM_BK1_B_BK0_BN_BK1_C_BM0_BM1_BN0_BN1_pipeline_BM0_2_BN0_2()
+    __device__ BlockwiseGemmDl_A_BK0_BM_BK1_B_BK0_BN_BK1_C_BM0_BM1_BN0_BN1_pipeline_BM0_2_BN0_2()
        : c_thread_origin_data_idx_{CalculateCThreadOriginOnBlock_BM0_BM1_BN0_BN1(
              get_thread_local_1d_id())},
          a_thread_copy_{
@@ -175,6 +173,7 @@ struct BlockwiseGemmDlops_A_BK0_BM_BK1_B_BK0_BN_BK1_C_BM0_BM1_BN0_BN1_pipeline_B
                      "wrong!");
        // TODO: remove this restriction
+        static_assert(BM0 == 2, "wrong");
        static_assert(BM0 == 2 && BN0 == 2, "wrong");
    }
@@ -226,7 +225,7 @@ struct BlockwiseGemmDlops_A_BK0_BM_BK1_B_BK0_BN_BK1_C_BM0_BM1_BN0_BN1_pipeline_B
            b_thread_desc_bk0_bn0_bn1_bk1_.GetElementSpaceSize());
        constexpr auto threadwise_contraction =
-            ThreadwiseContractionDlops_A_TK0_TM0_TM1_TK1_B_TK0_TN0_TN1_TK1_C_TM0_TM1_TN0_TN1<
+            ThreadwiseContractionDl_A_TK0_TM0_TM1_TK1_B_TK0_TN0_TN1_TK1_C_TM0_TM1_TN0_TN1<
                FloatA,
                FloatB,
                FloatC,
@@ -407,4 +406,3 @@ struct BlockwiseGemmDlops_A_BK0_BM_BK1_B_BK0_BN_BK1_C_BM0_BM1_BN0_BN1_pipeline_B
 };
 } // namespace ck
-#endif
--- a/include/ck/tensor_operation/gpu/block/blockwise_tensor_slice_transfer_v5r1.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_tensor_slice_transfer_v5r1.hpp
@@ -75,14 +75,13 @@ struct BlockwiseTensorSliceTransfer_v5r1
        }
    }
-    template <typename SrcBuffer, typename SrcStepHacks>
+    template <typename SrcBuffer>
-    __device__ void
+    __device__ void RunRead(const SrcDesc& src_desc, const SrcBuffer& src_buf)
-    RunRead(const SrcDesc& src_desc, const SrcBuffer& src_buf, const SrcStepHacks& src_step_hacks)
    {
        if(BlockSize == thread_cluster_desc_.GetElementSize() or
           get_thread_local_1d_id() < thread_cluster_desc_.GetElementSize())
        {
-            threadwise_transfer_.RunRead(src_desc, src_buf, src_step_hacks);
+            threadwise_transfer_.RunRead(src_desc, src_buf);
        }
    }

--- a/include/ck/tensor_operation/gpu/device/convolution_backward_weight_specialization.hpp
+++ b/include/ck/tensor_operation/gpu/device/convolution_backward_weight_specialization.hpp
+#pragma once
+namespace ck {
+namespace tensor_operation {
+namespace device {
+enum struct ConvolutionBackwardWeightSpecialization
+{
+    Default,
+    Filter1x1Stride1Pad0,
+    Filter1x1Pad0,
+    OddC,
+};
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/include/ck/tensor_operation/gpu/device/device_5ary_elementwise.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_5ary_elementwise.hpp
--- a/include/ck/tensor_operation/gpu/device/device_base.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_base.hpp
@@ -40,6 +40,10 @@ struct BaseOperator
    virtual bool IsSupportedArgument(const BaseArgument*) { return false; }
    virtual std::string GetTypeString() const { return ""; }
+    virtual size_t GetWorkSpaceSize(const BaseArgument*) const { return 0; }
+    virtual void SetWorkSpacePointer(BaseArgument*, void*) const {}
    virtual ~BaseOperator() {}
 };

--- a/include/ck/tensor_operation/gpu/device/device_batched_gemm_reduce_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_batched_gemm_reduce_xdl_cshuffle.hpp
--- a/include/ck/tensor_operation/gpu/device/device_batched_gemm_xdl.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_batched_gemm_xdl.hpp
--- a/include/ck/tensor_operation/gpu/device/device_binary_elementwise.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_binary_elementwise.hpp
--- a/include/ck/tensor_operation/gpu/device/device_cgemm.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_cgemm.hpp
--- a/include/ck/tensor_operation/gpu/device/device_cgemm_4gemm_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_cgemm_4gemm_xdl_cshuffle.hpp
--- a/include/ck/tensor_operation/gpu/device/device_conv2d_backward_weight_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_conv2d_backward_weight_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp