Merge remote-tracking branch 'origin/develop' into cpu_avx2

b79df771 · carlushuang · 05d38218 · 63914743 · b79df771 · b79df771
Commit b79df771 authored Jul 12, 2022 by carlushuang
20 changed files
--- a/example/19_binary_elementwise/elementwise_add_4d.cpp
+++ b/example/19_binary_elementwise/elementwise_add_4d.cpp
-/*******************************************************************************
- *
- * MIT License
- *
- * Copyright (c) 2020 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
 #include <iostream>
 #include <cstdlib>
-#include "check_err.hpp"
-#include "config.hpp"
-#include "device.hpp"
-#include "host_tensor.hpp"
-#include "host_tensor_generator.hpp"

-#include "device_tensor.hpp"
-#include "binary_element_wise_operation.hpp"
-#include "device_binary_elementwise.hpp"
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp"
+#include "ck/tensor_operation/gpu/device/device_binary_elementwise.hpp"
+
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/host_tensor/device_memory.hpp"
+#include "ck/library/host_tensor/host_tensor.hpp"
+#include "ck/library/host_tensor/host_tensor_generator.hpp"

 using F16 = ck::half_t;
 using F32 = float;
@@ -42,8 +20,7 @@ using ABDataType             = F16;
 using CDataType              = F16;
 using EltwiseComputeDataType = F32;

-using Add = ck::tensor_operation::binary_element_wise::
-    Add<EltwiseComputeDataType, EltwiseComputeDataType, EltwiseComputeDataType>;
+using Add = ck::tensor_operation::element_wise::Add;

 using DeviceElementwiseAddInstance =
    ck::tensor_operation::device::DeviceBinaryElementwise<ABDataType,
@@ -104,15 +81,21 @@ int main()
    a_device_buf.ToDevice(a.mData.data());
    b_device_buf.ToDevice(b.mData.data());

+    std::array<const void*, 2> input = {a_device_buf.GetDeviceBuffer(),
+                                        b_device_buf.GetDeviceBuffer()};
+    std::array<void*, 1> output      = {c_device_buf.GetDeviceBuffer()};
+
+    std::vector<ck::index_t> a_strides{a.mDesc.GetStrides().begin(), a.mDesc.GetStrides().end()};
+    std::vector<ck::index_t> b_strides{b.mDesc.GetStrides().begin(), b.mDesc.GetStrides().end()};
+    std::vector<ck::index_t> c_strides{c.mDesc.GetStrides().begin(), c.mDesc.GetStrides().end()};
+
    auto broadcastAdd = DeviceElementwiseAddInstance{};
-    auto argument     = broadcastAdd.MakeArgumentPointer(
-        a_device_buf.GetDeviceBuffer(),
-        b_device_buf.GetDeviceBuffer(),
-        c_device_buf.GetDeviceBuffer(),
+    auto argument =
+        broadcastAdd.MakeArgumentPointer(input,
+                                         output,
                                         std::vector<ck::index_t>{nchw.begin(), nchw.end()},
-        std::vector<ck::index_t>{a.mDesc.GetStrides().begin(), a.mDesc.GetStrides().end()},
-        std::vector<ck::index_t>{b.mDesc.GetStrides().begin(), b.mDesc.GetStrides().end()},
-        std::vector<ck::index_t>{c.mDesc.GetStrides().begin(), c.mDesc.GetStrides().end()},
+                                         {{a_strides}, b_strides},
+                                         {c_strides},
                                         Add{});

    if(!broadcastAdd.IsSupportedArgument(argument.get()))

--- a/example/20_convnd_bwd_weight_xdl/CMakeLists.txt
+++ b/example/20_convnd_bwd_weight_xdl/CMakeLists.txt
 add_example_executable(example_convnd_bwd_weight_xdl convnd_bwd_weight_xdl.cpp)
+add_example_executable(example_convnd_bwd_weight_xdl_bf16_splitk convnd_bwd_weight_xdl_bf16_splitk.cpp)
 target_link_libraries(example_convnd_bwd_weight_xdl PRIVATE conv_util)
+target_link_libraries(example_convnd_bwd_weight_xdl_bf16_splitk PRIVATE conv_util)
\ No newline at end of file
--- a/example/20_convnd_bwd_weight_xdl/convnd_bwd_weight_xdl.cpp
+++ b/example/20_convnd_bwd_weight_xdl/convnd_bwd_weight_xdl.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
 #include <iostream>
 #include <numeric>
 #include <initializer_list>
 #include <cstdlib>
-#include <stdlib.h>
-#include <half.hpp>
-
-#include "check_err.hpp"
-#include "conv_util.hpp"
-#include "config.hpp"
-#include "print.hpp"
-#include "device.hpp"
-#include "host_tensor.hpp"
-#include "host_tensor_generator.hpp"
-#include "device_tensor.hpp"
-#include "tensor_layout.hpp"
-#include "element_wise_operation.hpp"
-#include "device_convnd_backward_weight_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp"
-#include "reference_conv_backward_weight.hpp"
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_convnd_backward_weight_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/conv_util.hpp"
+#include "ck/library/host_tensor/device_memory.hpp"
+#include "ck/library/host_tensor/host_tensor.hpp"
+#include "ck/library/host_tensor/host_tensor_generator.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_conv_backward_weight.hpp"

 using InDataType  = ck::half_t;
 using WeiDataType = ck::half_t;
@@ -297,43 +297,7 @@ int main(int argc, char* argv[])
                                  split_k);

    // alloc work space
-    size_t bwd_weight_workspace_size = conv->GetWorkSpaceSize(argument.get());
    float ave_time = 0.f;
-    if(std::is_same<InDataType, ck::bhalf_t>::value && split_k > 1)
-    {
-        DeviceMem wei_work_space_device_buf(bwd_weight_workspace_size);
-        wei_work_space_device_buf.SetZero();
-        argument = conv->MakeArgumentPointer(
-            static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
-            static_cast<AccDataType*>(wei_work_space_device_buf.GetDeviceBuffer()),
-            static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
-            params.N_,
-            params.K_,
-            params.C_,
-            params.input_spatial_lengths_,
-            params.filter_spatial_lengths_,
-            output_spatial_lengths,
-            params.conv_filter_strides_,
-            params.conv_filter_dilations_,
-            params.input_left_pads_,
-            params.input_right_pads_,
-            InElementOp{},
-            WeiElementOp{},
-            OutElementOp{},
-            split_k);
-
-        if(!conv->IsSupportedArgument(argument.get()))
-        {
-            std::cout << "wrong! device_conv with the specified compilation parameters does "
-                         "not support this Conv problem"
-                      << std::endl;
-            return 1;
-        }
-
-        ave_time = invoker->Run(argument.get(), StreamConfig{nullptr, time_kernel});
-    }
-    else
-    {
    if(!conv->IsSupportedArgument(argument.get()))
    {
        std::cout << "wrong! device_conv with the specified compilation parameters does "
@@ -342,7 +306,6 @@ int main(int argc, char* argv[])
        return 1;
    }
    ave_time = invoker->Run(argument.get(), StreamConfig{nullptr, time_kernel});
-    }

    std::size_t flop = ck::utils::conv::get_flops(
        params.N_, params.C_, params.K_, params.filter_spatial_lengths_, output_spatial_lengths);

--- a/example/20_convnd_bwd_weight_xdl/convnd_bwd_weight_xdl_bf16_splitk.cpp
+++ b/example/20_convnd_bwd_weight_xdl/convnd_bwd_weight_xdl_bf16_splitk.cpp
--- a/example/21_gemm_layernorm/CMakeLists.txt
+++ b/example/21_gemm_layernorm/CMakeLists.txt
+add_example_executable(example_gemm_bias_relu_add_layernorm_xdl_fp16 gemm_bias_relu_add_layernorm_xdl_fp16.cpp)
 add_example_executable(example_gemm_layernorm_xdl_fp16 gemm_layernorm_xdl_fp16.cpp)
+add_example_executable(example_gemm_xdl_layernorm_single_kernel_fp16 gemm_xdl_layernorm_single_kernel_fp16.cpp)
--- a/example/21_gemm_layernorm/gemm_bias_relu_add_layernorm_xdl_fp16.cpp
+++ b/example/21_gemm_layernorm/gemm_bias_relu_add_layernorm_xdl_fp16.cpp
--- a/example/21_gemm_layernorm/gemm_layernorm_xdl_fp16.cpp
+++ b/example/21_gemm_layernorm/gemm_layernorm_xdl_fp16.cpp
--- a/example/02_gemm_alpha_beta/gemm_xdl_alpha_beta.cpp
+++ b/example/02_gemm_alpha_beta/gemm_xdl_alpha_beta.cpp
--- a/example/22_cgemm/cgemm_xdl_fp16.cpp
+++ b/example/22_cgemm/cgemm_xdl_fp16.cpp
-/*******************************************************************************
- *
- * MIT License
- *
- * Copyright (c) 2022 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
 #include <iostream>
 #include <numeric>
 #include <initializer_list>
 #include <cstdlib>
-#include <stdlib.h>
-#include <half.hpp>
-
-#include "check_err.hpp"
-#include "config.hpp"
-#include "device.hpp"
-#include "host_tensor.hpp"
-#include "host_tensor_generator.hpp"
-#include "device_tensor.hpp"
-#include "device_cgemm_4gemm_xdl_cshuffle.hpp"
-#include "element_wise_operation.hpp"
-#include "reference_cgemm.hpp"
-#include "gemm_specialization.hpp"
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/device_cgemm_4gemm_xdl_cshuffle.hpp"
+
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/host_tensor/device_memory.hpp"
+#include "ck/library/host_tensor/host_tensor.hpp"
+#include "ck/library/host_tensor/host_tensor_generator.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_cgemm.hpp"

 template <ck::index_t... Is>
 using S = ck::Sequence<Is...>;

--- a/example/23_softmax/CMakeLists.txt
+++ b/example/23_softmax/CMakeLists.txt
+add_example_executable(example_softmax_blockwise softmax_blockwise.cpp)
\ No newline at end of file
--- a/example/23_softmax/README.md
+++ b/example/23_softmax/README.md
--- a/example/23_softmax/softmax_blockwise.cpp
+++ b/example/23_softmax/softmax_blockwise.cpp
--- a/example/24_batched_gemm_c_permute/CMakeLists.txt
+++ b/example/24_batched_gemm_c_permute/CMakeLists.txt
--- a/example/24_batched_gemm_c_permute/batched_gemm_c_permute_xdl_fp16.cpp
+++ b/example/24_batched_gemm_c_permute/batched_gemm_c_permute_xdl_fp16.cpp
--- a/example/25_gemm_bias_c_permute/CMakeLists.txt
+++ b/example/25_gemm_bias_c_permute/CMakeLists.txt
--- a/example/25_gemm_bias_c_permute/gemm_bias_c_permute_xdl_fp16.cpp
+++ b/example/25_gemm_bias_c_permute/gemm_bias_c_permute_xdl_fp16.cpp
--- a/example/26_contraction/CMakeLists.txt
+++ b/example/26_contraction/CMakeLists.txt
--- a/example/26_contraction/README.md
+++ b/example/26_contraction/README.md
--- a/example/26_contraction/contraction_bilinear_xdl_fp32.cpp
+++ b/example/26_contraction/contraction_bilinear_xdl_fp32.cpp
--- a/example/26_contraction/contraction_scale_xdl_fp32.cpp
+++ b/example/26_contraction/contraction_scale_xdl_fp32.cpp