Merge branch 'my-attn-bwd2' into my-attn-bwd3

5736b460 · fsx950223 · aace9ec6 · f9bb62d5 · 5736b460 · 5736b460
Commit 5736b460 authored Feb 08, 2023 by fsx950223
8 changed files
--- a/include/ck/utility/thread_group.hpp
+++ b/include/ck/utility/thread_group.hpp
@@ -19,4 +19,37 @@ struct ThisThreadBlock
    __device__ static index_t GetThreadId() { return get_thread_local_1d_id(); }
 };
+template <index_t ThreadPerBlock>
+struct SubThreadBlock
+{
+    static constexpr index_t kNumThread_ = ThreadPerBlock;
+    __device__ SubThreadBlock(int mwave, int nwave) : mwave_(mwave), nwave_(nwave) {}
+    __device__ static constexpr index_t GetNumOfThread() { return kNumThread_; }
+    template <typename TupleArg1, typename TupleArg2>
+    __device__ constexpr bool IsBelong(const TupleArg1& mwave_range, const TupleArg2& nwave_range)
+    {
+        // wave_range[I0] inclusive, wave_range[I1] exclusive
+        if(mwave_ < mwave_range[I0])
+            return false;
+        else if(mwave_ >= mwave_range[I1])
+            return false;
+        else if(nwave_ < nwave_range[I0])
+            return false;
+        else if(nwave_ >= nwave_range[I1])
+            return false;
+        else
+            return true;
+    }
+    __device__ static index_t GetThreadId() { return get_thread_local_1d_id(); }
+    private:
+    index_t mwave_, nwave_;
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+};
 } // namespace ck
--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_dropout.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_dropout.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#include <iostream>
+#include <sstream>
+#include <vector>
+#include <algorithm>
+#include "ck/tensor_operation/gpu/device/device_base.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+namespace ck {
+namespace tensor_operation {
+namespace host {
+template <typename RefDataType, typename InDataType, typename OutDataType>
+struct ReferenceDropout : public device::BaseOperator
+{
+    // Argument
+    struct Argument : public device::BaseArgument
+    {
+        Argument(const Tensor<RefDataType>& ref,
+                 const Tensor<InDataType>& in,
+                 Tensor<OutDataType>& out,
+                 RefDataType p_dropout_in_16bits,
+                 float rp_dropout)
+            : ref_(ref),
+              in_(in),
+              out_(out),
+              p_dropout_in_16bits_(p_dropout_in_16bits),
+              rp_dropout_(ck::type_convert<OutDataType>(rp_dropout))
+        {
+        }
+        const Tensor<RefDataType>& ref_;
+        const Tensor<InDataType>& in_;
+        Tensor<OutDataType>& out_;
+        RefDataType p_dropout_in_16bits_;
+        OutDataType rp_dropout_;
+    };
+    // Invoker
+    struct Invoker : public device::BaseInvoker
+    {
+        float Run(const Argument& arg)
+        {
+            arg.out_.ForEach([&](auto& self, auto idx) {
+                self(idx) =
+                    arg.ref_(idx) < arg.p_dropout_in_16bits_ ? arg.in_(idx) * arg.rp_dropout_ : 0;
+            });
+            return 0;
+        }
+        float Run(const device::BaseArgument* p_arg,
+                  const StreamConfig& /* stream_config */ = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg));
+        }
+    };
+    static constexpr bool IsValidCompilationParameter()
+    {
+        // TODO: properly implement this check
+        return true;
+    }
+    bool IsSupportedArgument(const device::BaseArgument*) override { return true; }
+    static auto MakeArgument(const Tensor<RefDataType>& ref,
+                             const Tensor<InDataType>& in,
+                             Tensor<OutDataType>& out,
+                             RefDataType p_dropout_in_16bits,
+                             float rp_dropout)
+    {
+        return Argument{ref, in, out, p_dropout_in_16bits, rp_dropout};
+    }
+    static auto MakeInvoker() { return Invoker{}; }
+    virtual std::unique_ptr<device::BaseInvoker> MakeInvokerPointer()
+    {
+        return std::make_unique<Invoker>(Invoker{});
+    }
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+        // clang-format off
+        str << "ReferenceDropout"
+            << std::endl;
+        // clang-format on
+        return str.str();
+    }
+};
+} // namespace host
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f16_f16_f16_km_kn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f16_f16_f16_km_kn_mn_instance.cpp
@@ -11,6 +11,25 @@
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+/*
+For fp16 M-contigous matrix of size M_K, each thread reads 4x2 tile (2 * 64bits) from the global
+memory, transposes the 4x2 tile inside register, and writes into LDS in K0_M_K1 layout. This allows
+us to use 128-bit LDS write instruction. This also avoids write bank conflicts because two
+vertically connected 4x2 tiles is a contiguous chunk of memory if modeled as K0_M_K1 layout where
+K1=2.
+        <- K1 ->              <- K1 ->             <- K1 ->
+       _________             _________            _________
+    |  | 0 | 4 |  transpose  | 0 - 1 |   to LDS   | 0 - 1 |
+    |  | 1 | 5 |    --->     | 2 - 3 |   ---->    | 2 - 3 |
+    |  | 2 | 6 |             | 4 - 5 |            | 4 - 5 |
+  M |  | 3 | 7 |             | 6 - 7 |            | 6 - 7 |
+    |  ---------             ---------            ---------
+    |  |  ...  |             |  ...  |            |  ...  |
+    v  ---------             ---------            ---------
+        VMEM                  VGPR                  LDS
+*/
 namespace ck {
 namespace tensor_operation {
 namespace device {

--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -58,3 +58,4 @@ add_subdirectory(batchnorm)
 if(GPU_TARGETS MATCHES "gfx1100")
    add_subdirectory(wmma_op)
 endif()
+add_subdirectory(host_tensor)
--- a/test/host_tensor/CMakeLists.txt
+++ b/test/host_tensor/CMakeLists.txt
+add_gtest_executable(test_host_tensor test_host_tensor.cpp)
+target_link_libraries(test_host_tensor PRIVATE utility)
\ No newline at end of file
--- a/test/host_tensor/test_host_tensor.cpp
+++ b/test/host_tensor/test_host_tensor.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+#include <iostream>
+#include <vector>
+#include <gtest/gtest.h>
+#include "ck/ck.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+using namespace ck;
+TEST(HostTensorTranspose, TestBadArugment)
+{
+    Tensor<float> tensor({13, 7});
+    EXPECT_THROW(tensor.Transpose({0}), std::runtime_error);
+    EXPECT_THROW(tensor.Transpose({0, 1, 2}), std::runtime_error);
+}
+TEST(HostTensorTranspose, Test2D)
+{
+    std::vector<size_t> lengths  = {13, 7};
+    std::vector<size_t> tlengths = {7, 13};
+    Tensor<float> tensor(lengths);
+    tensor(0, 0) = 0.f;
+    tensor(3, 4) = 34.f;
+    EXPECT_EQ(tensor.GetLengths(), lengths);
+    EXPECT_EQ(tensor(0, 0), 0.f);
+    EXPECT_EQ(tensor(3, 4), 34.f);
+    EXPECT_EQ(tensor(4, 3), 0.f);
+    EXPECT_EQ(tensor.Transpose().GetLengths(), tlengths);
+    EXPECT_EQ(tensor.Transpose()(0, 0), 0.f);
+    EXPECT_EQ(tensor.Transpose()(4, 3), 34.f);
+    EXPECT_EQ(tensor.Transpose()(3, 4), 0.f);
+}
+TEST(HostTensorTranspose, Test3D)
+{
+    std::vector<size_t> lengths  = {13, 7, 5};
+    std::vector<size_t> tlengths = {5, 7, 13};
+    Tensor<float> tensor(lengths);
+    tensor(0, 0, 0) = 0.f;
+    tensor(3, 4, 2) = 342.f;
+    EXPECT_EQ(tensor.GetLengths(), lengths);
+    EXPECT_EQ(tensor(0, 0, 0), 0.f);
+    EXPECT_EQ(tensor(3, 4, 2), 342.f);
+    EXPECT_EQ(tensor(4, 3, 2), 0.f);
+    EXPECT_EQ(tensor.Transpose().GetLengths(), tlengths);
+    EXPECT_EQ(tensor.Transpose()(0, 0, 0), 0.f);
+    EXPECT_EQ(tensor.Transpose()(2, 4, 3), 342.f);
+    EXPECT_EQ(tensor.Transpose()(2, 3, 4), 0.f);
+}
+TEST(HostTensorTranspose, Test3D_021)
+{
+    std::vector<size_t> lengths  = {13, 7, 5};
+    std::vector<size_t> tlengths = {13, 5, 7};
+    Tensor<float> tensor(lengths);
+    tensor(0, 0, 0) = 0.f;
+    tensor(3, 4, 2) = 342.f;
+    EXPECT_EQ(tensor.GetLengths(), lengths);
+    EXPECT_EQ(tensor(0, 0, 0), 0.f);
+    EXPECT_EQ(tensor(3, 4, 2), 342.f);
+    EXPECT_EQ(tensor(4, 3, 2), 0.f);
+    // transpose last two dimensions
+    EXPECT_EQ(tensor.Transpose({0, 2, 1}).GetLengths(), tlengths);
+    EXPECT_EQ(tensor.Transpose({0, 2, 1})(0, 0, 0), 0.f);
+    EXPECT_EQ(tensor.Transpose({0, 2, 1})(2, 4, 3), 0.f);
+    EXPECT_EQ(tensor.Transpose({0, 2, 1})(3, 2, 4), 342.f);
+    EXPECT_EQ(tensor.Transpose({0, 2, 1})(2, 3, 4), 0.f);
+    // transpose last two dimensions back again
+    EXPECT_EQ(tensor.Transpose({0, 2, 1}).Transpose({0, 2, 1}).GetLengths(), lengths);
+    EXPECT_EQ(tensor.Transpose({0, 2, 1}).Transpose({0, 2, 1})(3, 4, 2), 342.f);
+}
+TEST(HostTensorTranspose, TestNonpacked2D)
+{
+    std::vector<size_t> lengths  = {13, 7};
+    std::vector<size_t> strides  = {100, 1};
+    std::vector<size_t> tlengths = {7, 13};
+    Tensor<float> tensor(lengths, strides);
+    tensor(0, 0) = 0.f;
+    tensor(3, 4) = 34.f;
+    EXPECT_EQ(tensor.GetLengths(), lengths);
+    EXPECT_EQ(tensor(0, 0), 0.f);
+    EXPECT_EQ(tensor(3, 4), 34.f);
+    EXPECT_EQ(tensor(4, 3), 0.f);
+    EXPECT_EQ(tensor.Transpose().GetLengths(), tlengths);
+    EXPECT_EQ(tensor.Transpose()(0, 0), 0.f);
+    EXPECT_EQ(tensor.Transpose()(4, 3), 34.f);
+    EXPECT_EQ(tensor.Transpose()(3, 4), 0.f);
+}
--- a/test/softmax/CMakeLists.txt
+++ b/test/softmax/CMakeLists.txt
@@ -3,9 +3,12 @@ add_custom_target(test_softmax)
 add_gtest_executable(test_softmax_rank3 test_softmax_rank3.cpp)
 add_gtest_executable(test_softmax_rank4 test_softmax_rank4.cpp)
 add_gtest_executable(test_softmax_interface test_softmax_interface.cpp)
+add_gtest_executable(test_softmax_host_ref test_softmax_host_ref.cpp)
 target_link_libraries(test_softmax_rank3 PRIVATE utility device_softmax_instance)
 target_link_libraries(test_softmax_rank4 PRIVATE utility device_softmax_instance)
 target_link_libraries(test_softmax_interface PRIVATE utility device_softmax_instance)
+target_link_libraries(test_softmax_host_ref PRIVATE utility)
 add_dependencies(test_softmax test_softmax_rank3)
 add_dependencies(test_softmax test_softmax_rank4)
 add_dependencies(test_softmax test_softmax_interface)
+add_dependencies(test_softmax test_softmax_host_ref)
--- a/test/softmax/test_softmax_host_ref.cpp
+++ b/test/softmax/test_softmax_host_ref.cpp