Commit cb2d4dbb authored by ltqin's avatar ltqin
Browse files

Merge branch 'attn-bwd-dropout' into attn-fwd-train-dropout

parents 989e3d10 0e7aeef5
...@@ -100,6 +100,17 @@ __host__ __device__ constexpr auto operator*(const Tuple<Xs...>& x, const Y& y) ...@@ -100,6 +100,17 @@ __host__ __device__ constexpr auto operator*(const Tuple<Xs...>& x, const Y& y)
return r; return r;
} }
template <typename... Xs, index_t N>
__host__ __device__ constexpr auto operator*(const Tuple<Xs...>& x, const Number<N>& y)
{
constexpr index_t NSize = sizeof...(Xs);
// Tuple<Xs...> r;
// static_for<0, NSize, 1>{}([&](auto i) { r(i) = x[i] * y; });
// return r;
return generate_tuple([&](auto i) { return x[i] * y; }, Number<NSize>{});
}
// MultiIndex = scalar * MultiIndex // MultiIndex = scalar * MultiIndex
template <typename... Xs, template <typename... Xs,
typename Y, typename Y,
......
...@@ -19,4 +19,37 @@ struct ThisThreadBlock ...@@ -19,4 +19,37 @@ struct ThisThreadBlock
__device__ static index_t GetThreadId() { return get_thread_local_1d_id(); } __device__ static index_t GetThreadId() { return get_thread_local_1d_id(); }
}; };
template <index_t ThreadPerBlock>
struct SubThreadBlock
{
static constexpr index_t kNumThread_ = ThreadPerBlock;
__device__ SubThreadBlock(int mwave, int nwave) : mwave_(mwave), nwave_(nwave) {}
__device__ static constexpr index_t GetNumOfThread() { return kNumThread_; }
template <typename TupleArg1, typename TupleArg2>
__device__ constexpr bool IsBelong(const TupleArg1& mwave_range, const TupleArg2& nwave_range)
{
// wave_range[I0] inclusive, wave_range[I1] exclusive
if(mwave_ < mwave_range[I0])
return false;
else if(mwave_ >= mwave_range[I1])
return false;
else if(nwave_ < nwave_range[I0])
return false;
else if(nwave_ >= nwave_range[I1])
return false;
else
return true;
}
__device__ static index_t GetThreadId() { return get_thread_local_1d_id(); }
private:
index_t mwave_, nwave_;
static constexpr auto I0 = Number<0>{};
static constexpr auto I1 = Number<1>{};
};
} // namespace ck } // namespace ck
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <iostream>
#include <sstream>
#include <vector>
#include <algorithm>
#include "ck/tensor_operation/gpu/device/device_base.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
namespace ck {
namespace tensor_operation {
namespace host {
template <typename RefDataType, typename InDataType, typename OutDataType>
struct ReferenceDropout : public device::BaseOperator
{
// Argument
struct Argument : public device::BaseArgument
{
Argument(const Tensor<RefDataType>& ref,
const Tensor<InDataType>& in,
Tensor<OutDataType>& out,
RefDataType p_dropout_in_16bits,
float rp_dropout)
: ref_(ref),
in_(in),
out_(out),
p_dropout_in_16bits_(p_dropout_in_16bits),
rp_dropout_(ck::type_convert<OutDataType>(rp_dropout))
{
}
const Tensor<RefDataType>& ref_;
const Tensor<InDataType>& in_;
Tensor<OutDataType>& out_;
RefDataType p_dropout_in_16bits_;
OutDataType rp_dropout_;
};
// Invoker
struct Invoker : public device::BaseInvoker
{
float Run(const Argument& arg)
{
arg.out_.ForEach([&](auto& self, auto idx) {
self(idx) =
arg.ref_(idx) < arg.p_dropout_in_16bits_ ? arg.in_(idx) * arg.rp_dropout_ : 0;
});
return 0;
}
float Run(const device::BaseArgument* p_arg,
const StreamConfig& /* stream_config */ = StreamConfig{}) override
{
return Run(*dynamic_cast<const Argument*>(p_arg));
}
};
static constexpr bool IsValidCompilationParameter()
{
// TODO: properly implement this check
return true;
}
bool IsSupportedArgument(const device::BaseArgument*) override { return true; }
static auto MakeArgument(const Tensor<RefDataType>& ref,
const Tensor<InDataType>& in,
Tensor<OutDataType>& out,
RefDataType p_dropout_in_16bits,
float rp_dropout)
{
return Argument{ref, in, out, p_dropout_in_16bits, rp_dropout};
}
static auto MakeInvoker() { return Invoker{}; }
virtual std::unique_ptr<device::BaseInvoker> MakeInvokerPointer()
{
return std::make_unique<Invoker>(Invoker{});
}
std::string GetTypeString() const override
{
auto str = std::stringstream();
// clang-format off
str << "ReferenceDropout"
<< std::endl;
// clang-format on
return str.str();
}
};
} // namespace host
} // namespace tensor_operation
} // namespace ck
...@@ -11,6 +11,25 @@ ...@@ -11,6 +11,25 @@
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
/*
For fp16 M-contigous matrix of size M_K, each thread reads 4x2 tile (2 * 64bits) from the global
memory, transposes the 4x2 tile inside register, and writes into LDS in K0_M_K1 layout. This allows
us to use 128-bit LDS write instruction. This also avoids write bank conflicts because two
vertically connected 4x2 tiles is a contiguous chunk of memory if modeled as K0_M_K1 layout where
K1=2.
<- K1 -> <- K1 -> <- K1 ->
_________ _________ _________
| | 0 | 4 | transpose | 0 - 1 | to LDS | 0 - 1 |
| | 1 | 5 | ---> | 2 - 3 | ----> | 2 - 3 |
| | 2 | 6 | | 4 - 5 | | 4 - 5 |
M | | 3 | 7 | | 6 - 7 | | 6 - 7 |
| --------- --------- ---------
| | ... | | ... | | ... |
v --------- --------- ---------
VMEM VGPR LDS
*/
namespace ck { namespace ck {
namespace tensor_operation { namespace tensor_operation {
namespace device { namespace device {
......
...@@ -58,3 +58,4 @@ add_subdirectory(batchnorm) ...@@ -58,3 +58,4 @@ add_subdirectory(batchnorm)
if(GPU_TARGETS MATCHES "gfx1100") if(GPU_TARGETS MATCHES "gfx1100")
add_subdirectory(wmma_op) add_subdirectory(wmma_op)
endif() endif()
add_subdirectory(host_tensor)
add_gtest_executable(test_host_tensor test_host_tensor.cpp)
target_link_libraries(test_host_tensor PRIVATE utility)
\ No newline at end of file
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include <vector>
#include <gtest/gtest.h>
#include "ck/ck.hpp"
#include "ck/library/utility/host_tensor.hpp"
using namespace ck;
TEST(HostTensorTranspose, TestBadArugment)
{
Tensor<float> tensor({13, 7});
EXPECT_THROW(tensor.Transpose({0}), std::runtime_error);
EXPECT_THROW(tensor.Transpose({0, 1, 2}), std::runtime_error);
}
TEST(HostTensorTranspose, Test2D)
{
std::vector<size_t> lengths = {13, 7};
std::vector<size_t> tlengths = {7, 13};
Tensor<float> tensor(lengths);
tensor(0, 0) = 0.f;
tensor(3, 4) = 34.f;
EXPECT_EQ(tensor.GetLengths(), lengths);
EXPECT_EQ(tensor(0, 0), 0.f);
EXPECT_EQ(tensor(3, 4), 34.f);
EXPECT_EQ(tensor(4, 3), 0.f);
EXPECT_EQ(tensor.Transpose().GetLengths(), tlengths);
EXPECT_EQ(tensor.Transpose()(0, 0), 0.f);
EXPECT_EQ(tensor.Transpose()(4, 3), 34.f);
EXPECT_EQ(tensor.Transpose()(3, 4), 0.f);
}
TEST(HostTensorTranspose, Test3D)
{
std::vector<size_t> lengths = {13, 7, 5};
std::vector<size_t> tlengths = {5, 7, 13};
Tensor<float> tensor(lengths);
tensor(0, 0, 0) = 0.f;
tensor(3, 4, 2) = 342.f;
EXPECT_EQ(tensor.GetLengths(), lengths);
EXPECT_EQ(tensor(0, 0, 0), 0.f);
EXPECT_EQ(tensor(3, 4, 2), 342.f);
EXPECT_EQ(tensor(4, 3, 2), 0.f);
EXPECT_EQ(tensor.Transpose().GetLengths(), tlengths);
EXPECT_EQ(tensor.Transpose()(0, 0, 0), 0.f);
EXPECT_EQ(tensor.Transpose()(2, 4, 3), 342.f);
EXPECT_EQ(tensor.Transpose()(2, 3, 4), 0.f);
}
TEST(HostTensorTranspose, Test3D_021)
{
std::vector<size_t> lengths = {13, 7, 5};
std::vector<size_t> tlengths = {13, 5, 7};
Tensor<float> tensor(lengths);
tensor(0, 0, 0) = 0.f;
tensor(3, 4, 2) = 342.f;
EXPECT_EQ(tensor.GetLengths(), lengths);
EXPECT_EQ(tensor(0, 0, 0), 0.f);
EXPECT_EQ(tensor(3, 4, 2), 342.f);
EXPECT_EQ(tensor(4, 3, 2), 0.f);
// transpose last two dimensions
EXPECT_EQ(tensor.Transpose({0, 2, 1}).GetLengths(), tlengths);
EXPECT_EQ(tensor.Transpose({0, 2, 1})(0, 0, 0), 0.f);
EXPECT_EQ(tensor.Transpose({0, 2, 1})(2, 4, 3), 0.f);
EXPECT_EQ(tensor.Transpose({0, 2, 1})(3, 2, 4), 342.f);
EXPECT_EQ(tensor.Transpose({0, 2, 1})(2, 3, 4), 0.f);
// transpose last two dimensions back again
EXPECT_EQ(tensor.Transpose({0, 2, 1}).Transpose({0, 2, 1}).GetLengths(), lengths);
EXPECT_EQ(tensor.Transpose({0, 2, 1}).Transpose({0, 2, 1})(3, 4, 2), 342.f);
}
TEST(HostTensorTranspose, TestNonpacked2D)
{
std::vector<size_t> lengths = {13, 7};
std::vector<size_t> strides = {100, 1};
std::vector<size_t> tlengths = {7, 13};
Tensor<float> tensor(lengths, strides);
tensor(0, 0) = 0.f;
tensor(3, 4) = 34.f;
EXPECT_EQ(tensor.GetLengths(), lengths);
EXPECT_EQ(tensor(0, 0), 0.f);
EXPECT_EQ(tensor(3, 4), 34.f);
EXPECT_EQ(tensor(4, 3), 0.f);
EXPECT_EQ(tensor.Transpose().GetLengths(), tlengths);
EXPECT_EQ(tensor.Transpose()(0, 0), 0.f);
EXPECT_EQ(tensor.Transpose()(4, 3), 34.f);
EXPECT_EQ(tensor.Transpose()(3, 4), 0.f);
}
...@@ -3,9 +3,12 @@ add_custom_target(test_softmax) ...@@ -3,9 +3,12 @@ add_custom_target(test_softmax)
add_gtest_executable(test_softmax_rank3 test_softmax_rank3.cpp) add_gtest_executable(test_softmax_rank3 test_softmax_rank3.cpp)
add_gtest_executable(test_softmax_rank4 test_softmax_rank4.cpp) add_gtest_executable(test_softmax_rank4 test_softmax_rank4.cpp)
add_gtest_executable(test_softmax_interface test_softmax_interface.cpp) add_gtest_executable(test_softmax_interface test_softmax_interface.cpp)
add_gtest_executable(test_softmax_host_ref test_softmax_host_ref.cpp)
target_link_libraries(test_softmax_rank3 PRIVATE utility device_softmax_instance) target_link_libraries(test_softmax_rank3 PRIVATE utility device_softmax_instance)
target_link_libraries(test_softmax_rank4 PRIVATE utility device_softmax_instance) target_link_libraries(test_softmax_rank4 PRIVATE utility device_softmax_instance)
target_link_libraries(test_softmax_interface PRIVATE utility device_softmax_instance) target_link_libraries(test_softmax_interface PRIVATE utility device_softmax_instance)
target_link_libraries(test_softmax_host_ref PRIVATE utility)
add_dependencies(test_softmax test_softmax_rank3) add_dependencies(test_softmax test_softmax_rank3)
add_dependencies(test_softmax test_softmax_rank4) add_dependencies(test_softmax test_softmax_rank4)
add_dependencies(test_softmax test_softmax_interface) add_dependencies(test_softmax test_softmax_interface)
add_dependencies(test_softmax test_softmax_host_ref)
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <vector>
#include "ck/library/utility/fill.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_softmax.hpp"
#include "gtest/gtest.h"
using namespace ck;
TEST(ReferenceSoftmax, Run)
{
Tensor<float> x({2, 2});
Tensor<float> y({2, 2});
x.GenerateTensorValue(GeneratorTensor_Diagonal<float>{});
using ReferenceSoftmax = tensor_operation::host::ReferenceSoftmax<float, float, float>;
float alpha = 1.f;
float beta = 0.f;
auto ref_softmax = ReferenceSoftmax{};
auto ref_softmax_invoker = ref_softmax.MakeInvoker();
auto ref_softmax_argument = ref_softmax.MakeArgument(x, y, alpha, beta, {1});
ref_softmax_invoker.Run(ref_softmax_argument);
EXPECT_TRUE((utils::check_err(
y.mData, std::vector<float>{0.73105858f, 0.268941421f, 0.26894142f, 0.73105858f})));
}
TEST(ReferenceSoftmax, RunWithCalculatedStats)
{
// >>> x = np.eye(4)
// >>> m = np.max(np.exp(x), axis=1, keepdims=True)
// >>> l = np.sum(np.exp(x - np.tile(m, (1,4))), axis=1, keepdims=True)
// >>> m + np.log(l)
// array([[1.74366838],
// [1.74366838],
// [1.74366838],
// [1.74366838]])
Tensor<float> x({4, 4});
Tensor<float> y({4, 4});
Tensor<float> stats({4});
x.GenerateTensorValue(GeneratorTensor_Diagonal<float>{});
using ReferenceSoftmax = tensor_operation::host::ReferenceSoftmax<float, float, float>;
float alpha = 1.f;
float beta = 0.f;
auto ref_softmax = ReferenceSoftmax{};
auto ref_softmax_invoker = ref_softmax.MakeInvoker();
{
auto ref_softmax_argument = ref_softmax.MakeArgument(x, y, alpha, beta, {1}, &stats);
ref_softmax_invoker.Run(ref_softmax_argument);
EXPECT_TRUE((utils::check_err(
stats.mData, std::vector<float>{1.74366838f, 1.74366838f, 1.74366838f, 1.74366838f})));
}
{
Tensor<float> yy({4, 4});
auto ref_softmax_argument = ref_softmax.MakeArgument(x, yy, alpha, beta, {1}, &stats);
ref_softmax_invoker.RunWithPreCalcStats(ref_softmax_argument);
EXPECT_TRUE((utils::check_err(y.mData, yy.mData)));
}
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment