Commit 289f15de authored by aska-0096's avatar aska-0096
Browse files

Merge branch 'develop' of https://github.com/ROCmSoftwarePlatform/composable_kernel into wmma_gemm

parents 9bd44685 d58b7f51
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
#include <iostream> #include <iostream>
#include "profiler/include/profile_batched_gemm_impl.hpp" #include "profiler/profile_batched_gemm_impl.hpp"
namespace { namespace {
using ADataType = float; using ADataType = float;
......
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
#include <iostream> #include <iostream>
#include "profiler/include/profile_batched_gemm_impl.hpp" #include "profiler/profile_batched_gemm_impl.hpp"
namespace { namespace {
using ADataType = int8_t; using ADataType = int8_t;
......
...@@ -6,7 +6,7 @@ ...@@ -6,7 +6,7 @@
#include <vector> #include <vector>
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_gemm_xdl_cshuffle.hpp" #include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_gemm_xdl_cshuffle.hpp"
#include "profiler/include/profile_batched_gemm_gemm_impl.hpp" #include "profiler/profile_batched_gemm_gemm_impl.hpp"
using ck::tensor_operation::device::GemmSpecialization; using ck::tensor_operation::device::GemmSpecialization;
......
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
#include <iostream> #include <iostream>
#include "profiler/include/profile_batched_gemm_reduce_impl.hpp" #include "profiler/profile_batched_gemm_reduce_impl.hpp"
int main() int main()
{ {
......
...@@ -6,7 +6,7 @@ ...@@ -6,7 +6,7 @@
#include <vector> #include <vector>
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_xdl_cshuffle.hpp" #include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_xdl_cshuffle.hpp"
#include "profiler/include/profile_batched_gemm_softmax_gemm_impl.hpp" #include "profiler/profile_batched_gemm_softmax_gemm_impl.hpp"
using ck::tensor_operation::device::GemmSpecialization; using ck::tensor_operation::device::GemmSpecialization;
template <ck::index_t N> template <ck::index_t N>
......
add_custom_target(test_batched_gemm_softmax_gemm_permute) add_custom_target(test_batched_gemm_softmax_gemm_permute)
add_gtest_executable(test_batched_gemm_softmax_gemm_permute_fp16 test_batched_gemm_softmax_gemm_permute_fp16.cpp) add_gtest_executable(test_batched_gemm_softmax_gemm_permute_fp16 test_batched_gemm_softmax_gemm_permute_fp16.cpp)
add_gtest_executable(test_batched_gemm_softmax_gemm_permute_bf16 test_batched_gemm_softmax_gemm_permute_bf16.cpp)
target_link_libraries(test_batched_gemm_softmax_gemm_permute_fp16 PRIVATE utility device_batched_gemm_softmax_gemm_permute_instance) target_link_libraries(test_batched_gemm_softmax_gemm_permute_fp16 PRIVATE utility device_batched_gemm_softmax_gemm_permute_instance)
add_dependencies(test_batched_gemm_softmax_gemm_permute test_batched_gemm_softmax_gemm_permute_fp16) target_link_libraries(test_batched_gemm_softmax_gemm_permute_bf16 PRIVATE utility device_batched_gemm_softmax_gemm_permute_instance)
\ No newline at end of file add_dependencies(test_batched_gemm_softmax_gemm_permute test_batched_gemm_softmax_gemm_permute_fp16)
add_dependencies(test_batched_gemm_softmax_gemm_permute test_batched_gemm_softmax_gemm_permute_bf16)
\ No newline at end of file
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "gtest/gtest.h"
#include "test_batched_gemm_softmax_gemm_permute_util.hpp"
template <typename Tuple>
class TestBatchedGemmMaskingScaleSoftmaxGemmPermuteBF16
: public TestBatchedGemmMaskingScaleSoftmaxGemmPermute<Tuple>
{
};
using I1_t = ck::Number<1>;
using I2_t = ck::Number<2>;
using MaskDisabled_t =
ck::integral_constant<MaskingSpecialization, MaskingSpecialization::MaskDisabled>;
using MaskOutUpperTriangle_t =
ck::integral_constant<MaskingSpecialization, MaskingSpecialization::MaskOutUpperTriangle>;
// clang-format off
using KernelTypes = ::testing::Types<
std::tuple<I2_t, I1_t, I1_t, I1_t, I1_t, BF16, BF16, BF16, BF16, ck::Tuple<>, ck::Tuple<>, MaskDisabled_t>,
std::tuple<I2_t, I1_t, I1_t, I1_t, I1_t, BF16, BF16, BF16, BF16, ck::Tuple<>, ck::Tuple<>, MaskOutUpperTriangle_t>
>;
// clang-format on
TYPED_TEST_SUITE(TestBatchedGemmMaskingScaleSoftmaxGemmPermuteBF16, KernelTypes);
TYPED_TEST(TestBatchedGemmMaskingScaleSoftmaxGemmPermuteBF16, Test_BF16) { this->Run(); }
TYPED_TEST(TestBatchedGemmMaskingScaleSoftmaxGemmPermuteBF16, Test_BF16_PadM)
{
this->lengths_ = std::vector<std::vector<int>>{
{136, 128, 32, 128, 2, 3},
};
this->Run();
}
TYPED_TEST(TestBatchedGemmMaskingScaleSoftmaxGemmPermuteBF16, Test_BF16_PadN)
{
this->lengths_ = std::vector<std::vector<int>>{
{128, 136, 32, 128, 3, 2},
};
this->Run();
}
TYPED_TEST(TestBatchedGemmMaskingScaleSoftmaxGemmPermuteBF16, Test_BF16_PadK)
{
this->lengths_ = std::vector<std::vector<int>>{
{128, 128, 40, 128, 2, 4},
{128, 128, 136, 128, 4, 2},
};
this->Run();
}
TYPED_TEST(TestBatchedGemmMaskingScaleSoftmaxGemmPermuteBF16, Test_BF16_PadO)
{
this->lengths_ = std::vector<std::vector<int>>{
{128, 128, 32, 136, 1, 3},
};
this->Run();
}
TYPED_TEST(TestBatchedGemmMaskingScaleSoftmaxGemmPermuteBF16, Test_BF16_OddM)
{
this->lengths_ = std::vector<std::vector<int>>{
{129, 128, 32, 128, 2, 3},
};
this->Run();
}
TYPED_TEST(TestBatchedGemmMaskingScaleSoftmaxGemmPermuteBF16, Test_BF16_OddN)
{
this->lengths_ = std::vector<std::vector<int>>{
{128, 129, 32, 128, 4, 3},
};
this->Run();
}
TYPED_TEST(TestBatchedGemmMaskingScaleSoftmaxGemmPermuteBF16, Test_BF16_OddK)
{
this->lengths_ = std::vector<std::vector<int>>{
{128, 128, 33, 128, 2, 3},
{128, 128, 129, 128, 2, 3},
};
this->Run();
}
// If kernel B1Layout is RowMajor, expect not to support odd O size
TYPED_TEST(TestBatchedGemmMaskingScaleSoftmaxGemmPermuteBF16, Test_BF16_OddO)
{
this->lengths_ = std::vector<std::vector<int>>{
{128, 128, 32, 129, 2, 3},
};
this->Run();
}
TYPED_TEST(TestBatchedGemmMaskingScaleSoftmaxGemmPermuteBF16, DISABLED_Bench_BF16_IrregularK)
{
this->lengths_ = std::vector<std::vector<int>>{{256, 256, 160, 160, 1, 16},
{256, 64, 160, 64, 1, 16},
{1024, 1024, 80, 80, 1, 16},
{1024, 64, 80, 64, 1, 16},
{4096, 4096, 40, 40, 1, 16},
{4096, 64, 40, 64, 1, 16}};
this->bench_ = true;
this->verify_ = false;
this->Run();
}
TYPED_TEST(TestBatchedGemmMaskingScaleSoftmaxGemmPermuteBF16, DISABLED_Bench_BF16)
{
this->lengths_ = std::vector<std::vector<int>>{
{256, 256, 64, 64, 48, 16},
{256, 256, 128, 128, 48, 16},
{512, 512, 64, 64, 48, 16},
{512, 512, 128, 128, 48, 16},
{1024, 1024, 64, 64, 48, 16},
{1024, 1024, 128, 128, 48, 16},
{2048, 2048, 64, 64, 48, 16},
{2048, 2048, 128, 128, 48, 16},
{4096, 4096, 64, 64, 48, 16},
{4096, 4096, 128, 128, 48, 16},
};
this->bench_ = true;
this->verify_ = false;
this->Run();
}
using ck::tensor_operation::device::GemmSpecialization;
TEST(TestBatchedGemmMaskingScaleSoftmaxGemmPermuteInterface, GemmSpecializationSizeMatch)
{
int P = 120; // requires padding
int Q = 128; // do not require padding
// IsSupported(M, N, K, O)
// clang-format off
EXPECT_TRUE(DeviceInstanceWrapper_G2M1N1K1O1_TNTT_BF16_M128_N128_K32_O128<GemmSpecialization::Default>{}.IsSupported(Q, Q, Q, Q));
EXPECT_TRUE(DeviceInstanceWrapper_G2M1N1K1O1_TNTT_BF16_M128_N128_K32_O128<GemmSpecialization::MPadding>{}.IsSupported(P, Q, Q, Q));
EXPECT_TRUE(DeviceInstanceWrapper_G2M1N1K1O1_TNTT_BF16_M128_N128_K32_O128<GemmSpecialization::NPadding>{}.IsSupported(Q, P, Q, Q));
EXPECT_TRUE(DeviceInstanceWrapper_G2M1N1K1O1_TNTT_BF16_M128_N128_K32_O128<GemmSpecialization::KPadding>{}.IsSupported(Q, Q, P, Q));
EXPECT_TRUE(DeviceInstanceWrapper_G2M1N1K1O1_TNTT_BF16_M128_N128_K32_O128<GemmSpecialization::MNPadding>{}.IsSupported(P, P, Q, Q));
EXPECT_TRUE(DeviceInstanceWrapper_G2M1N1K1O1_TNTT_BF16_M128_N128_K32_O128<GemmSpecialization::MKPadding>{}.IsSupported(P, Q, P, Q));
EXPECT_TRUE(DeviceInstanceWrapper_G2M1N1K1O1_TNTT_BF16_M128_N128_K32_O128<GemmSpecialization::NKPadding>{}.IsSupported(Q, P, P, Q));
EXPECT_TRUE(DeviceInstanceWrapper_G2M1N1K1O1_TNTT_BF16_M128_N128_K32_O128<GemmSpecialization::MNKPadding>{}.IsSupported(P, P, P, Q));
EXPECT_TRUE(DeviceInstanceWrapper_G2M1N1K1O1_TNTT_BF16_M128_N128_K32_O128<GemmSpecialization::OPadding>{}.IsSupported(Q, Q, Q, P));
EXPECT_TRUE(DeviceInstanceWrapper_G2M1N1K1O1_TNTT_BF16_M128_N128_K32_O128<GemmSpecialization::MOPadding>{}.IsSupported(P, Q, Q, P));
EXPECT_TRUE(DeviceInstanceWrapper_G2M1N1K1O1_TNTT_BF16_M128_N128_K32_O128<GemmSpecialization::NOPadding>{}.IsSupported(Q, P, Q, P));
EXPECT_TRUE(DeviceInstanceWrapper_G2M1N1K1O1_TNTT_BF16_M128_N128_K32_O128<GemmSpecialization::KOPadding>{}.IsSupported(Q, Q, P, P));
EXPECT_TRUE(DeviceInstanceWrapper_G2M1N1K1O1_TNTT_BF16_M128_N128_K32_O128<GemmSpecialization::MNOPadding>{}.IsSupported(P, P, Q, P));
EXPECT_TRUE(DeviceInstanceWrapper_G2M1N1K1O1_TNTT_BF16_M128_N128_K32_O128<GemmSpecialization::MKOPadding>{}.IsSupported(P, Q, P, P));
EXPECT_TRUE(DeviceInstanceWrapper_G2M1N1K1O1_TNTT_BF16_M128_N128_K32_O128<GemmSpecialization::NKOPadding>{}.IsSupported(Q, P, P, P));
EXPECT_TRUE(DeviceInstanceWrapper_G2M1N1K1O1_TNTT_BF16_M128_N128_K32_O128<GemmSpecialization::MNKOPadding>{}.IsSupported(P, P, P, P));
// clang-format on
}
TEST(TestBatchedGemmMaskingScaleSoftmaxGemmPermuteInterface, GemmSpecializationSizeMismatch)
{
// IsSupported(M, N, K, O)
// clang-format off
EXPECT_FALSE(DeviceInstanceWrapper_G2M1N1K1O1_TNTT_BF16_M128_N128_K32_O128<GemmSpecialization::Default>{}.IsSupported(128, 128, 120, 128));
EXPECT_FALSE(DeviceInstanceWrapper_G2M1N1K1O1_TNTT_BF16_M128_N128_K32_O128<GemmSpecialization::MNKPadding>{}.IsSupported(128, 128, 128, 120));
// Kernel can't support odd K size because SrcVectorDim == KDim and must satisfy SizeKRaw % ABSrcScalarPerVector == 0
EXPECT_FALSE(DeviceInstanceWrapper_G2M1N1K1O1_TNTT_BF16_M128_N128_K32_O128<GemmSpecialization::MNKOPadding>{}.IsSupported(128, 128, 129, 128));
EXPECT_FALSE(DeviceInstanceWrapper_G2M1N1K1O1_TNTT_BF16_M128_N128_K32_O128<GemmSpecialization::MNKOPadding>{}.IsSupported(128, 128, 130, 128));
// Kernel can't support odd O size because SrcVectorDim == ODim and must satisfy SizeORaw % B1SrcScalarPerVector == 0
EXPECT_FALSE(DeviceInstanceWrapper_G2M1N1K1O1_TNTT_BF16_M128_N128_K32_O128<GemmSpecialization::MNKOPadding>{}.IsSupported(128, 128, 128, 129));
// clang-format on
}
TYPED_TEST(TestBatchedGemmMaskingScaleSoftmaxGemmPermuteBF16, AdhocTest)
{
this->lengths_ = std::vector<std::vector<int>>{
{49, 49, 64, 64, 4, 6},
{64, 49, 64, 64, 4, 6},
{1020, 1020, 64, 128, 4, 6},
{576, 576, 64, 64, 4, 6},
};
this->Run();
}
...@@ -7,7 +7,7 @@ ...@@ -7,7 +7,7 @@
#include "ck/ck.hpp" #include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle.hpp" #include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle.hpp"
#include "profiler/include/profile_batched_gemm_softmax_gemm_permute_impl.hpp" #include "profiler/profile_batched_gemm_softmax_gemm_permute_impl.hpp"
using ck::tensor_operation::device::GemmSpecialization; using ck::tensor_operation::device::GemmSpecialization;
using ck::tensor_operation::device::MaskingSpecialization; using ck::tensor_operation::device::MaskingSpecialization;
...@@ -16,7 +16,8 @@ using ck::tensor_operation::device::TensorSpecialization; ...@@ -16,7 +16,8 @@ using ck::tensor_operation::device::TensorSpecialization;
template <ck::index_t N> template <ck::index_t N>
using I = ck::Number<N>; using I = ck::Number<N>;
using F16 = ck::half_t; using F16 = ck::half_t;
using BF16 = ck::bhalf_t;
using Row = ck::tensor_layout::gemm::RowMajor; using Row = ck::tensor_layout::gemm::RowMajor;
using Col = ck::tensor_layout::gemm::ColumnMajor; using Col = ck::tensor_layout::gemm::ColumnMajor;
...@@ -63,7 +64,7 @@ struct TestBatchedGemmMaskingScaleSoftmaxGemmPermute : public ::testing::Test ...@@ -63,7 +64,7 @@ struct TestBatchedGemmMaskingScaleSoftmaxGemmPermute : public ::testing::Test
ck::Tuple<>, ck::Tuple<>,
ck::Tuple<>, ck::Tuple<>,
MaskingType::value>( MaskingType::value>(
verify_, 1, false, bench_, M, N, K, O, G0, G1); verify_, 2, false, bench_, M, N, K, O, G0, G1);
EXPECT_TRUE(pass); EXPECT_TRUE(pass);
} }
...@@ -224,3 +225,144 @@ struct DeviceInstanceWrapper_G2M1N1K1O1_TNTT_FP16_M128_N128_K32_O128 ...@@ -224,3 +225,144 @@ struct DeviceInstanceWrapper_G2M1N1K1O1_TNTT_FP16_M128_N128_K32_O128
return gemm.IsSupportedArgument(argument); return gemm.IsSupportedArgument(argument);
} }
}; };
template <GemmSpecialization GemmSpec>
struct DeviceInstanceWrapper_G2M1N1K1O1_TNTT_BF16_M128_N128_K32_O128
{
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
using Scale = ck::tensor_operation::element_wise::Scale;
template <ck::index_t... Is>
using S = ck::Sequence<Is...>;
using ADataType = BF16;
using B0DataType = BF16;
using B1DataType = BF16;
using AccDataType = float;
using CShuffleDataType = BF16;
using CDataType = BF16;
using AElementOp = PassThrough;
using B0ElementOp = PassThrough;
using Acc0ElementOp = Scale;
using B1ElementOp = PassThrough;
using CElementOp = PassThrough;
// static constexpr auto GemmSpec = std::tuple_element_t<0, Tuple>::value;
using DeviceGemmGemmInstance =
ck::tensor_operation::device::DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle<
2,
1,
1,
1,
1,
ADataType,
B0DataType,
B1DataType,
CDataType,
ck::Tuple<>,
ck::Tuple<>,
AccDataType,
CShuffleDataType,
AElementOp,
B0ElementOp,
Acc0ElementOp,
B1ElementOp,
CElementOp,
GemmSpec,
TensorSpecialization::Default, // ATensorSpec
TensorSpecialization::Default, // B0TensorSpec
TensorSpecialization::Default, // B1TensorSpec
TensorSpecialization::Default, // CTensorSpec
1,
256,
128, // MPerBlock
128, // NPerBlock
32, // KPerBlock
128, // Gemm1NPerBlock
32, // Gemm1KPerBlock
8, // AK1
8, // BK1
2, // B1K1
32, // MPerXDL
32, // NPerXDL
1, // MXdlPerWave
4, // NXdlPerWave
4, // Gemm1NXdlPerWave
S<4, 64, 1>, // ABlockTransfer
S<1, 0, 2>,
S<1, 0, 2>,
2,
8,
8,
true,
S<4, 64, 1>, // BBlockTransfer
S<1, 0, 2>,
S<1, 0, 2>,
2,
8,
8,
true,
S<8, 32, 1>, // B1BlockTransfer
S<0, 2, 1>,
S<0, 2, 1>,
1,
4,
2,
false,
1, // CShuffleMXdlPerWavePerShuffle
2, // CShuffleNXdlPerWavePerShuffle
S<1, 32, 1, 8>, // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
8, // CShuffleBlockTransferScalarPerVector_NPerBlock
MaskingSpecialization::MaskOutUpperTriangle>; // MaskOutUpperTriangle
bool IsSupported(int M, int N, int K, int O)
{
const int G0 = 1, G1 = 1;
// A layout [G0, M, G1, K]
std::vector<ck::index_t> a_gs_ms_ks_lengths{G0, G1, M, K};
std::vector<ck::index_t> a_gs_ms_ks_strides{M * G1 * K, K, G1 * K, 1};
// B0 layout [G0, N, G1, K]
std::vector<ck::index_t> b0_gs_ns_ks_lengths{G0, G1, N, K};
std::vector<ck::index_t> b0_gs_ns_ks_strides{N * G1 * K, K, G1 * K, 1};
// B1 layout [G0, N, G1, O]
std::vector<ck::index_t> b1_gs_os_ns_lengths{G0, G1, O, N};
std::vector<ck::index_t> b1_gs_os_ns_strides{N * G1 * O, O, 1, G1 * O};
// C layout [G0, M, G1, O]
std::vector<ck::index_t> c_gs_ms_os_lengths{G0, G1, M, O};
std::vector<ck::index_t> c_gs_ms_os_strides{M * G1 * O, O, G1 * O, 1};
auto gemm = DeviceGemmGemmInstance{};
auto invoker = gemm.MakeInvoker();
auto argument = gemm.MakeArgument(static_cast<ADataType*>(nullptr),
static_cast<B0DataType*>(nullptr),
static_cast<B1DataType*>(nullptr),
static_cast<CDataType*>(nullptr),
{}, // p_acc0_biases
{}, // p_acc1_biases
a_gs_ms_ks_lengths,
a_gs_ms_ks_strides,
b0_gs_ns_ks_lengths,
b0_gs_ns_ks_strides,
b1_gs_os_ns_lengths,
b1_gs_os_ns_strides,
c_gs_ms_os_lengths,
c_gs_ms_os_strides,
{}, // acc0_biases_gs_ms_ns_lengths
{}, // acc0_biases_gs_ms_ns_strides
{}, // acc1_biases_gs_ms_os_lengths
{}, // acc1_biases_gs_ms_os_strides
PassThrough{}, // a_element_op
PassThrough{}, // b0_element_op
Scale{1.f}, // acc0_element_op
PassThrough{}, // b1_element_op
PassThrough{}); // c_element_op
return gemm.IsSupportedArgument(argument);
}
};
add_gtest_executable(test_batchnorm_fwd_rank_4 batchnorm_fwd_rank_4.cpp)
add_gtest_executable(test_batchnorm_bwd_rank_4 batchnorm_bwd_rank_4.cpp)
target_link_libraries(test_batchnorm_fwd_rank_4 PRIVATE utility device_batchnorm_instance)
target_link_libraries(test_batchnorm_bwd_rank_4 PRIVATE utility device_batchnorm_instance)
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <cstdlib>
#include <iostream>
#include <initializer_list>
#include <vector>
#include <tuple>
#include <gtest/gtest.h>
#include "profiler/profile_batchnorm_backward_impl.hpp"
using F16 = ck::half_t;
using F32 = float;
using BF16 = ck::bhalf_t;
using F64 = double;
template <typename Tuple>
class TestBatchNormBwdRank4 : public ::testing::Test
{
private:
const double epsilon = std::numeric_limits<float>::epsilon();
protected:
using XDataType = std::tuple_element_t<0, Tuple>;
using DxDataType = std::tuple_element_t<1, Tuple>;
using DyDataType = std::tuple_element_t<2, Tuple>;
using AccDataType = std::tuple_element_t<3, Tuple>;
using ScaleDataType = std::tuple_element_t<4, Tuple>;
using BiasDataType = std::tuple_element_t<5, Tuple>;
using MeanVarDataType = std::tuple_element_t<6, Tuple>;
std::vector<std::vector<size_t>> list_of_lengths = {
{128, 16, 3, 1024}, {128, 16, 6, 512}, {1, 1, 1, 1}, {4, 4, 4, 4}, {32, 32, 32, 32}};
std::vector<int> reduceDims;
template <int NumReduceDim>
void Run()
{
for(auto& inOutLengths : list_of_lengths)
{
bool pass = true;
EXPECT_FALSE(reduceDims.size() != NumReduceDim);
pass = pass && ck::profiler::profile_batchnorm_backward_impl<XDataType,
DxDataType,
DyDataType,
AccDataType,
ScaleDataType,
BiasDataType,
MeanVarDataType,
4,
NumReduceDim>(
true, 3, false, false, inOutLengths, reduceDims, true, epsilon);
pass = pass && ck::profiler::profile_batchnorm_backward_impl<XDataType,
DxDataType,
DyDataType,
AccDataType,
ScaleDataType,
BiasDataType,
MeanVarDataType,
4,
NumReduceDim>(
true, 3, false, false, inOutLengths, reduceDims, false, epsilon);
EXPECT_TRUE(pass);
}
}
};
using KernelTypes = ::testing::Types<std::tuple<F16, F32, F32, F32, F16, F32, F32>,
std::tuple<F32, F32, F32, F32, F32, F32, F32>,
std::tuple<BF16, F32, F32, F32, BF16, F32, F32>,
std::tuple<F64, F64, F64, F64, F64, F64, F64>>;
TYPED_TEST_SUITE(TestBatchNormBwdRank4, KernelTypes);
// nhwc
TYPED_TEST(TestBatchNormBwdRank4, nhwc)
{
this->reduceDims = {0, 1, 2};
this->template Run<3>();
}
// nchw
TYPED_TEST(TestBatchNormBwdRank4, nchw)
{
this->reduceDims = {0, 2, 3};
this->template Run<3>();
}
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <cstdlib>
#include <iostream>
#include <initializer_list>
#include <vector>
#include <tuple>
#include <gtest/gtest.h>
#include "profiler/profile_batchnorm_forward_impl.hpp"
using F16 = ck::half_t;
using F32 = float;
using BF16 = ck::bhalf_t;
using I8 = int8_t;
using F64 = double;
template <typename Tuple>
class TestBatchNormFwdRank4 : public ::testing::Test
{
private:
const double epsilon = std::numeric_limits<float>::epsilon();
const double averageFactor = 0.1;
protected:
using XDataType = std::tuple_element_t<0, Tuple>;
using YDataType = std::tuple_element_t<1, Tuple>;
using AccDataType = std::tuple_element_t<2, Tuple>;
using ScaleDataType = std::tuple_element_t<3, Tuple>;
using BiasDataType = std::tuple_element_t<4, Tuple>;
using MeanVarDataType = std::tuple_element_t<5, Tuple>;
std::vector<std::vector<size_t>> list_of_lengths = {
{128, 16, 3, 1024}, {128, 16, 6, 512}, {1, 1, 1, 1}, {4, 4, 4, 4}, {32, 32, 32, 32}};
std::vector<int> reduceDims;
template <int NumReduceDim>
void Run()
{
for(auto& inOutLengths : list_of_lengths)
{
bool pass = true;
EXPECT_FALSE(reduceDims.size() != NumReduceDim);
pass =
pass && ck::profiler::profile_batchnorm_forward_impl<XDataType,
YDataType,
AccDataType,
ScaleDataType,
BiasDataType,
MeanVarDataType,
4,
NumReduceDim>(true,
3,
false,
false,
inOutLengths,
reduceDims,
true,
true,
epsilon,
averageFactor);
pass =
pass && ck::profiler::profile_batchnorm_forward_impl<XDataType,
YDataType,
AccDataType,
ScaleDataType,
BiasDataType,
MeanVarDataType,
4,
NumReduceDim>(true,
3,
false,
false,
inOutLengths,
reduceDims,
false,
false,
epsilon,
averageFactor);
EXPECT_TRUE(pass);
}
}
};
using KernelTypes = ::testing::Types<std::tuple<F16, F16, F32, F16, F16, F32>,
std::tuple<F32, F32, F32, F32, F32, F32>,
std::tuple<BF16, BF16, F32, BF16, BF16, F32>,
std::tuple<F64, F64, F64, F64, F64, F64>>;
TYPED_TEST_SUITE(TestBatchNormFwdRank4, KernelTypes);
// nhwc
TYPED_TEST(TestBatchNormFwdRank4, nhwc)
{
this->reduceDims = {0, 1, 2};
this->template Run<3>();
}
// nchw
TYPED_TEST(TestBatchNormFwdRank4, nchw)
{
this->reduceDims = {0, 2, 3};
this->template Run<3>();
}
...@@ -8,7 +8,7 @@ ...@@ -8,7 +8,7 @@
#include <tuple> #include <tuple>
#include <gtest/gtest.h> #include <gtest/gtest.h>
#include "profiler/include/profile_conv_bwd_data_impl.hpp" #include "profiler/profile_conv_bwd_data_impl.hpp"
template <typename Tuple> template <typename Tuple>
class TestConvndBwdData : public ::testing::Test class TestConvndBwdData : public ::testing::Test
......
add_gtest_executable(test_convnd_bwd_weight convnd_bwd_weight.cpp)
target_link_libraries(test_convnd_bwd_weight PRIVATE utility device_conv1d_bwd_weight_instance device_conv2d_bwd_weight_instance device_conv3d_bwd_weight_instance)
...@@ -8,7 +8,7 @@ ...@@ -8,7 +8,7 @@
#include <tuple> #include <tuple>
#include <gtest/gtest.h> #include <gtest/gtest.h>
#include "profiler/include/profile_conv_fwd_impl.hpp" #include "profiler/profile_conv_fwd_impl.hpp"
template <typename Tuple> template <typename Tuple>
class TestConvndFwd : public ::testing::Test class TestConvndFwd : public ::testing::Test
......
...@@ -2,7 +2,7 @@ ...@@ -2,7 +2,7 @@
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "gtest/gtest.h" #include "gtest/gtest.h"
#include "profiler/include/profile_elementwise_layernorm_impl.hpp" #include "profiler/profile_elementwise_layernorm_impl.hpp"
using F16 = ck::half_t; using F16 = ck::half_t;
using F32 = float; using F32 = float;
......
...@@ -9,6 +9,7 @@ ...@@ -9,6 +9,7 @@
#include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp" #include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/literals.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
namespace ck { namespace ck {
...@@ -128,15 +129,15 @@ struct TestGemm ...@@ -128,15 +129,15 @@ struct TestGemm
{ {
auto f_host_tensor_descriptor = auto f_host_tensor_descriptor =
[](std::size_t row, std::size_t col, std::size_t stride, auto layout) { [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
using namespace ck::literals;
if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value) if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
{ {
return HostTensorDescriptor(std::vector<std::size_t>({row, col}), return HostTensorDescriptor({row, col}, {stride, 1_uz});
std::vector<std::size_t>({stride, 1}));
} }
else else
{ {
return HostTensorDescriptor(std::vector<std::size_t>({row, col}), return HostTensorDescriptor({row, col}, {1_uz, stride});
std::vector<std::size_t>({1, stride}));
} }
}; };
...@@ -229,27 +230,27 @@ struct TestGemm ...@@ -229,27 +230,27 @@ struct TestGemm
bool res = false; bool res = false;
if(std::is_same<CDataType, float>::value) if(std::is_same<CDataType, float>::value)
{ {
res = ck::utils::check_err(c_device.mData, c_host.mData); res = ck::utils::check_err(c_device, c_host);
std::cout << (res ? "SUCCESS" : "FAILURE") << std::endl; std::cout << (res ? "SUCCESS" : "FAILURE") << std::endl;
} }
else if(std::is_same<CDataType, ck::half_t>::value) else if(std::is_same<CDataType, ck::half_t>::value)
{ {
res = ck::utils::check_err(c_device.mData, c_host.mData); res = ck::utils::check_err(c_device, c_host);
std::cout << (res ? "SUCCESS" : "FAILURE") << std::endl; std::cout << (res ? "SUCCESS" : "FAILURE") << std::endl;
} }
else if(std::is_same<CDataType, ck::bhalf_t>::value) else if(std::is_same<CDataType, ck::bhalf_t>::value)
{ {
res = ck::utils::check_err(c_device.mData, c_host.mData); res = ck::utils::check_err(c_device, c_host);
std::cout << (res ? "SUCCESS" : "FAILURE") << std::endl; std::cout << (res ? "SUCCESS" : "FAILURE") << std::endl;
} }
else if(std::is_same<CDataType, int8_t>::value) else if(std::is_same<CDataType, int8_t>::value)
{ {
res = ck::utils::check_err(c_device.mData, c_host.mData); res = ck::utils::check_err(c_device, c_host);
std::cout << (res ? "SUCCESS" : "FAILURE") << std::endl; std::cout << (res ? "SUCCESS" : "FAILURE") << std::endl;
} }
else if(std::is_same<CDataType, double>::value) else if(std::is_same<CDataType, double>::value)
{ {
res = ck::utils::check_err(c_device.mData, c_host.mData); res = ck::utils::check_err(c_device, c_host);
std::cout << (res ? "SUCCESS" : "FAILURE") << std::endl; std::cout << (res ? "SUCCESS" : "FAILURE") << std::endl;
} }
......
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
#include <iostream> #include <iostream>
#include "profiler/include/profile_gemm_reduce_impl.hpp" #include "profiler/profile_gemm_reduce_impl.hpp"
int main() int main()
{ {
......
...@@ -16,6 +16,7 @@ ...@@ -16,6 +16,7 @@
#include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp" #include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/literals.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
#include "ck/library/utility/host_gemm.hpp" #include "ck/library/utility/host_gemm.hpp"
...@@ -93,15 +94,15 @@ int test_gemm(const gemmArgs& args) ...@@ -93,15 +94,15 @@ int test_gemm(const gemmArgs& args)
auto f_host_tensor_descriptor = auto f_host_tensor_descriptor =
[](std::size_t row, std::size_t col, std::size_t stride, bool row_major) { [](std::size_t row, std::size_t col, std::size_t stride, bool row_major) {
using namespace ck::literals;
if(row_major) if(row_major)
{ {
return HostTensorDescriptor(std::vector<std::size_t>({row, col}), return HostTensorDescriptor({row, col}, {stride, 1_uz});
std::vector<std::size_t>({stride, 1}));
} }
else else
{ {
return HostTensorDescriptor(std::vector<std::size_t>({row, col}), return HostTensorDescriptor({row, col}, {1_uz, stride});
std::vector<std::size_t>({1, stride}));
} }
}; };
...@@ -225,9 +226,8 @@ int main(int argc, char* argv[]) ...@@ -225,9 +226,8 @@ int main(int argc, char* argv[])
std::vector<gemmArgs> test_cases; std::vector<gemmArgs> test_cases;
if(argc == 1) if(argc == 1)
{ {
test_cases = {{GemmMatrixLayout::MK_KN_MN, 3, 3, 3, 3, 3, 3, 1}}; test_cases = {{GemmMatrixLayout::MK_KN_MN, 1024, 1024, 1024, 1024, 1024, 1024, 2},
// JD: Populate with more and meaningful {GemmMatrixLayout::MK_KN_MN, 1024, 1024, 1024, 1024, 1024, 1024, 8}};
return 0;
} }
else if(argc == 9) else if(argc == 9)
{ {
...@@ -252,11 +252,10 @@ int main(int argc, char* argv[]) ...@@ -252,11 +252,10 @@ int main(int argc, char* argv[])
printf("arg2 to 7: M, N, K, StrideA, StrideB, StrideC KBatch\n"); printf("arg2 to 7: M, N, K, StrideA, StrideB, StrideC KBatch\n");
return -1; return -1;
} }
bool error = false;
for(const auto& kinder : test_cases) for(const auto& kinder : test_cases)
{ {
const auto res = test_gemm(kinder); error |= test_gemm(kinder);
if(!res)
return -1;
} }
return 0; return error ? 1 : 0;
} }
add_gtest_executable(test_grouped_convnd_bwd_weight grouped_convnd_bwd_weight.cpp)
target_link_libraries(test_grouped_convnd_bwd_weight PRIVATE utility device_grouped_conv1d_bwd_weight_instance device_grouped_conv2d_bwd_weight_instance device_grouped_conv3d_bwd_weight_instance)
...@@ -4,14 +4,15 @@ ...@@ -4,14 +4,15 @@
#include <cstdlib> #include <cstdlib>
#include <iostream> #include <iostream>
#include <initializer_list> #include <initializer_list>
#include <vector>
#include <tuple> #include <tuple>
#include <vector>
#include <gtest/gtest.h> #include <gtest/gtest.h>
#include "profiler/include/profile_conv_bwd_weight_impl.hpp" #include "profiler/profile_grouped_conv_bwd_weight_impl.hpp"
template <typename Tuple> template <typename Tuple>
class TestConvndBwdWeight : public ::testing::Test class TestGroupedConvndBwdWeight : public ::testing::Test
{ {
protected: protected:
using DataType = std::tuple_element_t<0, Tuple>; using DataType = std::tuple_element_t<0, Tuple>;
...@@ -25,20 +26,20 @@ class TestConvndBwdWeight : public ::testing::Test ...@@ -25,20 +26,20 @@ class TestConvndBwdWeight : public ::testing::Test
{ {
bool pass; bool pass;
EXPECT_FALSE(conv_params.empty()); EXPECT_FALSE(conv_params.empty());
pass = ck::profiler::profile_conv_bwd_weight_impl< pass = ck::profiler::profile_grouped_conv_bwd_weight_impl<
NDimSpatial, NDimSpatial,
ck::tuple_element_t<NDimSpatial - 1, ck::tuple_element_t<NDimSpatial - 1,
ck::Tuple<ck::tensor_layout::convolution::NWC, ck::Tuple<ck::tensor_layout::convolution::GNWC,
ck::tensor_layout::convolution::NHWC, ck::tensor_layout::convolution::GNHWC,
ck::tensor_layout::convolution::NDHWC>>, ck::tensor_layout::convolution::GNDHWC>>,
ck::tuple_element_t<NDimSpatial - 1, ck::tuple_element_t<NDimSpatial - 1,
ck::Tuple<ck::tensor_layout::convolution::KXC, ck::Tuple<ck::tensor_layout::convolution::GKXC,
ck::tensor_layout::convolution::KYXC, ck::tensor_layout::convolution::GKYXC,
ck::tensor_layout::convolution::KZYXC>>, ck::tensor_layout::convolution::GKZYXC>>,
ck::tuple_element_t<NDimSpatial - 1, ck::tuple_element_t<NDimSpatial - 1,
ck::Tuple<ck::tensor_layout::convolution::NWK, ck::Tuple<ck::tensor_layout::convolution::GNWK,
ck::tensor_layout::convolution::NHWK, ck::tensor_layout::convolution::GNHWK,
ck::tensor_layout::convolution::NDHWK>>, ck::tensor_layout::convolution::GNDHWK>>,
DataType, DataType,
DataType, DataType,
DataType>(true, // do_verification DataType>(true, // do_verification
...@@ -54,37 +55,37 @@ class TestConvndBwdWeight : public ::testing::Test ...@@ -54,37 +55,37 @@ class TestConvndBwdWeight : public ::testing::Test
using KernelTypes = using KernelTypes =
::testing::Types<std::tuple<float>, std::tuple<ck::half_t>, std::tuple<ck::bhalf_t>>; ::testing::Types<std::tuple<float>, std::tuple<ck::half_t>, std::tuple<ck::bhalf_t>>;
TYPED_TEST_SUITE(TestConvndBwdWeight, KernelTypes); TYPED_TEST_SUITE(TestGroupedConvndBwdWeight, KernelTypes);
TYPED_TEST(TestConvndBwdWeight, Test1D) TYPED_TEST(TestGroupedConvndBwdWeight, Test1D)
{ {
this->conv_params.clear(); this->conv_params.clear();
this->conv_params.push_back({1, 1, 128, 128, 256, {1}, {14}, {2}, {1}, {0}, {0}}); this->conv_params.push_back({1, 4, 128, 128, 256, {1}, {14}, {2}, {1}, {0}, {0}});
this->conv_params.push_back({1, 1, 128, 128, 256, {3}, {28}, {1}, {1}, {1}, {1}}); this->conv_params.push_back({1, 4, 64, 128, 256, {3}, {28}, {1}, {1}, {1}, {1}});
this->conv_params.push_back({1, 1, 128, 128, 256, {1}, {3}, {1}, {1}, {0}, {0}}); this->conv_params.push_back({1, 4, 128, 128, 256, {1}, {3}, {1}, {1}, {0}, {0}});
this->template Run<1>(); this->template Run<1>();
} }
TYPED_TEST(TestConvndBwdWeight, Test2D) TYPED_TEST(TestGroupedConvndBwdWeight, Test2D)
{ {
this->conv_params.clear(); this->conv_params.clear();
this->conv_params.push_back( this->conv_params.push_back(
{2, 1, 128, 128, 256, {1, 1}, {7, 7}, {2, 2}, {1, 1}, {0, 0}, {0, 0}}); {2, 4, 128, 128, 256, {1, 1}, {7, 7}, {2, 2}, {1, 1}, {0, 0}, {0, 0}});
this->conv_params.push_back( this->conv_params.push_back(
{2, 1, 32, 128, 256, {3, 3}, {14, 14}, {1, 1}, {1, 1}, {1, 1}, {1, 1}}); {2, 4, 8, 128, 256, {3, 3}, {14, 14}, {1, 1}, {1, 1}, {1, 1}, {1, 1}});
this->conv_params.push_back( this->conv_params.push_back(
{2, 1, 128, 128, 256, {1, 1}, {3, 3}, {1, 1}, {1, 1}, {0, 0}, {0, 0}}); {2, 4, 128, 128, 256, {1, 1}, {3, 3}, {1, 1}, {1, 1}, {0, 0}, {0, 0}});
this->template Run<2>(); this->template Run<2>();
} }
TYPED_TEST(TestConvndBwdWeight, Test3D) TYPED_TEST(TestGroupedConvndBwdWeight, Test3D)
{ {
this->conv_params.clear(); this->conv_params.clear();
this->conv_params.push_back( this->conv_params.push_back(
{3, 1, 128, 128, 256, {1, 1, 1}, {7, 7, 7}, {2, 2, 2}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}}); {3, 4, 128, 128, 256, {1, 1, 1}, {7, 7, 7}, {2, 2, 2}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}});
this->conv_params.push_back( this->conv_params.push_back(
{3, 1, 32, 128, 256, {3, 3, 3}, {14, 14, 3}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}}); {3, 4, 8, 128, 256, {3, 3, 3}, {14, 14, 3}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}});
this->conv_params.push_back( this->conv_params.push_back(
{3, 1, 128, 128, 256, {1, 1, 1}, {3, 3, 3}, {1, 1, 1}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}}); {3, 4, 128, 128, 256, {1, 1, 1}, {3, 3, 3}, {1, 1, 1}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}});
this->template Run<3>(); this->template Run<3>();
} }
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment