Commit 32806d5f authored by Jun Liu's avatar Jun Liu
Browse files

Merge branch 'amd-develop' into amd-master

parents e70a4d19 d0f355a3
......@@ -98,7 +98,7 @@ int profile_groupnorm(int argc, char* argv[])
}
else if(data_type == ck::DataTypeEnum::Half)
{
ck::profiler::profile_groupnorm_impl<F16, F16, F16, F32, F16, F32, false>(
ck::profiler::profile_groupnorm_impl<F16, F16, F16, F32, F16, F16, false>(
do_verification, init_method, do_log, time_kernel, length);
}
else
......
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include <vector>
#include <unordered_map>
#include "profiler/data_type_enum.hpp"
#include "profiler/profile_layernorm_bwd_data_impl.hpp"
#include "profiler_operation_registry.hpp"
using ck::index_t;
struct layernormBwdDataArgParser
{
std::unordered_map<std::string, std::vector<int>> long_opts = {{"length", {}}};
bool parse_opt(int argc, char* argv[], const std::string& key, int i)
{
if(std::string("--") + key == argv[i])
{
int pos = i;
while(++i < argc && argv[i][0] != '-') {}
int end = i;
for(int j = pos + 1; j < end; j++)
{
long_opts[key].push_back(std::stoi(argv[j]));
}
return true;
}
return false;
}
void operator()(int argc, char* argv[])
{
for(auto& kv : long_opts)
{
for(int i = 1; i < argc; i++)
{
if(parse_opt(argc, argv, kv.first, i))
break;
}
}
}
};
void print_help_layernorm_bwd_data()
{
// eg: ckProfiler layernorm_bwd_data 0 0 2 0 1 --length 1502 4096
std::cout << "arg1: data type (0: fp16; 1: fp32)\n"
<< "arg2: verification (0: no; 1: yes)\n"
<< "arg3: initialization (0: no init; 1: integer value; 2: decimal value)\n"
<< "arg4: print tensor value (0: no; 1: yes)\n"
<< "arg5: time kernel (0=no, 1=yes)\n"
<< "--length: tensor extents (e.g, --length 1024 1024) \n"
<< std::endl;
}
int profile_layernorm_bwd_data(int argc, char* argv[])
{
if(argc <= 2)
{
print_help_layernorm_bwd_data();
return 0;
}
layernormBwdDataArgParser arg_parser;
// short unnamed options
const ck::DataTypeEnum data_type = static_cast<ck::DataTypeEnum>(std::stoi(argv[2]));
const bool do_verification = std::stoi(argv[3]);
const int init_method = std::stoi(argv[4]);
const bool do_log = std::stoi(argv[5]);
const bool time_kernel = std::stoi(argv[6]);
// parse the long options
arg_parser(argc, argv);
const std::vector<index_t> length = arg_parser.long_opts["length"];
using F16 = ck::half_t;
using F32 = float;
if(length.size() == 2)
{
constexpr int rank = 2;
if(data_type == ck::DataTypeEnum::Half)
{
ck::profiler::profile_layernorm_bwd_data_impl<F16, F16, F16, F16, F32, F16, rank>(
do_verification, init_method, do_log, time_kernel, length);
}
else if(data_type == ck::DataTypeEnum::Float)
{
ck::profiler::profile_layernorm_bwd_data_impl<F32, F32, F32, F32, F32, F32, rank>(
do_verification, init_method, do_log, time_kernel, length);
}
else
{
throw std::runtime_error("not implemented yet");
}
}
else
{
throw std::runtime_error("not implemented yet");
}
return 0;
}
REGISTER_PROFILER_OPERATION("layernorm_bwd_data",
"Layer Normalization",
profile_layernorm_bwd_data);
......@@ -104,7 +104,7 @@ int profile_layernorm(int argc, char* argv[])
if(data_type == ck::DataTypeEnum::Half)
{
ck::profiler::profile_layernorm_impl<F16, F16, F16, F32, F16, F32, false, rank>(
ck::profiler::profile_layernorm_impl<F16, F16, F16, F32, F16, F16, false, rank>(
do_verification, init_method, do_log, time_kernel, length);
}
else if(data_type == ck::DataTypeEnum::Float)
......@@ -125,4 +125,4 @@ int profile_layernorm(int argc, char* argv[])
return 0;
}
REGISTER_PROFILER_OPERATION("layernorm", "Layer Normalization", profile_layernorm);
REGISTER_PROFILER_OPERATION("layernorm_fwd", "Layer Normalization", profile_layernorm);
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include "profiler/profile_transpose_impl.hpp"
#include "profiler_operation_registry.hpp"
enum struct MatrixLayout
{
NCDHW, // 0
NCHWD, // 1
};
enum struct DataType
{
F32_F32_F32_F32_F32, // 0
F16_F16_F16_F16_F16, // 1
};
#define OP_NAME "transpose"
#define OP_DESC "Transpose"
int profile_transpose(int argc, char* argv[])
{
if(argc != 15)
{
printf("arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n");
printf("arg2: data type (0: fp32; 1: fp16)\n");
// printf("arg3: matrix layout (NCDHW -> NDCHW);\n");
printf("arg4: verification (0: no; 1: yes)\n");
printf("arg5: initialization (0: no init; 1: integer value; 2: decimal value)\n");
printf("arg6: print tensor value (0: no; 1: yes)\n");
printf("arg7: time kernel (0=no, 1=yes)\n");
printf("arg8 to 13: N, C, D, H, W\n");
exit(1);
}
const auto data_type = static_cast<DataType>(std::stoi(argv[2]));
// const auto layout = static_cast<MatrixLayout>(std::stoi(argv[3]));
const bool do_verification = std::stoi(argv[3]);
const int init_method = std::stoi(argv[4]);
const bool do_log = std::stoi(argv[5]);
const bool time_kernel = std::stoi(argv[6]);
std::vector<index_t> lengths = std::stoi(argv[7]);
/**const int N = std::stoi(argv[7]);
const int C = std::stoi(argv[8]);
const int D = std::stoi(argv[9]);
const int H = std::stoi(argv[10]);
const int W = std::stoi(argv[11]);**/
using F32 = float;
using F16 = ck::half_t;
auto profile = [&](auto a_type, auto b_type) {
using ADataType = decltype(a_type);
using BDataType = decltype(b_type);
bool pass = ck::profiler::profile_transpose_impl<ADataType, BDataType>(
do_verification, init_method, do_log, time_kernel, lengths);
return pass ? 0 : 1;
};
if(data_type == GemmDataType::F32_F32_F32_F32_F32)
{
return profile(F32{}, F32{});
}
else if(data_type == GemmDataType::F16_F16_F16_F16_F16)
{
return profile(F16{}, F16{});
}
else
{
std::cout << "this data_type & layout is not implemented" << std::endl;
return 1;
}
}
REGISTER_PROFILER_OPERATION(OP_NAME, OP_DESC, profile_gemm_transpose);
......@@ -140,6 +140,7 @@ add_subdirectory(grouped_convnd_bwd_weight)
add_subdirectory(block_to_ctile_map)
add_subdirectory(softmax)
add_subdirectory(normalization_fwd)
add_subdirectory(normalization_bwd_data)
add_subdirectory(data_type)
add_subdirectory(elementwise_normalization)
add_subdirectory(batchnorm)
......@@ -149,6 +150,8 @@ add_subdirectory(batched_gemm_multi_d)
add_subdirectory(grouped_convnd_bwd_data)
add_subdirectory(conv_tensor_rearrange)
add_subdirectory(transpose)
add_subdirectory(permute_scale)
add_subdirectory(wrapper)
if(GPU_TARGETS MATCHES "gfx11")
add_subdirectory(wmma_op)
endif()
add_custom_target(test_normalization_bwd_data)
add_gtest_executable(test_layernorm2d_bwd_data_fp32 test_layernorm2d_bwd_data_fp32.cpp)
if(result EQUAL 0)
target_link_libraries(test_layernorm2d_bwd_data_fp32 PRIVATE utility device_normalization_bwd_data_instance)
add_dependencies(test_normalization_bwd_data test_layernorm2d_bwd_data_fp32)
endif()
add_gtest_executable(test_groupnorm_bwd_data_fp32 test_groupnorm_bwd_data_fp32.cpp)
if(result EQUAL 0)
target_link_libraries(test_groupnorm_bwd_data_fp32 PRIVATE utility device_normalization_bwd_data_instance)
add_dependencies(test_normalization_bwd_data test_groupnorm_bwd_data_fp32)
endif()
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include "gtest/gtest.h"
#include "profiler/profile_groupnorm_bwd_data_impl.hpp"
using F16 = ck::half_t;
using F32 = float;
using ck::index_t;
template <typename Tuple>
class TestgroupnormBwdData : public ::testing::Test
{
protected:
using DYDataType = std::tuple_element_t<0, Tuple>;
using XDataType = std::tuple_element_t<1, Tuple>;
using GammaDataType = std::tuple_element_t<2, Tuple>;
using MeanInvStdDataType = std::tuple_element_t<3, Tuple>;
using ComputeDataType = std::tuple_element_t<4, Tuple>;
using DXDataType = std::tuple_element_t<5, Tuple>;
void Run()
{
// Bwd data: [N, H, W, G, C], reduce H, W, C
std::vector<std::vector<ck::index_t>> lengths = {{1, 1, 1, 1, 1},
{1, 2, 3, 4, 5},
{256, 9, 9, 9, 9},
{1, 64, 64, 32, 10},
{1, 32, 32, 32, 20},
{1, 16, 16, 32, 40}};
for(auto length : lengths)
{
bool success = ck::profiler::profile_groupnorm_bwd_data_impl<DYDataType,
XDataType,
GammaDataType,
MeanInvStdDataType,
ComputeDataType,
DXDataType>(
true, 2, false, false, length);
EXPECT_TRUE(success);
}
}
};
using KernelTypes = ::testing::Types<
// DYDataType XDataType, GammaDataType, MeanInvStdDataType, ComputeDataType, DXDataType>
std::tuple<F32, F32, F32, F32, F32, F32>>;
TYPED_TEST_SUITE(TestgroupnormBwdData, KernelTypes);
TYPED_TEST(TestgroupnormBwdData, Test_FP32) { this->Run(); }
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include "gtest/gtest.h"
#include "profiler/profile_layernorm_bwd_data_impl.hpp"
using F16 = ck::half_t;
using F32 = float;
using ck::index_t;
template <typename Tuple>
class TestLayernorm2dBwdData : public ::testing::Test
{
protected:
using DYDataType = std::tuple_element_t<0, Tuple>;
using XDataType = std::tuple_element_t<1, Tuple>;
using GammaDataType = std::tuple_element_t<2, Tuple>;
using MeanInvStdDataType = std::tuple_element_t<3, Tuple>;
using ComputeDataType = std::tuple_element_t<4, Tuple>;
using DXDataType = std::tuple_element_t<5, Tuple>;
void Run()
{
// Bwd data: [N, D], reduce D
std::vector<std::vector<ck::index_t>> lengths = {
{4, 256}, {8, 511}, {9, 1032}, {4, 2048}, {1, 8192}, {4000, 2000}};
for(auto length : lengths)
{
bool success =
ck::profiler::profile_layernorm_bwd_data_impl<DYDataType,
XDataType,
GammaDataType,
MeanInvStdDataType,
ComputeDataType,
DXDataType,
2>(true, 2, false, false, length);
EXPECT_TRUE(success);
}
}
};
using KernelTypes = ::testing::Types<
// DYDataType XDataType, GammaDataType, MeanInvStdDataType, ComputeDataType, DXDataType>
std::tuple<F32, F32, F32, F32, F32, F32>>;
TYPED_TEST_SUITE(TestLayernorm2dBwdData, KernelTypes);
TYPED_TEST(TestLayernorm2dBwdData, Test_FP32) { this->Run(); }
......@@ -47,8 +47,8 @@ class TestGroupnorm : public ::testing::Test
};
using KernelTypes = ::testing::Types<
// XDataType, GammaDataType, BetaDataType, ComputeDataType, YDataType>
std::tuple<F16, F16, F16, F32, F16, F32>>;
// XDataType, GammaDataType, BetaDataType, ComputeDataType, YDataType, SaveMeanInvStdDataType>
std::tuple<F16, F16, F16, F32, F16, F16>>;
TYPED_TEST_SUITE(TestGroupnorm, KernelTypes);
TYPED_TEST(TestGroupnorm, Test_FP16) { this->Run(); }
......@@ -45,7 +45,7 @@ class TestGroupnorm : public ::testing::Test
};
using KernelTypes = ::testing::Types<
// XDataType, GammaDataType, BetaDataType, ComputeDataType, YDataType>
// XDataType, GammaDataType, BetaDataType, ComputeDataType, YDataType, SaveMeanInvStdDataType>
std::tuple<F32, F32, F32, F32, F32, F32>>;
TYPED_TEST_SUITE(TestGroupnorm, KernelTypes);
......
......@@ -41,8 +41,8 @@ class TestLayernorm2d : public ::testing::Test
};
using KernelTypes = ::testing::Types<
// XDataType, GammaDataType, BetaDataType, ComputeDataType, YDataType>
std::tuple<F16, F16, F16, F32, F16, F32>>;
// XDataType, GammaDataType, BetaDataType, ComputeDataType, YDataType, SaveMeanInvStdDataType>
std::tuple<F16, F16, F16, F32, F16, F16>>;
TYPED_TEST_SUITE(TestLayernorm2d, KernelTypes);
TYPED_TEST(TestLayernorm2d, Test_FP16) { this->Run(); }
......@@ -41,8 +41,8 @@ class TestLayernorm4d : public ::testing::Test
};
using KernelTypes = ::testing::Types<
// XDataType, GammaDataType, BetaDataType, ComputeDataType, YDataType>
std::tuple<F16, F16, F16, F32, F16, F32>>;
// XDataType, GammaDataType, BetaDataType, ComputeDataType, YDataType, SaveMeanInvStdDataType>
std::tuple<F16, F16, F16, F32, F16, F16>>;
TYPED_TEST_SUITE(TestLayernorm4d, KernelTypes);
TYPED_TEST(TestLayernorm4d, Test_FP16) { this->Run(); }
add_custom_target(test_permute)
add_gtest_executable(test_permute_scale test_permute_scale.cpp)
if(result EQUAL 0)
target_link_libraries(test_permute_scale PRIVATE utility device_permute_scale_instance)
add_dependencies(test_permute test_permute_scale)
endif()
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include "gtest/gtest.h"
#include "test_permute_scale_impl.hpp"
using F16 = ck::half_t;
using F32 = float;
using ck::index_t;
template <typename Tuple>
class TestPermute : public ::testing::Test
{
protected:
using ADataType = std::tuple_element_t<0, Tuple>;
using BDataType = std::tuple_element_t<1, Tuple>;
void Run()
{
std::vector<std::vector<ck::index_t>> lengths = {
{4, 2, 1, 8}, {1, 1, 1, 1}, {16, 8, 32, 64}, {32, 64, 128, 128}};
for(auto length : lengths)
{
bool success =
ck::test_permute_scale_impl<ADataType, BDataType, 4>(true, 2, false, false, length);
EXPECT_TRUE(success);
}
}
};
using KernelTypes = ::testing::Types<std::tuple<F16, F16>, std::tuple<F32, F32>>;
TYPED_TEST_SUITE(TestPermute, KernelTypes);
TYPED_TEST(TestPermute, Test_FP16) { this->Run(); }
TYPED_TEST(TestPermute, Test_FP32) { this->Run(); }
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <iomanip>
#include <random>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/device_elementwise_scale.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_elementwise_scale_impl.hpp"
#include "ck/library/tensor_operation_instance/gpu/permute_scale.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/literals.hpp"
namespace ck {
template <typename HostTensorA, typename HostTensorB, typename FunctorA, typename FunctorB>
void host_elementwise4D(HostTensorB& B_nhwc,
const HostTensorA& A_nchw,
FunctorA functor_a,
FunctorB functor_b,
float scale)
{
std::size_t N = A_nchw.mDesc.GetLengths()[0];
std::size_t C = A_nchw.mDesc.GetLengths()[1];
std::size_t H = A_nchw.mDesc.GetLengths()[2];
std::size_t W = A_nchw.mDesc.GetLengths()[3];
for(std::size_t w = 0; w < W; ++w)
for(std::size_t h = 0; h < H; ++h)
for(std::size_t c = 0; c < C; ++c)
for(std::size_t n = 0; n < N; ++n)
{
using tmp_type = ck::remove_reference_t<decltype(B_nhwc(0, 0))>;
tmp_type tmp_val = 0;
auto a_val = A_nchw.mData[(n) + (c * N) + (h * C * N) + (w * H * C * N)];
functor_b(tmp_val, a_val);
functor_a(B_nhwc.mData[(n) + (c * W * H * N) + (h * N) + (w * H * N)],
scale * tmp_val);
}
}
template <typename ADataType, typename BDataType, index_t NumDim>
bool test_permute_scale_impl(int do_verification,
int init_method,
bool do_log,
bool time_kernel,
std::vector<index_t> lengths)
{
bool pass = true;
using ElementOp = ck::tensor_operation::element_wise::PassThrough;
using UnaryOp = ck::tensor_operation::element_wise::UnarySquare;
using Scale = ck::tensor_operation::element_wise::Scale;
float scale = 2.f;
index_t N = lengths[0];
index_t C = lengths[1];
index_t H = lengths[2];
index_t W = lengths[3];
std::vector<ck::index_t> nchw = {N, C, H, W};
std::vector<ck::index_t> nhwc = {N, H, W, C};
Tensor<ADataType> a(nchw);
Tensor<BDataType> b(nhwc);
Tensor<BDataType> host_b(nhwc);
std::array<ck::index_t, 4> ab_lengths;
std::array<ck::index_t, 4> a_strides = {1,
static_cast<int>(nchw[0]),
static_cast<int>(nchw[0] * nchw[1]),
static_cast<int>(nchw[0] * nchw[1] * nchw[2])};
std::array<ck::index_t, 4> b_strides = {1,
static_cast<int>(nhwc[0] * nhwc[1] * nhwc[2]),
static_cast<int>(nhwc[0]),
static_cast<int>(nhwc[0] * nhwc[1])};
ck::ranges::copy(nchw, ab_lengths.begin());
std::cout << "A: " << a.mDesc << std::endl;
std::cout << "B: " << b.mDesc << std::endl;
switch(init_method)
{
case 0: break;
case 1: a.GenerateTensorValue(GeneratorTensor_2<ADataType>{-1, 2}); break;
default: // a.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0}
std::mt19937 gen(11939);
std::uniform_int_distribution<int> dis(0, 1);
auto i = 0;
for(std::size_t w = 0; w < a.mDesc.GetLengths()[3]; ++w)
for(std::size_t h = 0; h < a.mDesc.GetLengths()[2]; ++h)
for(std::size_t c = 0; c < a.mDesc.GetLengths()[1]; ++c)
for(std::size_t n = 0; n < a.mDesc.GetLengths()[0]; ++n)
{
a.mData[(n * nchw[1] * nchw[2] * nchw[3]) + (c * nchw[2] * nchw[3]) +
(h * nchw[3]) + w] = i;
i = dis(gen);
}
}
DeviceMem a_device_buf(sizeof(ADataType) * a.mDesc.GetElementSpaceSize());
DeviceMem b_device_buf(sizeof(BDataType) * b.mDesc.GetElementSpaceSize());
a_device_buf.ToDevice(a.mData.data());
std::array<const void*, 1> input = {a_device_buf.GetDeviceBuffer()};
std::array<void*, 1> output = {b_device_buf.GetDeviceBuffer()};
using DeviceOp = ck::tensor_operation::device::DeviceElementwise<ck::Tuple<ADataType>,
ck::Tuple<BDataType>,
ElementOp,
UnaryOp,
Scale,
NumDim>;
// get device op instances
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
DeviceOp>::GetInstances();
std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
std::string best_instance_name;
float best_ave_time = std::numeric_limits<float>::max();
float best_gb_per_sec = 0;
float best_tflops = 0;
if(do_verification)
{
host_elementwise4D(host_b, a, ElementOp{}, UnaryOp{}, scale);
}
for(auto& op_ptr : op_ptrs)
{
auto argument_ptr = op_ptr->MakeArgumentPointer(ab_lengths,
{a_strides},
{b_strides},
input,
output,
ElementOp{},
UnaryOp{},
Scale{scale});
auto invoker_ptr = op_ptr->MakeInvokerPointer();
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
{
b_device_buf.SetZero();
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
if(do_verification)
{
b_device_buf.FromDevice(b.mData.data());
pass &= ck::utils::check_err(
b.mData, host_b.mData, "Error: Incorrect results b", 1e-3, 1e-3);
if(do_log)
{
LogRangeAsType<float>(std::cout << "a : ", a.mData, ",") << std::endl;
LogRangeAsType<float>(std::cout << "b: ", b.mData, ",") << std::endl;
}
}
std::string op_name = op_ptr->GetTypeString();
float ave_time =
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
std::size_t flop = std::size_t(2) * nchw[0] * nchw[1] * nchw[2] * nchw[3];
std::size_t num_btype = sizeof(ADataType) * (nchw[0] * nchw[1] * nchw[2] * nchw[3]) +
sizeof(BDataType) * (nchw[0] * nchw[1] * nchw[2] * nchw[3]);
float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
float gb_per_sec = num_btype / 1.E6 / ave_time;
std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << tflops << " TFlops, "
<< gb_per_sec << " GB/s, " << op_name << std::endl;
if(tflops > best_tflops)
{
best_instance_name = op_name;
best_tflops = tflops;
best_ave_time = ave_time;
best_gb_per_sec = gb_per_sec;
}
}
else
{
std::cout << op_ptr->GetTypeString() << " does not support this problem" << std::endl;
}
}
if(time_kernel)
{
LogRange(std::cout << "length = ", lengths, ",") << ", ";
std::cout << "best perf = " << best_ave_time << " ms, " << best_gb_per_sec << " GB/s, "
<< best_instance_name << std::endl;
}
return true;
}
} // namespace ck
add_gtest_executable(test_layout test_layout.cpp)
target_link_libraries(test_layout PRIVATE utility)
add_gtest_executable(test_tensor test_tensor.cpp)
target_link_libraries(test_tensor PRIVATE utility)
// SPDX-License-Identifier: MIT
// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
#include <cstdlib>
#include <iostream>
#include <initializer_list>
#include <vector>
#include <gtest/gtest.h>
#include "ck/utility/common_header.hpp"
#include "ck/wrapper/layout.hpp"
#include "ck/tensor_description/tensor_descriptor.hpp"
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "ck/tensor_description/multi_index_transform_helper.hpp"
class TestWrapperLayout : public ::testing::Test
{
protected:
static constexpr auto I0 = ck::Number<0>{};
static constexpr auto I1 = ck::Number<1>{};
template <typename Desc,
typename Desc1d,
typename LayoutRuntime,
typename LayoutCompiletime,
typename Idxs>
void Run(Desc& desc,
Desc1d& desc_1d,
LayoutRuntime& layout_runtime,
LayoutCompiletime& layout_compiletime,
const std::vector<Idxs>& idxs)
{
// 1d check
EXPECT_EQ(desc_1d.GetLength(I0), ck::wrapper::size(layout_runtime));
// Check layout compiletime and runtime result consistency
EXPECT_EQ(ck::wrapper::size(layout_runtime), ck::wrapper::size(layout_compiletime));
for(ck::index_t i = 0; i < desc_1d.GetLength(I0); i++)
{
const ck::index_t layout_runtime_offset_1d = layout_runtime(ck::make_tuple(i));
const ck::index_t layout_compiletime_offset_1d = layout_compiletime(ck::make_tuple(i));
const ck::index_t desc_offset_1d = desc_1d.CalculateOffset(ck::make_tuple(i));
EXPECT_EQ(layout_runtime_offset_1d, desc_offset_1d);
EXPECT_EQ(layout_compiletime_offset_1d, layout_runtime_offset_1d);
}
// size(layout)-d check, don't check if access is hierarchical
if constexpr(!IsNestedTuple(Idxs{}))
{
ck::static_for<0, Idxs::Size(), 1>{}([&](auto d) {
EXPECT_EQ(desc.GetLength(ck::Number<d>{}), ck::wrapper::size<d>(layout_runtime));
EXPECT_EQ(ck::wrapper::size<d>(layout_runtime),
ck::wrapper::size<d>(layout_compiletime));
});
}
for(const auto idx : idxs)
{
const ck::index_t layout_runtime_offset = layout_runtime(idx);
const ck::index_t layout_compiletime_offset = layout_compiletime(idx);
const ck::index_t desc_offset =
desc.CalculateOffset(UnrollNestedTuple(idx)); // Unroll if nested
EXPECT_EQ(layout_runtime_offset, desc_offset);
EXPECT_EQ(layout_runtime_offset, layout_compiletime_offset);
}
}
};
TEST_F(TestWrapperLayout, 2d)
{
// dims:(4, 3) strides:(1, 4)
constexpr ck::index_t d1 = 4;
constexpr ck::index_t d0 = 3;
constexpr ck::index_t s1 = 1;
constexpr ck::index_t s0 = 4;
const auto desc =
ck::make_naive_tensor_descriptor(ck::make_tuple(ck::Number<d1>{}, ck::Number<d0>{}),
ck::make_tuple(ck::Number<s1>{}, ck::Number<s0>{}));
// Reverse due to column major
const auto desc_1d = transform_tensor_descriptor(
desc,
ck::make_tuple(ck::make_merge_transform(ck::make_tuple(d0, d1))),
ck::make_tuple(ck::Sequence<1, 0>{}),
ck::make_tuple(ck::Sequence<0>{}));
const auto layout_runtime = ck::wrapper::make_layout(ck::make_tuple(d1, d0));
const auto layout_compiletime =
ck::wrapper::make_layout(ck::make_tuple(ck::Number<d1>{}, ck::Number<d0>{}));
std::vector<ck::Tuple<ck::index_t, ck::index_t>> idxs;
for(ck::index_t h = 0; h < d1; h++)
{
for(ck::index_t w = 0; w < d0; w++)
{
idxs.emplace_back(h, w);
}
}
this->Run(desc, desc_1d, layout_runtime, layout_compiletime, idxs);
}
TEST_F(TestWrapperLayout, 3d_nested)
{
// dims:((2, 3), 4, 3) strides:((2, 4), 12, 48)
constexpr ck::index_t d3 = 2;
constexpr ck::index_t d2 = 3;
constexpr ck::index_t d1 = 4;
constexpr ck::index_t d0 = 3;
constexpr ck::index_t s3 = 2;
constexpr ck::index_t s2 = 4;
constexpr ck::index_t s1 = 12;
constexpr ck::index_t s0 = 48;
const auto desc = ck::make_naive_tensor_descriptor(
ck::make_tuple(ck::Number<d3>{}, ck::Number<d2>{}, ck::Number<d1>{}, ck::Number<d0>{}),
ck::make_tuple(ck::Number<s3>{}, ck::Number<s2>{}, ck::Number<s1>{}, ck::Number<s0>{}));
// Reverse due to column major
const auto desc_1d = transform_tensor_descriptor(
desc,
ck::make_tuple(ck::make_merge_transform(ck::make_tuple(d0, d1, d2, d3))),
ck::make_tuple(ck::Sequence<3, 2, 1, 0>{}),
ck::make_tuple(ck::Sequence<0>{}));
const auto desc_3d = transform_tensor_descriptor(
desc,
ck::make_tuple(ck::make_merge_transform(ck::make_tuple(d2, d3)),
ck::make_pass_through_transform(d1),
ck::make_pass_through_transform(d2)),
ck::make_tuple(ck::Sequence<1, 0>{}, ck::Sequence<2>{}, ck::Sequence<3>{}),
ck::make_tuple(ck::Sequence<0>{}, ck::Sequence<1>{}, ck::Sequence<2>{}));
const auto layout_runtime =
ck::wrapper::make_layout(ck::make_tuple(ck::make_tuple(d3, d2), d1, d0),
ck::make_tuple(ck::make_tuple(s3, s2), s1, s0));
const auto layout_compiletime = ck::wrapper::make_layout(
ck::make_tuple(
ck::make_tuple(ck::Number<d3>{}, ck::Number<d2>{}), ck::Number<d1>{}, ck::Number<d0>{}),
ck::make_tuple(ck::make_tuple(ck::Number<s3>{}, ck::Number<s2>{}),
ck::Number<s1>{},
ck::Number<s0>{}));
std::vector<ck::Tuple<ck::index_t, ck::index_t, ck::index_t>> idxs_3d;
for(ck::index_t d = 0; d < d2 * d3; d++)
{
for(ck::index_t h = 0; h < d1; h++)
{
for(ck::index_t w = 0; w < d0; w++)
{
idxs_3d.emplace_back(d, h, w);
}
}
}
this->Run(desc_3d, desc_1d, layout_runtime, layout_compiletime, idxs_3d);
// Check also 4d iteration
std::vector<ck::Tuple<ck::Tuple<ck::index_t, ck::index_t>, ck::index_t, ck::index_t>> idxs_4d;
for(ck::index_t e = 0; e < d3; e++)
{
for(ck::index_t d = 0; d < d2; d++)
{
for(ck::index_t h = 0; h < d1; h++)
{
for(ck::index_t w = 0; w < d0; w++)
{
idxs_4d.emplace_back(ck::make_tuple(e, d), h, w);
}
}
}
}
this->Run(desc, desc_1d, layout_runtime, layout_compiletime, idxs_4d);
}
TEST_F(TestWrapperLayout, 2d_nested)
{
// dims:((2, 3), (4, 3)) strides:((2, 4), (48, 12))
constexpr ck::index_t d3 = 2;
constexpr ck::index_t d2 = 3;
constexpr ck::index_t d1 = 4;
constexpr ck::index_t d0 = 3;
constexpr ck::index_t s3 = 2;
constexpr ck::index_t s2 = 4;
constexpr ck::index_t s1 = 48;
constexpr ck::index_t s0 = 12;
const auto desc = ck::make_naive_tensor_descriptor(
ck::make_tuple(ck::Number<d3>{}, ck::Number<d2>{}, ck::Number<d1>{}, ck::Number<d0>{}),
ck::make_tuple(ck::Number<s3>{}, ck::Number<s2>{}, ck::Number<s1>{}, ck::Number<s0>{}));
// Reverse due to column major
const auto desc_1d = transform_tensor_descriptor(
desc,
ck::make_tuple(ck::make_merge_transform(ck::make_tuple(d0, d1, d2, d3))),
ck::make_tuple(ck::Sequence<3, 2, 1, 0>{}),
ck::make_tuple(ck::Sequence<0>{}));
const auto desc_2d = transform_tensor_descriptor(
desc,
ck::make_tuple(ck::make_merge_transform(ck::make_tuple(d2, d3)),
ck::make_merge_transform(ck::make_tuple(d0, d1))),
ck::make_tuple(ck::Sequence<1, 0>{}, ck::Sequence<3, 2>{}),
ck::make_tuple(ck::Sequence<0>{}, ck::Sequence<1>{}));
const auto layout_runtime =
ck::wrapper::make_layout(ck::make_tuple(ck::make_tuple(d3, d2), ck::make_tuple(d1, d0)),
ck::make_tuple(ck::make_tuple(s3, s2), ck::make_tuple(s1, s0)));
const auto layout_compiletime = ck::wrapper::make_layout(
ck::make_tuple(ck::make_tuple(ck::Number<d3>{}, ck::Number<d2>{}),
ck::make_tuple(ck::Number<d1>{}, ck::Number<d0>{})),
ck::make_tuple(ck::make_tuple(ck::Number<s3>{}, ck::Number<s2>{}),
ck::make_tuple(ck::Number<s1>{}, ck::Number<s0>{})));
std::vector<ck::Tuple<ck::index_t, ck::index_t>> idxs_2d;
for(ck::index_t h = 0; h < d2 * d3; h++)
{
for(ck::index_t w = 0; w < d0 * d1; w++)
{
idxs_2d.emplace_back(h, w);
}
}
this->Run(desc_2d, desc_1d, layout_runtime, layout_compiletime, idxs_2d);
// Check also 4d iteration
std::vector<ck::Tuple<ck::Tuple<ck::index_t, ck::index_t>, ck::Tuple<ck::index_t, ck::index_t>>>
idxs_4d;
for(ck::index_t e = 0; e < d3; e++)
{
for(ck::index_t d = 0; d < d2; d++)
{
for(ck::index_t h = 0; h < d1; h++)
{
for(ck::index_t w = 0; w < d0; w++)
{
idxs_4d.emplace_back(ck::make_tuple(e, d), ck::make_tuple(h, w));
}
}
}
}
this->Run(desc, desc_1d, layout_runtime, layout_compiletime, idxs_4d);
}
TEST_F(TestWrapperLayout, 3d_double_nested)
{
// dims:(((2, 2), 3), (4, 3)) strides:(((2, 4), 8), (96, 24))
constexpr ck::index_t d4 = 2;
constexpr ck::index_t d3 = 2;
constexpr ck::index_t d2 = 3;
constexpr ck::index_t d1 = 4;
constexpr ck::index_t d0 = 3;
constexpr ck::index_t s4 = 2;
constexpr ck::index_t s3 = 4;
constexpr ck::index_t s2 = 8;
constexpr ck::index_t s1 = 96;
constexpr ck::index_t s0 = 24;
const auto desc = ck::make_naive_tensor_descriptor(ck::make_tuple(ck::Number<d4>{},
ck::Number<d3>{},
ck::Number<d2>{},
ck::Number<d1>{},
ck::Number<d0>{}),
ck::make_tuple(ck::Number<s4>{},
ck::Number<s3>{},
ck::Number<s2>{},
ck::Number<s1>{},
ck::Number<s0>{}));
// Reverse due to column major
const auto desc_1d = transform_tensor_descriptor(
desc,
ck::make_tuple(ck::make_merge_transform(ck::make_tuple(d0, d1, d2, d3, d4))),
ck::make_tuple(ck::Sequence<4, 3, 2, 1, 0>{}),
ck::make_tuple(ck::Sequence<0>{}));
const auto desc_3d = transform_tensor_descriptor(
desc,
ck::make_tuple(ck::make_merge_transform(ck::make_tuple(d3, d4)),
ck::make_pass_through_transform(d2),
ck::make_merge_transform(ck::make_tuple(d0, d1))),
ck::make_tuple(ck::Sequence<1, 0>{}, ck::Sequence<2>{}, ck::Sequence<4, 3>{}),
ck::make_tuple(ck::Sequence<0>{}, ck::Sequence<1>{}, ck::Sequence<2>{}));
const auto desc_2d = transform_tensor_descriptor(
desc_3d,
ck::make_tuple(ck::make_merge_transform(ck::make_tuple(d2, d3 * d4)),
ck::make_pass_through_transform(d1 * d0)),
ck::make_tuple(ck::Sequence<1, 0>{}, ck::Sequence<2>{}),
ck::make_tuple(ck::Sequence<0>{}, ck::Sequence<1>{}));
const auto layout_runtime = ck::wrapper::make_layout(
ck::make_tuple(ck::make_tuple(ck::make_tuple(d4, d3), d2), ck::make_tuple(d1, d0)),
ck::make_tuple(ck::make_tuple(ck::make_tuple(d4, s3), s2), ck::make_tuple(s1, s0)));
const auto layout_compiletime = ck::wrapper::make_layout(
ck::make_tuple(
ck::make_tuple(ck::make_tuple(ck::Number<d4>{}, ck::Number<d3>{}), ck::Number<d2>{}),
ck::make_tuple(ck::Number<d1>{}, ck::Number<d0>{})),
ck::make_tuple(
ck::make_tuple(ck::make_tuple(ck::Number<d4>{}, ck::Number<s3>{}), ck::Number<s2>{}),
ck::make_tuple(ck::Number<s1>{}, ck::Number<s0>{})));
std::vector<ck::Tuple<ck::index_t, ck::index_t>> idxs_2d;
for(ck::index_t h = 0; h < d2 * d3 * d4; h++)
{
for(ck::index_t w = 0; w < d0 * d1; w++)
{
idxs_2d.emplace_back(h, w);
}
}
this->Run(desc_2d, desc_1d, layout_runtime, layout_compiletime, idxs_2d);
// Check also 3d iteration
std::vector<ck::Tuple<ck::Tuple<ck::index_t, ck::index_t>, ck::index_t>> idxs_3d;
for(ck::index_t d = 0; d < d3 * d4; d++)
{
for(ck::index_t h = 0; h < d2; h++)
{
for(ck::index_t w = 0; w < d1 * d0; w++)
{
idxs_3d.emplace_back(ck::make_tuple(d, h), w);
}
}
}
this->Run(desc_3d, desc_1d, layout_runtime, layout_compiletime, idxs_3d);
// Check also 5d iteration
std::vector<ck::Tuple<ck::Tuple<ck::Tuple<ck::index_t, ck::index_t>, ck::index_t>,
ck::Tuple<ck::index_t, ck::index_t>>>
idxs_5d;
for(ck::index_t f = 0; f < d4; f++)
{
for(ck::index_t e = 0; e < d3; e++)
{
for(ck::index_t d = 0; d < d2; d++)
{
for(ck::index_t h = 0; h < d1; h++)
{
for(ck::index_t w = 0; w < d0; w++)
{
idxs_5d.emplace_back(ck::make_tuple(ck::make_tuple(f, e), d),
ck::make_tuple(h, w));
}
}
}
}
}
this->Run(desc, desc_1d, layout_runtime, layout_compiletime, idxs_5d);
}
TEST(TestLayoutHelpers, SizeAndGet)
{
// dims:(((2, 2), 3), (4, 3))
constexpr ck::index_t d4 = 2;
constexpr ck::index_t d3 = 2;
constexpr ck::index_t d2 = 3;
constexpr ck::index_t d1 = 4;
constexpr ck::index_t d0 = 3;
const auto layout_runtime = ck::wrapper::make_layout(
ck::make_tuple(ck::make_tuple(ck::make_tuple(d4, d3), d2), ck::make_tuple(d1, d0)));
const auto layout_compiletime = ck::wrapper::make_layout(ck::make_tuple(
ck::make_tuple(ck::make_tuple(ck::Number<d4>{}, ck::Number<d3>{}), ck::Number<d2>{}),
ck::make_tuple(ck::Number<d1>{}, ck::Number<d0>{})));
// Size of layout
EXPECT_EQ(ck::wrapper::size(layout_runtime), d4 * d3 * d2 * d1 * d0);
EXPECT_EQ(ck::wrapper::size(layout_compiletime), d4 * d3 * d2 * d1 * d0);
// Size of dims
EXPECT_EQ(ck::wrapper::size<0>(layout_runtime), d4 * d3 * d2);
EXPECT_EQ(ck::wrapper::size<0>(layout_compiletime), d4 * d3 * d2);
EXPECT_EQ(ck::wrapper::size<1>(layout_runtime), d1 * d0);
EXPECT_EQ(ck::wrapper::size<1>(layout_compiletime), d1 * d0);
// Access through new layout (using get with layout object)
EXPECT_EQ(ck::wrapper::size<0>(ck::wrapper::get<0>(layout_runtime)), d4 * d3);
EXPECT_EQ(ck::wrapper::size<0>(ck::wrapper::get<0>(layout_compiletime)), d4 * d3);
EXPECT_EQ(ck::wrapper::size<1>(ck::wrapper::get<0>(layout_runtime)), d2);
EXPECT_EQ(ck::wrapper::size<1>(ck::wrapper::get<0>(layout_compiletime)), d2);
EXPECT_EQ(ck::wrapper::size<0>(ck::wrapper::get<0>(ck::wrapper::get<0>(layout_runtime))), d4);
EXPECT_EQ(ck::wrapper::size<0>(ck::wrapper::get<0>(ck::wrapper::get<0>(layout_compiletime))),
d4);
EXPECT_EQ(ck::wrapper::size<1>(ck::wrapper::get<0>(ck::wrapper::get<0>(layout_runtime))), d3);
EXPECT_EQ(ck::wrapper::size<1>(ck::wrapper::get<0>(ck::wrapper::get<0>(layout_compiletime))),
d3);
EXPECT_EQ(ck::wrapper::size<1>(ck::wrapper::get<0>(layout_runtime)), d2);
EXPECT_EQ(ck::wrapper::size<1>(ck::wrapper::get<0>(layout_compiletime)), d2);
EXPECT_EQ(ck::wrapper::size<0>(ck::wrapper::get<1>(layout_runtime)), d1);
EXPECT_EQ(ck::wrapper::size<0>(ck::wrapper::get<1>(layout_compiletime)), d1);
EXPECT_EQ(ck::wrapper::size<1>(ck::wrapper::get<1>(layout_runtime)), d0);
EXPECT_EQ(ck::wrapper::size<1>(ck::wrapper::get<1>(layout_compiletime)), d0);
}
TEST(TestLayoutHelpers, DepthAndRank)
{
// dims:(((2, 2), 3), (4, 3))
constexpr ck::index_t d4 = 2;
constexpr ck::index_t d3 = 2;
constexpr ck::index_t d2 = 3;
constexpr ck::index_t d1 = 4;
constexpr ck::index_t d0 = 3;
const auto layout_runtime = ck::wrapper::make_layout(
ck::make_tuple(ck::make_tuple(ck::make_tuple(d4, d3), d2), ck::make_tuple(d1, d0)));
const auto layout_compiletime = ck::wrapper::make_layout(ck::make_tuple(
ck::make_tuple(ck::make_tuple(ck::Number<d4>{}, ck::Number<d3>{}), ck::Number<d2>{}),
ck::make_tuple(ck::Number<d1>{}, ck::Number<d0>{})));
EXPECT_EQ(ck::wrapper::depth(layout_runtime), 3);
EXPECT_EQ(ck::wrapper::depth(layout_compiletime), 3);
EXPECT_EQ(ck::wrapper::depth(ck::make_tuple(ck::make_tuple(d4, d3), d2)), 2);
// Check for integer
EXPECT_EQ(ck::wrapper::depth(d0), 0);
EXPECT_EQ(ck::wrapper::rank(layout_runtime), 2);
EXPECT_EQ(ck::wrapper::rank(layout_compiletime), 2);
EXPECT_EQ(ck::wrapper::rank(ck::make_tuple(ck::make_tuple(d4, d3), d2)), 2);
// Check for integer
EXPECT_EQ(ck::wrapper::rank(d0), 1);
}
TEST(TestLayoutHelpers, ShapeAndStrides)
{
// dims:(((2, 2), 3), (4, 3))
constexpr ck::index_t d4 = 2;
constexpr ck::index_t d3 = 2;
constexpr ck::index_t d2 = 3;
constexpr ck::index_t d1 = 4;
constexpr ck::index_t d0 = 3;
constexpr ck::index_t s4 = 2;
constexpr ck::index_t s3 = 4;
constexpr ck::index_t s2 = 8;
constexpr ck::index_t s1 = 96;
constexpr ck::index_t s0 = 24;
const auto shape_compiletime = ck::make_tuple(
ck::make_tuple(ck::make_tuple(ck::Number<d4>{}, ck::Number<d3>{}), ck::Number<d2>{}),
ck::make_tuple(ck::Number<d1>{}, ck::Number<d0>{}));
const auto strides_compiletime = ck::make_tuple(
ck::make_tuple(ck::make_tuple(ck::Number<s4>{}, ck::Number<s3>{}), ck::Number<s2>{}),
ck::make_tuple(ck::Number<s1>{}, ck::Number<s0>{}));
const auto shape_runtime =
ck::make_tuple(ck::make_tuple(ck::make_tuple(d4, d3), d2), ck::make_tuple(d1, d0));
const auto strides_runtime =
ck::make_tuple(ck::make_tuple(ck::make_tuple(s4, s3), s2), ck::make_tuple(s1, s0));
const auto layout_runtime = ck::wrapper::make_layout(shape_runtime, strides_runtime);
const auto layout_compiletime =
ck::wrapper::make_layout(shape_compiletime, strides_compiletime);
constexpr bool check_compiletime_shape =
std::is_same_v<decltype(shape_compiletime),
std::remove_reference_t<decltype(shape(layout_compiletime))>>;
constexpr bool check_compiletime_strides =
std::is_same_v<decltype(strides_compiletime),
std::remove_reference_t<decltype(stride(layout_compiletime))>>;
constexpr bool check_runtime_shape =
std::is_same_v<decltype(shape_runtime),
std::remove_reference_t<decltype(shape(layout_runtime))>>;
constexpr bool check_runtime_strides =
std::is_same_v<decltype(strides_runtime),
std::remove_reference_t<decltype(stride(layout_runtime))>>;
EXPECT_TRUE(check_compiletime_shape);
EXPECT_TRUE(check_compiletime_strides);
EXPECT_TRUE(check_runtime_shape);
EXPECT_TRUE(check_runtime_strides);
}
TEST(TestLayoutHelpers, Hierarchical)
{
// dims:(((2, 2), 3), (4, 3))
constexpr ck::index_t d4 = 2;
constexpr ck::index_t d3 = 2;
constexpr ck::index_t d2 = 3;
constexpr ck::index_t d1 = 4;
constexpr ck::index_t d0 = 3;
const auto runtime_shape =
ck::make_tuple(ck::make_tuple(ck::make_tuple(d4, d3), d2), ck::make_tuple(d1, d0));
const auto layout_runtime = ck::wrapper::make_layout(runtime_shape);
const auto layout_compiletime = ck::wrapper::make_layout(ck::make_tuple(
ck::make_tuple(ck::make_tuple(ck::Number<d4>{}, ck::Number<d3>{}), ck::Number<d2>{}),
ck::make_tuple(ck::Number<d1>{}, ck::Number<d0>{})));
EXPECT_EQ((ck::wrapper::rank<0, 0>(runtime_shape)), 2);
EXPECT_EQ((ck::wrapper::rank<0, 0>(layout_runtime)), 2);
EXPECT_EQ((ck::wrapper::rank<0, 0>(layout_compiletime)), 2);
EXPECT_EQ((ck::wrapper::depth<0, 0>(runtime_shape)), 1);
EXPECT_EQ((ck::wrapper::depth<0, 0>(layout_runtime)), 1);
EXPECT_EQ((ck::wrapper::depth<0, 0>(layout_compiletime)), 1);
EXPECT_EQ((ck::wrapper::size<0, 0>(runtime_shape)), d4 * d3);
EXPECT_EQ((ck::wrapper::size<0, 0>(layout_runtime)), d4 * d3);
EXPECT_EQ((ck::wrapper::size<0, 0>(layout_compiletime)), d4 * d3);
EXPECT_EQ((ck::wrapper::get<0, 0, 0>(runtime_shape)), d4);
}
// SPDX-License-Identifier: MIT
// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
#include <cstdlib>
#include <iostream>
#include <initializer_list>
#include <vector>
#include <gtest/gtest.h>
#include "ck/library/utility/device_memory.hpp"
#include "ck/host_utility/kernel_launch.hpp"
#include "ck/utility/common_header.hpp"
#include "ck/wrapper/layout.hpp"
#include "ck/wrapper/tensor.hpp"
// Compare data in tensor with offset from layout.
// Data and offset should match if physical memory has been initialized with
// sequentially increasing values from 0.
template <typename TensorType>
__host__ __device__ bool TestTensorCheck3d(TensorType& tensor)
{
const auto& layout = ck::wrapper::layout(tensor);
for(ck::index_t d = 0; d < ck::wrapper::size<0>(ck::wrapper::get<0>(layout)); d++)
{
for(ck::index_t h = 0; h < ck::wrapper::size<1>(ck::wrapper::get<0>(layout)); h++)
{
for(ck::index_t w = 0; w < ck::wrapper::size<1>(layout); w++)
{
const auto idx = ck::make_tuple(ck::make_tuple(d, h), w);
if(tensor(idx) != layout(idx))
{
return false;
}
}
}
}
return true;
}
template <typename TensorType>
__host__ __device__ bool TestTensorCheck1d(TensorType& tensor, ck::index_t start_offset = 0)
{
const auto& layout = ck::wrapper::layout(tensor);
for(ck::index_t w = 0; w < ck::wrapper::size<0>(layout); w++)
{
if(tensor(w) - start_offset != layout(ck::make_tuple(w)))
{
return false;
}
}
return true;
}
template <ck::index_t nelems, typename TensorType>
__host__ __device__ bool StaticTestTensorCheck1d(TensorType& tensor)
{
const auto& layout = ck::wrapper::layout(tensor);
bool success = true;
ck::static_for<0, nelems, 1>{}([&](auto w) {
if(tensor(ck::Number<w.value>{}) != layout(ck::make_tuple(w.value)))
{
success = false;
}
});
return success;
}
template <typename TensorType>
__host__ __device__ void InitTensor(TensorType& tensor)
{
for(ck::index_t i = 0; i < ck::wrapper::size(ck::wrapper::layout(tensor)); i++)
{
tensor(i) = i;
}
}
template <ck::index_t nelems, typename TensorType>
__host__ __device__ void StaticInitTensor(TensorType& tensor)
{
ck::static_for<0, nelems, 1>{}([&](auto i) { tensor(ck::Number<i.value>{}) = i.value; });
}
// Tests
TEST(TestTensor, ReadWriteHostMemory)
{
constexpr ck::index_t nelems = 8;
std::array<ck::index_t, nelems> data;
const auto layout = ck::wrapper::make_layout(ck::make_tuple(ck::make_tuple(2, 2), 2));
auto tensor = ck::wrapper::make_tensor<ck::wrapper::MemoryTypeEnum::Generic>(&data[0], layout);
InitTensor(tensor);
EXPECT_TRUE(TestTensorCheck1d(tensor));
EXPECT_TRUE(TestTensorCheck3d(tensor));
}
__global__ void TestTensorReadWriteDevice(void* data, void* success)
{
constexpr ck::index_t nelems = 8;
constexpr ck::index_t scalar_per_vector = 1;
__shared__ ck::index_t p_shared[nelems];
ck::index_t* casted_data_ptr = static_cast<ck::index_t*>(data);
bool* casted_success_ptr = static_cast<bool*>(success);
const auto layout = ck::wrapper::make_layout(ck::make_tuple(ck::make_tuple(2, 2), 2));
constexpr auto register_layout = ck::wrapper::make_layout(ck::make_tuple(ck::Number<8>{}));
auto tensor_global =
ck::wrapper::make_tensor<ck::wrapper::MemoryTypeEnum::Global>(casted_data_ptr, layout);
auto tensor_lds = ck::wrapper::make_tensor<ck::wrapper::MemoryTypeEnum::Lds>(p_shared, layout);
auto tensor_vgpr = ck::wrapper::make_register_tensor<ck::wrapper::MemoryTypeEnum::Vgpr,
nelems,
scalar_per_vector,
ck::index_t>(register_layout);
auto tensor_sgpr = ck::wrapper::make_register_tensor<ck::wrapper::MemoryTypeEnum::Sgpr,
nelems,
scalar_per_vector,
ck::index_t>(register_layout);
InitTensor(tensor_global);
InitTensor(tensor_lds);
StaticInitTensor<nelems>(tensor_vgpr);
StaticInitTensor<nelems>(tensor_sgpr);
*casted_success_ptr &= TestTensorCheck1d(tensor_global);
*casted_success_ptr &= TestTensorCheck3d(tensor_global);
*casted_success_ptr &= TestTensorCheck1d(tensor_lds);
*casted_success_ptr &= TestTensorCheck3d(tensor_lds);
*casted_success_ptr &= StaticTestTensorCheck1d<nelems>(tensor_vgpr);
*casted_success_ptr &= StaticTestTensorCheck1d<nelems>(tensor_sgpr);
}
TEST(TestTensor, ReadWriteGlobalLdsRegistersMemory)
{
constexpr ck::index_t nelems = 8;
std::array<ck::index_t, nelems> host_data;
DeviceMem data_buf(nelems * sizeof(ck::index_t));
data_buf.ToDevice(&host_data[0]);
DeviceMem success_buf(sizeof(bool));
launch_and_time_kernel(StreamConfig{},
TestTensorReadWriteDevice,
dim3(1),
dim3(1),
nelems * sizeof(ck::index_t),
data_buf.GetDeviceBuffer(),
success_buf.GetDeviceBuffer());
bool success;
success_buf.FromDevice(&success);
EXPECT_TRUE(success);
}
TEST(TestTensor, Slicing)
{
constexpr ck::index_t nelems = 8;
std::array<ck::index_t, nelems> data;
const auto shape = ck::make_tuple(ck::make_tuple(2, 2), 2);
const auto strides = ck::make_tuple(ck::make_tuple(1, 2), 4);
const auto layout = ck::wrapper::make_layout(shape, strides);
auto tensor = ck::wrapper::make_tensor<ck::wrapper::MemoryTypeEnum::Generic>(&data[0], layout);
InitTensor(tensor);
auto tensor2x2x2 =
tensor(ck::make_tuple(ck::wrapper::slice(2), ck::wrapper::slice(2)), ck::wrapper::slice(2));
EXPECT_EQ(ck::wrapper::rank(tensor2x2x2), 2);
EXPECT_EQ(ck::wrapper::depth(tensor2x2x2), 2);
EXPECT_EQ(ck::wrapper::size(tensor2x2x2), 8);
EXPECT_TRUE(TestTensorCheck1d(tensor2x2x2));
auto tensor2x2 = tensor(ck::make_tuple(1, ck::wrapper::slice(2)), ck::wrapper::slice(2));
EXPECT_EQ(ck::wrapper::rank(tensor2x2), 2);
EXPECT_EQ(ck::wrapper::depth(tensor2x2), 2);
EXPECT_EQ(ck::wrapper::size(tensor2x2), 4);
EXPECT_TRUE(TestTensorCheck1d(tensor2x2, layout(ck::make_tuple(ck::make_tuple(1, 0), 0))));
auto tensor1x1 = tensor(ck::make_tuple(1, ck::wrapper::slice(1, 2)), ck::wrapper::slice(1, 2));
EXPECT_EQ(rank(tensor1x1), 2);
EXPECT_EQ(depth(tensor1x1), 2);
EXPECT_EQ(size(tensor1x1), 1);
EXPECT_TRUE(TestTensorCheck1d(tensor1x1, layout(ck::make_tuple(ck::make_tuple(1, 1), 1))));
auto tensor2 = tensor(ck::make_tuple(1, 1), ck::wrapper::slice(0, 2));
EXPECT_EQ(ck::wrapper::rank(tensor2), 1);
EXPECT_EQ(ck::wrapper::depth(tensor2), 1);
EXPECT_EQ(ck::wrapper::size(tensor2), 2);
EXPECT_TRUE(TestTensorCheck1d(tensor2, layout(ck::make_tuple(ck::make_tuple(1, 1), 0))));
// negative indexing
auto tensor1x2 = tensor(ck::make_tuple(1, ck::wrapper::slice(0, -2)), ck::wrapper::slice());
EXPECT_EQ(rank(tensor1x2), 2);
EXPECT_EQ(depth(tensor1x2), 2);
EXPECT_EQ(size(tensor1x2), 2);
EXPECT_TRUE(TestTensorCheck1d(tensor1x2, layout(ck::make_tuple(ck::make_tuple(1, 0), 0))));
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment