Commit 057140b1 authored by Adam Osewski's avatar Adam Osewski
Browse files

Merge remote-tracking branch 'origin/develop' into aosewski/ggemm_multi_d2

parents 134fc2e7 12a8883c
......@@ -41,8 +41,8 @@ class TestLayernorm2d : public ::testing::Test
};
using KernelTypes = ::testing::Types<
// XDataType, GammaDataType, BetaDataType, ComputeDataType, YDataType>
std::tuple<F16, F16, F16, F32, F16, F32>>;
// XDataType, GammaDataType, BetaDataType, ComputeDataType, YDataType, SaveMeanInvStdDataType>
std::tuple<F16, F16, F16, F32, F16, F16>>;
TYPED_TEST_SUITE(TestLayernorm2d, KernelTypes);
TYPED_TEST(TestLayernorm2d, Test_FP16) { this->Run(); }
......@@ -41,8 +41,8 @@ class TestLayernorm4d : public ::testing::Test
};
using KernelTypes = ::testing::Types<
// XDataType, GammaDataType, BetaDataType, ComputeDataType, YDataType>
std::tuple<F16, F16, F16, F32, F16, F32>>;
// XDataType, GammaDataType, BetaDataType, ComputeDataType, YDataType, SaveMeanInvStdDataType>
std::tuple<F16, F16, F16, F32, F16, F16>>;
TYPED_TEST_SUITE(TestLayernorm4d, KernelTypes);
TYPED_TEST(TestLayernorm4d, Test_FP16) { this->Run(); }
add_custom_target(test_permute)
add_gtest_executable(test_permute_scale test_permute_scale.cpp)
if(result EQUAL 0)
target_link_libraries(test_permute_scale PRIVATE utility device_permute_scale_instance)
add_dependencies(test_permute test_permute_scale)
endif()
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include "gtest/gtest.h"
#include "test_permute_scale_impl.hpp"
using F16 = ck::half_t;
using F32 = float;
using ck::index_t;
template <typename Tuple>
class TestPermute : public ::testing::Test
{
protected:
using ADataType = std::tuple_element_t<0, Tuple>;
using BDataType = std::tuple_element_t<1, Tuple>;
void Run()
{
std::vector<std::vector<ck::index_t>> lengths = {
{4, 2, 1, 8}, {1, 1, 1, 1}, {16, 8, 32, 64}, {32, 64, 128, 128}};
for(auto length : lengths)
{
bool success =
ck::test_permute_scale_impl<ADataType, BDataType, 4>(true, 2, false, false, length);
EXPECT_TRUE(success);
}
}
};
using KernelTypes = ::testing::Types<std::tuple<F16, F16>, std::tuple<F32, F32>>;
TYPED_TEST_SUITE(TestPermute, KernelTypes);
TYPED_TEST(TestPermute, Test_FP16) { this->Run(); }
TYPED_TEST(TestPermute, Test_FP32) { this->Run(); }
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <iomanip>
#include <random>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/device_elementwise_scale.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_elementwise_scale_impl.hpp"
#include "ck/library/tensor_operation_instance/gpu/permute_scale.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/literals.hpp"
namespace ck {
template <typename HostTensorA, typename HostTensorB, typename FunctorA, typename FunctorB>
void host_elementwise4D(HostTensorB& B_nhwc,
const HostTensorA& A_nchw,
FunctorA functor_a,
FunctorB functor_b,
float scale)
{
std::size_t N = A_nchw.mDesc.GetLengths()[0];
std::size_t C = A_nchw.mDesc.GetLengths()[1];
std::size_t H = A_nchw.mDesc.GetLengths()[2];
std::size_t W = A_nchw.mDesc.GetLengths()[3];
for(std::size_t w = 0; w < W; ++w)
for(std::size_t h = 0; h < H; ++h)
for(std::size_t c = 0; c < C; ++c)
for(std::size_t n = 0; n < N; ++n)
{
using tmp_type = ck::remove_reference_t<decltype(B_nhwc(0, 0))>;
tmp_type tmp_val = 0;
auto a_val = A_nchw.mData[(n) + (c * N) + (h * C * N) + (w * H * C * N)];
functor_b(tmp_val, a_val);
functor_a(B_nhwc.mData[(n) + (c * W * H * N) + (h * N) + (w * H * N)],
scale * tmp_val);
}
}
template <typename ADataType, typename BDataType, index_t NumDim>
bool test_permute_scale_impl(int do_verification,
int init_method,
bool do_log,
bool time_kernel,
std::vector<index_t> lengths)
{
bool pass = true;
using ElementOp = ck::tensor_operation::element_wise::PassThrough;
using UnaryOp = ck::tensor_operation::element_wise::UnarySquare;
using Scale = ck::tensor_operation::element_wise::Scale;
float scale = 2.f;
index_t N = lengths[0];
index_t C = lengths[1];
index_t H = lengths[2];
index_t W = lengths[3];
std::vector<ck::index_t> nchw = {N, C, H, W};
std::vector<ck::index_t> nhwc = {N, H, W, C};
Tensor<ADataType> a(nchw);
Tensor<BDataType> b(nhwc);
Tensor<BDataType> host_b(nhwc);
std::array<ck::index_t, 4> ab_lengths;
std::array<ck::index_t, 4> a_strides = {1,
static_cast<int>(nchw[0]),
static_cast<int>(nchw[0] * nchw[1]),
static_cast<int>(nchw[0] * nchw[1] * nchw[2])};
std::array<ck::index_t, 4> b_strides = {1,
static_cast<int>(nhwc[0] * nhwc[1] * nhwc[2]),
static_cast<int>(nhwc[0]),
static_cast<int>(nhwc[0] * nhwc[1])};
ck::ranges::copy(nchw, ab_lengths.begin());
std::cout << "A: " << a.mDesc << std::endl;
std::cout << "B: " << b.mDesc << std::endl;
switch(init_method)
{
case 0: break;
case 1: a.GenerateTensorValue(GeneratorTensor_2<ADataType>{-1, 2}); break;
default: // a.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0}
std::mt19937 gen(11939);
std::uniform_int_distribution<int> dis(0, 1);
auto i = 0;
for(std::size_t w = 0; w < a.mDesc.GetLengths()[3]; ++w)
for(std::size_t h = 0; h < a.mDesc.GetLengths()[2]; ++h)
for(std::size_t c = 0; c < a.mDesc.GetLengths()[1]; ++c)
for(std::size_t n = 0; n < a.mDesc.GetLengths()[0]; ++n)
{
a.mData[(n * nchw[1] * nchw[2] * nchw[3]) + (c * nchw[2] * nchw[3]) +
(h * nchw[3]) + w] = i;
i = dis(gen);
}
}
DeviceMem a_device_buf(sizeof(ADataType) * a.mDesc.GetElementSpaceSize());
DeviceMem b_device_buf(sizeof(BDataType) * b.mDesc.GetElementSpaceSize());
a_device_buf.ToDevice(a.mData.data());
std::array<const void*, 1> input = {a_device_buf.GetDeviceBuffer()};
std::array<void*, 1> output = {b_device_buf.GetDeviceBuffer()};
using DeviceOp = ck::tensor_operation::device::DeviceElementwise<ck::Tuple<ADataType>,
ck::Tuple<BDataType>,
ElementOp,
UnaryOp,
Scale,
NumDim>;
// get device op instances
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
DeviceOp>::GetInstances();
std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
std::string best_instance_name;
float best_ave_time = std::numeric_limits<float>::max();
float best_gb_per_sec = 0;
float best_tflops = 0;
if(do_verification)
{
host_elementwise4D(host_b, a, ElementOp{}, UnaryOp{}, scale);
}
for(auto& op_ptr : op_ptrs)
{
auto argument_ptr = op_ptr->MakeArgumentPointer(ab_lengths,
{a_strides},
{b_strides},
input,
output,
ElementOp{},
UnaryOp{},
Scale{scale});
auto invoker_ptr = op_ptr->MakeInvokerPointer();
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
{
b_device_buf.SetZero();
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
if(do_verification)
{
b_device_buf.FromDevice(b.mData.data());
pass &= ck::utils::check_err(
b.mData, host_b.mData, "Error: Incorrect results b", 1e-3, 1e-3);
if(do_log)
{
LogRangeAsType<float>(std::cout << "a : ", a.mData, ",") << std::endl;
LogRangeAsType<float>(std::cout << "b: ", b.mData, ",") << std::endl;
}
}
std::string op_name = op_ptr->GetTypeString();
float ave_time =
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
std::size_t flop = std::size_t(2) * nchw[0] * nchw[1] * nchw[2] * nchw[3];
std::size_t num_btype = sizeof(ADataType) * (nchw[0] * nchw[1] * nchw[2] * nchw[3]) +
sizeof(BDataType) * (nchw[0] * nchw[1] * nchw[2] * nchw[3]);
float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
float gb_per_sec = num_btype / 1.E6 / ave_time;
std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << tflops << " TFlops, "
<< gb_per_sec << " GB/s, " << op_name << std::endl;
if(tflops > best_tflops)
{
best_instance_name = op_name;
best_tflops = tflops;
best_ave_time = ave_time;
best_gb_per_sec = gb_per_sec;
}
}
else
{
std::cout << op_ptr->GetTypeString() << " does not support this problem" << std::endl;
}
}
if(time_kernel)
{
LogRange(std::cout << "length = ", lengths, ",") << ", ";
std::cout << "best perf = " << best_ave_time << " ms, " << best_gb_per_sec << " GB/s, "
<< best_instance_name << std::endl;
}
return true;
}
} // namespace ck
......@@ -14,7 +14,6 @@ TYPED_TEST(TestTranspose, Test1)
this->Run();
}
TYPED_TEST(TestTranpose, Test2)
{
std::vector<int> Ms{127, 255, 312, 799, 1573};
......@@ -27,4 +26,3 @@ TYPED_TEST(TestTranpose, Test2)
this->Run();
}
add_gtest_executable(test_layout test_layout.cpp)
target_link_libraries(test_layout PRIVATE utility)
add_gtest_executable(test_tensor test_tensor.cpp)
target_link_libraries(test_tensor PRIVATE utility)
// SPDX-License-Identifier: MIT
// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
#include <cstdlib>
#include <iostream>
#include <initializer_list>
#include <vector>
#include <gtest/gtest.h>
#include "ck/utility/common_header.hpp"
#include "ck/wrapper/layout.hpp"
#include "ck/tensor_description/tensor_descriptor.hpp"
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "ck/tensor_description/multi_index_transform_helper.hpp"
class TestWrapperLayout : public ::testing::Test
{
protected:
static constexpr auto I0 = ck::Number<0>{};
static constexpr auto I1 = ck::Number<1>{};
template <typename Desc,
typename Desc1d,
typename LayoutRuntime,
typename LayoutCompiletime,
typename Idxs>
void Run(Desc& desc,
Desc1d& desc_1d,
LayoutRuntime& layout_runtime,
LayoutCompiletime& layout_compiletime,
const std::vector<Idxs>& idxs)
{
// 1d check
EXPECT_EQ(desc_1d.GetLength(I0), ck::wrapper::size(layout_runtime));
// Check layout compiletime and runtime result consistency
EXPECT_EQ(ck::wrapper::size(layout_runtime), ck::wrapper::size(layout_compiletime));
for(ck::index_t i = 0; i < desc_1d.GetLength(I0); i++)
{
const ck::index_t layout_runtime_offset_1d = layout_runtime(ck::make_tuple(i));
const ck::index_t layout_compiletime_offset_1d = layout_compiletime(ck::make_tuple(i));
const ck::index_t desc_offset_1d = desc_1d.CalculateOffset(ck::make_tuple(i));
EXPECT_EQ(layout_runtime_offset_1d, desc_offset_1d);
EXPECT_EQ(layout_compiletime_offset_1d, layout_runtime_offset_1d);
}
// size(layout)-d check, don't check if access is hierarchical
if constexpr(!IsNestedTuple(Idxs{}))
{
ck::static_for<0, Idxs::Size(), 1>{}([&](auto d) {
EXPECT_EQ(desc.GetLength(ck::Number<d>{}), ck::wrapper::size<d>(layout_runtime));
EXPECT_EQ(ck::wrapper::size<d>(layout_runtime),
ck::wrapper::size<d>(layout_compiletime));
});
}
for(const auto idx : idxs)
{
const ck::index_t layout_runtime_offset = layout_runtime(idx);
const ck::index_t layout_compiletime_offset = layout_compiletime(idx);
const ck::index_t desc_offset =
desc.CalculateOffset(UnrollNestedTuple(idx)); // Unroll if nested
EXPECT_EQ(layout_runtime_offset, desc_offset);
EXPECT_EQ(layout_runtime_offset, layout_compiletime_offset);
}
}
};
TEST_F(TestWrapperLayout, 2d)
{
// dims:(4, 3) strides:(1, 4)
constexpr ck::index_t d1 = 4;
constexpr ck::index_t d0 = 3;
constexpr ck::index_t s1 = 1;
constexpr ck::index_t s0 = 4;
const auto desc =
ck::make_naive_tensor_descriptor(ck::make_tuple(ck::Number<d1>{}, ck::Number<d0>{}),
ck::make_tuple(ck::Number<s1>{}, ck::Number<s0>{}));
// Reverse due to column major
const auto desc_1d = transform_tensor_descriptor(
desc,
ck::make_tuple(ck::make_merge_transform(ck::make_tuple(d0, d1))),
ck::make_tuple(ck::Sequence<1, 0>{}),
ck::make_tuple(ck::Sequence<0>{}));
const auto layout_runtime = ck::wrapper::make_layout(ck::make_tuple(d1, d0));
const auto layout_compiletime =
ck::wrapper::make_layout(ck::make_tuple(ck::Number<d1>{}, ck::Number<d0>{}));
std::vector<ck::Tuple<ck::index_t, ck::index_t>> idxs;
for(ck::index_t h = 0; h < d1; h++)
{
for(ck::index_t w = 0; w < d0; w++)
{
idxs.emplace_back(h, w);
}
}
this->Run(desc, desc_1d, layout_runtime, layout_compiletime, idxs);
}
TEST_F(TestWrapperLayout, 3d_nested)
{
// dims:((2, 3), 4, 3) strides:((2, 4), 12, 48)
constexpr ck::index_t d3 = 2;
constexpr ck::index_t d2 = 3;
constexpr ck::index_t d1 = 4;
constexpr ck::index_t d0 = 3;
constexpr ck::index_t s3 = 2;
constexpr ck::index_t s2 = 4;
constexpr ck::index_t s1 = 12;
constexpr ck::index_t s0 = 48;
const auto desc = ck::make_naive_tensor_descriptor(
ck::make_tuple(ck::Number<d3>{}, ck::Number<d2>{}, ck::Number<d1>{}, ck::Number<d0>{}),
ck::make_tuple(ck::Number<s3>{}, ck::Number<s2>{}, ck::Number<s1>{}, ck::Number<s0>{}));
// Reverse due to column major
const auto desc_1d = transform_tensor_descriptor(
desc,
ck::make_tuple(ck::make_merge_transform(ck::make_tuple(d0, d1, d2, d3))),
ck::make_tuple(ck::Sequence<3, 2, 1, 0>{}),
ck::make_tuple(ck::Sequence<0>{}));
const auto desc_3d = transform_tensor_descriptor(
desc,
ck::make_tuple(ck::make_merge_transform(ck::make_tuple(d2, d3)),
ck::make_pass_through_transform(d1),
ck::make_pass_through_transform(d2)),
ck::make_tuple(ck::Sequence<1, 0>{}, ck::Sequence<2>{}, ck::Sequence<3>{}),
ck::make_tuple(ck::Sequence<0>{}, ck::Sequence<1>{}, ck::Sequence<2>{}));
const auto layout_runtime =
ck::wrapper::make_layout(ck::make_tuple(ck::make_tuple(d3, d2), d1, d0),
ck::make_tuple(ck::make_tuple(s3, s2), s1, s0));
const auto layout_compiletime = ck::wrapper::make_layout(
ck::make_tuple(
ck::make_tuple(ck::Number<d3>{}, ck::Number<d2>{}), ck::Number<d1>{}, ck::Number<d0>{}),
ck::make_tuple(ck::make_tuple(ck::Number<s3>{}, ck::Number<s2>{}),
ck::Number<s1>{},
ck::Number<s0>{}));
std::vector<ck::Tuple<ck::index_t, ck::index_t, ck::index_t>> idxs_3d;
for(ck::index_t d = 0; d < d2 * d3; d++)
{
for(ck::index_t h = 0; h < d1; h++)
{
for(ck::index_t w = 0; w < d0; w++)
{
idxs_3d.emplace_back(d, h, w);
}
}
}
this->Run(desc_3d, desc_1d, layout_runtime, layout_compiletime, idxs_3d);
// Check also 4d iteration
std::vector<ck::Tuple<ck::Tuple<ck::index_t, ck::index_t>, ck::index_t, ck::index_t>> idxs_4d;
for(ck::index_t e = 0; e < d3; e++)
{
for(ck::index_t d = 0; d < d2; d++)
{
for(ck::index_t h = 0; h < d1; h++)
{
for(ck::index_t w = 0; w < d0; w++)
{
idxs_4d.emplace_back(ck::make_tuple(e, d), h, w);
}
}
}
}
this->Run(desc, desc_1d, layout_runtime, layout_compiletime, idxs_4d);
}
TEST_F(TestWrapperLayout, 2d_nested)
{
// dims:((2, 3), (4, 3)) strides:((2, 4), (48, 12))
constexpr ck::index_t d3 = 2;
constexpr ck::index_t d2 = 3;
constexpr ck::index_t d1 = 4;
constexpr ck::index_t d0 = 3;
constexpr ck::index_t s3 = 2;
constexpr ck::index_t s2 = 4;
constexpr ck::index_t s1 = 48;
constexpr ck::index_t s0 = 12;
const auto desc = ck::make_naive_tensor_descriptor(
ck::make_tuple(ck::Number<d3>{}, ck::Number<d2>{}, ck::Number<d1>{}, ck::Number<d0>{}),
ck::make_tuple(ck::Number<s3>{}, ck::Number<s2>{}, ck::Number<s1>{}, ck::Number<s0>{}));
// Reverse due to column major
const auto desc_1d = transform_tensor_descriptor(
desc,
ck::make_tuple(ck::make_merge_transform(ck::make_tuple(d0, d1, d2, d3))),
ck::make_tuple(ck::Sequence<3, 2, 1, 0>{}),
ck::make_tuple(ck::Sequence<0>{}));
const auto desc_2d = transform_tensor_descriptor(
desc,
ck::make_tuple(ck::make_merge_transform(ck::make_tuple(d2, d3)),
ck::make_merge_transform(ck::make_tuple(d0, d1))),
ck::make_tuple(ck::Sequence<1, 0>{}, ck::Sequence<3, 2>{}),
ck::make_tuple(ck::Sequence<0>{}, ck::Sequence<1>{}));
const auto layout_runtime =
ck::wrapper::make_layout(ck::make_tuple(ck::make_tuple(d3, d2), ck::make_tuple(d1, d0)),
ck::make_tuple(ck::make_tuple(s3, s2), ck::make_tuple(s1, s0)));
const auto layout_compiletime = ck::wrapper::make_layout(
ck::make_tuple(ck::make_tuple(ck::Number<d3>{}, ck::Number<d2>{}),
ck::make_tuple(ck::Number<d1>{}, ck::Number<d0>{})),
ck::make_tuple(ck::make_tuple(ck::Number<s3>{}, ck::Number<s2>{}),
ck::make_tuple(ck::Number<s1>{}, ck::Number<s0>{})));
std::vector<ck::Tuple<ck::index_t, ck::index_t>> idxs_2d;
for(ck::index_t h = 0; h < d2 * d3; h++)
{
for(ck::index_t w = 0; w < d0 * d1; w++)
{
idxs_2d.emplace_back(h, w);
}
}
this->Run(desc_2d, desc_1d, layout_runtime, layout_compiletime, idxs_2d);
// Check also 4d iteration
std::vector<ck::Tuple<ck::Tuple<ck::index_t, ck::index_t>, ck::Tuple<ck::index_t, ck::index_t>>>
idxs_4d;
for(ck::index_t e = 0; e < d3; e++)
{
for(ck::index_t d = 0; d < d2; d++)
{
for(ck::index_t h = 0; h < d1; h++)
{
for(ck::index_t w = 0; w < d0; w++)
{
idxs_4d.emplace_back(ck::make_tuple(e, d), ck::make_tuple(h, w));
}
}
}
}
this->Run(desc, desc_1d, layout_runtime, layout_compiletime, idxs_4d);
}
TEST_F(TestWrapperLayout, 3d_double_nested)
{
// dims:(((2, 2), 3), (4, 3)) strides:(((2, 4), 8), (96, 24))
constexpr ck::index_t d4 = 2;
constexpr ck::index_t d3 = 2;
constexpr ck::index_t d2 = 3;
constexpr ck::index_t d1 = 4;
constexpr ck::index_t d0 = 3;
constexpr ck::index_t s4 = 2;
constexpr ck::index_t s3 = 4;
constexpr ck::index_t s2 = 8;
constexpr ck::index_t s1 = 96;
constexpr ck::index_t s0 = 24;
const auto desc = ck::make_naive_tensor_descriptor(ck::make_tuple(ck::Number<d4>{},
ck::Number<d3>{},
ck::Number<d2>{},
ck::Number<d1>{},
ck::Number<d0>{}),
ck::make_tuple(ck::Number<s4>{},
ck::Number<s3>{},
ck::Number<s2>{},
ck::Number<s1>{},
ck::Number<s0>{}));
// Reverse due to column major
const auto desc_1d = transform_tensor_descriptor(
desc,
ck::make_tuple(ck::make_merge_transform(ck::make_tuple(d0, d1, d2, d3, d4))),
ck::make_tuple(ck::Sequence<4, 3, 2, 1, 0>{}),
ck::make_tuple(ck::Sequence<0>{}));
const auto desc_3d = transform_tensor_descriptor(
desc,
ck::make_tuple(ck::make_merge_transform(ck::make_tuple(d3, d4)),
ck::make_pass_through_transform(d2),
ck::make_merge_transform(ck::make_tuple(d0, d1))),
ck::make_tuple(ck::Sequence<1, 0>{}, ck::Sequence<2>{}, ck::Sequence<4, 3>{}),
ck::make_tuple(ck::Sequence<0>{}, ck::Sequence<1>{}, ck::Sequence<2>{}));
const auto desc_2d = transform_tensor_descriptor(
desc_3d,
ck::make_tuple(ck::make_merge_transform(ck::make_tuple(d2, d3 * d4)),
ck::make_pass_through_transform(d1 * d0)),
ck::make_tuple(ck::Sequence<1, 0>{}, ck::Sequence<2>{}),
ck::make_tuple(ck::Sequence<0>{}, ck::Sequence<1>{}));
const auto layout_runtime = ck::wrapper::make_layout(
ck::make_tuple(ck::make_tuple(ck::make_tuple(d4, d3), d2), ck::make_tuple(d1, d0)),
ck::make_tuple(ck::make_tuple(ck::make_tuple(d4, s3), s2), ck::make_tuple(s1, s0)));
const auto layout_compiletime = ck::wrapper::make_layout(
ck::make_tuple(
ck::make_tuple(ck::make_tuple(ck::Number<d4>{}, ck::Number<d3>{}), ck::Number<d2>{}),
ck::make_tuple(ck::Number<d1>{}, ck::Number<d0>{})),
ck::make_tuple(
ck::make_tuple(ck::make_tuple(ck::Number<d4>{}, ck::Number<s3>{}), ck::Number<s2>{}),
ck::make_tuple(ck::Number<s1>{}, ck::Number<s0>{})));
std::vector<ck::Tuple<ck::index_t, ck::index_t>> idxs_2d;
for(ck::index_t h = 0; h < d2 * d3 * d4; h++)
{
for(ck::index_t w = 0; w < d0 * d1; w++)
{
idxs_2d.emplace_back(h, w);
}
}
this->Run(desc_2d, desc_1d, layout_runtime, layout_compiletime, idxs_2d);
// Check also 3d iteration
std::vector<ck::Tuple<ck::Tuple<ck::index_t, ck::index_t>, ck::index_t>> idxs_3d;
for(ck::index_t d = 0; d < d3 * d4; d++)
{
for(ck::index_t h = 0; h < d2; h++)
{
for(ck::index_t w = 0; w < d1 * d0; w++)
{
idxs_3d.emplace_back(ck::make_tuple(d, h), w);
}
}
}
this->Run(desc_3d, desc_1d, layout_runtime, layout_compiletime, idxs_3d);
// Check also 5d iteration
std::vector<ck::Tuple<ck::Tuple<ck::Tuple<ck::index_t, ck::index_t>, ck::index_t>,
ck::Tuple<ck::index_t, ck::index_t>>>
idxs_5d;
for(ck::index_t f = 0; f < d4; f++)
{
for(ck::index_t e = 0; e < d3; e++)
{
for(ck::index_t d = 0; d < d2; d++)
{
for(ck::index_t h = 0; h < d1; h++)
{
for(ck::index_t w = 0; w < d0; w++)
{
idxs_5d.emplace_back(ck::make_tuple(ck::make_tuple(f, e), d),
ck::make_tuple(h, w));
}
}
}
}
}
this->Run(desc, desc_1d, layout_runtime, layout_compiletime, idxs_5d);
}
TEST(TestLayoutHelpers, SizeAndGet)
{
// dims:(((2, 2), 3), (4, 3))
constexpr ck::index_t d4 = 2;
constexpr ck::index_t d3 = 2;
constexpr ck::index_t d2 = 3;
constexpr ck::index_t d1 = 4;
constexpr ck::index_t d0 = 3;
const auto layout_runtime = ck::wrapper::make_layout(
ck::make_tuple(ck::make_tuple(ck::make_tuple(d4, d3), d2), ck::make_tuple(d1, d0)));
const auto layout_compiletime = ck::wrapper::make_layout(ck::make_tuple(
ck::make_tuple(ck::make_tuple(ck::Number<d4>{}, ck::Number<d3>{}), ck::Number<d2>{}),
ck::make_tuple(ck::Number<d1>{}, ck::Number<d0>{})));
// Size of layout
EXPECT_EQ(ck::wrapper::size(layout_runtime), d4 * d3 * d2 * d1 * d0);
EXPECT_EQ(ck::wrapper::size(layout_compiletime), d4 * d3 * d2 * d1 * d0);
// Size of dims
EXPECT_EQ(ck::wrapper::size<0>(layout_runtime), d4 * d3 * d2);
EXPECT_EQ(ck::wrapper::size<0>(layout_compiletime), d4 * d3 * d2);
EXPECT_EQ(ck::wrapper::size<1>(layout_runtime), d1 * d0);
EXPECT_EQ(ck::wrapper::size<1>(layout_compiletime), d1 * d0);
// Access through new layout (using get with layout object)
EXPECT_EQ(ck::wrapper::size<0>(ck::wrapper::get<0>(layout_runtime)), d4 * d3);
EXPECT_EQ(ck::wrapper::size<0>(ck::wrapper::get<0>(layout_compiletime)), d4 * d3);
EXPECT_EQ(ck::wrapper::size<1>(ck::wrapper::get<0>(layout_runtime)), d2);
EXPECT_EQ(ck::wrapper::size<1>(ck::wrapper::get<0>(layout_compiletime)), d2);
EXPECT_EQ(ck::wrapper::size<0>(ck::wrapper::get<0>(ck::wrapper::get<0>(layout_runtime))), d4);
EXPECT_EQ(ck::wrapper::size<0>(ck::wrapper::get<0>(ck::wrapper::get<0>(layout_compiletime))),
d4);
EXPECT_EQ(ck::wrapper::size<1>(ck::wrapper::get<0>(ck::wrapper::get<0>(layout_runtime))), d3);
EXPECT_EQ(ck::wrapper::size<1>(ck::wrapper::get<0>(ck::wrapper::get<0>(layout_compiletime))),
d3);
EXPECT_EQ(ck::wrapper::size<1>(ck::wrapper::get<0>(layout_runtime)), d2);
EXPECT_EQ(ck::wrapper::size<1>(ck::wrapper::get<0>(layout_compiletime)), d2);
EXPECT_EQ(ck::wrapper::size<0>(ck::wrapper::get<1>(layout_runtime)), d1);
EXPECT_EQ(ck::wrapper::size<0>(ck::wrapper::get<1>(layout_compiletime)), d1);
EXPECT_EQ(ck::wrapper::size<1>(ck::wrapper::get<1>(layout_runtime)), d0);
EXPECT_EQ(ck::wrapper::size<1>(ck::wrapper::get<1>(layout_compiletime)), d0);
}
TEST(TestLayoutHelpers, DepthAndRank)
{
// dims:(((2, 2), 3), (4, 3))
constexpr ck::index_t d4 = 2;
constexpr ck::index_t d3 = 2;
constexpr ck::index_t d2 = 3;
constexpr ck::index_t d1 = 4;
constexpr ck::index_t d0 = 3;
const auto layout_runtime = ck::wrapper::make_layout(
ck::make_tuple(ck::make_tuple(ck::make_tuple(d4, d3), d2), ck::make_tuple(d1, d0)));
const auto layout_compiletime = ck::wrapper::make_layout(ck::make_tuple(
ck::make_tuple(ck::make_tuple(ck::Number<d4>{}, ck::Number<d3>{}), ck::Number<d2>{}),
ck::make_tuple(ck::Number<d1>{}, ck::Number<d0>{})));
EXPECT_EQ(ck::wrapper::depth(layout_runtime), 3);
EXPECT_EQ(ck::wrapper::depth(layout_compiletime), 3);
EXPECT_EQ(ck::wrapper::depth(ck::make_tuple(ck::make_tuple(d4, d3), d2)), 2);
// Check for integer
EXPECT_EQ(ck::wrapper::depth(d0), 0);
EXPECT_EQ(ck::wrapper::rank(layout_runtime), 2);
EXPECT_EQ(ck::wrapper::rank(layout_compiletime), 2);
EXPECT_EQ(ck::wrapper::rank(ck::make_tuple(ck::make_tuple(d4, d3), d2)), 2);
// Check for integer
EXPECT_EQ(ck::wrapper::rank(d0), 1);
}
TEST(TestLayoutHelpers, ShapeAndStrides)
{
// dims:(((2, 2), 3), (4, 3))
constexpr ck::index_t d4 = 2;
constexpr ck::index_t d3 = 2;
constexpr ck::index_t d2 = 3;
constexpr ck::index_t d1 = 4;
constexpr ck::index_t d0 = 3;
constexpr ck::index_t s4 = 2;
constexpr ck::index_t s3 = 4;
constexpr ck::index_t s2 = 8;
constexpr ck::index_t s1 = 96;
constexpr ck::index_t s0 = 24;
const auto shape_compiletime = ck::make_tuple(
ck::make_tuple(ck::make_tuple(ck::Number<d4>{}, ck::Number<d3>{}), ck::Number<d2>{}),
ck::make_tuple(ck::Number<d1>{}, ck::Number<d0>{}));
const auto strides_compiletime = ck::make_tuple(
ck::make_tuple(ck::make_tuple(ck::Number<s4>{}, ck::Number<s3>{}), ck::Number<s2>{}),
ck::make_tuple(ck::Number<s1>{}, ck::Number<s0>{}));
const auto shape_runtime =
ck::make_tuple(ck::make_tuple(ck::make_tuple(d4, d3), d2), ck::make_tuple(d1, d0));
const auto strides_runtime =
ck::make_tuple(ck::make_tuple(ck::make_tuple(s4, s3), s2), ck::make_tuple(s1, s0));
const auto layout_runtime = ck::wrapper::make_layout(shape_runtime, strides_runtime);
const auto layout_compiletime =
ck::wrapper::make_layout(shape_compiletime, strides_compiletime);
constexpr bool check_compiletime_shape =
std::is_same_v<decltype(shape_compiletime),
std::remove_reference_t<decltype(shape(layout_compiletime))>>;
constexpr bool check_compiletime_strides =
std::is_same_v<decltype(strides_compiletime),
std::remove_reference_t<decltype(stride(layout_compiletime))>>;
constexpr bool check_runtime_shape =
std::is_same_v<decltype(shape_runtime),
std::remove_reference_t<decltype(shape(layout_runtime))>>;
constexpr bool check_runtime_strides =
std::is_same_v<decltype(strides_runtime),
std::remove_reference_t<decltype(stride(layout_runtime))>>;
EXPECT_TRUE(check_compiletime_shape);
EXPECT_TRUE(check_compiletime_strides);
EXPECT_TRUE(check_runtime_shape);
EXPECT_TRUE(check_runtime_strides);
}
TEST(TestLayoutHelpers, Hierarchical)
{
// dims:(((2, 2), 3), (4, 3))
constexpr ck::index_t d4 = 2;
constexpr ck::index_t d3 = 2;
constexpr ck::index_t d2 = 3;
constexpr ck::index_t d1 = 4;
constexpr ck::index_t d0 = 3;
const auto runtime_shape =
ck::make_tuple(ck::make_tuple(ck::make_tuple(d4, d3), d2), ck::make_tuple(d1, d0));
const auto layout_runtime = ck::wrapper::make_layout(runtime_shape);
const auto layout_compiletime = ck::wrapper::make_layout(ck::make_tuple(
ck::make_tuple(ck::make_tuple(ck::Number<d4>{}, ck::Number<d3>{}), ck::Number<d2>{}),
ck::make_tuple(ck::Number<d1>{}, ck::Number<d0>{})));
EXPECT_EQ((ck::wrapper::rank<0, 0>(runtime_shape)), 2);
EXPECT_EQ((ck::wrapper::rank<0, 0>(layout_runtime)), 2);
EXPECT_EQ((ck::wrapper::rank<0, 0>(layout_compiletime)), 2);
EXPECT_EQ((ck::wrapper::depth<0, 0>(runtime_shape)), 1);
EXPECT_EQ((ck::wrapper::depth<0, 0>(layout_runtime)), 1);
EXPECT_EQ((ck::wrapper::depth<0, 0>(layout_compiletime)), 1);
EXPECT_EQ((ck::wrapper::size<0, 0>(runtime_shape)), d4 * d3);
EXPECT_EQ((ck::wrapper::size<0, 0>(layout_runtime)), d4 * d3);
EXPECT_EQ((ck::wrapper::size<0, 0>(layout_compiletime)), d4 * d3);
EXPECT_EQ((ck::wrapper::get<0, 0, 0>(runtime_shape)), d4);
}
// SPDX-License-Identifier: MIT
// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
#include <cstdlib>
#include <iostream>
#include <initializer_list>
#include <vector>
#include <gtest/gtest.h>
#include "ck/library/utility/device_memory.hpp"
#include "ck/host_utility/kernel_launch.hpp"
#include "ck/utility/common_header.hpp"
#include "ck/wrapper/layout.hpp"
#include "ck/wrapper/tensor.hpp"
// Compare data in tensor with offset from layout.
// Data and offset should match if physical memory has been initialized with
// sequentially increasing values from 0.
template <typename TensorType>
__host__ __device__ bool TestTensorCheck3d(TensorType& tensor)
{
const auto& layout = ck::wrapper::layout(tensor);
for(ck::index_t d = 0; d < ck::wrapper::size<0>(ck::wrapper::get<0>(layout)); d++)
{
for(ck::index_t h = 0; h < ck::wrapper::size<1>(ck::wrapper::get<0>(layout)); h++)
{
for(ck::index_t w = 0; w < ck::wrapper::size<1>(layout); w++)
{
const auto idx = ck::make_tuple(ck::make_tuple(d, h), w);
if(tensor(idx) != layout(idx))
{
return false;
}
}
}
}
return true;
}
template <typename TensorType>
__host__ __device__ bool TestTensorCheck1d(TensorType& tensor, ck::index_t start_offset = 0)
{
const auto& layout = ck::wrapper::layout(tensor);
for(ck::index_t w = 0; w < ck::wrapper::size<0>(layout); w++)
{
if(tensor(w) - start_offset != layout(ck::make_tuple(w)))
{
return false;
}
}
return true;
}
template <ck::index_t nelems, typename TensorType>
__host__ __device__ bool StaticTestTensorCheck1d(TensorType& tensor)
{
const auto& layout = ck::wrapper::layout(tensor);
bool success = true;
ck::static_for<0, nelems, 1>{}([&](auto w) {
if(tensor(ck::Number<w.value>{}) != layout(ck::make_tuple(w.value)))
{
success = false;
}
});
return success;
}
template <typename TensorType>
__host__ __device__ void InitTensor(TensorType& tensor)
{
for(ck::index_t i = 0; i < ck::wrapper::size(ck::wrapper::layout(tensor)); i++)
{
tensor(i) = i;
}
}
template <ck::index_t nelems, typename TensorType>
__host__ __device__ void StaticInitTensor(TensorType& tensor)
{
ck::static_for<0, nelems, 1>{}([&](auto i) { tensor(ck::Number<i.value>{}) = i.value; });
}
// Tests
TEST(TestTensor, ReadWriteHostMemory)
{
constexpr ck::index_t nelems = 8;
std::array<ck::index_t, nelems> data;
const auto layout = ck::wrapper::make_layout(ck::make_tuple(ck::make_tuple(2, 2), 2));
auto tensor = ck::wrapper::make_tensor<ck::wrapper::MemoryTypeEnum::Generic>(&data[0], layout);
InitTensor(tensor);
EXPECT_TRUE(TestTensorCheck1d(tensor));
EXPECT_TRUE(TestTensorCheck3d(tensor));
}
__global__ void TestTensorReadWriteDevice(void* data, void* success)
{
constexpr ck::index_t nelems = 8;
constexpr ck::index_t scalar_per_vector = 1;
__shared__ ck::index_t p_shared[nelems];
ck::index_t* casted_data_ptr = static_cast<ck::index_t*>(data);
bool* casted_success_ptr = static_cast<bool*>(success);
const auto layout = ck::wrapper::make_layout(ck::make_tuple(ck::make_tuple(2, 2), 2));
constexpr auto register_layout = ck::wrapper::make_layout(ck::make_tuple(ck::Number<8>{}));
auto tensor_global =
ck::wrapper::make_tensor<ck::wrapper::MemoryTypeEnum::Global>(casted_data_ptr, layout);
auto tensor_lds = ck::wrapper::make_tensor<ck::wrapper::MemoryTypeEnum::Lds>(p_shared, layout);
auto tensor_vgpr = ck::wrapper::make_register_tensor<ck::wrapper::MemoryTypeEnum::Vgpr,
nelems,
scalar_per_vector,
ck::index_t>(register_layout);
auto tensor_sgpr = ck::wrapper::make_register_tensor<ck::wrapper::MemoryTypeEnum::Sgpr,
nelems,
scalar_per_vector,
ck::index_t>(register_layout);
InitTensor(tensor_global);
InitTensor(tensor_lds);
StaticInitTensor<nelems>(tensor_vgpr);
StaticInitTensor<nelems>(tensor_sgpr);
*casted_success_ptr &= TestTensorCheck1d(tensor_global);
*casted_success_ptr &= TestTensorCheck3d(tensor_global);
*casted_success_ptr &= TestTensorCheck1d(tensor_lds);
*casted_success_ptr &= TestTensorCheck3d(tensor_lds);
*casted_success_ptr &= StaticTestTensorCheck1d<nelems>(tensor_vgpr);
*casted_success_ptr &= StaticTestTensorCheck1d<nelems>(tensor_sgpr);
}
TEST(TestTensor, ReadWriteGlobalLdsRegistersMemory)
{
constexpr ck::index_t nelems = 8;
std::array<ck::index_t, nelems> host_data;
DeviceMem data_buf(nelems * sizeof(ck::index_t));
data_buf.ToDevice(&host_data[0]);
DeviceMem success_buf(sizeof(bool));
launch_and_time_kernel(StreamConfig{},
TestTensorReadWriteDevice,
dim3(1),
dim3(1),
nelems * sizeof(ck::index_t),
data_buf.GetDeviceBuffer(),
success_buf.GetDeviceBuffer());
bool success;
success_buf.FromDevice(&success);
EXPECT_TRUE(success);
}
TEST(TestTensor, Slicing)
{
constexpr ck::index_t nelems = 8;
std::array<ck::index_t, nelems> data;
const auto shape = ck::make_tuple(ck::make_tuple(2, 2), 2);
const auto strides = ck::make_tuple(ck::make_tuple(1, 2), 4);
const auto layout = ck::wrapper::make_layout(shape, strides);
auto tensor = ck::wrapper::make_tensor<ck::wrapper::MemoryTypeEnum::Generic>(&data[0], layout);
InitTensor(tensor);
auto tensor2x2x2 =
tensor(ck::make_tuple(ck::wrapper::slice(2), ck::wrapper::slice(2)), ck::wrapper::slice(2));
EXPECT_EQ(ck::wrapper::rank(tensor2x2x2), 2);
EXPECT_EQ(ck::wrapper::depth(tensor2x2x2), 2);
EXPECT_EQ(ck::wrapper::size(tensor2x2x2), 8);
EXPECT_TRUE(TestTensorCheck1d(tensor2x2x2));
auto tensor2x2 = tensor(ck::make_tuple(1, ck::wrapper::slice(2)), ck::wrapper::slice(2));
EXPECT_EQ(ck::wrapper::rank(tensor2x2), 2);
EXPECT_EQ(ck::wrapper::depth(tensor2x2), 2);
EXPECT_EQ(ck::wrapper::size(tensor2x2), 4);
EXPECT_TRUE(TestTensorCheck1d(tensor2x2, layout(ck::make_tuple(ck::make_tuple(1, 0), 0))));
auto tensor1x1 = tensor(ck::make_tuple(1, ck::wrapper::slice(1, 2)), ck::wrapper::slice(1, 2));
EXPECT_EQ(rank(tensor1x1), 2);
EXPECT_EQ(depth(tensor1x1), 2);
EXPECT_EQ(size(tensor1x1), 1);
EXPECT_TRUE(TestTensorCheck1d(tensor1x1, layout(ck::make_tuple(ck::make_tuple(1, 1), 1))));
auto tensor2 = tensor(ck::make_tuple(1, 1), ck::wrapper::slice(0, 2));
EXPECT_EQ(ck::wrapper::rank(tensor2), 1);
EXPECT_EQ(ck::wrapper::depth(tensor2), 1);
EXPECT_EQ(ck::wrapper::size(tensor2), 2);
EXPECT_TRUE(TestTensorCheck1d(tensor2, layout(ck::make_tuple(ck::make_tuple(1, 1), 0))));
// negative indexing
auto tensor1x2 = tensor(ck::make_tuple(1, ck::wrapper::slice(0, -2)), ck::wrapper::slice());
EXPECT_EQ(rank(tensor1x2), 2);
EXPECT_EQ(depth(tensor1x2), 2);
EXPECT_EQ(size(tensor1x2), 2);
EXPECT_TRUE(TestTensorCheck1d(tensor1x2, layout(ck::make_tuple(ck::make_tuple(1, 0), 0))));
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment