Unverified Commit 29dcb956 authored by Illia Silin's avatar Illia Silin Committed by GitHub
Browse files

Merge pull request #33 from ROCm/lwpck-1292

Merge from the public repo.
parents 29deceb6 cbcc844e
...@@ -41,8 +41,8 @@ class TestLayernorm4d : public ::testing::Test ...@@ -41,8 +41,8 @@ class TestLayernorm4d : public ::testing::Test
}; };
using KernelTypes = ::testing::Types< using KernelTypes = ::testing::Types<
// XDataType, GammaDataType, BetaDataType, ComputeDataType, YDataType> // XDataType, GammaDataType, BetaDataType, ComputeDataType, YDataType, SaveMeanInvStdDataType>
std::tuple<F16, F16, F16, F32, F16, F32>>; std::tuple<F16, F16, F16, F32, F16, F16>>;
TYPED_TEST_SUITE(TestLayernorm4d, KernelTypes); TYPED_TEST_SUITE(TestLayernorm4d, KernelTypes);
TYPED_TEST(TestLayernorm4d, Test_FP16) { this->Run(); } TYPED_TEST(TestLayernorm4d, Test_FP16) { this->Run(); }
add_custom_target(test_permute)
add_gtest_executable(test_permute_scale test_permute_scale.cpp)
if(result EQUAL 0)
target_link_libraries(test_permute_scale PRIVATE utility device_permute_scale_instance)
add_dependencies(test_permute test_permute_scale)
endif()
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include "gtest/gtest.h"
#include "test_permute_scale_impl.hpp"
using F16 = ck::half_t;
using F32 = float;
using ck::index_t;
template <typename Tuple>
class TestPermute : public ::testing::Test
{
protected:
using ADataType = std::tuple_element_t<0, Tuple>;
using BDataType = std::tuple_element_t<1, Tuple>;
void Run()
{
std::vector<std::vector<ck::index_t>> lengths = {
{4, 2, 1, 8}, {1, 1, 1, 1}, {16, 8, 32, 64}, {32, 64, 128, 128}};
for(auto length : lengths)
{
bool success =
ck::test_permute_scale_impl<ADataType, BDataType, 4>(true, 2, false, false, length);
EXPECT_TRUE(success);
}
}
};
using KernelTypes = ::testing::Types<std::tuple<F16, F16>, std::tuple<F32, F32>>;
TYPED_TEST_SUITE(TestPermute, KernelTypes);
TYPED_TEST(TestPermute, Test_FP16) { this->Run(); }
TYPED_TEST(TestPermute, Test_FP32) { this->Run(); }
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <iomanip>
#include <random>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/device_elementwise_scale.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_elementwise_scale_impl.hpp"
#include "ck/library/tensor_operation_instance/gpu/permute_scale.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/literals.hpp"
namespace ck {
template <typename HostTensorA, typename HostTensorB, typename FunctorA, typename FunctorB>
void host_elementwise4D(HostTensorB& B_nhwc,
const HostTensorA& A_nchw,
FunctorA functor_a,
FunctorB functor_b,
float scale)
{
std::size_t N = A_nchw.mDesc.GetLengths()[0];
std::size_t C = A_nchw.mDesc.GetLengths()[1];
std::size_t H = A_nchw.mDesc.GetLengths()[2];
std::size_t W = A_nchw.mDesc.GetLengths()[3];
for(std::size_t w = 0; w < W; ++w)
for(std::size_t h = 0; h < H; ++h)
for(std::size_t c = 0; c < C; ++c)
for(std::size_t n = 0; n < N; ++n)
{
using tmp_type = ck::remove_reference_t<decltype(B_nhwc(0, 0))>;
tmp_type tmp_val = 0;
auto a_val = A_nchw.mData[(n) + (c * N) + (h * C * N) + (w * H * C * N)];
functor_b(tmp_val, a_val);
functor_a(B_nhwc.mData[(n) + (c * W * H * N) + (h * N) + (w * H * N)],
scale * tmp_val);
}
}
template <typename ADataType, typename BDataType, index_t NumDim>
bool test_permute_scale_impl(int do_verification,
int init_method,
bool do_log,
bool time_kernel,
std::vector<index_t> lengths)
{
bool pass = true;
using ElementOp = ck::tensor_operation::element_wise::PassThrough;
using UnaryOp = ck::tensor_operation::element_wise::UnarySquare;
using Scale = ck::tensor_operation::element_wise::Scale;
float scale = 2.f;
index_t N = lengths[0];
index_t C = lengths[1];
index_t H = lengths[2];
index_t W = lengths[3];
std::vector<ck::index_t> nchw = {N, C, H, W};
std::vector<ck::index_t> nhwc = {N, H, W, C};
Tensor<ADataType> a(nchw);
Tensor<BDataType> b(nhwc);
Tensor<BDataType> host_b(nhwc);
std::array<ck::index_t, 4> ab_lengths;
std::array<ck::index_t, 4> a_strides = {1,
static_cast<int>(nchw[0]),
static_cast<int>(nchw[0] * nchw[1]),
static_cast<int>(nchw[0] * nchw[1] * nchw[2])};
std::array<ck::index_t, 4> b_strides = {1,
static_cast<int>(nhwc[0] * nhwc[1] * nhwc[2]),
static_cast<int>(nhwc[0]),
static_cast<int>(nhwc[0] * nhwc[1])};
ck::ranges::copy(nchw, ab_lengths.begin());
std::cout << "A: " << a.mDesc << std::endl;
std::cout << "B: " << b.mDesc << std::endl;
switch(init_method)
{
case 0: break;
case 1: a.GenerateTensorValue(GeneratorTensor_2<ADataType>{-1, 2}); break;
default: // a.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0}
std::mt19937 gen(11939);
std::uniform_int_distribution<int> dis(0, 1);
auto i = 0;
for(std::size_t w = 0; w < a.mDesc.GetLengths()[3]; ++w)
for(std::size_t h = 0; h < a.mDesc.GetLengths()[2]; ++h)
for(std::size_t c = 0; c < a.mDesc.GetLengths()[1]; ++c)
for(std::size_t n = 0; n < a.mDesc.GetLengths()[0]; ++n)
{
a.mData[(n * nchw[1] * nchw[2] * nchw[3]) + (c * nchw[2] * nchw[3]) +
(h * nchw[3]) + w] = i;
i = dis(gen);
}
}
DeviceMem a_device_buf(sizeof(ADataType) * a.mDesc.GetElementSpaceSize());
DeviceMem b_device_buf(sizeof(BDataType) * b.mDesc.GetElementSpaceSize());
a_device_buf.ToDevice(a.mData.data());
std::array<const void*, 1> input = {a_device_buf.GetDeviceBuffer()};
std::array<void*, 1> output = {b_device_buf.GetDeviceBuffer()};
using DeviceOp = ck::tensor_operation::device::DeviceElementwise<ck::Tuple<ADataType>,
ck::Tuple<BDataType>,
ElementOp,
UnaryOp,
Scale,
NumDim>;
// get device op instances
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
DeviceOp>::GetInstances();
std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
std::string best_instance_name;
float best_ave_time = std::numeric_limits<float>::max();
float best_gb_per_sec = 0;
float best_tflops = 0;
if(do_verification)
{
host_elementwise4D(host_b, a, ElementOp{}, UnaryOp{}, scale);
}
for(auto& op_ptr : op_ptrs)
{
auto argument_ptr = op_ptr->MakeArgumentPointer(ab_lengths,
{a_strides},
{b_strides},
input,
output,
ElementOp{},
UnaryOp{},
Scale{scale});
auto invoker_ptr = op_ptr->MakeInvokerPointer();
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
{
b_device_buf.SetZero();
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
if(do_verification)
{
b_device_buf.FromDevice(b.mData.data());
pass &= ck::utils::check_err(
b.mData, host_b.mData, "Error: Incorrect results b", 1e-3, 1e-3);
if(do_log)
{
LogRangeAsType<float>(std::cout << "a : ", a.mData, ",") << std::endl;
LogRangeAsType<float>(std::cout << "b: ", b.mData, ",") << std::endl;
}
}
std::string op_name = op_ptr->GetTypeString();
float ave_time =
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
std::size_t flop = std::size_t(2) * nchw[0] * nchw[1] * nchw[2] * nchw[3];
std::size_t num_btype = sizeof(ADataType) * (nchw[0] * nchw[1] * nchw[2] * nchw[3]) +
sizeof(BDataType) * (nchw[0] * nchw[1] * nchw[2] * nchw[3]);
float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
float gb_per_sec = num_btype / 1.E6 / ave_time;
std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << tflops << " TFlops, "
<< gb_per_sec << " GB/s, " << op_name << std::endl;
if(tflops > best_tflops)
{
best_instance_name = op_name;
best_tflops = tflops;
best_ave_time = ave_time;
best_gb_per_sec = gb_per_sec;
}
}
else
{
std::cout << op_ptr->GetTypeString() << " does not support this problem" << std::endl;
}
}
if(time_kernel)
{
LogRange(std::cout << "length = ", lengths, ",") << ", ";
std::cout << "best perf = " << best_ave_time << " ms, " << best_gb_per_sec << " GB/s, "
<< best_instance_name << std::endl;
}
return true;
}
} // namespace ck
// SPDX-License-Identifier: MIT // SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <tuple>
#include "gtest/gtest.h" #include "gtest/gtest.h"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "profiler/profile_transpose_impl.hpp"
#include "test_transpose_util.hpp"
using F16 = ck::half_t; using F16 = ck::half_t;
using F32 = float; using F32 = float;
using ck::index_t;
template <typename Tuple> template <typename Tuple>
class TestTranspose : public ::testing::Test class TestTranspose : public ::testing::Test
{ {
protected:
using ADataType = std::tuple_element_t<0, Tuple>;
using BDataType = std::tuple_element_t<1, Tuple>;
void Run()
{
std::vector<std::vector<ck::index_t>> lengths = {
{4, 16, 16, 32, 5}, {8, 16, 16, 32, 8} /**{32, 16, 16, 32, 8},**/};
for(auto length : lengths)
{
bool success = ck::profiler::profile_transpose_impl<ADataType, BDataType, 5>(
true, 2, false, false, length);
EXPECT_TRUE(success);
}
}
}; };
// clang-format off using KernelTypes = ::testing::Types<std::tuple<F16, F16>, std::tuple<F32, F32>>;
using KernelTypes = ::testing::Types<
std::tuple< F16, F16>,
std::tuple< F32, F32>
>;
// clang-format on
TYPED_TEST_SUITE(TestTranspose, KernelTypes); TYPED_TEST_SUITE(TestTranspose, KernelTypes);
TYPED_TEST(TestTranspose, Test_FP16) { this->Run(); }
//#include "test_transpose_ut_cases.inc" TYPED_TEST(TestTranspose, Test_FP32) { this->Run(); }
#pragma once
TYPED_TEST(TestTranspose, Test1)
{
// for 16, 8, 16, 32, 8
std::vector<int> Ms{1, 2, 3, 4, 5, 6};
std::vector<index_t> lengths{16, 8, 16, 32, 8};
/**constexpr int N = 16;
constexpr int C = 8;
constexpr int D = 16;
constexpr int H = 32;
constexpr int W = 8;**/
this->Run();
}
TYPED_TEST(TestTranpose, Test2)
{
std::vector<int> Ms{127, 255, 312, 799, 1573};
std::vector<index_t> lengths{16, 8, 16, 32, 16};
/**constexpr int N = 16;
constexpr int C = 8;
constexpr int D = 16;
constexpr int H = 32;
constexpr int W = 8;**/
this->Run();
}
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <string>
#include <sstream>
#include <tuple>
#include <vector>
#include <gtest/gtest.h>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "include/ck/utility/data_type.hpp"
#include "profiler/profile_transpose_impl.hpp"
namespace ck {
namespace test {
template <typename Tuple>
class TestTranspose : public testing::Test
{
using F32 = float;
protected:
using ADataType = std::tuple_element_t<0, Tuple>;
using BDataType = std::tuple_element_t<1, Tuple>;
public:
static constexpr bool verify_ = true;
static constexpr int init_method_ = 1; // decimal value initialization
static constexpr bool log_ = false;
static constexpr bool bench_ = false; // measure kernel performance
std::vector<std::vector<index_t>> lengths_ = {{16, 32, 16, 32, 16}, {16, 8, 16, 32, 8}};
void Run()
{
for(auto length : this->lengths_)
{
this->RunSingle(length);
}
}
void RunSingle()
{
bool pass = ck::profiler::profile_transpose_impl<ADataType, BDataType, 5>(
verify_, init_method_, log_, bench_, lengths_);
EXPECT_TRUE(pass);
}
};
} // namespace test
} // namespace ck
add_gtest_executable(test_layout test_layout.cpp)
target_link_libraries(test_layout PRIVATE utility)
add_gtest_executable(test_tensor test_tensor.cpp)
target_link_libraries(test_tensor PRIVATE utility)
add_gtest_executable(test_copy test_copy.cpp)
target_link_libraries(test_copy PRIVATE utility)
add_gtest_executable(test_partition test_partition.cpp)
target_link_libraries(test_partition PRIVATE utility)
if(GPU_TARGETS MATCHES "gfx908" OR GPU_TARGETS MATCHES "gfx90a" OR
GPU_TARGETS MATCHES "gfx940" OR GPU_TARGETS MATCHES "gfx941" OR
GPU_TARGETS MATCHES "gfx942")
add_gtest_executable(test_gemm test_gemm.cpp)
target_link_libraries(test_gemm PRIVATE utility)
endif()
// SPDX-License-Identifier: MIT
// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
#include <numeric>
#include <cstdlib>
#include <iostream>
#include <initializer_list>
#include <vector>
#include <gtest/gtest.h>
#include "ck/host_utility/kernel_launch.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/utility/common_header.hpp"
#include "ck/wrapper/layout.hpp"
#include "ck/wrapper/tensor.hpp"
#include "ck/wrapper/operations/copy.hpp"
// Test copy from Global to Global through LDS and VGPR
template <typename InputTensor,
typename OutputTensor,
typename BlockShape,
typename ThreadLayoutShape,
bool UseOptimizedCopy>
__global__ void TestCopyDevice(const InputTensor input_tensor,
OutputTensor output_tensor,
const BlockShape tile_shape,
const ThreadLayoutShape thread_layout)
{
__shared__ ck::index_t p_shared[ck::wrapper::size(tile_shape)];
const auto tensor_lds = ck::wrapper::make_tensor<ck::wrapper::MemoryTypeEnum::Lds>(
p_shared, ck::wrapper::make_layout(tile_shape));
const auto block_idx = static_cast<ck::index_t>(blockIdx.x);
// Get local tiles for global memory
const auto input_local_tile = ck::wrapper::make_local_tile(input_tensor, tile_shape, block_idx);
const auto output_local_tile =
ck::wrapper::make_local_tile(output_tensor, tile_shape, block_idx);
// Get partition per thread
const auto input_local_partition =
ck::wrapper::make_local_partition(input_local_tile, thread_layout, threadIdx.x);
auto lds_local_partition =
ck::wrapper::make_local_partition(tensor_lds, thread_layout, threadIdx.x);
auto output_local_partition =
ck::wrapper::make_local_partition(output_local_tile, thread_layout, threadIdx.x);
// Allocate VGPR
auto tensor_vgpr =
ck::wrapper::make_register_tensor<ck::wrapper::MemoryTypeEnum::Vgpr, ck::index_t>(
layout(lds_local_partition));
// Perform copy
if constexpr(UseOptimizedCopy)
{
using DimAccessOrder = ck::Tuple<ck::Number<1>, ck::Number<0>>;
constexpr ck::index_t vector_dim = 0;
constexpr ck::index_t scalar_per_vector = 2;
ck::wrapper::copy<DimAccessOrder, vector_dim, scalar_per_vector>(input_local_partition,
lds_local_partition);
// TODO: Enable optimized copy for static buffers
ck::wrapper::copy<DimAccessOrder, vector_dim, scalar_per_vector>(lds_local_partition,
tensor_vgpr);
ck::wrapper::copy<DimAccessOrder, vector_dim, scalar_per_vector>(tensor_vgpr,
output_local_partition);
}
else
{
ck::wrapper::copy(input_local_partition, lds_local_partition);
ck::wrapper::copy(lds_local_partition, tensor_vgpr);
ck::wrapper::copy(tensor_vgpr, output_local_partition);
}
}
template <bool UseOptimizedCopy>
void PerformCopyGlobalToGlobalViaLDS()
{
const auto shape =
ck::make_tuple(ck::make_tuple(ck::Number<2>{}, ck::Number<2>{}), ck::Number<256>{});
const auto strides =
ck::make_tuple(ck::make_tuple(ck::Number<1>{}, ck::Number<2>{}), ck::Number<4>{});
const auto layout = ck::wrapper::make_layout(shape, strides);
// 0, 1, 2, ..., size(shape) - 1
std::vector<ck::index_t> input_data(ck::wrapper::size(shape));
std::iota(input_data.begin(), input_data.end(), 0);
// Global memory buffers
DeviceMem in_buf(ck::wrapper::size(layout) * sizeof(ck::index_t));
DeviceMem out_buf(ck::wrapper::size(layout) * sizeof(ck::index_t));
in_buf.ToDevice(input_data.data());
out_buf.SetZero();
// Create tensors for global memory
const auto input_tensor_global = ck::wrapper::make_tensor<ck::wrapper::MemoryTypeEnum::Global>(
static_cast<const ck::index_t*>(in_buf.GetDeviceBuffer()), layout);
auto output_tensor_global = ck::wrapper::make_tensor<ck::wrapper::MemoryTypeEnum::Global>(
static_cast<ck::index_t*>(out_buf.GetDeviceBuffer()), layout);
const auto thread_layout = ck::make_tuple(ck::Number<1>{}, ck::Number<32>{});
const auto tile_shape = ck::make_tuple(ck::Number<4>{}, ck::Number<64>{});
const ck::index_t grid_size = ck::math::integer_divide_ceil(
ck::wrapper::size(input_tensor_global), ck::wrapper::size(tile_shape));
const auto kernel = TestCopyDevice<decltype(input_tensor_global),
decltype(output_tensor_global),
decltype(tile_shape),
decltype(thread_layout),
UseOptimizedCopy>;
launch_and_time_kernel(StreamConfig{},
kernel,
dim3(grid_size),
dim3(ck::wrapper::size(thread_layout)),
0,
input_tensor_global,
output_tensor_global,
tile_shape,
thread_layout);
// Verify results
std::vector<ck::index_t> output_data(ck::wrapper::size(shape));
out_buf.FromDevice(output_data.data());
EXPECT_TRUE(ck::utils::check_err(output_data, input_data));
}
TEST(TestCopyGlobalToGlobalViaLDS, GenericCopy) { PerformCopyGlobalToGlobalViaLDS<false>(); }
TEST(TestCopyGlobalToGlobalViaLDS, OptimizedCopy) { PerformCopyGlobalToGlobalViaLDS<true>(); }
// SPDX-License-Identifier: MIT
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
#include <numeric>
#include <cstdlib>
#include <iostream>
#include <initializer_list>
#include <vector>
#include <gtest/gtest.h>
#include "ck/library/utility/host_tensor.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
#include "ck/host_utility/kernel_launch.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/utility/common_header.hpp"
#include "ck/library/utility/fill.hpp"
#include "ck/wrapper/layout.hpp"
#include "ck/wrapper/tensor.hpp"
#include "ck/wrapper/operations/copy.hpp"
#include "ck/wrapper/operations/gemm.hpp"
template <typename DataType>
void CheckResult(const std::vector<DataType>& a_data,
const std::vector<DataType>& b_data,
std::vector<DataType>& c_m_n_device_result,
const ck::index_t M,
const ck::index_t N,
const ck::index_t K)
{
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
using ReferenceGemmInstance = ck::tensor_operation::host::
ReferenceGemm<DataType, DataType, DataType, float, PassThrough, PassThrough, PassThrough>;
Tensor<DataType> a_m_k(HostTensorDescriptor({M, K}));
Tensor<DataType> b_k_n(HostTensorDescriptor({K, N}, {1, K}));
Tensor<DataType> c_m_n_host_result(HostTensorDescriptor({M, N}));
a_m_k.mData = a_data;
b_k_n.mData = b_data;
auto ref_op = ReferenceGemmInstance{};
auto ref_invoker = ref_op.MakeInvoker();
auto ref_argument = ref_op.MakeArgument(
a_m_k, b_k_n, c_m_n_host_result, PassThrough{}, PassThrough{}, PassThrough{});
ref_invoker.Run(ref_argument);
EXPECT_TRUE(ck::utils::check_err(c_m_n_device_result, c_m_n_host_result.mData));
}
template <typename DataType,
typename GemmTraits,
ck::index_t scalar_per_vector,
typename BlockShape,
typename ThreadLayoutShape>
__global__ void DeviceGemm(const void* p_a,
const void* p_b,
void* p_c,
const ck::index_t M,
const ck::index_t N,
const ck::index_t K,
const BlockShape tile_shape,
const ThreadLayoutShape thread_layout)
{
constexpr auto MPerBlock = ck::wrapper::size<0>(tile_shape);
constexpr auto NPerBlock = ck::wrapper::size<1>(tile_shape);
constexpr auto KPerBlock = ck::wrapper::size<2>(tile_shape);
const auto a_global_layout =
ck::wrapper::make_layout(ck::make_tuple(M, K), ck::make_tuple(K, 1));
const auto b_global_layout =
ck::wrapper::make_layout(ck::make_tuple(N, K), ck::make_tuple(K, 1));
const auto c_global_layout =
ck::wrapper::make_layout(ck::make_tuple(M, N), ck::make_tuple(N, 1));
constexpr auto a_tile_layout = ck::wrapper::make_layout(
ck::make_tuple(MPerBlock, KPerBlock), ck::make_tuple(KPerBlock, ck::Number<1>{}));
constexpr auto b_tile_layout = ck::wrapper::make_layout(
ck::make_tuple(NPerBlock, KPerBlock), ck::make_tuple(KPerBlock, ck::Number<1>{}));
constexpr auto c_tile_layout = ck::wrapper::make_layout(
ck::make_tuple(MPerBlock, NPerBlock), ck::make_tuple(NPerBlock, ck::Number<1>{}));
auto a_global_tensor = ck::wrapper::make_tensor<ck::wrapper::MemoryTypeEnum::Global>(
static_cast<const DataType*>(p_a), a_global_layout);
auto b_global_tensor = ck::wrapper::make_tensor<ck::wrapper::MemoryTypeEnum::Global>(
static_cast<const DataType*>(p_b), b_global_layout);
auto c_global_tensor = ck::wrapper::make_tensor<ck::wrapper::MemoryTypeEnum::Global>(
static_cast<DataType*>(p_c), c_global_layout);
auto a_padded_global_tensor = ck::wrapper::pad(a_global_tensor, shape(a_tile_layout));
auto b_padded_global_tensor = ck::wrapper::pad(b_global_tensor, shape(b_tile_layout));
auto c_padded_global_tensor = ck::wrapper::pad(c_global_tensor, shape(c_tile_layout));
__shared__ DataType lds_a[ck::wrapper::size(a_tile_layout)];
__shared__ DataType lds_b[ck::wrapper::size(b_tile_layout)];
auto a_lds_tensor = ck::wrapper::make_tensor<ck::wrapper::MemoryTypeEnum::Lds>(
static_cast<DataType*>(lds_a), a_tile_layout);
auto b_lds_tensor = ck::wrapper::make_tensor<ck::wrapper::MemoryTypeEnum::Lds>(
static_cast<DataType*>(lds_b), b_tile_layout);
const ck::index_t block_idx = static_cast<ck::index_t>(blockIdx.x);
using DimAccessOrder = ck::Tuple<ck::Number<0>, ck::Number<1>>;
constexpr ck::index_t vector_dim = 1;
auto c_global_local_tile = ck::wrapper::make_local_tile(
c_padded_global_tensor,
tile_shape,
block_idx,
make_tuple(ck::Number<1>{}, ck::Number<1>{}, ck::wrapper::slice(KPerBlock)));
auto c_global_local_partition =
ck::wrapper::make_blockwise_gemm_xdl_c_local_partition<DataType,
decltype(a_tile_layout),
decltype(b_tile_layout),
ck::wrapper::size(thread_layout),
GemmTraits>(c_global_local_tile);
auto c_vgpr_reg = ck::wrapper::make_blockwise_gemm_xdl_c_vgpr<DataType,
decltype(a_tile_layout),
decltype(b_tile_layout),
ck::wrapper::size(thread_layout),
GemmTraits>();
ck::wrapper::clear(c_vgpr_reg);
const ck::index_t num_loop = ck::math::integer_divide_ceil(K, KPerBlock);
ck::index_t i = 0;
do
{
const auto k_slice = ck::wrapper::slice(i * KPerBlock, (i + 1) * KPerBlock);
auto a_padded_global_tensor_k_slice = a_padded_global_tensor(ck::wrapper::slice(), k_slice);
auto b_padded_global_tensor_k_slice = b_padded_global_tensor(ck::wrapper::slice(), k_slice);
auto a_global_local_tile = ck::wrapper::make_local_tile(
a_padded_global_tensor_k_slice,
tile_shape,
block_idx,
make_tuple(ck::Number<1>{}, ck::wrapper::slice(N), ck::Number<1>{}));
auto b_global_local_tile = ck::wrapper::make_local_tile(
b_padded_global_tensor_k_slice,
tile_shape,
block_idx,
make_tuple(ck::wrapper::slice(M), ck::Number<1>{}, ck::Number<1>{}));
ck::wrapper::blockwise_copy<DimAccessOrder, vector_dim, scalar_per_vector>(
a_global_local_tile, a_lds_tensor, thread_layout);
ck::wrapper::blockwise_copy<DimAccessOrder, vector_dim, scalar_per_vector>(
b_global_local_tile, b_lds_tensor, thread_layout);
ck::block_sync_lds();
ck::wrapper::blockwise_gemm_xdl<DataType, ck::wrapper::size(thread_layout), GemmTraits>(
a_lds_tensor, b_lds_tensor, c_vgpr_reg);
++i;
} while(i < num_loop);
ck::wrapper::copy(c_vgpr_reg, c_global_local_partition);
}
template <typename DataType,
typename GemmTraits,
ck::index_t scalar_per_vector,
typename BlockShape,
typename ThreadLayoutShape>
void PerformGemm(const ck::index_t M,
const ck::index_t N,
const ck::index_t K,
const BlockShape& tile_shape,
const ThreadLayoutShape& thread_layout)
{
// Global memory buffers
DeviceMem a_mem(M * K * sizeof(DataType));
DeviceMem b_mem(K * N * sizeof(DataType));
DeviceMem c_mem(M * N * sizeof(DataType));
std::vector<DataType> a_data(M * K);
std::vector<DataType> b_data(K * N);
ck::utils::FillUniformDistributionIntegerValue<DataType>{-5.f, 5.f}(a_data);
ck::utils::FillUniformDistributionIntegerValue<DataType>{-5.f, 5.f}(b_data);
a_mem.ToDevice(a_data.data());
b_mem.ToDevice(b_data.data());
c_mem.SetZero();
const ck::index_t grid_size =
ck::math::integer_divide_ceil(M, ck::wrapper::size<0>(tile_shape)) *
ck::math::integer_divide_ceil(N, ck::wrapper::size<1>(tile_shape));
const auto kernel =
DeviceGemm<DataType, GemmTraits, scalar_per_vector, BlockShape, ThreadLayoutShape>;
launch_and_time_kernel(StreamConfig{nullptr},
kernel,
dim3(grid_size),
dim3(ck::wrapper::size(thread_layout)),
0,
a_mem.GetDeviceBuffer(),
b_mem.GetDeviceBuffer(),
c_mem.GetDeviceBuffer(),
M,
N,
K,
tile_shape,
thread_layout);
std::vector<DataType> c_data(M * N);
c_mem.FromDevice(c_data.data());
CheckResult<DataType>(a_data, b_data, c_data, M, N, K);
}
TEST(TestGemm, Float)
{
using DataType = float;
const auto thread_layout = ck::make_tuple(ck::Number<16>{}, ck::Number<16>{});
const auto tile_shape = ck::make_tuple(ck::Number<128>{}, ck::Number<128>{}, ck::Number<64>{});
PerformGemm<DataType, ck::wrapper::BlockwisGemmXdlTraits_32x32Xdl_2x2XdlPerWave_4K1, 4>(
512, 512, 128, tile_shape, thread_layout);
// Irregular case
PerformGemm<DataType, ck::wrapper::BlockwisGemmXdlTraits_32x32Xdl_2x2XdlPerWave_4K1, 1>(
129, 129, 67, tile_shape, thread_layout);
}
TEST(TestGemm, Int8)
{
using DataType = int8_t;
const auto thread_layout = ck::make_tuple(ck::Number<64>{}, ck::Number<4>{});
const auto tile_shape = ck::make_tuple(ck::Number<128>{}, ck::Number<128>{}, ck::Number<64>{});
PerformGemm<DataType, ck::wrapper::BlockwisGemmXdlTraits_32x32Xdl_2x2XdlPerWave_16K1, 16>(
512, 512, 128, tile_shape, thread_layout);
// Irregular case
PerformGemm<DataType, ck::wrapper::BlockwisGemmXdlTraits_32x32Xdl_2x2XdlPerWave_16K1, 1>(
129, 129, 67, tile_shape, thread_layout);
}
TEST(TestGemm, Half)
{
using DataType = ck::half_t;
const auto thread_layout = ck::make_tuple(ck::Number<32>{}, ck::Number<8>{});
const auto tile_shape = ck::make_tuple(ck::Number<128>{}, ck::Number<128>{}, ck::Number<64>{});
PerformGemm<DataType, ck::wrapper::BlockwisGemmXdlTraits_32x32Xdl_2x2XdlPerWave_8K1, 8>(
512, 512, 128, tile_shape, thread_layout);
// Irregular case
PerformGemm<DataType, ck::wrapper::BlockwisGemmXdlTraits_32x32Xdl_2x2XdlPerWave_8K1, 1>(
129, 129, 67, tile_shape, thread_layout);
}
TEST(TestGemm, Float_2x4_4x2_XdlPerWave)
{
using DataType = float;
const auto thread_layout_4x2_xdl_per_wave = ck::make_tuple(ck::Number<16>{}, ck::Number<8>{});
const auto thread_layout_2x4_xdl_per_wave = ck::make_tuple(ck::Number<8>{}, ck::Number<16>{});
const auto tile_shape = ck::make_tuple(ck::Number<128>{}, ck::Number<128>{}, ck::Number<64>{});
PerformGemm<DataType, ck::wrapper::BlockwisGemmXdlTraits_32x32Xdl_4x2XdlPerWave_4K1, 4>(
512, 512, 128, tile_shape, thread_layout_4x2_xdl_per_wave);
PerformGemm<DataType, ck::wrapper::BlockwisGemmXdlTraits_32x32Xdl_2x4XdlPerWave_4K1, 4>(
512, 512, 128, tile_shape, thread_layout_2x4_xdl_per_wave);
}
// SPDX-License-Identifier: MIT
// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
#include <cstdlib>
#include <iostream>
#include <initializer_list>
#include <vector>
#include <gtest/gtest.h>
#include "ck/utility/common_header.hpp"
#include "ck/wrapper/layout.hpp"
#include "ck/tensor_description/tensor_descriptor.hpp"
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "ck/tensor_description/multi_index_transform_helper.hpp"
class TestWrapperLayout : public ::testing::Test
{
protected:
static constexpr auto I0 = ck::Number<0>{};
static constexpr auto I1 = ck::Number<1>{};
template <typename Desc,
typename Desc1d,
typename LayoutRuntime,
typename LayoutCompiletime,
typename Idxs>
void Run(Desc& desc,
Desc1d& desc_1d,
LayoutRuntime& layout_runtime,
LayoutCompiletime& layout_compiletime,
const std::vector<Idxs>& idxs)
{
// 1d check
EXPECT_EQ(desc_1d.GetLength(I0), ck::wrapper::size(layout_runtime));
// Check layout compiletime and runtime result consistency
EXPECT_EQ(ck::wrapper::size(layout_runtime), ck::wrapper::size(layout_compiletime));
for(ck::index_t i = 0; i < desc_1d.GetLength(I0); i++)
{
const ck::index_t layout_runtime_offset_1d = layout_runtime(ck::make_tuple(i));
const ck::index_t layout_compiletime_offset_1d = layout_compiletime(ck::make_tuple(i));
const ck::index_t desc_offset_1d = desc_1d.CalculateOffset(ck::make_tuple(i));
EXPECT_EQ(layout_runtime_offset_1d, desc_offset_1d);
EXPECT_EQ(layout_compiletime_offset_1d, layout_runtime_offset_1d);
}
// size(layout)-d check, don't check if access is hierarchical
if constexpr(!IsNestedTuple(Idxs{}))
{
ck::static_for<0, Idxs::Size(), 1>{}([&](auto d) {
EXPECT_EQ(desc.GetLength(ck::Number<d>{}), ck::wrapper::size<d>(layout_runtime));
EXPECT_EQ(ck::wrapper::size<d>(layout_runtime),
ck::wrapper::size<d>(layout_compiletime));
});
}
for(const auto idx : idxs)
{
const ck::index_t layout_runtime_offset = layout_runtime(idx);
const ck::index_t layout_compiletime_offset = layout_compiletime(idx);
const ck::index_t desc_offset =
desc.CalculateOffset(UnrollNestedTuple(idx)); // Unroll if nested
EXPECT_EQ(layout_runtime_offset, desc_offset);
EXPECT_EQ(layout_runtime_offset, layout_compiletime_offset);
}
}
};
TEST_F(TestWrapperLayout, 2d)
{
// dims:(4, 3) strides:(1, 4)
constexpr ck::index_t d1 = 4;
constexpr ck::index_t d0 = 3;
constexpr ck::index_t s1 = 1;
constexpr ck::index_t s0 = 4;
const auto desc =
ck::make_naive_tensor_descriptor(ck::make_tuple(ck::Number<d1>{}, ck::Number<d0>{}),
ck::make_tuple(ck::Number<s1>{}, ck::Number<s0>{}));
// Reverse due to column major
const auto desc_1d = transform_tensor_descriptor(
desc,
ck::make_tuple(ck::make_merge_transform(ck::make_tuple(d0, d1))),
ck::make_tuple(ck::Sequence<1, 0>{}),
ck::make_tuple(ck::Sequence<0>{}));
const auto layout_runtime = ck::wrapper::make_layout(ck::make_tuple(d1, d0));
const auto layout_compiletime =
ck::wrapper::make_layout(ck::make_tuple(ck::Number<d1>{}, ck::Number<d0>{}),
ck::make_tuple(ck::Number<s1>{}, ck::Number<s0>{}));
std::vector<ck::Tuple<ck::index_t, ck::index_t>> idxs;
for(ck::index_t h = 0; h < d1; h++)
{
for(ck::index_t w = 0; w < d0; w++)
{
idxs.emplace_back(h, w);
}
}
this->Run(desc, desc_1d, layout_runtime, layout_compiletime, idxs);
}
TEST_F(TestWrapperLayout, 3d_nested)
{
// dims:((2, 3), 4, 3) strides:((2, 4), 12, 48)
constexpr ck::index_t d3 = 2;
constexpr ck::index_t d2 = 3;
constexpr ck::index_t d1 = 4;
constexpr ck::index_t d0 = 3;
constexpr ck::index_t s3 = 2;
constexpr ck::index_t s2 = 4;
constexpr ck::index_t s1 = 12;
constexpr ck::index_t s0 = 48;
const auto desc = ck::make_naive_tensor_descriptor(
ck::make_tuple(ck::Number<d3>{}, ck::Number<d2>{}, ck::Number<d1>{}, ck::Number<d0>{}),
ck::make_tuple(ck::Number<s3>{}, ck::Number<s2>{}, ck::Number<s1>{}, ck::Number<s0>{}));
// Reverse due to column major
const auto desc_1d = transform_tensor_descriptor(
desc,
ck::make_tuple(ck::make_merge_transform(ck::make_tuple(d0, d1, d2, d3))),
ck::make_tuple(ck::Sequence<3, 2, 1, 0>{}),
ck::make_tuple(ck::Sequence<0>{}));
const auto desc_3d = transform_tensor_descriptor(
desc,
ck::make_tuple(ck::make_merge_transform(ck::make_tuple(d2, d3)),
ck::make_pass_through_transform(d1),
ck::make_pass_through_transform(d2)),
ck::make_tuple(ck::Sequence<1, 0>{}, ck::Sequence<2>{}, ck::Sequence<3>{}),
ck::make_tuple(ck::Sequence<0>{}, ck::Sequence<1>{}, ck::Sequence<2>{}));
const auto layout_runtime =
ck::wrapper::make_layout(ck::make_tuple(ck::make_tuple(d3, d2), d1, d0),
ck::make_tuple(ck::make_tuple(s3, s2), s1, s0));
const auto layout_compiletime = ck::wrapper::make_layout(
ck::make_tuple(
ck::make_tuple(ck::Number<d3>{}, ck::Number<d2>{}), ck::Number<d1>{}, ck::Number<d0>{}),
ck::make_tuple(ck::make_tuple(ck::Number<s3>{}, ck::Number<s2>{}),
ck::Number<s1>{},
ck::Number<s0>{}));
std::vector<ck::Tuple<ck::index_t, ck::index_t, ck::index_t>> idxs_3d;
for(ck::index_t d = 0; d < d2 * d3; d++)
{
for(ck::index_t h = 0; h < d1; h++)
{
for(ck::index_t w = 0; w < d0; w++)
{
idxs_3d.emplace_back(d, h, w);
}
}
}
this->Run(desc_3d, desc_1d, layout_runtime, layout_compiletime, idxs_3d);
// Check also 4d iteration
std::vector<ck::Tuple<ck::Tuple<ck::index_t, ck::index_t>, ck::index_t, ck::index_t>> idxs_4d;
for(ck::index_t e = 0; e < d3; e++)
{
for(ck::index_t d = 0; d < d2; d++)
{
for(ck::index_t h = 0; h < d1; h++)
{
for(ck::index_t w = 0; w < d0; w++)
{
idxs_4d.emplace_back(ck::make_tuple(e, d), h, w);
}
}
}
}
this->Run(desc, desc_1d, layout_runtime, layout_compiletime, idxs_4d);
}
TEST_F(TestWrapperLayout, 2d_nested)
{
// dims:((2, 3), (4, 3)) strides:((2, 4), (48, 12))
constexpr ck::index_t d3 = 2;
constexpr ck::index_t d2 = 3;
constexpr ck::index_t d1 = 4;
constexpr ck::index_t d0 = 3;
constexpr ck::index_t s3 = 2;
constexpr ck::index_t s2 = 4;
constexpr ck::index_t s1 = 48;
constexpr ck::index_t s0 = 12;
const auto desc = ck::make_naive_tensor_descriptor(
ck::make_tuple(ck::Number<d3>{}, ck::Number<d2>{}, ck::Number<d1>{}, ck::Number<d0>{}),
ck::make_tuple(ck::Number<s3>{}, ck::Number<s2>{}, ck::Number<s1>{}, ck::Number<s0>{}));
// Reverse due to column major
const auto desc_1d = transform_tensor_descriptor(
desc,
ck::make_tuple(ck::make_merge_transform(ck::make_tuple(d0, d1, d2, d3))),
ck::make_tuple(ck::Sequence<3, 2, 1, 0>{}),
ck::make_tuple(ck::Sequence<0>{}));
const auto desc_2d = transform_tensor_descriptor(
desc,
ck::make_tuple(ck::make_merge_transform(ck::make_tuple(d2, d3)),
ck::make_merge_transform(ck::make_tuple(d0, d1))),
ck::make_tuple(ck::Sequence<1, 0>{}, ck::Sequence<3, 2>{}),
ck::make_tuple(ck::Sequence<0>{}, ck::Sequence<1>{}));
const auto layout_runtime =
ck::wrapper::make_layout(ck::make_tuple(ck::make_tuple(d3, d2), ck::make_tuple(d1, d0)),
ck::make_tuple(ck::make_tuple(s3, s2), ck::make_tuple(s1, s0)));
const auto layout_compiletime = ck::wrapper::make_layout(
ck::make_tuple(ck::make_tuple(ck::Number<d3>{}, ck::Number<d2>{}),
ck::make_tuple(ck::Number<d1>{}, ck::Number<d0>{})),
ck::make_tuple(ck::make_tuple(ck::Number<s3>{}, ck::Number<s2>{}),
ck::make_tuple(ck::Number<s1>{}, ck::Number<s0>{})));
std::vector<ck::Tuple<ck::index_t, ck::index_t>> idxs_2d;
for(ck::index_t h = 0; h < d2 * d3; h++)
{
for(ck::index_t w = 0; w < d0 * d1; w++)
{
idxs_2d.emplace_back(h, w);
}
}
this->Run(desc_2d, desc_1d, layout_runtime, layout_compiletime, idxs_2d);
// Check also 4d iteration
std::vector<ck::Tuple<ck::Tuple<ck::index_t, ck::index_t>, ck::Tuple<ck::index_t, ck::index_t>>>
idxs_4d;
for(ck::index_t e = 0; e < d3; e++)
{
for(ck::index_t d = 0; d < d2; d++)
{
for(ck::index_t h = 0; h < d1; h++)
{
for(ck::index_t w = 0; w < d0; w++)
{
idxs_4d.emplace_back(ck::make_tuple(e, d), ck::make_tuple(h, w));
}
}
}
}
this->Run(desc, desc_1d, layout_runtime, layout_compiletime, idxs_4d);
}
TEST_F(TestWrapperLayout, 3d_double_nested)
{
// dims:(((2, 2), 3), (4, 3)) strides:(((2, 4), 8), (96, 24))
constexpr ck::index_t d4 = 2;
constexpr ck::index_t d3 = 2;
constexpr ck::index_t d2 = 3;
constexpr ck::index_t d1 = 4;
constexpr ck::index_t d0 = 3;
constexpr ck::index_t s4 = 2;
constexpr ck::index_t s3 = 4;
constexpr ck::index_t s2 = 8;
constexpr ck::index_t s1 = 96;
constexpr ck::index_t s0 = 24;
const auto desc = ck::make_naive_tensor_descriptor(ck::make_tuple(ck::Number<d4>{},
ck::Number<d3>{},
ck::Number<d2>{},
ck::Number<d1>{},
ck::Number<d0>{}),
ck::make_tuple(ck::Number<s4>{},
ck::Number<s3>{},
ck::Number<s2>{},
ck::Number<s1>{},
ck::Number<s0>{}));
// Reverse due to column major
const auto desc_1d = transform_tensor_descriptor(
desc,
ck::make_tuple(ck::make_merge_transform(ck::make_tuple(d0, d1, d2, d3, d4))),
ck::make_tuple(ck::Sequence<4, 3, 2, 1, 0>{}),
ck::make_tuple(ck::Sequence<0>{}));
const auto desc_3d = transform_tensor_descriptor(
desc,
ck::make_tuple(ck::make_merge_transform(ck::make_tuple(d3, d4)),
ck::make_pass_through_transform(d2),
ck::make_merge_transform(ck::make_tuple(d0, d1))),
ck::make_tuple(ck::Sequence<1, 0>{}, ck::Sequence<2>{}, ck::Sequence<4, 3>{}),
ck::make_tuple(ck::Sequence<0>{}, ck::Sequence<1>{}, ck::Sequence<2>{}));
const auto desc_2d = transform_tensor_descriptor(
desc_3d,
ck::make_tuple(ck::make_merge_transform(ck::make_tuple(d2, d3 * d4)),
ck::make_pass_through_transform(d1 * d0)),
ck::make_tuple(ck::Sequence<1, 0>{}, ck::Sequence<2>{}),
ck::make_tuple(ck::Sequence<0>{}, ck::Sequence<1>{}));
const auto layout_runtime = ck::wrapper::make_layout(
ck::make_tuple(ck::make_tuple(ck::make_tuple(d4, d3), d2), ck::make_tuple(d1, d0)),
ck::make_tuple(ck::make_tuple(ck::make_tuple(d4, s3), s2), ck::make_tuple(s1, s0)));
const auto layout_compiletime = ck::wrapper::make_layout(
ck::make_tuple(
ck::make_tuple(ck::make_tuple(ck::Number<d4>{}, ck::Number<d3>{}), ck::Number<d2>{}),
ck::make_tuple(ck::Number<d1>{}, ck::Number<d0>{})),
ck::make_tuple(
ck::make_tuple(ck::make_tuple(ck::Number<d4>{}, ck::Number<s3>{}), ck::Number<s2>{}),
ck::make_tuple(ck::Number<s1>{}, ck::Number<s0>{})));
std::vector<ck::Tuple<ck::index_t, ck::index_t>> idxs_2d;
for(ck::index_t h = 0; h < d2 * d3 * d4; h++)
{
for(ck::index_t w = 0; w < d0 * d1; w++)
{
idxs_2d.emplace_back(h, w);
}
}
this->Run(desc_2d, desc_1d, layout_runtime, layout_compiletime, idxs_2d);
// Check also 3d iteration
std::vector<ck::Tuple<ck::Tuple<ck::index_t, ck::index_t>, ck::index_t>> idxs_3d;
for(ck::index_t d = 0; d < d3 * d4; d++)
{
for(ck::index_t h = 0; h < d2; h++)
{
for(ck::index_t w = 0; w < d1 * d0; w++)
{
idxs_3d.emplace_back(ck::make_tuple(d, h), w);
}
}
}
this->Run(desc_3d, desc_1d, layout_runtime, layout_compiletime, idxs_3d);
// Check also 5d iteration
std::vector<ck::Tuple<ck::Tuple<ck::Tuple<ck::index_t, ck::index_t>, ck::index_t>,
ck::Tuple<ck::index_t, ck::index_t>>>
idxs_5d;
for(ck::index_t f = 0; f < d4; f++)
{
for(ck::index_t e = 0; e < d3; e++)
{
for(ck::index_t d = 0; d < d2; d++)
{
for(ck::index_t h = 0; h < d1; h++)
{
for(ck::index_t w = 0; w < d0; w++)
{
idxs_5d.emplace_back(ck::make_tuple(ck::make_tuple(f, e), d),
ck::make_tuple(h, w));
}
}
}
}
}
this->Run(desc, desc_1d, layout_runtime, layout_compiletime, idxs_5d);
}
TEST(TestLayoutHelpers, SizeAndGet)
{
// dims:(((2, 2), 3), (4, 3))
constexpr ck::index_t d4 = 2;
constexpr ck::index_t d3 = 2;
constexpr ck::index_t d2 = 3;
constexpr ck::index_t d1 = 4;
constexpr ck::index_t d0 = 3;
const auto layout_runtime = ck::wrapper::make_layout(
ck::make_tuple(ck::make_tuple(ck::make_tuple(d4, d3), d2), ck::make_tuple(d1, d0)));
const auto layout_compiletime = ck::wrapper::make_layout(ck::make_tuple(
ck::make_tuple(ck::make_tuple(ck::Number<d4>{}, ck::Number<d3>{}), ck::Number<d2>{}),
ck::make_tuple(ck::Number<d1>{}, ck::Number<d0>{})));
// Size of layout
EXPECT_EQ(ck::wrapper::size(layout_runtime), d4 * d3 * d2 * d1 * d0);
EXPECT_EQ(ck::wrapper::size(layout_compiletime), d4 * d3 * d2 * d1 * d0);
// Size of dims
EXPECT_EQ(ck::wrapper::size<0>(layout_runtime), d4 * d3 * d2);
EXPECT_EQ(ck::wrapper::size<0>(layout_compiletime), d4 * d3 * d2);
EXPECT_EQ(ck::wrapper::size<1>(layout_runtime), d1 * d0);
EXPECT_EQ(ck::wrapper::size<1>(layout_compiletime), d1 * d0);
// Access through new layout (using get with layout object)
EXPECT_EQ(ck::wrapper::size<0>(ck::wrapper::get<0>(layout_runtime)), d4 * d3);
EXPECT_EQ(ck::wrapper::size<0>(ck::wrapper::get<0>(layout_compiletime)), d4 * d3);
EXPECT_EQ(ck::wrapper::size<1>(ck::wrapper::get<0>(layout_runtime)), d2);
EXPECT_EQ(ck::wrapper::size<1>(ck::wrapper::get<0>(layout_compiletime)), d2);
EXPECT_EQ(ck::wrapper::size<0>(ck::wrapper::get<0>(ck::wrapper::get<0>(layout_runtime))), d4);
EXPECT_EQ(ck::wrapper::size<0>(ck::wrapper::get<0>(ck::wrapper::get<0>(layout_compiletime))),
d4);
EXPECT_EQ(ck::wrapper::size<1>(ck::wrapper::get<0>(ck::wrapper::get<0>(layout_runtime))), d3);
EXPECT_EQ(ck::wrapper::size<1>(ck::wrapper::get<0>(ck::wrapper::get<0>(layout_compiletime))),
d3);
EXPECT_EQ(ck::wrapper::size<1>(ck::wrapper::get<0>(layout_runtime)), d2);
EXPECT_EQ(ck::wrapper::size<1>(ck::wrapper::get<0>(layout_compiletime)), d2);
EXPECT_EQ(ck::wrapper::size<0>(ck::wrapper::get<1>(layout_runtime)), d1);
EXPECT_EQ(ck::wrapper::size<0>(ck::wrapper::get<1>(layout_compiletime)), d1);
EXPECT_EQ(ck::wrapper::size<1>(ck::wrapper::get<1>(layout_runtime)), d0);
EXPECT_EQ(ck::wrapper::size<1>(ck::wrapper::get<1>(layout_compiletime)), d0);
}
TEST(TestLayoutHelpers, DepthAndRank)
{
// dims:(((2, 2), 3), (4, 3))
constexpr ck::index_t d4 = 2;
constexpr ck::index_t d3 = 2;
constexpr ck::index_t d2 = 3;
constexpr ck::index_t d1 = 4;
constexpr ck::index_t d0 = 3;
const auto layout_runtime = ck::wrapper::make_layout(
ck::make_tuple(ck::make_tuple(ck::make_tuple(d4, d3), d2), ck::make_tuple(d1, d0)));
const auto layout_compiletime = ck::wrapper::make_layout(ck::make_tuple(
ck::make_tuple(ck::make_tuple(ck::Number<d4>{}, ck::Number<d3>{}), ck::Number<d2>{}),
ck::make_tuple(ck::Number<d1>{}, ck::Number<d0>{})));
EXPECT_EQ(ck::wrapper::depth(layout_runtime), 3);
EXPECT_EQ(ck::wrapper::depth(layout_compiletime), 3);
EXPECT_EQ(ck::wrapper::depth(ck::make_tuple(ck::make_tuple(d4, d3), d2)), 2);
// Check for integer
EXPECT_EQ(ck::wrapper::depth(d0), 0);
EXPECT_EQ(ck::wrapper::rank(layout_runtime), 2);
EXPECT_EQ(ck::wrapper::rank(layout_compiletime), 2);
EXPECT_EQ(ck::wrapper::rank(ck::make_tuple(ck::make_tuple(d4, d3), d2)), 2);
// Check for integer
EXPECT_EQ(ck::wrapper::rank(d0), 1);
}
TEST(TestLayoutHelpers, ShapeAndStrides)
{
// dims:(((2, 2), 3), (4, 3))
constexpr ck::index_t d4 = 2;
constexpr ck::index_t d3 = 2;
constexpr ck::index_t d2 = 3;
constexpr ck::index_t d1 = 4;
constexpr ck::index_t d0 = 3;
constexpr ck::index_t s4 = 2;
constexpr ck::index_t s3 = 4;
constexpr ck::index_t s2 = 8;
constexpr ck::index_t s1 = 96;
constexpr ck::index_t s0 = 24;
const auto shape_compiletime = ck::make_tuple(
ck::make_tuple(ck::make_tuple(ck::Number<d4>{}, ck::Number<d3>{}), ck::Number<d2>{}),
ck::make_tuple(ck::Number<d1>{}, ck::Number<d0>{}));
const auto strides_compiletime = ck::make_tuple(
ck::make_tuple(ck::make_tuple(ck::Number<s4>{}, ck::Number<s3>{}), ck::Number<s2>{}),
ck::make_tuple(ck::Number<s1>{}, ck::Number<s0>{}));
const auto shape_runtime =
ck::make_tuple(ck::make_tuple(ck::make_tuple(d4, d3), d2), ck::make_tuple(d1, d0));
const auto strides_runtime =
ck::make_tuple(ck::make_tuple(ck::make_tuple(s4, s3), s2), ck::make_tuple(s1, s0));
const auto layout_runtime = ck::wrapper::make_layout(shape_runtime, strides_runtime);
const auto layout_compiletime =
ck::wrapper::make_layout(shape_compiletime, strides_compiletime);
constexpr bool check_compiletime_shape =
std::is_same_v<decltype(shape_compiletime),
std::remove_reference_t<decltype(shape(layout_compiletime))>>;
constexpr bool check_runtime_shape =
std::is_same_v<decltype(shape_runtime),
std::remove_reference_t<decltype(shape(layout_runtime))>>;
EXPECT_TRUE(check_compiletime_shape);
EXPECT_TRUE(check_runtime_shape);
}
TEST(TestLayoutHelpers, Hierarchical)
{
// dims:(((2, 2), 3), (4, 3))
constexpr ck::index_t d4 = 2;
constexpr ck::index_t d3 = 2;
constexpr ck::index_t d2 = 3;
constexpr ck::index_t d1 = 4;
constexpr ck::index_t d0 = 3;
const auto runtime_shape =
ck::make_tuple(ck::make_tuple(ck::make_tuple(d4, d3), d2), ck::make_tuple(d1, d0));
const auto layout_runtime = ck::wrapper::make_layout(runtime_shape);
const auto layout_compiletime = ck::wrapper::make_layout(ck::make_tuple(
ck::make_tuple(ck::make_tuple(ck::Number<d4>{}, ck::Number<d3>{}), ck::Number<d2>{}),
ck::make_tuple(ck::Number<d1>{}, ck::Number<d0>{})));
EXPECT_EQ((ck::wrapper::rank<0, 0>(runtime_shape)), 2);
EXPECT_EQ((ck::wrapper::rank<0, 0>(layout_runtime)), 2);
EXPECT_EQ((ck::wrapper::rank<0, 0>(layout_compiletime)), 2);
EXPECT_EQ((ck::wrapper::depth<0, 0>(runtime_shape)), 1);
EXPECT_EQ((ck::wrapper::depth<0, 0>(layout_runtime)), 1);
EXPECT_EQ((ck::wrapper::depth<0, 0>(layout_compiletime)), 1);
EXPECT_EQ((ck::wrapper::size<0, 0>(runtime_shape)), d4 * d3);
EXPECT_EQ((ck::wrapper::size<0, 0>(layout_runtime)), d4 * d3);
EXPECT_EQ((ck::wrapper::size<0, 0>(layout_compiletime)), d4 * d3);
EXPECT_EQ((ck::wrapper::get<0, 0, 0>(runtime_shape)), d4);
}
// SPDX-License-Identifier: MIT
// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
#include <numeric>
#include <cstdlib>
#include <iostream>
#include <initializer_list>
#include <vector>
#include <gtest/gtest.h>
#include "ck/host_utility/kernel_launch.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/utility/common_header.hpp"
#include "ck/wrapper/layout.hpp"
#include "ck/wrapper/tensor.hpp"
TEST(TestPartition, LocalPartition)
{
const auto shape =
ck::make_tuple(ck::make_tuple(ck::Number<16>{}, ck::Number<4>{}), ck::Number<4>{});
const auto strides =
ck::make_tuple(ck::make_tuple(ck::Number<1>{}, ck::Number<16>{}), ck::Number<64>{});
const auto layout = ck::wrapper::make_layout(shape, strides);
std::vector<ck::index_t> data(ck::wrapper::size(layout));
std::iota(data.begin(), data.end(), 0);
const auto tensor =
ck::wrapper::make_tensor<ck::wrapper::MemoryTypeEnum::Generic>(data.data(), layout);
const auto thread_steps = ck::make_tuple(ck::Number<1>{}, ck::Number<8>{}, ck::Number<1>{});
const auto thread_layout = ck::make_tuple(ck::Number<4>{}, ck::Number<8>{}, ck::Number<1>{});
// 3d partition on 2d shape (calculate partition on 3d thread layout, and then skip first dim)
const auto thread_projection =
ck::make_tuple(ck::wrapper::slice(4), ck::Number<1>{}, ck::Number<1>{});
constexpr ck::index_t projection_thread_length = ck::Number<4>{};
for(ck::index_t thread_id = 0;
thread_id < ck::wrapper::size(thread_layout) / projection_thread_length;
thread_id++)
{
const auto packed_partition =
ck::wrapper::make_local_partition(tensor, thread_layout, thread_id, thread_projection);
const auto expected_partition_size =
ck::wrapper::size(tensor) /
(ck::wrapper::size(thread_layout) / projection_thread_length);
const auto expected_partition_first_val = thread_id * ck::wrapper::size<1>(thread_steps);
const auto expected_partition_second_val = expected_partition_first_val + 1;
EXPECT_EQ(ck::wrapper::size(packed_partition), expected_partition_size);
EXPECT_EQ(packed_partition(0), expected_partition_first_val);
EXPECT_EQ(packed_partition(1), expected_partition_second_val);
}
}
TEST(TestPartition, LocalTile)
{
const auto shape = ck::make_tuple(ck::Number<16>{}, ck::Number<4>{}, ck::Number<4>{});
const auto strides = ck::make_tuple(ck::Number<1>{}, ck::Number<16>{}, ck::Number<64>{});
const auto layout = ck::wrapper::make_layout(shape, strides);
std::vector<ck::index_t> data(ck::wrapper::size(layout));
std::iota(data.begin(), data.end(), 0);
const auto tensor =
ck::wrapper::make_tensor<ck::wrapper::MemoryTypeEnum::Generic>(data.data(), layout);
// 4d tile partitioning on 3d shape (calculate tile on 4d tile layout, and then skip last dim)
const auto block_shape =
ck::make_tuple(ck::Number<2>{}, ck::Number<4>{}, ck::Number<2>{}, ck::Number<2>{});
const auto block_projection =
ck::make_tuple(ck::Number<1>{}, ck::Number<1>{}, ck::Number<1>{}, ck::wrapper::slice(2));
constexpr ck::index_t projection_block_dim = ck::Number<2>{};
const auto num_blocks =
ck::make_tuple(ck::wrapper::size<0>(shape) / ck::wrapper::size<0>(block_shape),
ck::wrapper::size<1>(shape) / ck::wrapper::size<1>(block_shape),
ck::wrapper::size<2>(shape) / ck::wrapper::size<2>(block_shape));
std::vector<ck::index_t> block_idxs(ck::wrapper::size(num_blocks));
std::iota(block_idxs.begin(), block_idxs.end(), 0);
for(auto block_idx : block_idxs)
{
const auto packed_tile =
ck::wrapper::make_local_tile(tensor, block_shape, block_idx, block_projection);
const auto expected_tile_size = ck::wrapper::size(block_shape) / projection_block_dim;
auto expected_tile_first_val = (block_idx % ck::wrapper::size<2>(num_blocks)) *
ck::wrapper::size<2>(block_shape) *
ck::wrapper::size<2>(strides);
block_idx /= ck::wrapper::size<2>(num_blocks);
expected_tile_first_val += (block_idx % ck::wrapper::size<1>(num_blocks)) *
ck::wrapper::size<1>(block_shape) *
ck::wrapper::size<1>(strides);
block_idx /= ck::wrapper::size<1>(num_blocks);
expected_tile_first_val += (block_idx % ck::wrapper::size<0>(num_blocks)) *
ck::wrapper::size<0>(block_shape) *
ck::wrapper::size<0>(strides);
const auto expected_tile_second_val = expected_tile_first_val + 1;
EXPECT_EQ(ck::wrapper::size(packed_tile), expected_tile_size);
EXPECT_EQ(packed_tile(0), expected_tile_first_val);
EXPECT_EQ(packed_tile(1), expected_tile_second_val);
}
}
// SPDX-License-Identifier: MIT
// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
#include <cstdlib>
#include <iostream>
#include <initializer_list>
#include <vector>
#include <gtest/gtest.h>
#include "ck/library/utility/device_memory.hpp"
#include "ck/host_utility/kernel_launch.hpp"
#include "ck/utility/common_header.hpp"
#include "ck/wrapper/layout.hpp"
#include "ck/wrapper/tensor.hpp"
// Compare data in tensor with offset from layout.
// Data and offset should match if physical memory has been initialized with
// sequentially increasing values from 0.
template <typename TensorType>
__host__ __device__ bool TestTensorCheck3d(TensorType& tensor)
{
const auto& layout = ck::wrapper::layout(tensor);
for(ck::index_t d = 0; d < ck::wrapper::size<0>(ck::wrapper::get<0>(layout)); d++)
{
for(ck::index_t h = 0; h < ck::wrapper::size<1>(ck::wrapper::get<0>(layout)); h++)
{
for(ck::index_t w = 0; w < ck::wrapper::size<1>(layout); w++)
{
const auto idx = ck::make_tuple(ck::make_tuple(d, h), w);
if(tensor(idx) != layout(idx))
{
return false;
}
}
}
}
return true;
}
template <typename TensorType>
__host__ __device__ bool TestTensorCheck1d(TensorType& tensor, ck::index_t start_offset = 0)
{
const auto& layout = ck::wrapper::layout(tensor);
for(ck::index_t w = 0; w < ck::wrapper::size<0>(layout); w++)
{
if(tensor(w) - start_offset != layout(ck::make_tuple(w)))
{
return false;
}
}
return true;
}
template <ck::index_t nelems, typename TensorType>
__host__ __device__ bool StaticTestTensorCheck1d(TensorType& tensor)
{
const auto& layout = ck::wrapper::layout(tensor);
bool success = true;
ck::static_for<0, nelems, 1>{}([&](auto w) {
if(tensor(ck::Number<w.value>{}) != layout(ck::make_tuple(w.value)))
{
success = false;
}
});
return success;
}
template <typename TensorType>
__host__ __device__ void InitTensor(TensorType& tensor)
{
for(ck::index_t i = 0; i < ck::wrapper::size(ck::wrapper::layout(tensor)); i++)
{
tensor(i) = i;
}
}
template <ck::index_t nelems, typename TensorType>
__host__ __device__ void StaticInitTensor(TensorType& tensor)
{
ck::static_for<0, nelems, 1>{}([&](auto i) { tensor(ck::Number<i.value>{}) = i.value; });
}
// Tests
TEST(TestTensor, ReadWriteHostMemory)
{
constexpr ck::index_t nelems = 8;
std::array<ck::index_t, nelems> data;
const auto layout = ck::wrapper::make_layout(ck::make_tuple(ck::make_tuple(2, 2), 2));
auto tensor = ck::wrapper::make_tensor<ck::wrapper::MemoryTypeEnum::Generic>(&data[0], layout);
InitTensor(tensor);
EXPECT_TRUE(TestTensorCheck1d(tensor));
EXPECT_TRUE(TestTensorCheck3d(tensor));
}
__global__ void TestTensorReadWriteDevice(void* data, void* success)
{
constexpr ck::index_t nelems = 8;
__shared__ ck::index_t p_shared[nelems];
ck::index_t* casted_data_ptr = static_cast<ck::index_t*>(data);
bool* casted_success_ptr = static_cast<bool*>(success);
const auto layout = ck::wrapper::make_layout(ck::make_tuple(ck::make_tuple(2, 2), 2));
constexpr auto vgpr_layout =
ck::wrapper::make_layout(make_tuple(ck::Number<nelems>{}), make_tuple(ck::Number<1>{}));
auto tensor_global =
ck::wrapper::make_tensor<ck::wrapper::MemoryTypeEnum::Global>(casted_data_ptr, layout);
auto tensor_lds = ck::wrapper::make_tensor<ck::wrapper::MemoryTypeEnum::Lds>(p_shared, layout);
auto tensor_vgpr =
ck::wrapper::make_register_tensor<ck::wrapper::MemoryTypeEnum::Vgpr, ck::index_t>(
vgpr_layout);
InitTensor(tensor_global);
InitTensor(tensor_lds);
StaticInitTensor<nelems>(tensor_vgpr);
*casted_success_ptr = TestTensorCheck1d(tensor_global);
*casted_success_ptr &= TestTensorCheck3d(tensor_global);
*casted_success_ptr &= TestTensorCheck1d(tensor_lds);
*casted_success_ptr &= TestTensorCheck3d(tensor_lds);
*casted_success_ptr &= StaticTestTensorCheck1d<nelems>(tensor_vgpr);
}
TEST(TestTensor, ReadWriteGlobalLdsRegistersMemory)
{
constexpr ck::index_t nelems = 8;
std::array<ck::index_t, nelems> host_data;
DeviceMem data_buf(nelems * sizeof(ck::index_t));
data_buf.ToDevice(&host_data[0]);
DeviceMem success_buf(sizeof(bool));
launch_and_time_kernel(StreamConfig{},
TestTensorReadWriteDevice,
dim3(1),
dim3(1),
0,
data_buf.GetDeviceBuffer(),
success_buf.GetDeviceBuffer());
bool success;
success_buf.FromDevice(&success);
EXPECT_TRUE(success);
}
TEST(TestTensor, Slicing)
{
constexpr ck::index_t nelems = 8;
std::array<ck::index_t, nelems> data;
const auto shape = ck::make_tuple(ck::make_tuple(2, 2), 2);
const auto strides = ck::make_tuple(ck::make_tuple(1, 2), 4);
const auto layout = ck::wrapper::make_layout(shape, strides);
auto tensor = ck::wrapper::make_tensor<ck::wrapper::MemoryTypeEnum::Generic>(&data[0], layout);
InitTensor(tensor);
auto tensor2x2x2 =
tensor(ck::make_tuple(ck::wrapper::slice(2), ck::wrapper::slice(2)), ck::wrapper::slice(2));
EXPECT_EQ(tensor2x2x2(0), layout(ck::make_tuple(ck::make_tuple(0, 0), 0)));
EXPECT_EQ(ck::wrapper::rank(tensor2x2x2), 2);
EXPECT_EQ(ck::wrapper::depth(tensor2x2x2), 2);
EXPECT_EQ(ck::wrapper::size(tensor2x2x2), 8);
EXPECT_TRUE(TestTensorCheck1d(tensor2x2x2));
auto tensor2x2 = tensor(ck::make_tuple(1, ck::wrapper::slice(2)), ck::wrapper::slice(2));
EXPECT_EQ(tensor2x2(0), layout(ck::make_tuple(ck::make_tuple(1, 0), 0)));
EXPECT_EQ(ck::wrapper::rank(tensor2x2), 2);
EXPECT_EQ(ck::wrapper::depth(tensor2x2), 2);
EXPECT_EQ(ck::wrapper::size(tensor2x2), 4);
EXPECT_TRUE(TestTensorCheck1d(tensor2x2));
auto tensor1x1 = tensor(ck::make_tuple(1, ck::wrapper::slice(1, 2)), ck::wrapper::slice(1, 2));
EXPECT_EQ(tensor1x1(0), layout(ck::make_tuple(ck::make_tuple(1, 1), 1)));
EXPECT_EQ(rank(tensor1x1), 2);
EXPECT_EQ(depth(tensor1x1), 2);
EXPECT_EQ(size(tensor1x1), 1);
EXPECT_TRUE(TestTensorCheck1d(tensor1x1));
auto tensor2 = tensor(ck::make_tuple(1, 1), ck::wrapper::slice(0, 2));
EXPECT_EQ(tensor2(0), layout(ck::make_tuple(ck::make_tuple(1, 1), 0)));
EXPECT_EQ(ck::wrapper::rank(tensor2), 1);
EXPECT_EQ(ck::wrapper::depth(tensor2), 1);
EXPECT_EQ(ck::wrapper::size(tensor2), 2);
EXPECT_TRUE(TestTensorCheck1d(tensor2));
auto tensor2_v2 = tensor(2, ck::wrapper::slice(0, 2));
EXPECT_EQ(tensor2_v2(0), layout(ck::make_tuple(2, 0)));
EXPECT_EQ(ck::wrapper::rank(tensor2_v2), 1);
EXPECT_EQ(ck::wrapper::depth(tensor2_v2), 1);
EXPECT_EQ(ck::wrapper::size(tensor2_v2), 2);
EXPECT_TRUE(TestTensorCheck1d(tensor2_v2));
// negative indexing
auto tensor1x2 = tensor(ck::make_tuple(1, ck::wrapper::slice(0, -2)), ck::wrapper::slice());
EXPECT_EQ(tensor1x2(0), layout(ck::make_tuple(ck::make_tuple(1, 0), 0)));
EXPECT_EQ(rank(tensor1x2), 2);
EXPECT_EQ(depth(tensor1x2), 2);
EXPECT_EQ(size(tensor1x2), 2);
EXPECT_TRUE(TestTensorCheck1d(tensor1x2));
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment