Commit 874a78f9 authored by Jun Liu's avatar Jun Liu
Browse files

Merge branch 'amd-develop' into amd-master

parents 6368be50 2fd6c6d4
...@@ -135,6 +135,8 @@ class TestConvTensorRearrangeInterface : public ::testing::Test ...@@ -135,6 +135,8 @@ class TestConvTensorRearrangeInterface : public ::testing::Test
return col2img.IsSupportedArgument(argument); return col2img.IsSupportedArgument(argument);
} }
throw std::runtime_error("Conv_tensor_rearrange: problem with tensor rearrange operator. ");
return 1;
} }
}; };
......
...@@ -60,7 +60,9 @@ class TestGemmSplitK : public testing::Test ...@@ -60,7 +60,9 @@ class TestGemmSplitK : public testing::Test
const int StrideA, const int StrideA,
const int StrideB, const int StrideB,
const int StrideC, const int StrideC,
int kbatch = 1) int kbatch = 1,
int n_warmup = 1,
int n_iter = 10)
{ {
bool pass = ck::profiler::profile_gemm_splitk_impl<ADataType, bool pass = ck::profiler::profile_gemm_splitk_impl<ADataType,
BDataType, BDataType,
...@@ -68,8 +70,19 @@ class TestGemmSplitK : public testing::Test ...@@ -68,8 +70,19 @@ class TestGemmSplitK : public testing::Test
CDataType, CDataType,
ALayout, ALayout,
BLayout, BLayout,
CLayout>( CLayout>(verify_,
verify_, init_method_, log_, bench_, M, N, K, StrideA, StrideB, StrideC, kbatch); init_method_,
log_,
bench_,
M,
N,
K,
StrideA,
StrideB,
StrideC,
kbatch,
n_warmup,
n_iter);
EXPECT_TRUE(pass); EXPECT_TRUE(pass);
} }
}; };
......
...@@ -63,7 +63,9 @@ class TestGroupedGemm : public testing::TestWithParam<int> ...@@ -63,7 +63,9 @@ class TestGroupedGemm : public testing::TestWithParam<int>
const std::vector<int>& StrideAs, const std::vector<int>& StrideAs,
const std::vector<int>& StrideBs, const std::vector<int>& StrideBs,
const std::vector<int>& StrideCs, const std::vector<int>& StrideCs,
int kbatch = 1) int kbatch = 1,
int n_warmup = 1,
int n_iter = 10)
{ {
bool pass = ck::profiler::profile_grouped_gemm_impl<ADataType, bool pass = ck::profiler::profile_grouped_gemm_impl<ADataType,
BDataType, BDataType,
...@@ -71,8 +73,19 @@ class TestGroupedGemm : public testing::TestWithParam<int> ...@@ -71,8 +73,19 @@ class TestGroupedGemm : public testing::TestWithParam<int>
float, float,
ALayout, ALayout,
BLayout, BLayout,
ELayout>( ELayout>(verify_,
verify_, init_method_, log_, bench_, Ms, Ns, Ks, StrideAs, StrideBs, StrideCs, kbatch); init_method_,
log_,
bench_,
Ms,
Ns,
Ks,
StrideAs,
StrideBs,
StrideCs,
kbatch,
n_warmup,
n_iter);
EXPECT_TRUE(pass); EXPECT_TRUE(pass);
} }
}; };
......
add_custom_target(test_normalization_bwd_gamma_beta)
add_gtest_executable(test_layernorm2d_bwd_gamma_beta_fp32 test_layernorm2d_bwd_gamma_beta_fp32.cpp)
if(result EQUAL 0)
target_link_libraries(test_layernorm2d_bwd_gamma_beta_fp32 PRIVATE utility device_normalization_bwd_gamma_beta_instance)
add_dependencies(test_normalization_bwd_gamma_beta test_layernorm2d_bwd_gamma_beta_fp32)
endif()
add_gtest_executable(test_groupnorm_bwd_gamma_beta_fp32 test_groupnorm_bwd_gamma_beta_fp32.cpp)
if(result EQUAL 0)
target_link_libraries(test_groupnorm_bwd_gamma_beta_fp32 PRIVATE utility device_normalization_bwd_gamma_beta_instance)
add_dependencies(test_normalization_bwd_gamma_beta test_groupnorm_bwd_gamma_beta_fp32)
endif()
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include "gtest/gtest.h"
#include "profiler/profile_groupnorm_bwd_gamma_beta_impl.hpp"
using F16 = ck::half_t;
using F32 = float;
using ck::index_t;
template <typename Tuple>
class TestgroupnormBwdGammaBeta : public ::testing::Test
{
protected:
using DYDataType = std::tuple_element_t<0, Tuple>;
using XDataType = std::tuple_element_t<1, Tuple>;
using MeanInvStdDataType = std::tuple_element_t<2, Tuple>;
using ComputeDataType = std::tuple_element_t<3, Tuple>;
using DGammaDataType = std::tuple_element_t<4, Tuple>;
using DBetaDataType = std::tuple_element_t<5, Tuple>;
void Run()
{
// Bwd data: [N, H, W, G, C], reduce H, W, C
std::vector<std::vector<ck::index_t>> lengths = {{1, 1, 1, 1, 1},
{1, 2, 3, 4, 5},
{256, 9, 9, 9, 9},
{1, 64, 64, 32, 10},
{1, 32, 32, 32, 20},
{1, 16, 16, 32, 40}};
for(auto length : lengths)
{
bool success = ck::profiler::profile_groupnorm_bwd_gamma_beta_impl<DYDataType,
XDataType,
MeanInvStdDataType,
ComputeDataType,
DGammaDataType,
DBetaDataType>(
true, 2, false, false, length);
EXPECT_TRUE(success);
}
}
};
using KernelTypes = ::testing::Types<
// DYDataType XDataType, MeanInvStdDataType, ComputeDataType, DGammaDataType, DBetaDataType>
std::tuple<F32, F32, F32, F32, F32, F32>>;
TYPED_TEST_SUITE(TestgroupnormBwdGammaBeta, KernelTypes);
TYPED_TEST(TestgroupnormBwdGammaBeta, Test_FP32) { this->Run(); }
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include "gtest/gtest.h"
#include "profiler/profile_layernorm_bwd_gamma_beta_impl.hpp"
using F16 = ck::half_t;
using F32 = float;
using ck::index_t;
template <typename Tuple>
class TestLayernorm2dBwdGammaBeta : public ::testing::Test
{
protected:
using DYDataType = std::tuple_element_t<0, Tuple>;
using XDataType = std::tuple_element_t<1, Tuple>;
using MeanInvStdDataType = std::tuple_element_t<2, Tuple>;
using ComputeDataType = std::tuple_element_t<3, Tuple>;
using DGammaDataType = std::tuple_element_t<4, Tuple>;
using DBetaDataType = std::tuple_element_t<5, Tuple>;
void Run()
{
// Bwd data: [N, D], reduce D
std::vector<std::vector<ck::index_t>> lengths = {
{4, 256}, {8, 511}, {9, 1032}, {4, 2048}, {1, 8192}, {4000, 2000}};
for(auto length : lengths)
{
bool success = ck::profiler::profile_layernorm_bwd_gamma_beta_impl<DYDataType,
XDataType,
MeanInvStdDataType,
ComputeDataType,
DGammaDataType,
DBetaDataType,
2>(
true, 2, false, false, length);
EXPECT_TRUE(success);
}
}
};
using KernelTypes = ::testing::Types<
// DYDataType XDataType, MeanInvStdDataType, ComputeDataType, DGammaDataType, DBetaDataType>
std::tuple<F32, F32, F32, F32, F32, F32>>;
TYPED_TEST_SUITE(TestLayernorm2dBwdGammaBeta, KernelTypes);
TYPED_TEST(TestLayernorm2dBwdGammaBeta, Test_FP32) { this->Run(); }
...@@ -21,49 +21,59 @@ template <typename InputTensor, ...@@ -21,49 +21,59 @@ template <typename InputTensor,
typename OutputTensor, typename OutputTensor,
typename BlockShape, typename BlockShape,
typename ThreadLayoutShape, typename ThreadLayoutShape,
typename LocalTileSteps, bool UseOptimizedCopy>
typename LocalPartitionSteps>
__global__ void TestCopyDevice(const InputTensor input_tensor, __global__ void TestCopyDevice(const InputTensor input_tensor,
OutputTensor output_tensor, OutputTensor output_tensor,
const BlockShape tile_shape, const BlockShape tile_shape,
const ThreadLayoutShape thread_layout, const ThreadLayoutShape thread_layout)
const LocalTileSteps block_steps,
const LocalPartitionSteps thread_steps)
{ {
__shared__ ck::index_t p_shared[ck::wrapper::size(tile_shape)]; __shared__ ck::index_t p_shared[ck::wrapper::size(tile_shape)];
auto tensor_lds = ck::wrapper::make_tensor<ck::wrapper::MemoryTypeEnum::Lds>( const auto tensor_lds = ck::wrapper::make_tensor<ck::wrapper::MemoryTypeEnum::Lds>(
p_shared, ck::wrapper::make_layout(tile_shape)); p_shared, ck::wrapper::make_layout(tile_shape));
const auto block_idxs = ck::make_tuple(ck::make_tuple(0, 0), blockIdx.x); const auto block_idx = static_cast<ck::index_t>(blockIdx.x);
// Get local tiles for global memory // Get local tiles for global memory
const auto input_local_tile = const auto input_local_tile = ck::wrapper::make_local_tile(input_tensor, tile_shape, block_idx);
ck::wrapper::make_local_tile(input_tensor, tile_shape, block_idxs, block_steps);
const auto output_local_tile = const auto output_local_tile =
ck::wrapper::make_local_tile(output_tensor, tile_shape, block_idxs, block_steps); ck::wrapper::make_local_tile(output_tensor, tile_shape, block_idx);
// Get partition per thread // Get partition per thread
const auto input_local_partition = ck::wrapper::make_local_partition( const auto input_local_partition =
input_local_tile, thread_layout, threadIdx.x, thread_steps); ck::wrapper::make_local_partition(input_local_tile, thread_layout, threadIdx.x);
auto lds_local_partition = auto lds_local_partition =
ck::wrapper::make_local_partition(tensor_lds, thread_layout, threadIdx.x, thread_steps); ck::wrapper::make_local_partition(tensor_lds, thread_layout, threadIdx.x);
auto output_local_partition = ck::wrapper::make_local_partition( auto output_local_partition =
output_local_tile, thread_layout, threadIdx.x, thread_steps); ck::wrapper::make_local_partition(output_local_tile, thread_layout, threadIdx.x);
// Allocate VGPR // Allocate VGPR
constexpr ck::index_t scalar_per_vector = 1; auto tensor_vgpr =
constexpr ck::index_t vgpr_size = ck::wrapper::size(lds_local_partition); ck::wrapper::make_register_tensor<ck::wrapper::MemoryTypeEnum::Vgpr, ck::index_t>(
auto tensor_vgpr = ck::wrapper::make_register_tensor<ck::wrapper::MemoryTypeEnum::Vgpr, layout(lds_local_partition));
vgpr_size,
scalar_per_vector,
ck::index_t>();
// Perform copy // Perform copy
ck::wrapper::copy(input_local_partition, lds_local_partition); if constexpr(UseOptimizedCopy)
ck::wrapper::copy(lds_local_partition, tensor_vgpr); {
ck::wrapper::copy(tensor_vgpr, output_local_partition); using DimAccessOrder = ck::Tuple<ck::Number<1>, ck::Number<0>>;
constexpr ck::index_t vector_dim = 0;
constexpr ck::index_t scalar_per_vector = 2;
ck::wrapper::copy<DimAccessOrder, vector_dim, scalar_per_vector>(input_local_partition,
lds_local_partition);
// TODO: Enable optimized copy for static buffers
ck::wrapper::copy<DimAccessOrder, vector_dim, scalar_per_vector>(lds_local_partition,
tensor_vgpr);
ck::wrapper::copy<DimAccessOrder, vector_dim, scalar_per_vector>(tensor_vgpr,
output_local_partition);
}
else
{
ck::wrapper::copy(input_local_partition, lds_local_partition);
ck::wrapper::copy(lds_local_partition, tensor_vgpr);
ck::wrapper::copy(tensor_vgpr, output_local_partition);
}
} }
template <bool UseOptimizedCopy>
void PerformCopyGlobalToGlobalViaLDS() void PerformCopyGlobalToGlobalViaLDS()
{ {
const auto shape = const auto shape =
...@@ -89,15 +99,8 @@ void PerformCopyGlobalToGlobalViaLDS() ...@@ -89,15 +99,8 @@ void PerformCopyGlobalToGlobalViaLDS()
auto output_tensor_global = ck::wrapper::make_tensor<ck::wrapper::MemoryTypeEnum::Global>( auto output_tensor_global = ck::wrapper::make_tensor<ck::wrapper::MemoryTypeEnum::Global>(
static_cast<ck::index_t*>(out_buf.GetDeviceBuffer()), layout); static_cast<ck::index_t*>(out_buf.GetDeviceBuffer()), layout);
const auto thread_layout = const auto thread_layout = ck::make_tuple(ck::Number<1>{}, ck::Number<32>{});
ck::make_tuple(ck::make_tuple(ck::Number<1>{}, ck::Number<1>{}), ck::Number<32>{}); const auto tile_shape = ck::make_tuple(ck::Number<4>{}, ck::Number<64>{});
const auto tile_shape =
ck::make_tuple(ck::make_tuple(ck::Number<2>{}, ck::Number<2>{}), ck::Number<64>{});
const auto thread_steps =
ck::make_tuple(ck::make_tuple(ck::Number<1>{}, ck::Number<1>{}), ck::Number<2>{});
const auto block_steps =
ck::make_tuple(ck::make_tuple(ck::Number<1>{}, ck::Number<1>{}), ck::Number<64>{});
const ck::index_t grid_size = ck::math::integer_divide_ceil( const ck::index_t grid_size = ck::math::integer_divide_ceil(
ck::wrapper::size(input_tensor_global), ck::wrapper::size(tile_shape)); ck::wrapper::size(input_tensor_global), ck::wrapper::size(tile_shape));
...@@ -106,8 +109,7 @@ void PerformCopyGlobalToGlobalViaLDS() ...@@ -106,8 +109,7 @@ void PerformCopyGlobalToGlobalViaLDS()
decltype(output_tensor_global), decltype(output_tensor_global),
decltype(tile_shape), decltype(tile_shape),
decltype(thread_layout), decltype(thread_layout),
decltype(block_steps), UseOptimizedCopy>;
decltype(thread_steps)>;
launch_and_time_kernel(StreamConfig{}, launch_and_time_kernel(StreamConfig{},
kernel, kernel,
dim3(grid_size), dim3(grid_size),
...@@ -116,9 +118,7 @@ void PerformCopyGlobalToGlobalViaLDS() ...@@ -116,9 +118,7 @@ void PerformCopyGlobalToGlobalViaLDS()
input_tensor_global, input_tensor_global,
output_tensor_global, output_tensor_global,
tile_shape, tile_shape,
thread_layout, thread_layout);
block_steps,
thread_steps);
// Verify results // Verify results
std::vector<ck::index_t> output_data(ck::wrapper::size(shape)); std::vector<ck::index_t> output_data(ck::wrapper::size(shape));
...@@ -126,4 +126,5 @@ void PerformCopyGlobalToGlobalViaLDS() ...@@ -126,4 +126,5 @@ void PerformCopyGlobalToGlobalViaLDS()
EXPECT_TRUE(ck::utils::check_err(output_data, input_data)); EXPECT_TRUE(ck::utils::check_err(output_data, input_data));
} }
TEST(TestCopy, CopyGlobalToGlobalViaLDS) { PerformCopyGlobalToGlobalViaLDS(); } TEST(TestCopyGlobalToGlobalViaLDS, GenericCopy) { PerformCopyGlobalToGlobalViaLDS<false>(); }
TEST(TestCopyGlobalToGlobalViaLDS, OptimizedCopy) { PerformCopyGlobalToGlobalViaLDS<true>(); }
...@@ -29,42 +29,29 @@ TEST(TestPartition, LocalPartition) ...@@ -29,42 +29,29 @@ TEST(TestPartition, LocalPartition)
const auto tensor = const auto tensor =
ck::wrapper::make_tensor<ck::wrapper::MemoryTypeEnum::Generic>(data.data(), layout); ck::wrapper::make_tensor<ck::wrapper::MemoryTypeEnum::Generic>(data.data(), layout);
const auto thread_steps = const auto thread_steps = ck::make_tuple(ck::Number<8>{}, ck::Number<1>{});
ck::make_tuple(ck::make_tuple(ck::Number<2>{}, ck::Number<1>{}), ck::Number<1>{}); const auto thread_layout = ck::make_tuple(ck::Number<8>{}, ck::Number<1>{});
const auto thread_layout =
ck::make_tuple(ck::make_tuple(ck::Number<8>{}, ck::Number<1>{}), ck::Number<1>{});
for(ck::index_t thread_id = 0; thread_id < ck::wrapper::size(thread_layout); thread_id++)
{
const auto raked_partition =
ck::wrapper::make_local_partition(tensor, thread_layout, thread_id);
const auto expected_partition_size =
ck::wrapper::size(tensor) / ck::wrapper::size(thread_layout);
EXPECT_EQ(ck::wrapper::size(raked_partition), expected_partition_size);
EXPECT_EQ(raked_partition(0), thread_id);
}
for(ck::index_t thread_id = 0; thread_id < ck::wrapper::size(thread_layout); thread_id++) for(ck::index_t thread_id = 0; thread_id < ck::wrapper::size(thread_layout); thread_id++)
{ {
const auto packed_partition = const auto packed_partition =
ck::wrapper::make_local_partition(tensor, thread_layout, thread_id, thread_steps); ck::wrapper::make_local_partition(tensor, thread_layout, thread_id);
const auto expected_partition_size = const auto expected_partition_size =
ck::wrapper::size(tensor) / ck::wrapper::size(thread_layout); ck::wrapper::size(tensor) / ck::wrapper::size(thread_layout);
const auto expected_partition_first_val = thread_id * ck::wrapper::size<0, 0>(thread_steps); const auto expected_partition_first_val = thread_id * ck::wrapper::size<0>(thread_steps);
const auto expected_partition_second_val = expected_partition_first_val + 1;
EXPECT_EQ(ck::wrapper::size(packed_partition), expected_partition_size); EXPECT_EQ(ck::wrapper::size(packed_partition), expected_partition_size);
EXPECT_EQ(packed_partition(0), expected_partition_first_val); EXPECT_EQ(packed_partition(0), expected_partition_first_val);
EXPECT_EQ(packed_partition(1), expected_partition_second_val);
} }
} }
TEST(TestPartition, LocalTile) TEST(TestPartition, LocalTile)
{ {
const auto shape = const auto shape = ck::make_tuple(ck::Number<16>{}, ck::Number<4>{}, ck::Number<4>{});
ck::make_tuple(ck::make_tuple(ck::Number<16>{}, ck::Number<4>{}), ck::Number<4>{}); const auto strides = ck::make_tuple(ck::Number<1>{}, ck::Number<16>{}, ck::Number<64>{});
const auto strides = const auto layout = ck::wrapper::make_layout(shape, strides);
ck::make_tuple(ck::make_tuple(ck::Number<1>{}, ck::Number<16>{}), ck::Number<64>{});
const auto layout = ck::wrapper::make_layout(shape, strides);
std::vector<ck::index_t> data(ck::wrapper::size(layout)); std::vector<ck::index_t> data(ck::wrapper::size(layout));
std::iota(data.begin(), data.end(), 0); std::iota(data.begin(), data.end(), 0);
...@@ -72,48 +59,34 @@ TEST(TestPartition, LocalTile) ...@@ -72,48 +59,34 @@ TEST(TestPartition, LocalTile)
const auto tensor = const auto tensor =
ck::wrapper::make_tensor<ck::wrapper::MemoryTypeEnum::Generic>(data.data(), layout); ck::wrapper::make_tensor<ck::wrapper::MemoryTypeEnum::Generic>(data.data(), layout);
const auto block_steps = const auto block_shape = ck::make_tuple(ck::Number<2>{}, ck::Number<4>{}, ck::Number<2>{});
ck::make_tuple(ck::make_tuple(ck::Number<4>{}, ck::Number<2>{}), ck::Number<2>{}); const auto num_blocks =
const auto block_shape = ck::make_tuple(ck::wrapper::size<0>(shape) / ck::wrapper::size<0>(block_shape),
ck::make_tuple(ck::make_tuple(ck::Number<4>{}, ck::Number<2>{}), ck::Number<2>{}); ck::wrapper::size<1>(shape) / ck::wrapper::size<1>(block_shape),
const auto block_layout = ck::wrapper::size<2>(shape) / ck::wrapper::size<2>(block_shape));
ck::make_tuple(ck::make_tuple(ck::Number<4>{}, ck::Number<2>{}), ck::Number<2>{}); std::vector<ck::index_t> block_idxs(ck::wrapper::size(num_blocks));
std::iota(block_idxs.begin(), block_idxs.end(), 0);
std::vector<ck::Tuple<ck::Tuple<ck::index_t, ck::index_t>, ck::index_t>> block_idxs;
for(ck::index_t x = 0; x < ck::wrapper::size<0, 0>(block_layout); x++)
{
for(ck::index_t y = 0; y < ck::wrapper::size<0, 1>(block_layout); y++)
{
for(ck::index_t z = 0; z < ck::wrapper::size<1>(block_layout); z++)
{
block_idxs.emplace_back(ck::make_tuple(x, y), z);
}
}
}
for(const auto& block_idx : block_idxs)
{
const auto raked_tile = ck::wrapper::make_local_tile(tensor, block_shape, block_idx);
const auto expected_tile_size = ck::wrapper::size(block_shape);
EXPECT_EQ(ck::wrapper::size(raked_tile), expected_tile_size);
EXPECT_EQ(raked_tile(0), layout(block_idx));
}
for(const auto& block_idx : block_idxs) for(auto block_idx : block_idxs)
{ {
const auto packed_tile = const auto packed_tile = ck::wrapper::make_local_tile(tensor, block_shape, block_idx);
ck::wrapper::make_local_tile(tensor, block_shape, block_idx, block_steps);
const auto expected_tile_size = ck::wrapper::size(block_shape); const auto expected_tile_size = ck::wrapper::size(block_shape);
const auto expected_tile_first_val = auto expected_tile_first_val = (block_idx % ck::wrapper::size<2>(num_blocks)) *
ck::wrapper::size<0, 0>(block_idx) * ck::wrapper::size<0, 0>(block_shape) * ck::wrapper::size<2>(block_shape) *
ck::wrapper::size<0, 0>(strides) + ck::wrapper::size<2>(strides);
ck::wrapper::size<0, 1>(block_idx) * ck::wrapper::size<0, 1>(block_shape) * block_idx /= ck::wrapper::size<2>(num_blocks);
ck::wrapper::size<0, 1>(strides) + expected_tile_first_val += (block_idx % ck::wrapper::size<1>(num_blocks)) *
ck::wrapper::size<1>(block_idx) * ck::wrapper::size<1>(block_shape) * ck::wrapper::size<1>(block_shape) *
ck::wrapper::size<1>(strides); ck::wrapper::size<1>(strides);
block_idx /= ck::wrapper::size<1>(num_blocks);
expected_tile_first_val += (block_idx % ck::wrapper::size<0>(num_blocks)) *
ck::wrapper::size<0>(block_shape) *
ck::wrapper::size<0>(strides);
const auto expected_tile_second_val = expected_tile_first_val + 1;
EXPECT_EQ(ck::wrapper::size(packed_tile), expected_tile_size); EXPECT_EQ(ck::wrapper::size(packed_tile), expected_tile_size);
EXPECT_EQ(packed_tile(0), expected_tile_first_val); EXPECT_EQ(packed_tile(0), expected_tile_first_val);
EXPECT_EQ(packed_tile(1), expected_tile_second_val);
} }
} }
// SPDX-License-Identifier: MIT // SPDX-License-Identifier: MIT
// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved. // Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
#include <cstdlib> #include <cstdlib>
#include <iostream> #include <iostream>
...@@ -100,31 +100,26 @@ TEST(TestTensor, ReadWriteHostMemory) ...@@ -100,31 +100,26 @@ TEST(TestTensor, ReadWriteHostMemory)
__global__ void TestTensorReadWriteDevice(void* data, void* success) __global__ void TestTensorReadWriteDevice(void* data, void* success)
{ {
constexpr ck::index_t nelems = 8; constexpr ck::index_t nelems = 8;
constexpr ck::index_t scalar_per_vector = 1;
__shared__ ck::index_t p_shared[nelems]; __shared__ ck::index_t p_shared[nelems];
ck::index_t* casted_data_ptr = static_cast<ck::index_t*>(data); ck::index_t* casted_data_ptr = static_cast<ck::index_t*>(data);
bool* casted_success_ptr = static_cast<bool*>(success); bool* casted_success_ptr = static_cast<bool*>(success);
const auto layout = ck::wrapper::make_layout(ck::make_tuple(ck::make_tuple(2, 2), 2)); const auto layout = ck::wrapper::make_layout(ck::make_tuple(ck::make_tuple(2, 2), 2));
constexpr auto vgpr_layout =
ck::wrapper::make_layout(make_tuple(ck::Number<nelems>{}), make_tuple(ck::Number<1>{}));
auto tensor_global = auto tensor_global =
ck::wrapper::make_tensor<ck::wrapper::MemoryTypeEnum::Global>(casted_data_ptr, layout); ck::wrapper::make_tensor<ck::wrapper::MemoryTypeEnum::Global>(casted_data_ptr, layout);
auto tensor_lds = ck::wrapper::make_tensor<ck::wrapper::MemoryTypeEnum::Lds>(p_shared, layout); auto tensor_lds = ck::wrapper::make_tensor<ck::wrapper::MemoryTypeEnum::Lds>(p_shared, layout);
auto tensor_vgpr = ck::wrapper::make_register_tensor<ck::wrapper::MemoryTypeEnum::Vgpr, auto tensor_vgpr =
nelems, ck::wrapper::make_register_tensor<ck::wrapper::MemoryTypeEnum::Vgpr, ck::index_t>(
scalar_per_vector, vgpr_layout);
ck::index_t>();
auto tensor_sgpr = ck::wrapper::make_register_tensor<ck::wrapper::MemoryTypeEnum::Sgpr,
nelems,
scalar_per_vector,
ck::index_t>();
InitTensor(tensor_global); InitTensor(tensor_global);
InitTensor(tensor_lds); InitTensor(tensor_lds);
StaticInitTensor<nelems>(tensor_vgpr); StaticInitTensor<nelems>(tensor_vgpr);
StaticInitTensor<nelems>(tensor_sgpr);
*casted_success_ptr = TestTensorCheck1d(tensor_global); *casted_success_ptr = TestTensorCheck1d(tensor_global);
*casted_success_ptr &= TestTensorCheck3d(tensor_global); *casted_success_ptr &= TestTensorCheck3d(tensor_global);
...@@ -133,8 +128,6 @@ __global__ void TestTensorReadWriteDevice(void* data, void* success) ...@@ -133,8 +128,6 @@ __global__ void TestTensorReadWriteDevice(void* data, void* success)
*casted_success_ptr &= TestTensorCheck3d(tensor_lds); *casted_success_ptr &= TestTensorCheck3d(tensor_lds);
*casted_success_ptr &= StaticTestTensorCheck1d<nelems>(tensor_vgpr); *casted_success_ptr &= StaticTestTensorCheck1d<nelems>(tensor_vgpr);
*casted_success_ptr &= StaticTestTensorCheck1d<nelems>(tensor_sgpr);
} }
TEST(TestTensor, ReadWriteGlobalLdsRegistersMemory) TEST(TestTensor, ReadWriteGlobalLdsRegistersMemory)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment