Commit 6368be50 authored by Jun Liu's avatar Jun Liu
Browse files

Merge branch 'amd-develop' into amd-master

parents 32806d5f 71d6ede7
...@@ -152,7 +152,6 @@ ENDFOREACH() ...@@ -152,7 +152,6 @@ ENDFOREACH()
if(CK_DEVICE_OTHER_INSTANCES) if(CK_DEVICE_OTHER_INSTANCES)
add_library(device_other_operations STATIC ${CK_DEVICE_OTHER_INSTANCES}) add_library(device_other_operations STATIC ${CK_DEVICE_OTHER_INSTANCES})
add_library(composablekernels::device_other_operations ALIAS device_other_operations) add_library(composablekernels::device_other_operations ALIAS device_other_operations)
target_compile_features(device_other_operations PUBLIC)
set_target_properties(device_other_operations PROPERTIES POSITION_INDEPENDENT_CODE ON) set_target_properties(device_other_operations PROPERTIES POSITION_INDEPENDENT_CODE ON)
target_include_directories(device_other_operations PUBLIC target_include_directories(device_other_operations PUBLIC
$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck> $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck>
......
...@@ -9,43 +9,42 @@ namespace tensor_operation { ...@@ -9,43 +9,42 @@ namespace tensor_operation {
namespace device { namespace device {
namespace instance { namespace instance {
// TODO: Workaround for https://ontrack-internal.amd.com/browse/SWDEV-435347 void add_device_grouped_conv3d_fwd_xdl_scaleadd_ab_ndhwgc_gkzyxc_ndhwgk_bf16_instances(
// void add_device_grouped_conv3d_fwd_xdl_scaleadd_ab_ndhwgc_gkzyxc_ndhwgk_bf16_instances( std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
// std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3, NDHWGC,
// NDHWGC, GKZYXC,
// GKZYXC, ck::Tuple<>,
// ck::Tuple<>, NDHWGK,
// NDHWGK, ck::Tuple<BF16, BF16>,
// ck::Tuple<BF16, BF16>, ck::Tuple<BF16, BF16>,
// ck::Tuple<BF16, BF16>, ck::Tuple<>,
// ck::Tuple<>, BF16,
// BF16, ScaleAdd,
// ScaleAdd, ScaleAdd,
// ScaleAdd, PassThrough>>>& instances)
// PassThrough>>>& instances) {
// { add_device_operation_instances(
// add_device_operation_instances( instances,
// instances, device_grouped_conv_fwd_xdl_scaleadd_ab_bf16_instances<3,
// device_grouped_conv_fwd_xdl_scaleadd_ab_bf16_instances<3, NDHWGC,
// NDHWGC, GKZYXC,
// GKZYXC, NDHWGK,
// NDHWGK, ConvFwdDefault>{});
// ConvFwdDefault>{}); add_device_operation_instances(
// add_device_operation_instances( instances,
// instances, device_grouped_conv_fwd_xdl_scaleadd_ab_bf16_instances<3,
// device_grouped_conv_fwd_xdl_scaleadd_ab_bf16_instances<3, NDHWGC,
// NDHWGC, GKZYXC,
// GKZYXC, NDHWGK,
// NDHWGK, ConvFwd1x1P0>{});
// ConvFwd1x1P0>{}); add_device_operation_instances(
// add_device_operation_instances( instances,
// instances, device_grouped_conv_fwd_xdl_scaleadd_ab_bf16_instances<3,
// device_grouped_conv_fwd_xdl_scaleadd_ab_bf16_instances<3, NDHWGC,
// NDHWGC, GKZYXC,
// GKZYXC, NDHWGK,
// NDHWGK, ConvFwd1x1S1P0>{});
// ConvFwd1x1S1P0>{}); }
// }
} // namespace instance } // namespace instance
} // namespace device } // namespace device
......
set(DEVICE_SOFTMAX_INSTANCES) add_instance_library(device_softmax_instance
list(APPEND DEVICE_SOFTMAX_INSTANCES
device_softmax_f16_f16_instance_rank3_reduce1.cpp device_softmax_f16_f16_instance_rank3_reduce1.cpp
device_softmax_f16_f16_instance_rank3_reduce2.cpp device_softmax_f16_f16_instance_rank3_reduce2.cpp
device_softmax_f16_f16_instance_rank3_reduce3.cpp device_softmax_f16_f16_instance_rank3_reduce3.cpp
...@@ -14,4 +13,3 @@ list(APPEND DEVICE_SOFTMAX_INSTANCES ...@@ -14,4 +13,3 @@ list(APPEND DEVICE_SOFTMAX_INSTANCES
device_softmax_f32_f32_instance_rank4_reduce2.cpp device_softmax_f32_f32_instance_rank4_reduce2.cpp
device_softmax_f32_f32_instance_rank4_reduce3.cpp device_softmax_f32_f32_instance_rank4_reduce3.cpp
device_softmax_f32_f32_instance_rank4_reduce4.cpp) device_softmax_f32_f32_instance_rank4_reduce4.cpp)
add_instance_library(device_softmax_instance ${DEVICE_SOFTMAX_INSTANCES})
## utility add_library(utility STATIC
set(UTILITY_SOURCE
device_memory.cpp device_memory.cpp
host_tensor.cpp host_tensor.cpp
convolution_parameter.cpp convolution_parameter.cpp
) )
add_library(utility STATIC ${UTILITY_SOURCE})
add_library(composable_kernel::utility ALIAS utility) add_library(composable_kernel::utility ALIAS utility)
set_target_properties(utility PROPERTIES POSITION_INDEPENDENT_CODE ON)
target_compile_options(utility PRIVATE ${CMAKE_COMPILER_WARNINGS})
target_include_directories(utility PUBLIC target_include_directories(utility PUBLIC
"$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck>" "$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck>"
"$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/library/utility>" "$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/library/utility>"
) )
if(WIN32)
target_compile_definitions(utility PUBLIC NOMINMAX)
endif()
rocm_install( rocm_install(
TARGETS utility TARGETS utility
......
...@@ -25,7 +25,7 @@ namespace ck { ...@@ -25,7 +25,7 @@ namespace ck {
namespace profiler { namespace profiler {
template <typename HostTensorA, typename HostTensorB, typename Functor> template <typename HostTensorA, typename HostTensorB, typename Functor>
void host_elementwise4D(HostTensorB& B_nchwd, const HostTensorA& A_ncdhw, Functor functor) void host_elementwise4D(HostTensorB& B_ndhwc, const HostTensorA& A_ncdhw, Functor functor)
{ {
for(std::size_t n = 0; n < A_ncdhw.mDesc.GetLengths()[0]; ++n) for(std::size_t n = 0; n < A_ncdhw.mDesc.GetLengths()[0]; ++n)
for(std::size_t c = 0; c < A_ncdhw.mDesc.GetLengths()[1]; ++c) for(std::size_t c = 0; c < A_ncdhw.mDesc.GetLengths()[1]; ++c)
...@@ -34,7 +34,7 @@ void host_elementwise4D(HostTensorB& B_nchwd, const HostTensorA& A_ncdhw, Functo ...@@ -34,7 +34,7 @@ void host_elementwise4D(HostTensorB& B_nchwd, const HostTensorA& A_ncdhw, Functo
for(std::size_t w = 0; w < A_ncdhw.mDesc.GetLengths()[4]; ++w) for(std::size_t w = 0; w < A_ncdhw.mDesc.GetLengths()[4]; ++w)
{ {
auto a_val = A_ncdhw(n, c, d, h, w); auto a_val = A_ncdhw(n, c, d, h, w);
functor(B_nchwd(n, c, h, w, d), a_val); functor(B_ndhwc(n, d, h, w, c), a_val);
} }
} }
...@@ -77,8 +77,6 @@ bool profile_transpose_impl(int do_verification, ...@@ -77,8 +77,6 @@ bool profile_transpose_impl(int do_verification,
using ElementOp = ck::tensor_operation::element_wise::PassThrough; using ElementOp = ck::tensor_operation::element_wise::PassThrough;
// const auto element_op = ElementOp{};
DeviceMem a_device_buf(sizeof(ADataType) * a.mDesc.GetElementSpaceSize()); DeviceMem a_device_buf(sizeof(ADataType) * a.mDesc.GetElementSpaceSize());
DeviceMem b_device_buf(sizeof(BDataType) * b.mDesc.GetElementSpaceSize()); DeviceMem b_device_buf(sizeof(BDataType) * b.mDesc.GetElementSpaceSize());
...@@ -118,6 +116,7 @@ bool profile_transpose_impl(int do_verification, ...@@ -118,6 +116,7 @@ bool profile_transpose_impl(int do_verification,
// re-init C to zero before profiling next kernel // re-init C to zero before profiling next kernel
b_device_buf.SetZero(); b_device_buf.SetZero();
// run for verification
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false}); invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
if(do_verification) if(do_verification)
...@@ -136,6 +135,7 @@ bool profile_transpose_impl(int do_verification, ...@@ -136,6 +135,7 @@ bool profile_transpose_impl(int do_verification,
std::string op_name = op_ptr->GetTypeString(); std::string op_name = op_ptr->GetTypeString();
// run for timing purposes
float ave_time = float ave_time =
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel}); invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
...@@ -153,10 +153,6 @@ bool profile_transpose_impl(int do_verification, ...@@ -153,10 +153,6 @@ bool profile_transpose_impl(int do_verification,
std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << tflops << " TFlops, " std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << tflops << " TFlops, "
<< gb_per_sec << " GB/s, " << op_name << std::endl; << gb_per_sec << " GB/s, " << op_name << std::endl;
// pass = pass & ck::utils::check_err(b_device_result, b_host_result);
pass &= ck::utils::check_err(
b.mData, host_b.mData, "Error: Incorrect results b", 1e-3, 1e-3);
if(tflops > best_tflops) if(tflops > best_tflops)
{ {
best_op_name = op_name; best_op_name = op_name;
......
...@@ -29,6 +29,7 @@ set(PROFILER_SOURCES ...@@ -29,6 +29,7 @@ set(PROFILER_SOURCES
profile_batchnorm_infer.cpp profile_batchnorm_infer.cpp
profile_grouped_conv_bwd_data.cpp profile_grouped_conv_bwd_data.cpp
profile_conv_tensor_rearrange.cpp profile_conv_tensor_rearrange.cpp
profile_transpose.cpp
) )
if(DL_KERNELS) if(DL_KERNELS)
...@@ -58,7 +59,7 @@ set(PROFILER_EXECUTABLE ckProfiler) ...@@ -58,7 +59,7 @@ set(PROFILER_EXECUTABLE ckProfiler)
add_executable(${PROFILER_EXECUTABLE} ${PROFILER_SOURCES}) add_executable(${PROFILER_EXECUTABLE} ${PROFILER_SOURCES})
target_compile_options(${PROFILER_EXECUTABLE} PRIVATE -Wno-global-constructors) target_compile_options(${PROFILER_EXECUTABLE} PRIVATE -Wno-global-constructors)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE utility) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE utility getopt::getopt)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_instance)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_splitk_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_splitk_instance)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_add_multiply_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_add_multiply_instance)
...@@ -91,6 +92,7 @@ target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv2d_bwd_d ...@@ -91,6 +92,7 @@ target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv2d_bwd_d
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv3d_bwd_data_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv3d_bwd_data_instance)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_image_to_column_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_image_to_column_instance)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_column_to_image_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_column_to_image_instance)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_transpose_instance)
if(DTYPES MATCHES "fp32" OR DTYPES MATCHES "fp64" OR NOT DEFINED DTYPES) if(DTYPES MATCHES "fp32" OR DTYPES MATCHES "fp64" OR NOT DEFINED DTYPES)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_contraction_bilinear_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_contraction_bilinear_instance)
......
// SPDX-License-Identifier: MIT
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include "profiler/profile_transpose_impl.hpp"
#include "profiler_operation_registry.hpp"
enum struct DataType
{
F32_F32_F32_F32_F32, // 0
F16_F16_F16_F16_F16, // 1
};
#define OP_NAME "transpose"
#define OP_DESC "Transpose"
struct TransposeArgParser
{
std::unordered_map<std::string, std::vector<int>> long_opts = {{"lengths", {}}};
bool parse_opt(const int argc, char* argv[], const std::string& key, int i)
{
if(std::string("--") + key == argv[i])
{
const int pos = i;
while(++i < argc && argv[i][0] != '-') {}
int end = i;
for(int j = pos + 1; j < end; j++)
{
long_opts[key].push_back(std::stoi(argv[j]));
}
return true;
}
return false;
}
void operator()(int argc, char* argv[])
{
for(auto& kv : long_opts)
{
for(int i = 1; i < argc; i++)
{
if(parse_opt(argc, argv, kv.first, i))
break;
}
}
}
};
static void print_helper_msg()
{
printf("arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n");
printf("arg2: data type (0: fp32; 1: fp16)\n");
printf("arg3: verification (0: no; 1: yes)\n");
printf("arg4: initialization (0: no init; 1: integer value; 2: decimal value)\n");
printf("arg5: print tensor value (0: no; 1: yes)\n");
printf("arg6: time kernel (0=no, 1=yes)\n");
printf("arg7: --lengths: N, C, D, H, W\n");
}
int profile_transpose(int argc, char* argv[])
{
if(argc != 7)
{
print_helper_msg();
exit(1);
}
TransposeArgParser arg_parser;
const auto data_type = static_cast<DataType>(std::stoi(argv[2]));
const bool do_verification = std::stoi(argv[3]);
const int init_method = std::stoi(argv[4]);
const bool do_log = std::stoi(argv[5]);
const bool time_kernel = std::stoi(argv[6]);
arg_parser(argc, argv);
const std::vector<ck::index_t> lengths = arg_parser.long_opts["lengths"];
using F32 = float;
using F16 = ck::half_t;
auto profile = [&](auto a_type, auto b_type) {
using ADataType = decltype(a_type);
using BDataType = decltype(b_type);
constexpr ck::index_t NumDim = 5;
bool pass = ck::profiler::profile_transpose_impl<ADataType, BDataType, NumDim>(
do_verification, init_method, do_log, time_kernel, lengths);
return pass ? 0 : 1;
};
if(data_type == DataType::F32_F32_F32_F32_F32)
{
return profile(F32{}, F32{});
}
else if(data_type == DataType::F16_F16_F16_F16_F16)
{
return profile(F16{}, F16{});
}
else
{
std::cout << "this data_type & layout is not implemented" << std::endl;
return 1;
}
}
REGISTER_PROFILER_OPERATION(OP_NAME, OP_DESC, profile_transpose);
...@@ -3,7 +3,7 @@ include_directories(BEFORE ...@@ -3,7 +3,7 @@ include_directories(BEFORE
${PROJECT_SOURCE_DIR}/profiler/include ${PROJECT_SOURCE_DIR}/profiler/include
) )
include(googletest) include(gtest)
add_custom_target(tests) add_custom_target(tests)
...@@ -50,6 +50,7 @@ function(add_test_executable TEST_NAME) ...@@ -50,6 +50,7 @@ function(add_test_executable TEST_NAME)
#only continue if there are some source files left on the list #only continue if there are some source files left on the list
if(ARGN) if(ARGN)
add_executable(${TEST_NAME} ${ARGN}) add_executable(${TEST_NAME} ${ARGN})
target_link_libraries(${TEST_NAME} PRIVATE getopt::getopt)
add_test(NAME ${TEST_NAME} COMMAND $<TARGET_FILE:${TEST_NAME}>) add_test(NAME ${TEST_NAME} COMMAND $<TARGET_FILE:${TEST_NAME}>)
add_dependencies(tests ${TEST_NAME}) add_dependencies(tests ${TEST_NAME})
add_dependencies(check ${TEST_NAME}) add_dependencies(check ${TEST_NAME})
...@@ -58,9 +59,7 @@ function(add_test_executable TEST_NAME) ...@@ -58,9 +59,7 @@ function(add_test_executable TEST_NAME)
endif() endif()
#message("add_test returns ${result}") #message("add_test returns ${result}")
set(result ${result} PARENT_SCOPE) set(result ${result} PARENT_SCOPE)
endfunction(add_test_executable TEST_NAME) endfunction()
include(GoogleTest)
function(add_gtest_executable TEST_NAME) function(add_gtest_executable TEST_NAME)
message("adding gtest ${TEST_NAME}") message("adding gtest ${TEST_NAME}")
...@@ -109,14 +108,14 @@ function(add_gtest_executable TEST_NAME) ...@@ -109,14 +108,14 @@ function(add_gtest_executable TEST_NAME)
# suppress gtest warnings # suppress gtest warnings
target_compile_options(${TEST_NAME} PRIVATE -Wno-global-constructors -Wno-undef) target_compile_options(${TEST_NAME} PRIVATE -Wno-global-constructors -Wno-undef)
target_link_libraries(${TEST_NAME} PRIVATE gtest_main) target_link_libraries(${TEST_NAME} PRIVATE gtest_main getopt::getopt)
add_test(NAME ${TEST_NAME} COMMAND $<TARGET_FILE:${TEST_NAME}>) add_test(NAME ${TEST_NAME} COMMAND $<TARGET_FILE:${TEST_NAME}>)
rocm_install(TARGETS ${TEST_NAME} COMPONENT tests) rocm_install(TARGETS ${TEST_NAME} COMPONENT tests)
set(result 0) set(result 0)
endif() endif()
#message("add_gtest returns ${result}") #message("add_gtest returns ${result}")
set(result ${result} PARENT_SCOPE) set(result ${result} PARENT_SCOPE)
endfunction(add_gtest_executable TEST_NAME) endfunction()
add_subdirectory(magic_number_division) add_subdirectory(magic_number_division)
add_subdirectory(space_filling_curve) add_subdirectory(space_filling_curve)
......
// SPDX-License-Identifier: MIT // SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <tuple>
#include "gtest/gtest.h" #include "gtest/gtest.h"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "profiler/profile_transpose_impl.hpp"
#include "test_transpose_util.hpp"
using F16 = ck::half_t; using F16 = ck::half_t;
using F32 = float; using F32 = float;
using ck::index_t;
template <typename Tuple> template <typename Tuple>
class TestTranspose : public ::testing::Test class TestTranspose : public ::testing::Test
{ {
protected:
using ADataType = std::tuple_element_t<0, Tuple>;
using BDataType = std::tuple_element_t<1, Tuple>;
void Run()
{
std::vector<std::vector<ck::index_t>> lengths = {
{4, 16, 16, 32, 5}, {8, 16, 16, 32, 8} /**{32, 16, 16, 32, 8},**/};
for(auto length : lengths)
{
bool success = ck::profiler::profile_transpose_impl<ADataType, BDataType, 5>(
true, 2, false, false, length);
EXPECT_TRUE(success);
}
}
}; };
// clang-format off using KernelTypes = ::testing::Types<std::tuple<F16, F16>, std::tuple<F32, F32>>;
using KernelTypes = ::testing::Types<
std::tuple< F16, F16>,
std::tuple< F32, F32>
>;
// clang-format on
TYPED_TEST_SUITE(TestTranspose, KernelTypes); TYPED_TEST_SUITE(TestTranspose, KernelTypes);
TYPED_TEST(TestTranspose, Test_FP16) { this->Run(); }
//#include "test_transpose_ut_cases.inc" TYPED_TEST(TestTranspose, Test_FP32) { this->Run(); }
#pragma once
TYPED_TEST(TestTranspose, Test1)
{
// for 16, 8, 16, 32, 8
std::vector<int> Ms{1, 2, 3, 4, 5, 6};
std::vector<index_t> lengths{16, 8, 16, 32, 8};
/**constexpr int N = 16;
constexpr int C = 8;
constexpr int D = 16;
constexpr int H = 32;
constexpr int W = 8;**/
this->Run();
}
TYPED_TEST(TestTranpose, Test2)
{
std::vector<int> Ms{127, 255, 312, 799, 1573};
std::vector<index_t> lengths{16, 8, 16, 32, 16};
/**constexpr int N = 16;
constexpr int C = 8;
constexpr int D = 16;
constexpr int H = 32;
constexpr int W = 8;**/
this->Run();
}
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <string>
#include <sstream>
#include <tuple>
#include <vector>
#include <gtest/gtest.h>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "include/ck/utility/data_type.hpp"
#include "profiler/profile_transpose_impl.hpp"
namespace ck {
namespace test {
template <typename Tuple>
class TestTranspose : public testing::Test
{
using F32 = float;
protected:
using ADataType = std::tuple_element_t<0, Tuple>;
using BDataType = std::tuple_element_t<1, Tuple>;
public:
static constexpr bool verify_ = true;
static constexpr int init_method_ = 1; // decimal value initialization
static constexpr bool log_ = false;
static constexpr bool bench_ = false; // measure kernel performance
std::vector<std::vector<index_t>> lengths_ = {{16, 32, 16, 32, 16}, {16, 8, 16, 32, 8}};
void Run()
{
for(auto length : this->lengths_)
{
this->RunSingle(length);
}
}
void RunSingle()
{
bool pass = ck::profiler::profile_transpose_impl<ADataType, BDataType, 5>(
verify_, init_method_, log_, bench_, lengths_);
EXPECT_TRUE(pass);
}
};
} // namespace test
} // namespace ck
...@@ -2,3 +2,7 @@ add_gtest_executable(test_layout test_layout.cpp) ...@@ -2,3 +2,7 @@ add_gtest_executable(test_layout test_layout.cpp)
target_link_libraries(test_layout PRIVATE utility) target_link_libraries(test_layout PRIVATE utility)
add_gtest_executable(test_tensor test_tensor.cpp) add_gtest_executable(test_tensor test_tensor.cpp)
target_link_libraries(test_tensor PRIVATE utility) target_link_libraries(test_tensor PRIVATE utility)
add_gtest_executable(test_copy test_copy.cpp)
target_link_libraries(test_copy PRIVATE utility)
add_gtest_executable(test_partition test_partition.cpp)
target_link_libraries(test_partition PRIVATE utility)
// SPDX-License-Identifier: MIT
// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
#include <numeric>
#include <cstdlib>
#include <iostream>
#include <initializer_list>
#include <vector>
#include <gtest/gtest.h>
#include "ck/host_utility/kernel_launch.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/utility/common_header.hpp"
#include "ck/wrapper/layout.hpp"
#include "ck/wrapper/tensor.hpp"
#include "ck/wrapper/operations/copy.hpp"
// Test copy from Global to Global through LDS and VGPR
template <typename InputTensor,
typename OutputTensor,
typename BlockShape,
typename ThreadLayoutShape,
typename LocalTileSteps,
typename LocalPartitionSteps>
__global__ void TestCopyDevice(const InputTensor input_tensor,
OutputTensor output_tensor,
const BlockShape tile_shape,
const ThreadLayoutShape thread_layout,
const LocalTileSteps block_steps,
const LocalPartitionSteps thread_steps)
{
__shared__ ck::index_t p_shared[ck::wrapper::size(tile_shape)];
auto tensor_lds = ck::wrapper::make_tensor<ck::wrapper::MemoryTypeEnum::Lds>(
p_shared, ck::wrapper::make_layout(tile_shape));
const auto block_idxs = ck::make_tuple(ck::make_tuple(0, 0), blockIdx.x);
// Get local tiles for global memory
const auto input_local_tile =
ck::wrapper::make_local_tile(input_tensor, tile_shape, block_idxs, block_steps);
const auto output_local_tile =
ck::wrapper::make_local_tile(output_tensor, tile_shape, block_idxs, block_steps);
// Get partition per thread
const auto input_local_partition = ck::wrapper::make_local_partition(
input_local_tile, thread_layout, threadIdx.x, thread_steps);
auto lds_local_partition =
ck::wrapper::make_local_partition(tensor_lds, thread_layout, threadIdx.x, thread_steps);
auto output_local_partition = ck::wrapper::make_local_partition(
output_local_tile, thread_layout, threadIdx.x, thread_steps);
// Allocate VGPR
constexpr ck::index_t scalar_per_vector = 1;
constexpr ck::index_t vgpr_size = ck::wrapper::size(lds_local_partition);
auto tensor_vgpr = ck::wrapper::make_register_tensor<ck::wrapper::MemoryTypeEnum::Vgpr,
vgpr_size,
scalar_per_vector,
ck::index_t>();
// Perform copy
ck::wrapper::copy(input_local_partition, lds_local_partition);
ck::wrapper::copy(lds_local_partition, tensor_vgpr);
ck::wrapper::copy(tensor_vgpr, output_local_partition);
}
void PerformCopyGlobalToGlobalViaLDS()
{
const auto shape =
ck::make_tuple(ck::make_tuple(ck::Number<2>{}, ck::Number<2>{}), ck::Number<256>{});
const auto strides =
ck::make_tuple(ck::make_tuple(ck::Number<1>{}, ck::Number<2>{}), ck::Number<4>{});
const auto layout = ck::wrapper::make_layout(shape, strides);
// 0, 1, 2, ..., size(shape) - 1
std::vector<ck::index_t> input_data(ck::wrapper::size(shape));
std::iota(input_data.begin(), input_data.end(), 0);
// Global memory buffers
DeviceMem in_buf(ck::wrapper::size(layout) * sizeof(ck::index_t));
DeviceMem out_buf(ck::wrapper::size(layout) * sizeof(ck::index_t));
in_buf.ToDevice(input_data.data());
out_buf.SetZero();
// Create tensors for global memory
const auto input_tensor_global = ck::wrapper::make_tensor<ck::wrapper::MemoryTypeEnum::Global>(
static_cast<const ck::index_t*>(in_buf.GetDeviceBuffer()), layout);
auto output_tensor_global = ck::wrapper::make_tensor<ck::wrapper::MemoryTypeEnum::Global>(
static_cast<ck::index_t*>(out_buf.GetDeviceBuffer()), layout);
const auto thread_layout =
ck::make_tuple(ck::make_tuple(ck::Number<1>{}, ck::Number<1>{}), ck::Number<32>{});
const auto tile_shape =
ck::make_tuple(ck::make_tuple(ck::Number<2>{}, ck::Number<2>{}), ck::Number<64>{});
const auto thread_steps =
ck::make_tuple(ck::make_tuple(ck::Number<1>{}, ck::Number<1>{}), ck::Number<2>{});
const auto block_steps =
ck::make_tuple(ck::make_tuple(ck::Number<1>{}, ck::Number<1>{}), ck::Number<64>{});
const ck::index_t grid_size = ck::math::integer_divide_ceil(
ck::wrapper::size(input_tensor_global), ck::wrapper::size(tile_shape));
const auto kernel = TestCopyDevice<decltype(input_tensor_global),
decltype(output_tensor_global),
decltype(tile_shape),
decltype(thread_layout),
decltype(block_steps),
decltype(thread_steps)>;
launch_and_time_kernel(StreamConfig{},
kernel,
dim3(grid_size),
dim3(ck::wrapper::size(thread_layout)),
0,
input_tensor_global,
output_tensor_global,
tile_shape,
thread_layout,
block_steps,
thread_steps);
// Verify results
std::vector<ck::index_t> output_data(ck::wrapper::size(shape));
out_buf.FromDevice(output_data.data());
EXPECT_TRUE(ck::utils::check_err(output_data, input_data));
}
TEST(TestCopy, CopyGlobalToGlobalViaLDS) { PerformCopyGlobalToGlobalViaLDS(); }
...@@ -84,7 +84,8 @@ TEST_F(TestWrapperLayout, 2d) ...@@ -84,7 +84,8 @@ TEST_F(TestWrapperLayout, 2d)
ck::make_tuple(ck::Sequence<0>{})); ck::make_tuple(ck::Sequence<0>{}));
const auto layout_runtime = ck::wrapper::make_layout(ck::make_tuple(d1, d0)); const auto layout_runtime = ck::wrapper::make_layout(ck::make_tuple(d1, d0));
const auto layout_compiletime = const auto layout_compiletime =
ck::wrapper::make_layout(ck::make_tuple(ck::Number<d1>{}, ck::Number<d0>{})); ck::wrapper::make_layout(ck::make_tuple(ck::Number<d1>{}, ck::Number<d0>{}),
ck::make_tuple(ck::Number<s1>{}, ck::Number<s0>{}));
std::vector<ck::Tuple<ck::index_t, ck::index_t>> idxs; std::vector<ck::Tuple<ck::index_t, ck::index_t>> idxs;
for(ck::index_t h = 0; h < d1; h++) for(ck::index_t h = 0; h < d1; h++)
...@@ -435,19 +436,11 @@ TEST(TestLayoutHelpers, ShapeAndStrides) ...@@ -435,19 +436,11 @@ TEST(TestLayoutHelpers, ShapeAndStrides)
constexpr bool check_compiletime_shape = constexpr bool check_compiletime_shape =
std::is_same_v<decltype(shape_compiletime), std::is_same_v<decltype(shape_compiletime),
std::remove_reference_t<decltype(shape(layout_compiletime))>>; std::remove_reference_t<decltype(shape(layout_compiletime))>>;
constexpr bool check_compiletime_strides =
std::is_same_v<decltype(strides_compiletime),
std::remove_reference_t<decltype(stride(layout_compiletime))>>;
constexpr bool check_runtime_shape = constexpr bool check_runtime_shape =
std::is_same_v<decltype(shape_runtime), std::is_same_v<decltype(shape_runtime),
std::remove_reference_t<decltype(shape(layout_runtime))>>; std::remove_reference_t<decltype(shape(layout_runtime))>>;
constexpr bool check_runtime_strides =
std::is_same_v<decltype(strides_runtime),
std::remove_reference_t<decltype(stride(layout_runtime))>>;
EXPECT_TRUE(check_compiletime_shape); EXPECT_TRUE(check_compiletime_shape);
EXPECT_TRUE(check_compiletime_strides);
EXPECT_TRUE(check_runtime_shape); EXPECT_TRUE(check_runtime_shape);
EXPECT_TRUE(check_runtime_strides);
} }
TEST(TestLayoutHelpers, Hierarchical) TEST(TestLayoutHelpers, Hierarchical)
......
// SPDX-License-Identifier: MIT
// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
#include <numeric>
#include <cstdlib>
#include <iostream>
#include <initializer_list>
#include <vector>
#include <gtest/gtest.h>
#include "ck/host_utility/kernel_launch.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/utility/common_header.hpp"
#include "ck/wrapper/layout.hpp"
#include "ck/wrapper/tensor.hpp"
TEST(TestPartition, LocalPartition)
{
const auto shape =
ck::make_tuple(ck::make_tuple(ck::Number<16>{}, ck::Number<4>{}), ck::Number<4>{});
const auto strides =
ck::make_tuple(ck::make_tuple(ck::Number<1>{}, ck::Number<16>{}), ck::Number<64>{});
const auto layout = ck::wrapper::make_layout(shape, strides);
std::vector<ck::index_t> data(ck::wrapper::size(layout));
std::iota(data.begin(), data.end(), 0);
const auto tensor =
ck::wrapper::make_tensor<ck::wrapper::MemoryTypeEnum::Generic>(data.data(), layout);
const auto thread_steps =
ck::make_tuple(ck::make_tuple(ck::Number<2>{}, ck::Number<1>{}), ck::Number<1>{});
const auto thread_layout =
ck::make_tuple(ck::make_tuple(ck::Number<8>{}, ck::Number<1>{}), ck::Number<1>{});
for(ck::index_t thread_id = 0; thread_id < ck::wrapper::size(thread_layout); thread_id++)
{
const auto raked_partition =
ck::wrapper::make_local_partition(tensor, thread_layout, thread_id);
const auto expected_partition_size =
ck::wrapper::size(tensor) / ck::wrapper::size(thread_layout);
EXPECT_EQ(ck::wrapper::size(raked_partition), expected_partition_size);
EXPECT_EQ(raked_partition(0), thread_id);
}
for(ck::index_t thread_id = 0; thread_id < ck::wrapper::size(thread_layout); thread_id++)
{
const auto packed_partition =
ck::wrapper::make_local_partition(tensor, thread_layout, thread_id, thread_steps);
const auto expected_partition_size =
ck::wrapper::size(tensor) / ck::wrapper::size(thread_layout);
const auto expected_partition_first_val = thread_id * ck::wrapper::size<0, 0>(thread_steps);
EXPECT_EQ(ck::wrapper::size(packed_partition), expected_partition_size);
EXPECT_EQ(packed_partition(0), expected_partition_first_val);
}
}
TEST(TestPartition, LocalTile)
{
const auto shape =
ck::make_tuple(ck::make_tuple(ck::Number<16>{}, ck::Number<4>{}), ck::Number<4>{});
const auto strides =
ck::make_tuple(ck::make_tuple(ck::Number<1>{}, ck::Number<16>{}), ck::Number<64>{});
const auto layout = ck::wrapper::make_layout(shape, strides);
std::vector<ck::index_t> data(ck::wrapper::size(layout));
std::iota(data.begin(), data.end(), 0);
const auto tensor =
ck::wrapper::make_tensor<ck::wrapper::MemoryTypeEnum::Generic>(data.data(), layout);
const auto block_steps =
ck::make_tuple(ck::make_tuple(ck::Number<4>{}, ck::Number<2>{}), ck::Number<2>{});
const auto block_shape =
ck::make_tuple(ck::make_tuple(ck::Number<4>{}, ck::Number<2>{}), ck::Number<2>{});
const auto block_layout =
ck::make_tuple(ck::make_tuple(ck::Number<4>{}, ck::Number<2>{}), ck::Number<2>{});
std::vector<ck::Tuple<ck::Tuple<ck::index_t, ck::index_t>, ck::index_t>> block_idxs;
for(ck::index_t x = 0; x < ck::wrapper::size<0, 0>(block_layout); x++)
{
for(ck::index_t y = 0; y < ck::wrapper::size<0, 1>(block_layout); y++)
{
for(ck::index_t z = 0; z < ck::wrapper::size<1>(block_layout); z++)
{
block_idxs.emplace_back(ck::make_tuple(x, y), z);
}
}
}
for(const auto& block_idx : block_idxs)
{
const auto raked_tile = ck::wrapper::make_local_tile(tensor, block_shape, block_idx);
const auto expected_tile_size = ck::wrapper::size(block_shape);
EXPECT_EQ(ck::wrapper::size(raked_tile), expected_tile_size);
EXPECT_EQ(raked_tile(0), layout(block_idx));
}
for(const auto& block_idx : block_idxs)
{
const auto packed_tile =
ck::wrapper::make_local_tile(tensor, block_shape, block_idx, block_steps);
const auto expected_tile_size = ck::wrapper::size(block_shape);
const auto expected_tile_first_val =
ck::wrapper::size<0, 0>(block_idx) * ck::wrapper::size<0, 0>(block_shape) *
ck::wrapper::size<0, 0>(strides) +
ck::wrapper::size<0, 1>(block_idx) * ck::wrapper::size<0, 1>(block_shape) *
ck::wrapper::size<0, 1>(strides) +
ck::wrapper::size<1>(block_idx) * ck::wrapper::size<1>(block_shape) *
ck::wrapper::size<1>(strides);
EXPECT_EQ(ck::wrapper::size(packed_tile), expected_tile_size);
EXPECT_EQ(packed_tile(0), expected_tile_first_val);
}
}
...@@ -108,7 +108,6 @@ __global__ void TestTensorReadWriteDevice(void* data, void* success) ...@@ -108,7 +108,6 @@ __global__ void TestTensorReadWriteDevice(void* data, void* success)
bool* casted_success_ptr = static_cast<bool*>(success); bool* casted_success_ptr = static_cast<bool*>(success);
const auto layout = ck::wrapper::make_layout(ck::make_tuple(ck::make_tuple(2, 2), 2)); const auto layout = ck::wrapper::make_layout(ck::make_tuple(ck::make_tuple(2, 2), 2));
constexpr auto register_layout = ck::wrapper::make_layout(ck::make_tuple(ck::Number<8>{}));
auto tensor_global = auto tensor_global =
ck::wrapper::make_tensor<ck::wrapper::MemoryTypeEnum::Global>(casted_data_ptr, layout); ck::wrapper::make_tensor<ck::wrapper::MemoryTypeEnum::Global>(casted_data_ptr, layout);
...@@ -116,18 +115,18 @@ __global__ void TestTensorReadWriteDevice(void* data, void* success) ...@@ -116,18 +115,18 @@ __global__ void TestTensorReadWriteDevice(void* data, void* success)
auto tensor_vgpr = ck::wrapper::make_register_tensor<ck::wrapper::MemoryTypeEnum::Vgpr, auto tensor_vgpr = ck::wrapper::make_register_tensor<ck::wrapper::MemoryTypeEnum::Vgpr,
nelems, nelems,
scalar_per_vector, scalar_per_vector,
ck::index_t>(register_layout); ck::index_t>();
auto tensor_sgpr = ck::wrapper::make_register_tensor<ck::wrapper::MemoryTypeEnum::Sgpr, auto tensor_sgpr = ck::wrapper::make_register_tensor<ck::wrapper::MemoryTypeEnum::Sgpr,
nelems, nelems,
scalar_per_vector, scalar_per_vector,
ck::index_t>(register_layout); ck::index_t>();
InitTensor(tensor_global); InitTensor(tensor_global);
InitTensor(tensor_lds); InitTensor(tensor_lds);
StaticInitTensor<nelems>(tensor_vgpr); StaticInitTensor<nelems>(tensor_vgpr);
StaticInitTensor<nelems>(tensor_sgpr); StaticInitTensor<nelems>(tensor_sgpr);
*casted_success_ptr &= TestTensorCheck1d(tensor_global); *casted_success_ptr = TestTensorCheck1d(tensor_global);
*casted_success_ptr &= TestTensorCheck3d(tensor_global); *casted_success_ptr &= TestTensorCheck3d(tensor_global);
*casted_success_ptr &= TestTensorCheck1d(tensor_lds); *casted_success_ptr &= TestTensorCheck1d(tensor_lds);
...@@ -151,7 +150,7 @@ TEST(TestTensor, ReadWriteGlobalLdsRegistersMemory) ...@@ -151,7 +150,7 @@ TEST(TestTensor, ReadWriteGlobalLdsRegistersMemory)
TestTensorReadWriteDevice, TestTensorReadWriteDevice,
dim3(1), dim3(1),
dim3(1), dim3(1),
nelems * sizeof(ck::index_t), 0,
data_buf.GetDeviceBuffer(), data_buf.GetDeviceBuffer(),
success_buf.GetDeviceBuffer()); success_buf.GetDeviceBuffer());
...@@ -173,33 +172,45 @@ TEST(TestTensor, Slicing) ...@@ -173,33 +172,45 @@ TEST(TestTensor, Slicing)
auto tensor2x2x2 = auto tensor2x2x2 =
tensor(ck::make_tuple(ck::wrapper::slice(2), ck::wrapper::slice(2)), ck::wrapper::slice(2)); tensor(ck::make_tuple(ck::wrapper::slice(2), ck::wrapper::slice(2)), ck::wrapper::slice(2));
EXPECT_EQ(tensor2x2x2(0), layout(ck::make_tuple(ck::make_tuple(0, 0), 0)));
EXPECT_EQ(ck::wrapper::rank(tensor2x2x2), 2); EXPECT_EQ(ck::wrapper::rank(tensor2x2x2), 2);
EXPECT_EQ(ck::wrapper::depth(tensor2x2x2), 2); EXPECT_EQ(ck::wrapper::depth(tensor2x2x2), 2);
EXPECT_EQ(ck::wrapper::size(tensor2x2x2), 8); EXPECT_EQ(ck::wrapper::size(tensor2x2x2), 8);
EXPECT_TRUE(TestTensorCheck1d(tensor2x2x2)); EXPECT_TRUE(TestTensorCheck1d(tensor2x2x2));
auto tensor2x2 = tensor(ck::make_tuple(1, ck::wrapper::slice(2)), ck::wrapper::slice(2)); auto tensor2x2 = tensor(ck::make_tuple(1, ck::wrapper::slice(2)), ck::wrapper::slice(2));
EXPECT_EQ(tensor2x2(0), layout(ck::make_tuple(ck::make_tuple(1, 0), 0)));
EXPECT_EQ(ck::wrapper::rank(tensor2x2), 2); EXPECT_EQ(ck::wrapper::rank(tensor2x2), 2);
EXPECT_EQ(ck::wrapper::depth(tensor2x2), 2); EXPECT_EQ(ck::wrapper::depth(tensor2x2), 2);
EXPECT_EQ(ck::wrapper::size(tensor2x2), 4); EXPECT_EQ(ck::wrapper::size(tensor2x2), 4);
EXPECT_TRUE(TestTensorCheck1d(tensor2x2, layout(ck::make_tuple(ck::make_tuple(1, 0), 0)))); EXPECT_TRUE(TestTensorCheck1d(tensor2x2));
auto tensor1x1 = tensor(ck::make_tuple(1, ck::wrapper::slice(1, 2)), ck::wrapper::slice(1, 2)); auto tensor1x1 = tensor(ck::make_tuple(1, ck::wrapper::slice(1, 2)), ck::wrapper::slice(1, 2));
EXPECT_EQ(tensor1x1(0), layout(ck::make_tuple(ck::make_tuple(1, 1), 1)));
EXPECT_EQ(rank(tensor1x1), 2); EXPECT_EQ(rank(tensor1x1), 2);
EXPECT_EQ(depth(tensor1x1), 2); EXPECT_EQ(depth(tensor1x1), 2);
EXPECT_EQ(size(tensor1x1), 1); EXPECT_EQ(size(tensor1x1), 1);
EXPECT_TRUE(TestTensorCheck1d(tensor1x1, layout(ck::make_tuple(ck::make_tuple(1, 1), 1)))); EXPECT_TRUE(TestTensorCheck1d(tensor1x1));
auto tensor2 = tensor(ck::make_tuple(1, 1), ck::wrapper::slice(0, 2)); auto tensor2 = tensor(ck::make_tuple(1, 1), ck::wrapper::slice(0, 2));
EXPECT_EQ(tensor2(0), layout(ck::make_tuple(ck::make_tuple(1, 1), 0)));
EXPECT_EQ(ck::wrapper::rank(tensor2), 1); EXPECT_EQ(ck::wrapper::rank(tensor2), 1);
EXPECT_EQ(ck::wrapper::depth(tensor2), 1); EXPECT_EQ(ck::wrapper::depth(tensor2), 1);
EXPECT_EQ(ck::wrapper::size(tensor2), 2); EXPECT_EQ(ck::wrapper::size(tensor2), 2);
EXPECT_TRUE(TestTensorCheck1d(tensor2, layout(ck::make_tuple(ck::make_tuple(1, 1), 0)))); EXPECT_TRUE(TestTensorCheck1d(tensor2));
auto tensor2_v2 = tensor(2, ck::wrapper::slice(0, 2));
EXPECT_EQ(tensor2_v2(0), layout(ck::make_tuple(2, 0)));
EXPECT_EQ(ck::wrapper::rank(tensor2_v2), 1);
EXPECT_EQ(ck::wrapper::depth(tensor2_v2), 1);
EXPECT_EQ(ck::wrapper::size(tensor2_v2), 2);
EXPECT_TRUE(TestTensorCheck1d(tensor2_v2));
// negative indexing // negative indexing
auto tensor1x2 = tensor(ck::make_tuple(1, ck::wrapper::slice(0, -2)), ck::wrapper::slice()); auto tensor1x2 = tensor(ck::make_tuple(1, ck::wrapper::slice(0, -2)), ck::wrapper::slice());
EXPECT_EQ(tensor1x2(0), layout(ck::make_tuple(ck::make_tuple(1, 0), 0)));
EXPECT_EQ(rank(tensor1x2), 2); EXPECT_EQ(rank(tensor1x2), 2);
EXPECT_EQ(depth(tensor1x2), 2); EXPECT_EQ(depth(tensor1x2), 2);
EXPECT_EQ(size(tensor1x2), 2); EXPECT_EQ(size(tensor1x2), 2);
EXPECT_TRUE(TestTensorCheck1d(tensor1x2, layout(ck::make_tuple(ck::make_tuple(1, 0), 0)))); EXPECT_TRUE(TestTensorCheck1d(tensor1x2));
} }
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment