Unverified Commit 06701e70 authored by Rostyslav Geyyer's avatar Rostyslav Geyyer Committed by GitHub
Browse files

Merge branch 'develop' into lwpck-1815

parents 5800d24e da42a889
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "profiler/profile_grouped_conv_fwd_outelementop_impl.hpp"
#include "ck/utility/data_type.hpp"
#include "profiler_operation_registry.hpp"
#include <iostream>
enum struct ConvLayout
{
GNHWC_GKYXC_GNHWK = 0,
NHWGC_GKYXC_NHWGK = 1
};
enum struct OutElementOp
{
ConvScale = 0,
ConvInvScale = 1
};
enum struct ConvDataType
{
F8_F8_F8 = 0,
BF8_BF8_F8 = 1,
F8_BF8_F8 = 2,
BF8_F8_F8 = 3
};
#define OP_NAME "grouped_conv_fwd_outelementop"
#define OP_DESC "Grouped Convolution Forward+Elementwise Operation"
static void print_helper_msg()
{
// clang-format off
std::cout
<< "arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n"
<< "arg2: data type (0: Input fp8, Weight fp8, Output fp8\n"
<< " 1: Input bf8, Weight bf8, Output fp8\n"
<< " 2: Input fp8, Weight bf8, Output fp8\n"
<< " 3: Input bf8, Weight fp8, Output fp8)\n"
<< "arg3: element-wise operation (0: ConvScale\n"
<< " 1: ConvInvScale)\n"
<< "arg4: tensor layout (0: Input[G, N, Hi, Wi, C], Weight[G, K, Y, X, C], Output[G, N, Ho, Wo, K]\n"
<< " 1: Input[N, Hi, Wi, G, C], Weight[G, K, Y, X, C], Output[N, Ho, Wo, G, K])\n"
<< "arg5: verification (0: no, 1: yes)\n"
<< "arg6: initialization (0: no init, 1: integer value, 2: decimal value)\n"
<< "arg7: print tensor value (0: no; 1: yes)\n"
<< "arg8: time kernel (0: no, 1: yes)\n"
<< ck::utils::conv::get_conv_param_parser_helper_msg() << std::endl;
// clang-format on
}
int grouped_conv_fwd_outelementop(int argc, char* argv[])
{
// 9 total, 1 for num_dim_spatial
if(argc < 10)
{
print_helper_msg();
return 1;
}
const auto data_type = static_cast<ConvDataType>(std::stoi(argv[2]));
const auto op = static_cast<OutElementOp>(std::stoi(argv[3]));
const auto layout = static_cast<ConvLayout>(std::stoi(argv[4]));
const bool do_verification = std::stoi(argv[5]);
const int init_method = std::stoi(argv[6]);
const bool do_log = std::stoi(argv[7]);
const bool time_kernel = std::stoi(argv[8]);
const int num_dim_spatial = std::stoi(argv[9]);
// 8 for control, 1 for num_dim_spatial, 4 for G/N/K/C, and 6 * num_dim_spatial + 1 for argv[0]
if(argc != 8 + 1 + 4 + 6 * num_dim_spatial + 1)
{
print_helper_msg();
return 1;
}
const auto params = ck::utils::conv::parse_conv_param(num_dim_spatial, 10, argv);
using F8 = ck::f8_t;
using BF8 = ck::bf8_t;
using GKZYXC = ck::tensor_layout::convolution::GKZYXC;
using NDHWGC = ck::tensor_layout::convolution::NDHWGC;
using NDHWGK = ck::tensor_layout::convolution::NDHWGK;
using ConvScale = ck::tensor_operation::element_wise::ConvScale;
using ConvInvScale = ck::tensor_operation::element_wise::ConvInvscale;
constexpr auto I3 = ck::Number<3>{};
auto profile = [&](auto num_dim_spatial_tmp,
auto in_layout,
auto wei_layout,
auto out_layout,
auto in_type,
auto wei_type,
auto out_type,
auto out_element_op,
auto a_compute_type,
auto b_compute_type) {
constexpr ck::index_t NDimSpatial = num_dim_spatial_tmp.value;
using InLayout = decltype(in_layout);
using WeiLayout = decltype(wei_layout);
using OutLayout = decltype(out_layout);
using InDataType = decltype(in_type);
using WeiDataType = decltype(wei_type);
using OutDataType = decltype(out_type);
using OutElementOp = decltype(out_element_op);
using AComputeType = decltype(a_compute_type);
using BComputeType = decltype(b_compute_type);
bool pass = ck::profiler::profile_grouped_conv_fwd_outelementop_impl<NDimSpatial,
InLayout,
WeiLayout,
OutLayout,
InDataType,
WeiDataType,
OutDataType,
OutElementOp,
AComputeType,
BComputeType>(
do_verification, init_method, do_log, time_kernel, params);
return pass ? 0 : 1;
};
if(num_dim_spatial == 3 && layout == ConvLayout::NHWGC_GKYXC_NHWGK)
{
if(op == OutElementOp::ConvScale)
{
if(data_type == ConvDataType::F8_F8_F8)
{
return profile(
I3, NDHWGC{}, GKZYXC{}, NDHWGK{}, F8{}, F8{}, F8{}, ConvScale{}, F8{}, F8{});
}
else if(data_type == ConvDataType::BF8_BF8_F8)
{
return profile(I3,
NDHWGC{},
GKZYXC{},
NDHWGK{},
BF8{},
BF8{},
F8{},
ConvScale{},
BF8{},
BF8{});
}
else if(data_type == ConvDataType::F8_BF8_F8)
{
return profile(
I3, NDHWGC{}, GKZYXC{}, NDHWGK{}, F8{}, BF8{}, F8{}, ConvScale{}, F8{}, BF8{});
}
else if(data_type == ConvDataType::BF8_F8_F8)
{
return profile(
I3, NDHWGC{}, GKZYXC{}, NDHWGK{}, BF8{}, F8{}, F8{}, ConvScale{}, BF8{}, F8{});
}
}
else if(op == OutElementOp::ConvInvScale)
{
if(data_type == ConvDataType::F8_F8_F8)
{
return profile(
I3, NDHWGC{}, GKZYXC{}, NDHWGK{}, F8{}, F8{}, F8{}, ConvInvScale{}, F8{}, F8{});
}
else if(data_type == ConvDataType::BF8_BF8_F8)
{
return profile(I3,
NDHWGC{},
GKZYXC{},
NDHWGK{},
BF8{},
BF8{},
F8{},
ConvInvScale{},
BF8{},
BF8{});
}
else if(data_type == ConvDataType::F8_BF8_F8)
{
return profile(I3,
NDHWGC{},
GKZYXC{},
NDHWGK{},
F8{},
BF8{},
F8{},
ConvInvScale{},
F8{},
BF8{});
}
else if(data_type == ConvDataType::BF8_F8_F8)
{
return profile(I3,
NDHWGC{},
GKZYXC{},
NDHWGK{},
BF8{},
F8{},
F8{},
ConvInvScale{},
BF8{},
F8{});
}
}
}
std::cout << "this data_type & layout is not implemented" << std::endl;
return 1;
}
REGISTER_PROFILER_OPERATION(OP_NAME, OP_DESC, grouped_conv_fwd_outelementop);
...@@ -98,8 +98,8 @@ int profile_grouped_gemm(int argc, char* argv[]) ...@@ -98,8 +98,8 @@ int profile_grouped_gemm(int argc, char* argv[])
int n_iter = 10; int n_iter = 10;
if(argc == 17) if(argc == 17)
{ {
n_warmup = std::stoi(argv[16]); n_warmup = std::stoi(argv[15]);
n_iter = std::stoi(argv[17]); n_iter = std::stoi(argv[16]);
} }
#ifdef CK_ENABLE_FP16 #ifdef CK_ENABLE_FP16
......
File mode changed from 100755 to 100644
#!/bin/bash
## GPU visibility
export HIP_VISIBLE_DEVICES=0
DRIVER="../build/bin/ckProfiler"
OP=$1
DATATYPE=$2
OUTELEMENTOP=$3
LAYOUT=$4
VERIFY=$5
INIT=$6
LOG=$7
TIME=$8
N=$9
####### op datatype OUTELEMENTOP layout verify init log time Ndims G N K C Z Y X Di Hi Wi Sz Sy Sx Dz Dy Dx Left Pz LeftPy LeftPx RightPz RightPy RightPx
$DRIVER $OP $DATATYPE $OUTELEMENTOP $LAYOUT $VERIFY $INIT $LOG $TIME 3 32 $N 96 96 3 3 3 28 28 28 1 1 1 1 1 1 1 1 1 1 1 1
$DRIVER $OP $DATATYPE $OUTELEMENTOP $LAYOUT $VERIFY $INIT $LOG $TIME 3 32 $N 192 192 3 3 3 28 28 28 1 1 1 1 1 1 1 1 1 1 1 1
...@@ -60,7 +60,7 @@ function(add_test_executable TEST_NAME) ...@@ -60,7 +60,7 @@ function(add_test_executable TEST_NAME)
endif() endif()
endforeach() endforeach()
foreach(source IN LISTS ARGN) foreach(source IN LISTS ARGN)
if(NOT TEST_TARGETS MATCHES "gfx11" AND source MATCHES "wmma") if(NOT TEST_TARGETS MATCHES "gfx11" AND NOT TEST_TARGETS MATCHES "gfx12" AND source MATCHES "wmma")
message("removing wmma test ${source} ") message("removing wmma test ${source} ")
list(REMOVE_ITEM ARGN "${source}") list(REMOVE_ITEM ARGN "${source}")
endif() endif()
...@@ -71,6 +71,8 @@ function(add_test_executable TEST_NAME) ...@@ -71,6 +71,8 @@ function(add_test_executable TEST_NAME)
list(REMOVE_ITEM TEST_TARGETS gfx1030 gfx1100 gfx1101 gfx1102 gfx1103) list(REMOVE_ITEM TEST_TARGETS gfx1030 gfx1100 gfx1101 gfx1102 gfx1103)
elseif(ARGN MATCHES "_wmma") elseif(ARGN MATCHES "_wmma")
list(REMOVE_ITEM TEST_TARGETS gfx908 gfx90a gfx940 gfx941 gfx942 gfx1030) list(REMOVE_ITEM TEST_TARGETS gfx908 gfx90a gfx940 gfx941 gfx942 gfx1030)
elseif(ARGN MATCHES "_smfmac")
list(REMOVE_ITEM TEST_TARGETS gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx908 gfx90a)
endif() endif()
set_source_files_properties(${ARGN} PROPERTIES LANGUAGE HIP) set_source_files_properties(${ARGN} PROPERTIES LANGUAGE HIP)
add_executable(${TEST_NAME} ${ARGN}) add_executable(${TEST_NAME} ${ARGN})
...@@ -139,7 +141,7 @@ function(add_gtest_executable TEST_NAME) ...@@ -139,7 +141,7 @@ function(add_gtest_executable TEST_NAME)
endif() endif()
endforeach() endforeach()
foreach(source IN LISTS ARGN) foreach(source IN LISTS ARGN)
if(NOT TEST_TARGETS MATCHES "gfx11" AND source MATCHES "wmma") if(NOT TEST_TARGETS MATCHES "gfx11" AND NOT TEST_TARGETS MATCHES "gfx12" AND source MATCHES "wmma")
message("removing wmma test ${source} ") message("removing wmma test ${source} ")
list(REMOVE_ITEM ARGN "${source}") list(REMOVE_ITEM ARGN "${source}")
endif() endif()
...@@ -150,6 +152,8 @@ function(add_gtest_executable TEST_NAME) ...@@ -150,6 +152,8 @@ function(add_gtest_executable TEST_NAME)
list(REMOVE_ITEM TEST_TARGETS gfx900 gfx906 gfx1030 gfx1100 gfx1101 gfx1102 gfx1103) list(REMOVE_ITEM TEST_TARGETS gfx900 gfx906 gfx1030 gfx1100 gfx1101 gfx1102 gfx1103)
elseif(ARGN MATCHES "_wmma") elseif(ARGN MATCHES "_wmma")
list(REMOVE_ITEM TEST_TARGETS gfx900 gfx906 gfx908 gfx90a gfx940 gfx941 gfx942 gfx1030) list(REMOVE_ITEM TEST_TARGETS gfx900 gfx906 gfx908 gfx90a gfx940 gfx941 gfx942 gfx1030)
elseif(ARGN MATCHES "_smfmac")
list(REMOVE_ITEM TEST_TARGETS gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx908 gfx90a)
endif() endif()
set_source_files_properties(${ARGN} PROPERTIES LANGUAGE HIP) set_source_files_properties(${ARGN} PROPERTIES LANGUAGE HIP)
add_executable(${TEST_NAME} ${ARGN}) add_executable(${TEST_NAME} ${ARGN})
...@@ -209,4 +213,7 @@ add_subdirectory(wrapper) ...@@ -209,4 +213,7 @@ add_subdirectory(wrapper)
if(GPU_TARGETS MATCHES "gfx11") if(GPU_TARGETS MATCHES "gfx11")
add_subdirectory(wmma_op) add_subdirectory(wmma_op)
endif() endif()
if(GPU_TARGETS MATCHES "gfx942" AND CK_HIP_VERSION_MAJOR GREATER_EQUAL 6 AND CK_HIP_VERSION_MINOR GREATER_EQUAL 2) # smfmac needs ROCm6.2
add_subdirectory(smfmac_op)
endif()
add_subdirectory(position_embedding) add_subdirectory(position_embedding)
...@@ -2,11 +2,11 @@ add_gtest_executable(test_grouped_convnd_bwd_data test_grouped_convnd_bwd_data_x ...@@ -2,11 +2,11 @@ add_gtest_executable(test_grouped_convnd_bwd_data test_grouped_convnd_bwd_data_x
if(result EQUAL 0) if(result EQUAL 0)
target_link_libraries(test_grouped_convnd_bwd_data PRIVATE utility device_grouped_conv2d_bwd_data_instance device_grouped_conv3d_bwd_data_instance) target_link_libraries(test_grouped_convnd_bwd_data PRIVATE utility device_grouped_conv2d_bwd_data_instance device_grouped_conv3d_bwd_data_instance)
endif() endif()
add_gtest_executable(test_grouped_convnd_bwd_data_interface test_grouped_convnd_bwd_data_interface_xdl.cpp) add_gtest_executable(test_grouped_convnd_bwd_data_interface_xdl test_grouped_convnd_bwd_data_interface_xdl.cpp)
if(result EQUAL 0) if(result EQUAL 0)
target_link_libraries(test_grouped_convnd_bwd_data_interface PRIVATE utility device_grouped_conv2d_bwd_data_instance) target_link_libraries(test_grouped_convnd_bwd_data_interface_xdl PRIVATE utility device_grouped_conv2d_bwd_data_instance)
endif() endif()
add_gtest_executable(test_grouped_convnd_bwd_data_interface test_grouped_convnd_bwd_data_interface_wmma.cpp) add_gtest_executable(test_grouped_convnd_bwd_data_interface_wmma test_grouped_convnd_bwd_data_interface_wmma.cpp)
if(result EQUAL 0) if(result EQUAL 0)
target_link_libraries(test_grouped_convnd_bwd_data_interface PRIVATE utility device_grouped_conv2d_bwd_data_instance) target_link_libraries(test_grouped_convnd_bwd_data_interface_wmma PRIVATE utility device_grouped_conv2d_bwd_data_instance)
endif() endif()
...@@ -52,6 +52,14 @@ class TestGroupedConvndBwdData : public ::testing::Test ...@@ -52,6 +52,14 @@ class TestGroupedConvndBwdData : public ::testing::Test
ck::utils::conv::ConvParam conv_param; ck::utils::conv::ConvParam conv_param;
void SetUp() override
{
if(!ck::is_gfx11_supported())
{
GTEST_SKIP();
}
}
template <ck::index_t NDimSpatial> template <ck::index_t NDimSpatial>
bool Run() bool Run()
{ {
......
...@@ -5,13 +5,13 @@ if(GPU_TARGETS MATCHES "gfx9" OR DL_KERNELS) ...@@ -5,13 +5,13 @@ if(GPU_TARGETS MATCHES "gfx9" OR DL_KERNELS)
add_gtest_executable(test_grouped_convnd_bwd_weight test_grouped_convnd_bwd_weight.cpp) add_gtest_executable(test_grouped_convnd_bwd_weight test_grouped_convnd_bwd_weight.cpp)
target_link_libraries(test_grouped_convnd_bwd_weight PRIVATE utility device_grouped_conv3d_bwd_weight_instance) target_link_libraries(test_grouped_convnd_bwd_weight PRIVATE utility device_grouped_conv3d_bwd_weight_instance)
endif() endif()
add_gtest_executable(test_grouped_convnd_bwd_weight_interface test_grouped_convnd_bwd_weight_interface_xdl.cpp) add_gtest_executable(test_grouped_convnd_bwd_weight_interface_xdl test_grouped_convnd_bwd_weight_interface_xdl.cpp)
if(result EQUAL 0) if(result EQUAL 0)
target_link_libraries(test_grouped_convnd_bwd_weight_interface PRIVATE utility) target_link_libraries(test_grouped_convnd_bwd_weight_interface_xdl PRIVATE utility)
endif() endif()
add_gtest_executable(test_grouped_convnd_bwd_weight_interface test_grouped_convnd_bwd_weight_interface_wmma.cpp) add_gtest_executable(test_grouped_convnd_bwd_weight_interface_wmma test_grouped_convnd_bwd_weight_interface_wmma.cpp)
if(result EQUAL 0) if(result EQUAL 0)
target_link_libraries(test_grouped_convnd_bwd_weight_interface PRIVATE utility) target_link_libraries(test_grouped_convnd_bwd_weight_interface_wmma PRIVATE utility)
endif() endif()
add_gtest_executable(test_grouped_conv_bwd_weight_xdl_bilinear test_grouped_conv_bwd_weight_xdl_bilinear.cpp) add_gtest_executable(test_grouped_conv_bwd_weight_xdl_bilinear test_grouped_conv_bwd_weight_xdl_bilinear.cpp)
if(result EQUAL 0) if(result EQUAL 0)
......
...@@ -44,7 +44,7 @@ class TestGroupedConvndBwdWeight : public ::testing::Test ...@@ -44,7 +44,7 @@ class TestGroupedConvndBwdWeight : public ::testing::Test
} }
} }
if(ck::is_gfx11_supported()) if(ck::is_gfx11_supported() || ck::is_gfx12_supported())
{ {
// on gfx11 only support for 3d is implemented // on gfx11 only support for 3d is implemented
if constexpr(NDimSpatial{} != 3) if constexpr(NDimSpatial{} != 3)
......
...@@ -52,6 +52,14 @@ class TestGroupedConvndBwdWeight : public ::testing::Test ...@@ -52,6 +52,14 @@ class TestGroupedConvndBwdWeight : public ::testing::Test
ck::utils::conv::ConvParam conv_param; ck::utils::conv::ConvParam conv_param;
void SetUp() override
{
if(!ck::is_gfx11_supported())
{
GTEST_SKIP();
}
}
template <ck::index_t SplitK> template <ck::index_t SplitK>
bool Run() bool Run()
{ {
......
if(GPU_TARGETS MATCHES "gfx9" OR GPU_TARGETS MATCHES "gfx11") if(GPU_TARGETS MATCHES "gfx9" OR GPU_TARGETS MATCHES "gfx11")
add_gtest_executable(test_grouped_convnd_fwd test_grouped_convnd_fwd.cpp) add_gtest_executable(test_grouped_convnd_fwd test_grouped_convnd_fwd.cpp)
if(GPU_TARGETS MATCHES "gfx11") if((GPU_TARGETS MATCHES "gfx11") AND (NOT GPU_TARGETS MATCHES "gfx9"))
target_link_libraries(test_grouped_convnd_fwd PRIVATE utility device_grouped_conv2d_fwd_instance device_grouped_conv3d_fwd_instance) target_link_libraries(test_grouped_convnd_fwd PRIVATE utility device_grouped_conv2d_fwd_instance device_grouped_conv3d_fwd_instance)
else() else()
target_link_libraries(test_grouped_convnd_fwd PRIVATE utility device_grouped_conv1d_fwd_instance device_grouped_conv2d_fwd_instance device_grouped_conv3d_fwd_instance) target_link_libraries(test_grouped_convnd_fwd PRIVATE utility device_grouped_conv1d_fwd_instance device_grouped_conv2d_fwd_instance device_grouped_conv3d_fwd_instance)
......
add_gtest_executable(test_smfmac_op smfmac_op_xdl.cpp)
target_link_libraries(test_smfmac_op PRIVATE utility)
// SPDX-License-Identifier: MIT
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
#include <algorithm>
#include <cstdlib>
#include <iostream>
#include <numeric>
#include <tuple>
#include <vector>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "test/smfmac_op/smfmac_op_util.hpp"
template <typename Src1Type,
ck::index_t Src1VecSize,
typename Src2Type,
ck::index_t Src2VecSize,
typename DstType,
ck::index_t AccVecSize,
typename GPUAccType,
typename CPUAccType,
ck::index_t M,
ck::index_t N,
ck::index_t K>
bool run_test()
{
using Row = ck::tensor_layout::gemm::RowMajor;
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
bool pass = true;
const auto matmul_default = ck::smfmac_op_util::matmul<Src1Type,
Src1VecSize,
Src2Type,
Src2VecSize,
GPUAccType,
AccVecSize,
DstType,
M,
N,
K>;
const auto smfmac_kernel_container = std::make_tuple(matmul_default);
ck::static_for<0, 1, 1>{}([&](auto i) {
pass &=
ck::smfmac_op_util::TestSmfmac<decltype(std::get<ck::Number<i>{}>(
smfmac_kernel_container)),
Src1Type,
Src2Type,
DstType,
GPUAccType,
CPUAccType,
decltype(Row{}),
decltype(Row{}),
decltype(Row{}),
PassThrough,
PassThrough,
PassThrough,
AccVecSize,
M,
N,
K>{}(std::get<ck::Number<i>{}>(smfmac_kernel_container));
});
return pass;
}
int main(int, char*[])
{
bool pass = true;
// clang-format off
// | Src1Type| Src1VecSize| Src2Type| Src2VecSize| DstType| DstVecSize| GPUAccType| CPUAccType| M| N| K|
pass &= run_test< ck::half_t, 4, ck::half_t, 8, float, 4, float, float,16,16,32>();
pass &= run_test<ck::bhalf_t, 4, ck::bhalf_t, 8, float, 4, float, float,16,16,32>();
pass &= run_test< ck::half_t, 4, ck::half_t, 8, float, 16, float, float,32,32,16>();
pass &= run_test<ck::bhalf_t, 4, ck::bhalf_t, 8, float, 16, float, float,32,32,16>();
// clang-format on
std::cout << "TestGemm ..... " << (pass ? "SUCCESS" : "FAILURE") << std::endl;
return pass;
}
// SPDX-License-Identifier: MIT
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
#include "ck/utility/amd_smfmac.hpp"
#include "ck/library/utility/fill.hpp"
namespace ck {
namespace smfmac_op_util {
template <typename src_vec1, typename src_vec2, typename acc_vec>
__device__ void
builtin_smfmac_naive_selector(const src_vec1&, const src_vec2&, const int32_t&, acc_vec&)
{
}
template <>
__device__ void
builtin_smfmac_naive_selector<half4_t,
half8_t,
StaticBufferTupleOfVector<AddressSpaceEnum::Vgpr, float, 1, 4, true>>(
const half4_t& reg_a,
const half8_t& reg_b,
const int32_t& reg_idx,
StaticBufferTupleOfVector<AddressSpaceEnum::Vgpr, float, 1, 4, true>& reg_c)
{
intrin_smfmac_f32_16x16x32f16<16, 16>::Run(
reg_a, reg_b, reg_idx, reg_c.GetVectorTypeReference(Number<0>{}));
}
template <>
__device__ void
builtin_smfmac_naive_selector<bhalf4_t,
bhalf8_t,
StaticBufferTupleOfVector<AddressSpaceEnum::Vgpr, float, 1, 4, true>>(
const bhalf4_t& reg_a,
const bhalf8_t& reg_b,
const int32_t& reg_idx,
StaticBufferTupleOfVector<AddressSpaceEnum::Vgpr, float, 1, 4, true>& reg_c)
{
intrin_smfmac_f32_16x16x32bf16<16, 16>::Run(
reg_a, reg_b, reg_idx, reg_c.GetVectorTypeReference(Number<0>{}));
}
template <>
__device__ void builtin_smfmac_naive_selector<
half4_t,
half8_t,
StaticBufferTupleOfVector<AddressSpaceEnum::Vgpr, float, 1, 16, true>>(
const half4_t& reg_a,
const half8_t& reg_b,
const int32_t& reg_idx,
StaticBufferTupleOfVector<AddressSpaceEnum::Vgpr, float, 1, 16, true>& reg_c)
{
intrin_smfmac_f32_32x32x16f16<32, 32>::Run(
reg_a, reg_b, reg_idx, reg_c.GetVectorTypeReference(Number<0>{}));
}
template <>
__device__ void builtin_smfmac_naive_selector<
bhalf4_t,
bhalf8_t,
StaticBufferTupleOfVector<AddressSpaceEnum::Vgpr, float, 1, 16, true>>(
const bhalf4_t& reg_a,
const bhalf8_t& reg_b,
const int32_t& reg_idx,
StaticBufferTupleOfVector<AddressSpaceEnum::Vgpr, float, 1, 16, true>& reg_c)
{
intrin_smfmac_f32_32x32x16bf16<32, 32>::Run(
reg_a, reg_b, reg_idx, reg_c.GetVectorTypeReference(Number<0>{}));
}
// Smfmac instructions are using 4:2 structural sparsity, that means that in every contignuous
// subgroup of 4 elements, atleast 2 must be equal to zero and the position of non-zero elements is
// stored in idx register to allow selection of corresponding B matrix elements for multiplication.
// Currently smfmac instructions support only A matrix as sparse
template <typename src1_t,
index_t src1_vec_size,
typename src2_t,
index_t src2_vec_size,
typename acc_t,
index_t acc_vec_size,
typename dst_t,
int32_t M,
int32_t N,
int32_t K>
__global__ void matmul(const src1_t* a, const src2_t* b, dst_t* c)
{
__shared__ src1_t a_shared[M * K];
__shared__ src2_t b_shared[K * N];
const int lane = threadIdx.x;
// smfmac's A part is storing only non-zero elements in 2VGPRs
// smfmac's B part is storing all elements in 4VGPRs
using src1_vec = typename vector_type<src1_t, src1_vec_size>::type;
using src1_full_vec = typename vector_type<src1_t, src1_vec_size * 2>::type;
using src2_vec = typename vector_type<src2_t, src2_vec_size>::type;
src1_vec a_frag = {};
src2_vec b_frag = {};
src1_full_vec a_temp = {};
src2_vec b_temp = {};
// initialize c fragment to 0
using acc_vec = StaticBufferTupleOfVector<AddressSpaceEnum::Vgpr, acc_t, 1, acc_vec_size, true>;
acc_vec c_thread_buf_;
for(int i = 0; i < 8; ++i)
{
a_temp[i] = a[(lane % M) * K + (lane / M) * 8 + i]; // M K
}
for(int i = 0; i < 8; ++i)
{
b_temp[i] = b[(8 * (lane / N) + i) * N + (lane % N)]; // K N
}
__syncthreads();
for(int i = 0; i < 8; ++i)
{
a_shared[(lane % M) * K + (lane / M) * 8 + i] = a_temp[i];
}
for(int i = 0; i < 8; ++i)
{
b_shared[(8 * (lane / N) + i) * N + (lane % N)] = b_temp[i];
}
__syncthreads();
// Idx must be a 32-bit register and it is storing 4 2-bit indexes of A's non zero elements.
// It starts with last two elements of every 4 elements subgroup set as non-zero
int32_t idx = 0b11101110;
// Bit masks are for zeroing 0-3rd position of idx
static constexpr int32_t bit_clear_masks[4] = {0b11, 0b1100, 0b110000, 0b11000000};
src1_t curr_val;
int32_t a_pos = 0;
for(int j = 0; j < 2; ++j)
{
a_pos = j * 2;
for(int i = 0; i < 4; ++i)
{
curr_val = a_shared[(lane % M) * K + (lane / M) * 8 + 4 * j + i];
if(curr_val != 0.0f)
{
idx &= ~bit_clear_masks[a_pos];
idx |= (i % 4) << 2 * a_pos;
a_frag[a_pos] = curr_val;
a_pos++;
}
}
}
for(int i = 0; i < 8; ++i)
{
b_frag[i] = b_shared[(8 * (lane / N) + i) * N + (lane % N)];
}
builtin_smfmac_naive_selector<src1_vec, src2_vec, acc_vec>(a_frag, b_frag, idx, c_thread_buf_);
__syncthreads();
// store results from unpacked c_thread_buf_ output
if constexpr(K == 32)
{
static_for<0, acc_vec_size, 1>{}([&](auto i) {
c[(4 * (lane / 16) + i) * N + lane % 16] =
ck::type_convert<dst_t>(c_thread_buf_[Number<i>{}]);
});
}
else
{
static_for<0, acc_vec_size, 1>{}([&](auto i) {
c[((8 * (i / 4)) % 32 + 4 * (lane / 32) + i % 4) * N + lane % 32] =
ck::type_convert<dst_t>(c_thread_buf_[Number<i>{}]);
});
}
}
struct GemmParams
{
GemmParams() : M(16), N(16), K(32), StrideA(32), StrideB(16), StrideC(16), alpha(1), beta(0) {}
ck::index_t M;
ck::index_t N;
ck::index_t K;
ck::index_t StrideA;
ck::index_t StrideB;
ck::index_t StrideC;
float alpha;
float beta;
};
template <typename GemmInstance,
typename ADataType,
typename BDataType,
typename CDataType,
typename AElementwiseOperation,
typename BElementwiseOperation,
typename CElementwiseOperation>
void RunHostGEMM(const Tensor<ADataType>& A,
const Tensor<BDataType>& B,
Tensor<CDataType>& C,
AElementwiseOperation a_element_op,
BElementwiseOperation b_element_op,
CElementwiseOperation c_element_op)
{
auto ref_gemm = GemmInstance{};
auto ref_invoker = ref_gemm.MakeInvoker();
auto ref_argument = ref_gemm.MakeArgument(A, B, C, a_element_op, b_element_op, c_element_op);
ref_invoker.Run(ref_argument);
}
template <typename KernelType, typename ADataType, typename BDataType, typename CDataType>
bool RunDeviceGEMM(KernelType kernel,
const Tensor<ADataType>& A,
const Tensor<BDataType>& B,
Tensor<CDataType>& C)
{
DeviceMem a_m_k_device_buf(sizeof(ADataType) * A.mDesc.GetElementSpaceSize());
DeviceMem b_n_k_device_buf(sizeof(BDataType) * B.mDesc.GetElementSpaceSize());
DeviceMem c_m_n_device_buf(sizeof(CDataType) * C.mDesc.GetElementSpaceSize());
a_m_k_device_buf.ToDevice(A.mData.data());
b_n_k_device_buf.ToDevice(B.mData.data());
kernel<<<1, 64>>>(static_cast<const ADataType*>(a_m_k_device_buf.GetDeviceBuffer()),
static_cast<const BDataType*>(b_n_k_device_buf.GetDeviceBuffer()),
static_cast<CDataType*>(c_m_n_device_buf.GetDeviceBuffer()));
c_m_n_device_buf.FromDevice(C.mData.data());
return true;
}
template <typename DeviceSmfmac,
typename ADataType,
typename BDataType,
typename CDataType,
typename GPUAccDataType,
typename CPUAccDataType,
typename ALayout,
typename BLayout,
typename CLayout,
typename AElementwiseOperation,
typename BElementwiseOperation,
typename CElementwiseOperation,
index_t CAccNum,
index_t M,
index_t N,
index_t K>
struct TestSmfmac
{
auto PrepareGemmTensor(const ck::smfmac_op_util::GemmParams& params)
{
auto f_host_tensor_descriptor =
[](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
{
return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
std::vector<std::size_t>({stride, 1}));
}
else
{
return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
std::vector<std::size_t>({1, stride}));
}
};
Tensor<ADataType> a_m_k(
f_host_tensor_descriptor(params.M, params.K, params.StrideA, ALayout{}));
Tensor<BDataType> b_n_k(
f_host_tensor_descriptor(params.K, params.N, params.StrideB, BLayout{}));
Tensor<CDataType> c_m_n_host_result(
f_host_tensor_descriptor(params.M, params.N, params.StrideC, CLayout{}));
Tensor<CDataType> c_m_n_device_result(
f_host_tensor_descriptor(params.M, params.N, params.StrideC, CLayout{}));
auto f_generate_tensor_value = [](auto& tensor, auto type) {
using dataType = decltype(type);
tensor.GenerateTensorValue(GeneratorTensor_2<dataType>{-5, 5});
};
f_generate_tensor_value(a_m_k, ADataType{});
f_generate_tensor_value(b_n_k, BDataType{});
ck::utils::TransformIntoStructuralSparsity<ADataType>{}(a_m_k);
return std::make_tuple(a_m_k, b_n_k, c_m_n_host_result, c_m_n_device_result);
}
auto operator()(const DeviceSmfmac& smfmac_kernel)
{
std::cout << "ALayout = " << ALayout{}.name << ", BLayout = " << BLayout{}.name
<< ", CLayout = " << CLayout{}.name << std::endl;
// Arrange
ck::smfmac_op_util::GemmParams params;
params.M = M;
params.N = N;
params.K = K;
params.StrideA = K; // M K
params.StrideB = N; // K N
params.StrideC = N; // M N
auto host_tensors = PrepareGemmTensor(params);
const Tensor<ADataType>& a = std::get<0>(host_tensors);
const Tensor<BDataType>& b = std::get<1>(host_tensors);
Tensor<CDataType>& c_host = std::get<2>(host_tensors);
Tensor<CDataType>& c_device = std::get<3>(host_tensors);
auto a_element_op = AElementwiseOperation{};
auto b_element_op = BElementwiseOperation{};
auto c_element_op = CElementwiseOperation{};
using ReferenceGemmInstance =
ck::tensor_operation::host::ReferenceGemm<ADataType,
BDataType,
CDataType,
CPUAccDataType,
AElementwiseOperation,
BElementwiseOperation,
CElementwiseOperation>;
ck::smfmac_op_util::RunHostGEMM<ReferenceGemmInstance>(
a, b, c_host, a_element_op, b_element_op, c_element_op);
// Act
bool is_supported = ck::smfmac_op_util::RunDeviceGEMM(smfmac_kernel, a, b, c_device);
if(is_supported)
{
// Assert
bool res = false;
if(std::is_same<CDataType, float>::value)
{
res = ck::utils::check_err(c_device.mData, c_host.mData);
std::cout << (res ? "SUCCESS" : "FAILURE") << std::endl;
}
else
{
std::cout << "UNSUPPORTED CDataType" << std::endl;
}
return res;
}
else
{
return true;
}
}
};
} // namespace smfmac_op_util
} // namespace ck
// SPDX-License-Identifier: MIT
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
#include <algorithm>
#include <cstdlib>
#include <iostream>
#include <numeric>
#include <tuple>
#include <vector>
#include "ck/ck.hpp"
#include "gtest/gtest.h"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "test/smfmac_op/smfmac_op_util.hpp"
using BF16 = ck::bhalf_t;
using F16 = ck::half_t;
using F32 = float;
using Row = ck::tensor_layout::gemm::RowMajor;
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
template <typename Tuple>
class TestSmfmac : public ::testing::Test
{
protected:
using Src1Type = std::tuple_element_t<0, Tuple>;
static constexpr ck::index_t Src1VecSize = std::tuple_element_t<1, Tuple>{}.value;
using Src2Type = std::tuple_element_t<2, Tuple>;
static constexpr ck::index_t Src2VecSize = std::tuple_element_t<3, Tuple>{}.value;
using DstType = std::tuple_element_t<4, Tuple>;
static constexpr ck::index_t AccVecSize = std::tuple_element_t<5, Tuple>{}.value;
using GPUAccType = std::tuple_element_t<6, Tuple>;
using CPUAccType = std::tuple_element_t<7, Tuple>;
static constexpr ck::index_t M = std::tuple_element_t<8, Tuple>{}.value;
static constexpr ck::index_t N = std::tuple_element_t<9, Tuple>{}.value;
static constexpr ck::index_t K = std::tuple_element_t<10, Tuple>{}.value;
void Run()
{
bool pass = true;
constexpr auto matmul_default = ck::smfmac_op_util::matmul<Src1Type,
Src1VecSize,
Src2Type,
Src2VecSize,
GPUAccType,
AccVecSize,
DstType,
M,
N,
K>;
constexpr auto smfmac_kernel_container = std::make_tuple(matmul_default);
ck::static_for<0, std::tuple_size_v<decltype(smfmac_kernel_container)>, 1>{}([&](auto i) {
pass &= ck::smfmac_op_util::TestSmfmac<
std::tuple_element_t<i.value, decltype(smfmac_kernel_container)>,
Src1Type,
Src2Type,
DstType,
GPUAccType,
CPUAccType,
decltype(Row{}),
decltype(Row{}),
decltype(Row{}),
PassThrough,
PassThrough,
PassThrough,
AccVecSize,
M,
N,
K>{}(std::get<ck::Number<i>{}>(smfmac_kernel_container));
});
EXPECT_TRUE(pass);
}
};
template <ck::index_t N>
using I = ck::Number<N>;
using KernelTypes =
::testing::Types<std::tuple<F16, I<4>, F16, I<8>, F32, I<4>, F32, F32, I<16>, I<16>, I<32>>,
std::tuple<BF16, I<4>, BF16, I<8>, F32, I<4>, F32, F32, I<16>, I<16>, I<32>>,
std::tuple<F16, I<4>, F16, I<8>, F32, I<16>, F32, F32, I<32>, I<32>, I<16>>,
std::tuple<BF16, I<4>, BF16, I<8>, F32, I<16>, F32, F32, I<32>, I<32>, I<16>>>;
TYPED_TEST_SUITE(TestSmfmac, KernelTypes);
TYPED_TEST(TestSmfmac, TestSmfmacFP16BF16) { this->Run(); }
...@@ -11,6 +11,7 @@ ...@@ -11,6 +11,7 @@
#include "ck/library/utility/host_tensor_generator.hpp" #include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
#include "ck/utility/amd_wmma.hpp" #include "ck/utility/amd_wmma.hpp"
#include "ck/host_utility/device_prop.hpp"
namespace ck { namespace ck {
namespace wmma_op_util { namespace wmma_op_util {
...@@ -140,10 +141,18 @@ __global__ void matmul(const src_t* a, const src_t* b, dst_t* c) ...@@ -140,10 +141,18 @@ __global__ void matmul(const src_t* a, const src_t* b, dst_t* c)
p_shared[8 * 16 * lane_hi + 8 * lane_lo + ele + 16 * 16] = b_temp[ele]; p_shared[8 * 16 * lane_hi + 8 * lane_lo + ele + 16 * 16] = b_temp[ele];
} }
#ifdef __gfx12__
asm volatile("\
s_wait_dscnt 0x0 \n \
s_barrier_signal -1 \n \
s_barrier_wait -1 \
" ::);
#else
asm volatile("\ asm volatile("\
s_waitcnt lgkmcnt(0) \n \ s_waitcnt lgkmcnt(0) \n \
s_barrier \ s_barrier \
" ::); " ::);
#endif
for(int ele = 0; ele < 16; ++ele) for(int ele = 0; ele < 16; ++ele)
{ {
...@@ -155,10 +164,18 @@ __global__ void matmul(const src_t* a, const src_t* b, dst_t* c) ...@@ -155,10 +164,18 @@ __global__ void matmul(const src_t* a, const src_t* b, dst_t* c)
a_frag[ele] = p_shared[(ele / 8) * 16 * 8 + 8 * lane + ele % 8]; a_frag[ele] = p_shared[(ele / 8) * 16 * 8 + 8 * lane + ele % 8];
} }
#ifdef __gfx12__
asm volatile("\
s_wait_dscnt 0x0 \n \
s_barrier_signal -1 \n \
s_barrier_wait -1 \
" ::);
#else
asm volatile("\ asm volatile("\
s_waitcnt lgkmcnt(0) \n \ s_waitcnt lgkmcnt(0) \n \
s_barrier \ s_barrier \
" ::); " ::);
#endif
// sync threads, similar to mma_sync // sync threads, similar to mma_sync
// __syncthreads(); // __syncthreads();
...@@ -357,7 +374,8 @@ struct TestWmma ...@@ -357,7 +374,8 @@ struct TestWmma
a, b, c_host, a_element_op, b_element_op, c_element_op); a, b, c_host, a_element_op, b_element_op, c_element_op);
// Act // Act
bool is_supported = ck::wmma_op_util::RunDeviceGEMM(wmma_kernel, a, b, c_device); bool is_supported = ck::is_gfx11_supported() &&
ck::wmma_op_util::RunDeviceGEMM(wmma_kernel, a, b, c_device);
if(is_supported) if(is_supported)
{ {
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment