"docs/source/vscode:/vscode.git/clone" did not exist on "3f7c3511dcc95e5bb9fd53399dfc4eb655e1d6fd"
Commit ed528d76 authored by Aleksander Dudek's avatar Aleksander Dudek
Browse files

Merge branch 'develop' into ck_tile_gemmkernel_reuse

parents 6e078dc0 355893cd
FROM ubuntu:20.04 FROM ubuntu:20.04
ARG DEBIAN_FRONTEND=noninteractive ARG DEBIAN_FRONTEND=noninteractive
ARG ROCMVERSION=6.2 ARG ROCMVERSION=6.3
ARG compiler_version="" ARG compiler_version=""
ARG compiler_commit="" ARG compiler_commit=""
ARG CK_SCCACHE="" ARG CK_SCCACHE=""
...@@ -13,17 +13,12 @@ RUN set -xe && \ ...@@ -13,17 +13,12 @@ RUN set -xe && \
apt-get update && apt-get install -y --allow-unauthenticated apt-utils wget gnupg2 curl && \ apt-get update && apt-get install -y --allow-unauthenticated apt-utils wget gnupg2 curl && \
curl -fsSL https://repo.radeon.com/rocm/rocm.gpg.key | gpg --dearmor -o /etc/apt/trusted.gpg.d/rocm-keyring.gpg curl -fsSL https://repo.radeon.com/rocm/rocm.gpg.key | gpg --dearmor -o /etc/apt/trusted.gpg.d/rocm-keyring.gpg
RUN if [ "$ROCMVERSION" != "6.3" ]; then \ RUN if [ "$ROCMVERSION" != "6.4" ]; then \
sh -c "wget https://repo.radeon.com/amdgpu-install/$ROCMVERSION/ubuntu/focal/amdgpu-install_6.2.60200-1_all.deb --no-check-certificate" && \ sh -c "wget https://repo.radeon.com/amdgpu-install/$ROCMVERSION/ubuntu/focal/amdgpu-install_6.3.60300-1_all.deb --no-check-certificate" && \
apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated ./amdgpu-install_6.2.60200-1_all.deb && \ apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated ./amdgpu-install_6.3.60300-1_all.deb && \
wget -qO - http://repo.radeon.com/rocm/rocm.gpg.key | apt-key add - && \ wget -qO - http://repo.radeon.com/rocm/rocm.gpg.key | apt-key add - && \
sh -c "echo deb [arch=amd64 signed-by=/etc/apt/trusted.gpg.d/rocm-keyring.gpg] $DEB_ROCM_REPO focal main > /etc/apt/sources.list.d/rocm.list" && \ sh -c "echo deb [arch=amd64 signed-by=/etc/apt/trusted.gpg.d/rocm-keyring.gpg] $DEB_ROCM_REPO focal main > /etc/apt/sources.list.d/rocm.list" && \
sh -c 'echo deb [arch=amd64 signed-by=/etc/apt/trusted.gpg.d/rocm-keyring.gpg] https://repo.radeon.com/amdgpu/$ROCMVERSION/ubuntu focal main > /etc/apt/sources.list.d/amdgpu.list'; \ sh -c 'echo deb [arch=amd64 signed-by=/etc/apt/trusted.gpg.d/rocm-keyring.gpg] https://repo.radeon.com/amdgpu/$ROCMVERSION/ubuntu focal main > /etc/apt/sources.list.d/amdgpu.list'; \
elif [ "$ROCMVERSION" = "6.3" ] && [ "$compiler_version" = "rc1" ]; then \
sh -c "wget http://artifactory-cdn.amd.com/artifactory/list/amdgpu-deb/amdgpu-install-internal_6.3-20.04-1_all.deb --no-check-certificate" && \
apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install dialog libpopt0 rsync && DEBIAN_FRONTEND=noninteractive apt-get install ./amdgpu-install-internal_6.3-20.04-1_all.deb && \
sh -c 'echo deb [arch=amd64 trusted=yes] http://compute-artifactory.amd.com/artifactory/list/rocm-release-archive-20.04-deb/ 6.3 rel-20 > /etc/apt/sources.list.d/rocm-build.list' && \
amdgpu-repo --amdgpu-build=2074281; \
fi fi
RUN sh -c "echo deb http://mirrors.kernel.org/ubuntu focal main universe | tee -a /etc/apt/sources.list" && \ RUN sh -c "echo deb http://mirrors.kernel.org/ubuntu focal main universe | tee -a /etc/apt/sources.list" && \
......
ARG BASE_DOCKER="rocm/composable_kernel:ck_ub20.04_rocm6.2" ARG BASE_DOCKER="rocm/composable_kernel:ck_ub20.04_rocm6.3"
FROM $BASE_DOCKER FROM $BASE_DOCKER
ARG compiler_version="" ARG compiler_version=""
ARG compiler_commit="" ARG compiler_commit=""
......
This diff is collapsed.
...@@ -172,12 +172,13 @@ bool run_grouped_conv_fwd(bool do_verification, ...@@ -172,12 +172,13 @@ bool run_grouped_conv_fwd(bool do_verification,
{ {
case 0: break; case 0: break;
case 1: case 1:
in.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5}); // values generated: -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5
wei.GenerateTensorValue(GeneratorTensor_2<WeiDataType>{-5, 5}); in.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 6});
wei.GenerateTensorValue(GeneratorTensor_3<WeiDataType>{-1.0, 1.0});
break; break;
default: default:
in.GenerateTensorValue(GeneratorTensor_3<InDataType>{0.0, 1.0}); in.GenerateTensorValue(GeneratorTensor_3<InDataType>{-5.0, 5.0});
wei.GenerateTensorValue(GeneratorTensor_3<WeiDataType>{-0.5, 0.5}); wei.GenerateTensorValue(GeneratorTensor_3<WeiDataType>{-1.0, 1.0});
} }
DeviceMem in_device_buf(sizeof(InDataType) * in.mDesc.GetElementSpaceSize()); DeviceMem in_device_buf(sizeof(InDataType) * in.mDesc.GetElementSpaceSize());
......
// SPDX-License-Identifier: MIT // SPDX-License-Identifier: MIT
// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved. // Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
#pragma once #pragma once
...@@ -106,89 +106,35 @@ struct DeviceGroupedConvBwdDataMultipleD_Wmma_CShuffle ...@@ -106,89 +106,35 @@ struct DeviceGroupedConvBwdDataMultipleD_Wmma_CShuffle
static constexpr auto I3 = Number<3>{}; static constexpr auto I3 = Number<3>{};
static constexpr index_t KPerBlock = K0PerBlock * K1; static constexpr index_t KPerBlock = K0PerBlock * K1;
static constexpr auto transform_conv_to_gemm = using ConvToGemmBwdDataTransform = TransformConvBwdDataToGemm_v1<NDimSpatial,
TransformConvBwdDataToGemm_v1<NDimSpatial, ConvBackwardDataSpecialization,
ConvBackwardDataSpecialization, K1,
K1, K1,
K1, MPerBlock,
MPerBlock, NPerBlock,
NPerBlock, KPerBlock,
KPerBlock, true /* DoPadGemmM */,
true /* DoPadGemmM */, true /* DoPadGemmN */,
true /* DoPadGemmN */>{}; ALayout,
BLayout,
static auto GetDummyABDsEGridDescriptor() ELayout>;
{
const std::array<index_t, NDimSpatial + 3> dummy_tensor_lengths = {1};
const std::array<index_t, NDimSpatial + 3> dummy_tensor_strides = {1};
const std::array<index_t, NDimSpatial> dummy_spatial_lengths = {1};
const auto a_grid_desc_ak0_m_ak1 =
transform_conv_to_gemm.template MakeADescriptor_AK0_M_AK1<ALayout>(
dummy_tensor_lengths,
dummy_tensor_strides,
dummy_tensor_lengths,
dummy_tensor_strides,
dummy_tensor_lengths,
dummy_tensor_strides,
dummy_spatial_lengths,
dummy_spatial_lengths,
dummy_spatial_lengths,
dummy_spatial_lengths,
dummy_spatial_lengths);
const auto b_grid_desc_bk0_n_bk1 =
transform_conv_to_gemm.template MakeBDescriptor_BK0_N_BK1<BLayout>(
dummy_tensor_lengths,
dummy_tensor_strides,
dummy_tensor_lengths,
dummy_tensor_strides,
dummy_tensor_lengths,
dummy_tensor_strides,
dummy_spatial_lengths,
dummy_spatial_lengths,
dummy_spatial_lengths,
dummy_spatial_lengths,
dummy_spatial_lengths);
const auto ds_grid_desc_m_n = generate_tuple(
[&](auto i) {
using DLayout = remove_cvref_t<tuple_element_t<i.value, DsLayout>>;
return transform_conv_to_gemm.template MakeCDescriptor_M_N<DLayout>(
dummy_tensor_lengths,
dummy_tensor_strides,
dummy_tensor_lengths,
dummy_tensor_strides,
dummy_tensor_lengths,
dummy_tensor_strides,
dummy_spatial_lengths,
dummy_spatial_lengths,
dummy_spatial_lengths,
dummy_spatial_lengths,
dummy_spatial_lengths);
},
Number<NumDTensor>{});
const auto e_grid_desc_m_n =
transform_conv_to_gemm.template MakeCDescriptor_M_N<ELayout>(dummy_tensor_lengths,
dummy_tensor_strides,
dummy_tensor_lengths,
dummy_tensor_strides,
dummy_tensor_lengths,
dummy_tensor_strides,
dummy_spatial_lengths,
dummy_spatial_lengths,
dummy_spatial_lengths,
dummy_spatial_lengths,
dummy_spatial_lengths);
static auto
GetDummyABDsEGridDescriptor(const ConvToGemmBwdDataTransform& conv_to_gemm_transform)
{
const auto a_grid_desc_ak0_m_ak1 = conv_to_gemm_transform.MakeADescriptor_AK0_M_AK1();
const auto b_grid_desc_bk0_n_bk1 = conv_to_gemm_transform.MakeBDescriptor_BK0_N_BK1();
const auto ds_grid_desc_m_n =
generate_tuple([&](auto) { return conv_to_gemm_transform.MakeCDescriptor_M_N(); },
Number<NumDTensor>{});
const auto e_grid_desc_m_n = conv_to_gemm_transform.MakeCDescriptor_M_N();
return make_tuple( return make_tuple(
a_grid_desc_ak0_m_ak1, b_grid_desc_bk0_n_bk1, ds_grid_desc_m_n, e_grid_desc_m_n); a_grid_desc_ak0_m_ak1, b_grid_desc_bk0_n_bk1, ds_grid_desc_m_n, e_grid_desc_m_n);
} }
// desc // desc
using ABDsEGridDesc = decltype(GetDummyABDsEGridDescriptor()); constexpr static ConvToGemmBwdDataTransform dummy_conv_to_gemm_transform;
using ABDsEGridDesc = decltype(GetDummyABDsEGridDescriptor(dummy_conv_to_gemm_transform));
using AGridDesc_AK0_M_AK1 = remove_cvref_t<tuple_element_t<0, ABDsEGridDesc>>; using AGridDesc_AK0_M_AK1 = remove_cvref_t<tuple_element_t<0, ABDsEGridDesc>>;
using BGridDesc_BK0_N_BK1 = remove_cvref_t<tuple_element_t<1, ABDsEGridDesc>>; using BGridDesc_BK0_N_BK1 = remove_cvref_t<tuple_element_t<1, ABDsEGridDesc>>;
...@@ -270,7 +216,7 @@ struct DeviceGroupedConvBwdDataMultipleD_Wmma_CShuffle ...@@ -270,7 +216,7 @@ struct DeviceGroupedConvBwdDataMultipleD_Wmma_CShuffle
const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_lengths, const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_lengths,
const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_strides, const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_strides,
const std::array<std::array<index_t, NDimSpatial + 3>, NumDTensor>& const std::array<std::array<index_t, NDimSpatial + 3>, NumDTensor>&
ds_g_n_c_wis_lengths, /*ds_g_n_c_wis_lengths*/,
const std::array<std::array<index_t, NDimSpatial + 3>, NumDTensor>& const std::array<std::array<index_t, NDimSpatial + 3>, NumDTensor>&
ds_g_n_c_wis_strides, ds_g_n_c_wis_strides,
const std::array<index_t, NDimSpatial + 3>& e_g_n_c_wis_lengths, const std::array<index_t, NDimSpatial + 3>& e_g_n_c_wis_lengths,
...@@ -291,15 +237,8 @@ struct DeviceGroupedConvBwdDataMultipleD_Wmma_CShuffle ...@@ -291,15 +237,8 @@ struct DeviceGroupedConvBwdDataMultipleD_Wmma_CShuffle
b_element_op_{b_element_op}, b_element_op_{b_element_op},
cde_element_op_{cde_element_op}, cde_element_op_{cde_element_op},
a_g_n_k_wos_lengths_{a_g_n_k_wos_lengths}, a_g_n_k_wos_lengths_{a_g_n_k_wos_lengths},
a_g_n_k_wos_strides_{a_g_n_k_wos_strides},
b_g_k_c_xs_lengths_{b_g_k_c_xs_lengths}, b_g_k_c_xs_lengths_{b_g_k_c_xs_lengths},
b_g_k_c_xs_strides_{b_g_k_c_xs_strides},
ds_g_n_c_wis_lengths_{ds_g_n_c_wis_lengths},
ds_g_n_c_wis_strides_{ds_g_n_c_wis_strides},
e_g_n_c_wis_lengths_{e_g_n_c_wis_lengths},
e_g_n_c_wis_strides_{e_g_n_c_wis_strides},
conv_filter_strides_{conv_filter_strides}, conv_filter_strides_{conv_filter_strides},
conv_filter_dilations_{conv_filter_dilations},
input_left_pads_{input_left_pads}, input_left_pads_{input_left_pads},
input_right_pads_{input_right_pads} input_right_pads_{input_right_pads}
{ {
...@@ -382,68 +321,47 @@ struct DeviceGroupedConvBwdDataMultipleD_Wmma_CShuffle ...@@ -382,68 +321,47 @@ struct DeviceGroupedConvBwdDataMultipleD_Wmma_CShuffle
tildes = {i_ztilde, i_ytilde, i_xtilde}; tildes = {i_ztilde, i_ytilde, i_xtilde};
} }
ConvToGemmBwdDataTransform conv_to_gemm_transform_{a_g_n_k_wos_lengths,
a_g_n_k_wos_strides,
b_g_k_c_xs_lengths,
b_g_k_c_xs_strides,
e_g_n_c_wis_lengths,
e_g_n_c_wis_strides,
conv_filter_strides,
conv_filter_dilations,
input_left_pads,
input_right_pads,
tildes};
const auto a_grid_desc_ak0_m_ak1 = const auto a_grid_desc_ak0_m_ak1 =
transform_conv_to_gemm.template MakeADescriptor_AK0_M_AK1<ALayout>( conv_to_gemm_transform_.MakeADescriptor_AK0_M_AK1();
a_g_n_k_wos_lengths,
a_g_n_k_wos_strides,
b_g_k_c_xs_lengths,
b_g_k_c_xs_strides,
e_g_n_c_wis_lengths,
e_g_n_c_wis_strides,
conv_filter_strides,
conv_filter_dilations,
input_left_pads,
input_right_pads,
tildes);
const auto b_grid_desc_bk0_n_bk1 = const auto b_grid_desc_bk0_n_bk1 =
transform_conv_to_gemm.template MakeBDescriptor_BK0_N_BK1<BLayout>( conv_to_gemm_transform_.MakeBDescriptor_BK0_N_BK1();
a_g_n_k_wos_lengths,
a_g_n_k_wos_strides,
b_g_k_c_xs_lengths,
b_g_k_c_xs_strides,
e_g_n_c_wis_lengths,
e_g_n_c_wis_strides,
conv_filter_strides,
conv_filter_dilations,
input_left_pads,
input_right_pads,
tildes);
DsGridDesc_M_N ds_grid_desc_m_n; DsGridDesc_M_N ds_grid_desc_m_n;
// populate Ds desc // populate Ds desc
static_for<0, NumDTensor, 1>{}([&](auto i) { static_for<0, NumDTensor, 1>{}([&](auto i) {
using DLayout = remove_cvref_t<tuple_element_t<i.value, DsLayout>>; using DLayout = remove_cvref_t<tuple_element_t<i.value, DsLayout>>;
static_assert(is_same_v<DLayout, ELayout>);
ds_grid_desc_m_n(i) = ConvToGemmBwdDataTransform conv_to_gemm_transform_d{
transform_conv_to_gemm.template MakeCDescriptor_M_N<DLayout>(
a_g_n_k_wos_lengths,
a_g_n_k_wos_strides,
b_g_k_c_xs_lengths,
b_g_k_c_xs_strides,
ds_g_n_c_wis_lengths[i],
ds_g_n_c_wis_strides[i],
conv_filter_strides,
conv_filter_dilations,
input_left_pads,
input_right_pads,
tildes);
});
const auto e_grid_desc_m_n =
transform_conv_to_gemm.template MakeCDescriptor_M_N<ELayout>(
a_g_n_k_wos_lengths, a_g_n_k_wos_lengths,
a_g_n_k_wos_strides, a_g_n_k_wos_strides,
b_g_k_c_xs_lengths, b_g_k_c_xs_lengths,
b_g_k_c_xs_strides, b_g_k_c_xs_strides,
e_g_n_c_wis_lengths, e_g_n_c_wis_lengths,
e_g_n_c_wis_strides, ds_g_n_c_wis_strides[i],
conv_filter_strides, conv_filter_strides,
conv_filter_dilations, conv_filter_dilations,
input_left_pads, input_left_pads,
input_right_pads, input_right_pads,
tildes); tildes};
ds_grid_desc_m_n(i) = conv_to_gemm_transform_d.MakeCDescriptor_M_N();
});
const auto e_grid_desc_m_n = conv_to_gemm_transform_.MakeCDescriptor_M_N();
// for check validity // for check validity
ds_grid_desc_m_n_container_.push_back(ds_grid_desc_m_n); ds_grid_desc_m_n_container_.push_back(ds_grid_desc_m_n);
...@@ -522,17 +440,9 @@ struct DeviceGroupedConvBwdDataMultipleD_Wmma_CShuffle ...@@ -522,17 +440,9 @@ struct DeviceGroupedConvBwdDataMultipleD_Wmma_CShuffle
BElementwiseOp b_element_op_; BElementwiseOp b_element_op_;
CDEElementwiseOp cde_element_op_; CDEElementwiseOp cde_element_op_;
// for checking IsSupportedArgument()
std::array<index_t, NDimSpatial + 3> a_g_n_k_wos_lengths_; std::array<index_t, NDimSpatial + 3> a_g_n_k_wos_lengths_;
std::array<index_t, NDimSpatial + 3> a_g_n_k_wos_strides_;
std::array<index_t, NDimSpatial + 3> b_g_k_c_xs_lengths_; std::array<index_t, NDimSpatial + 3> b_g_k_c_xs_lengths_;
std::array<index_t, NDimSpatial + 3> b_g_k_c_xs_strides_;
std::array<std::array<index_t, NDimSpatial + 3>, NumDTensor> ds_g_n_c_wis_lengths_;
std::array<std::array<index_t, NDimSpatial + 3>, NumDTensor> ds_g_n_c_wis_strides_;
std::array<index_t, NDimSpatial + 3> e_g_n_c_wis_lengths_;
std::array<index_t, NDimSpatial + 3> e_g_n_c_wis_strides_;
std::array<index_t, NDimSpatial> conv_filter_strides_; std::array<index_t, NDimSpatial> conv_filter_strides_;
std::array<index_t, NDimSpatial> conv_filter_dilations_;
std::array<index_t, NDimSpatial> input_left_pads_; std::array<index_t, NDimSpatial> input_left_pads_;
std::array<index_t, NDimSpatial> input_right_pads_; std::array<index_t, NDimSpatial> input_right_pads_;
}; };
......
...@@ -998,14 +998,14 @@ struct FmhaFwdKernel ...@@ -998,14 +998,14 @@ struct FmhaFwdKernel
return pad_tensor_view( return pad_tensor_view(
q_dram_naive, q_dram_naive,
make_tuple(number<FmhaPipeline::kM0>{}, number<FmhaPipeline::kSubQKHeaddim>{}), make_tuple(number<FmhaPipeline::kM0>{}, number<FmhaPipeline::kSubQKHeaddim>{}),
sequence<false, kPadHeadDimQ>{}); sequence<kPadSeqLenQ, kPadHeadDimQ>{});
} }
else else
{ {
return pad_tensor_view( return pad_tensor_view(
q_dram_naive, q_dram_naive,
make_tuple(number<FmhaPipeline::kM0>{}, number<FmhaPipeline::kK0>{}), make_tuple(number<FmhaPipeline::kM0>{}, number<FmhaPipeline::kK0>{}),
sequence<false, kPadHeadDimQ>{}); sequence<kPadSeqLenQ, kPadHeadDimQ>{});
} }
}(); }();
const auto k_dram = [&]() { const auto k_dram = [&]() {
...@@ -1019,7 +1019,7 @@ struct FmhaFwdKernel ...@@ -1019,7 +1019,7 @@ struct FmhaFwdKernel
return pad_tensor_view( return pad_tensor_view(
k_dram_naive, k_dram_naive,
make_tuple(number<FmhaPipeline::kN0>{}, number<FmhaPipeline::kK0>{}), make_tuple(number<FmhaPipeline::kN0>{}, number<FmhaPipeline::kK0>{}),
sequence<false, kPadHeadDimQ>{}); sequence<kPadSeqLenK, kPadHeadDimQ>{});
}(); }();
const auto v_dram = [&]() { const auto v_dram = [&]() {
if constexpr(std::is_same_v<VLayout, ck_tile::tensor_layout::gemm::RowMajor>) if constexpr(std::is_same_v<VLayout, ck_tile::tensor_layout::gemm::RowMajor>)
...@@ -1041,7 +1041,7 @@ struct FmhaFwdKernel ...@@ -1041,7 +1041,7 @@ struct FmhaFwdKernel
return pad_tensor_view( return pad_tensor_view(
v_dram_transposed, v_dram_transposed,
make_tuple(number<FmhaPipeline::kN1>{}, number<FmhaPipeline::kK1>{}), make_tuple(number<FmhaPipeline::kN1>{}, number<FmhaPipeline::kK1>{}),
sequence<kPadHeadDimV, false>{}); sequence<kPadHeadDimV, kPadSeqLenK>{});
} }
else else
{ {
...@@ -1055,7 +1055,7 @@ struct FmhaFwdKernel ...@@ -1055,7 +1055,7 @@ struct FmhaFwdKernel
return pad_tensor_view( return pad_tensor_view(
v_dram_naive, v_dram_naive,
make_tuple(number<FmhaPipeline::kN1>{}, number<FmhaPipeline::kK1>{}), make_tuple(number<FmhaPipeline::kN1>{}, number<FmhaPipeline::kK1>{}),
sequence<false, kPadSeqLenK>{}); sequence<kPadHeadDimV, kPadSeqLenK>{});
} }
}(); }();
...@@ -1097,8 +1097,9 @@ struct FmhaFwdKernel ...@@ -1097,8 +1097,9 @@ struct FmhaFwdKernel
number<FmhaPipeline::kAlignmentBias>{}, number<FmhaPipeline::kAlignmentBias>{},
number<1>{}); number<1>{});
return pad_tensor_view( return pad_tensor_view(bias_dram_naive,
bias_dram_naive, bias_dram_window_lengths, sequence<false, kPadSeqLenK>{}); bias_dram_window_lengths,
sequence<kPadSeqLenQ, kPadSeqLenK>{});
}(); }();
return make_tile_window(bias_dram, bias_dram_window_lengths, {i_m0, 0}); return make_tile_window(bias_dram, bias_dram_window_lengths, {i_m0, 0});
......
...@@ -6,7 +6,7 @@ set(CK_TILE_SRC_FOLDER ${CMAKE_SOURCE_DIR}/include/ck_tile/) ...@@ -6,7 +6,7 @@ set(CK_TILE_SRC_FOLDER ${CMAKE_SOURCE_DIR}/include/ck_tile/)
# CK Codegen requires dataclass which is added in Python 3.7 # CK Codegen requires dataclass which is added in Python 3.7
# Python version 3.8 is required for general good practice as it is default for Ubuntu 20.04 # Python version 3.8 is required for general good practice as it is default for Ubuntu 20.04
if(NOT CK_USE_ALTERNATIVE_PYTHON) if(NOT CK_USE_ALTERNATIVE_PYTHON)
find_package(PythonInterp 3 REQUIRED) find_package(Python3 COMPONENTS Interpreter Development)
else() else()
message("Using alternative python version") message("Using alternative python version")
set(EXTRA_PYTHON_PATH) set(EXTRA_PYTHON_PATH)
...@@ -33,7 +33,7 @@ set(FMHA_KNOWN_APIS "fwd,fwd_splitkv,fwd_appendkv,bwd") ...@@ -33,7 +33,7 @@ set(FMHA_KNOWN_APIS "fwd,fwd_splitkv,fwd_appendkv,bwd")
# Note: The receipt 3 arg filters the generated backwards instances to reduce compilation time. # Note: The receipt 3 arg filters the generated backwards instances to reduce compilation time.
# With receipt 3 set, we are generating instances for datatype == {fp16 || bfp16}, bias == {no || alibi}, deterministic == off, and dpad == dvpad. # With receipt 3 set, we are generating instances for datatype == {fp16 || bfp16}, bias == {no || alibi}, deterministic == off, and dpad == dvpad.
execute_process( execute_process(
COMMAND ${PYTHON_EXECUTABLE} ${FMHA_SRC_FOLDER}/generate.py COMMAND ${Python3_EXECUTABLE} ${FMHA_SRC_FOLDER}/generate.py
--list_blobs ${FMHA_CPP_FOLDER}/blob_list.txt --list_blobs ${FMHA_CPP_FOLDER}/blob_list.txt
--api ${FMHA_KNOWN_APIS} --api ${FMHA_KNOWN_APIS}
--receipt 3 --receipt 3
...@@ -50,7 +50,7 @@ endif() ...@@ -50,7 +50,7 @@ endif()
# With receipt 3 set, we are generating instances for datatype == {fp16 || bfp16}, bias == {no || alibi}, deterministic == off, and dpad == dvpad. # With receipt 3 set, we are generating instances for datatype == {fp16 || bfp16}, bias == {no || alibi}, deterministic == off, and dpad == dvpad.
add_custom_command( add_custom_command(
OUTPUT ${FMHA_GEN_BLOBS} OUTPUT ${FMHA_GEN_BLOBS}
COMMAND ${PYTHON_EXECUTABLE} ${FMHA_SRC_FOLDER}/generate.py COMMAND ${Python3_EXECUTABLE} ${FMHA_SRC_FOLDER}/generate.py
--output_dir ${FMHA_CPP_FOLDER} --output_dir ${FMHA_CPP_FOLDER}
--api ${FMHA_KNOWN_APIS} --api ${FMHA_KNOWN_APIS}
--receipt 3 --receipt 3
......
...@@ -82,7 +82,7 @@ def parse_logfile(logfile): ...@@ -82,7 +82,7 @@ def parse_logfile(logfile):
StrideA=[] StrideA=[]
StrideB=[] StrideB=[]
StrideC=[] StrideC=[]
if 'perf_gemm.log' in logfile: if 'perf_gemm' in logfile and 'gemm_bilinear' not in logfile:
for line in open(logfile): for line in open(logfile):
if 'Best Perf' in line: if 'Best Perf' in line:
lst=line.split() lst=line.split()
...@@ -260,7 +260,7 @@ def main(): ...@@ -260,7 +260,7 @@ def main():
conn = sqlEngine.connect() conn = sqlEngine.connect()
#save gemm performance tests: #save gemm performance tests:
if 'perf_gemm.log' in filename: if 'perf_gemm' in filename and 'gemm_bilinear' not in filename:
#write the ck_gemm_test_params table only needed once the test set changes #write the ck_gemm_test_params table only needed once the test set changes
#post_test_params(test_list,conn) #post_test_params(test_list,conn)
for i in range(1,len(results)+1): for i in range(1,len(results)+1):
......
...@@ -11,9 +11,22 @@ ...@@ -11,9 +11,22 @@
#process results #process results
python3 process_perf_data.py perf_gemm.log python3 process_perf_data.py perf_gemm.log
python3 process_perf_data.py perf_onnx_gemm.log
python3 process_perf_data.py perf_resnet50_N256.log python3 process_perf_data.py perf_resnet50_N256.log
python3 process_perf_data.py perf_resnet50_N4.log python3 process_perf_data.py perf_resnet50_N4.log
file=./perf_onnx_gemm_gfx10.log
if [ -e "$file" ]; then
python3 process_perf_data.py perf_onnx_gemm_gfx10.log
fi
file=./perf_onnx_gemm_gfx11.log
if [ -e "$file" ]; then
python3 process_perf_data.py perf_onnx_gemm_gfx11.log
fi
file=./perf_onnx_gemm_gfx12.log
if [ -e "$file" ]; then
python3 process_perf_data.py perf_onnx_gemm_gfx12.log
fi
file=./perf_fmha_fwd_gfx942.log file=./perf_fmha_fwd_gfx942.log
if [ -e "$file" ]; then if [ -e "$file" ]; then
python3 process_perf_data.py perf_fmha_fwd_gfx942.log python3 process_perf_data.py perf_fmha_fwd_gfx942.log
......
...@@ -24,6 +24,18 @@ python3 process_perf_data.py perf_splitK_gemm.log ...@@ -24,6 +24,18 @@ python3 process_perf_data.py perf_splitK_gemm.log
python3 process_perf_data.py perf_onnx_gemm.log python3 process_perf_data.py perf_onnx_gemm.log
python3 process_perf_data.py perf_mixed_gemm.log python3 process_perf_data.py perf_mixed_gemm.log
file=./perf_onnx_gemm_gfx10.log
if [ -e "$file" ]; then
python3 process_perf_data.py perf_onnx_gemm_gfx10.log
fi
file=./perf_onnx_gemm_gfx11.log
if [ -e "$file" ]; then
python3 process_perf_data.py perf_onnx_gemm_gfx11.log
fi
file=./perf_onnx_gemm_gfx12.log
if [ -e "$file" ]; then
python3 process_perf_data.py perf_onnx_gemm_gfx12.log
fi
file=./perf_fmha_fwd_gfx942.log file=./perf_fmha_fwd_gfx942.log
if [ -e "$file" ]; then if [ -e "$file" ]; then
python3 process_perf_data.py perf_fmha_fwd_gfx942.log python3 process_perf_data.py perf_fmha_fwd_gfx942.log
......
...@@ -5,7 +5,7 @@ ...@@ -5,7 +5,7 @@
# post your new test results to the database and compare them to the baseline # post your new test results to the database and compare them to the baseline
# please contact Illia.Silin@amd.com for more details # please contact Illia.Silin@amd.com for more details
# #
# run the script as "./run_full_performance_tests.sh <verification> <tag for your test environment> <branch name> < node name> # run the script as "./run_full_performance_tests.sh <verification> <tag for your test environment> <branch name> <node name>
# input arguments: # input arguments:
# verification = 0 : do not verify result correctness on CPU # verification = 0 : do not verify result correctness on CPU
# = 1 : verifuy correctness on CPU (may take a long time) # = 1 : verifuy correctness on CPU (may take a long time)
......
#!/bin/bash
#
# in order to run this script you'd first need to build the ckProfiler executable in ../build/bin/
# run the script as "./run_gemm_performance_tests.sh <verification> <tag for your test environment> <branch name> <node name> <arch>
# input arguments:
# verification = 0 : do not verify result correctness on CPU
# = 1 : verify correctness on CPU (may take a long time)
# environment tag : a string describing the specifics of your test environment
# branch name : name of the branch in git repo (git status | grep -e 'On branch')
# node name : $hostname
# arch : GPU architecture, e.g. "gfx9" or "gfx1100"
#get the command line arguments:
export verify=$1
echo 'Verification: ' $verify
export env_type=$2
echo 'Environment type: ' $env_type
export branch=$3
echo 'Branch name: ' $branch
export host_name=$4
echo 'Host name: ' $host_name
export arch=$5
echo 'GPU architecture: ' $arch
function print_log_header(){
rm -f $1;
echo 'On branch ' $3 &> $1;
echo 'Node name: ' $4 >> $1;
#get GPU_arch and number of compute units from rocminfo
echo -n "GPU_arch: " >> $1; rocminfo | grep "Name:" | grep "gfx" >> $1;
rocminfo | grep "Compute Unit:" >> $1;
hipcc --version | grep -e 'HIP version' >> $1;
echo 'Environment type: ' $2 >> $1;
/opt/rocm/bin/amdclang++ --version | grep -e 'InstalledDir' >> $1;
}
#run ONNX gemm tests
export onnx_log="perf_onnx_gemm_$arch.log"
print_log_header $onnx_log $env_type $branch $host_name
./profile_onnx_gemm.sh gemm 0 0 $verify 1 0 1 2>&1 | tee -a $onnx_log
./profile_onnx_gemm.sh gemm 1 0 $verify 1 0 1 2>&1 | tee -a $onnx_log
#!/bin/bash #!/bin/bash
# #
# in order to run this script you'd first need to build the ckProfiler executable in ../build/bin/ # in order to run this script you'd first need to build the ckProfiler executable in ../build/bin/
# run the script as "./run_performance_tests.sh <verification> <tag for your test environment> <branch name> < node name> # run the script as "./run_performance_tests.sh <verification> <tag for your test environment> <branch name> <node name>
# input arguments: # input arguments:
# verification = 0 : do not verify result correctness on CPU # verification = 0 : do not verify result correctness on CPU
# = 1 : verify correctness on CPU (may take a long time) # = 1 : verify correctness on CPU (may take a long time)
...@@ -51,20 +51,11 @@ print_log_header $gemm_log $env_type $branch $host_name ...@@ -51,20 +51,11 @@ print_log_header $gemm_log $env_type $branch $host_name
./profile_gemm.sh gemm 2 3 $verify 1 0 1 | tee -a $gemm_log ./profile_gemm.sh gemm 2 3 $verify 1 0 1 | tee -a $gemm_log
./profile_gemm.sh gemm 3 3 $verify 1 0 1 | tee -a $gemm_log ./profile_gemm.sh gemm 3 3 $verify 1 0 1 | tee -a $gemm_log
#run grouped_fwd fp16 tests #run ONNX gemm tests
export grouped_conv_fwd_log="perf_grouped_conv_fwd_fp16.log" export onnx_log="perf_onnx_gemm.log"
print_log_header $conv_fwd_log $env_type $branch $host_name print_log_header $onnx_log $env_type $branch $host_name
./profile_grouped_conv_fwd.sh grouped_conv_fwd 1 1 0 $verify 1 0 1 256 2>&1 | tee -a $grouped_conv_fwd_log ./profile_onnx_gemm.sh gemm 0 0 $verify 1 0 1 2>&1 | tee -a $onnx_log
./profile_onnx_gemm.sh gemm 1 0 $verify 1 0 1 2>&1 | tee -a $onnx_log
#run grouped_bwd_data fp16 tests
export grouped_conv_bwd_data_log="perf_grouped_conv_bwd_data_fp16.log"
print_log_header $grouped_conv_bwd_data_log $env_type $branch $host_name
./profile_grouped_conv_bwd_data.sh grouped_conv_bwd_data 1 1 $verify 1 0 1 256 2>&1 | tee -a $grouped_conv_bwd_data_log
#run grouped_bwd_weight fp16 tests
export grouped_conv_bwd_weight_log="perf_grouped_conv_bwd_weight_fp16.log"
print_log_header $grouped_conv_bwd_weight_log $env_type $branch $host_name
./profile_grouped_conv_bwd_weight.sh grouped_conv_bwd_weight 1 1 $verify 1 0 1 256 1 2>&1 | tee -a $grouped_conv_bwd_weight_log
#run resnet50 tests #run resnet50 tests
export resnet256_log="perf_resnet50_N256.log" export resnet256_log="perf_resnet50_N256.log"
......
...@@ -51,8 +51,11 @@ TEST(Custom_bool, TestAsType) ...@@ -51,8 +51,11 @@ TEST(Custom_bool, TestAsType)
ck::static_for<0, size, 1>{}([&](auto i) { ck::static_for<0, size, 1>{}([&](auto i) {
right_vec.template AsType<custom_bool_t>()(Number<i>{}) = custom_bool_t{test_vec.at(i)}; right_vec.template AsType<custom_bool_t>()(Number<i>{}) = custom_bool_t{test_vec.at(i)};
}); });
// copy the vector vector_type<custom_bool_t, size> left_vec;
vector_type<custom_bool_t, size> left_vec{right_vec}; // check copy assignment op
left_vec = right_vec;
// overwrite right_vec with 0s
right_vec = vector_type<custom_bool_t, size>{};
// check if values were copied correctly // check if values were copied correctly
ck::static_for<0, size, 1>{}([&](auto i) { ck::static_for<0, size, 1>{}([&](auto i) {
ASSERT_EQ(left_vec.template AsType<custom_bool_t>()(Number<i>{}).data, test_vec.at(i)); ASSERT_EQ(left_vec.template AsType<custom_bool_t>()(Number<i>{}).data, test_vec.at(i));
...@@ -129,8 +132,11 @@ TEST(Custom_int8, TestAsType) ...@@ -129,8 +132,11 @@ TEST(Custom_int8, TestAsType)
ck::static_for<0, size, 1>{}([&](auto i) { ck::static_for<0, size, 1>{}([&](auto i) {
right_vec.template AsType<custom_int8_t>()(Number<i>{}) = custom_int8_t{test_vec.at(i)}; right_vec.template AsType<custom_int8_t>()(Number<i>{}) = custom_int8_t{test_vec.at(i)};
}); });
// copy the vector vector_type<custom_int8_t, size> left_vec;
vector_type<custom_int8_t, size> left_vec{right_vec}; // check copy assignment op
left_vec = right_vec;
// overwrite right_vec with 0s
right_vec = vector_type<custom_int8_t, size>{};
// check if values were copied correctly // check if values were copied correctly
ck::static_for<0, size, 1>{}([&](auto i) { ck::static_for<0, size, 1>{}([&](auto i) {
ASSERT_EQ(left_vec.template AsType<custom_int8_t>()(Number<i>{}).data, test_vec.at(i)); ASSERT_EQ(left_vec.template AsType<custom_int8_t>()(Number<i>{}).data, test_vec.at(i));
...@@ -207,8 +213,11 @@ TEST(Custom_uint8, TestAsType) ...@@ -207,8 +213,11 @@ TEST(Custom_uint8, TestAsType)
ck::static_for<0, size, 1>{}([&](auto i) { ck::static_for<0, size, 1>{}([&](auto i) {
right_vec.template AsType<custom_uint8_t>()(Number<i>{}) = custom_uint8_t{test_vec.at(i)}; right_vec.template AsType<custom_uint8_t>()(Number<i>{}) = custom_uint8_t{test_vec.at(i)};
}); });
// copy the vector vector_type<custom_uint8_t, size> left_vec;
vector_type<custom_uint8_t, size> left_vec{right_vec}; // check copy assignment op
left_vec = right_vec;
// overwrite right_vec with 0s
right_vec = vector_type<custom_uint8_t, size>{};
// check if values were copied correctly // check if values were copied correctly
ck::static_for<0, size, 1>{}([&](auto i) { ck::static_for<0, size, 1>{}([&](auto i) {
ASSERT_EQ(left_vec.template AsType<custom_uint8_t>()(Number<i>{}).data, test_vec.at(i)); ASSERT_EQ(left_vec.template AsType<custom_uint8_t>()(Number<i>{}).data, test_vec.at(i));
...@@ -287,8 +296,11 @@ TEST(Custom_f8, TestAsType) ...@@ -287,8 +296,11 @@ TEST(Custom_f8, TestAsType)
ck::static_for<0, size, 1>{}([&](auto i) { ck::static_for<0, size, 1>{}([&](auto i) {
right_vec.template AsType<custom_f8_t>()(Number<i>{}) = custom_f8_t{test_vec.at(i)}; right_vec.template AsType<custom_f8_t>()(Number<i>{}) = custom_f8_t{test_vec.at(i)};
}); });
// copy the vector vector_type<custom_f8_t, size> left_vec;
vector_type<custom_f8_t, size> left_vec{right_vec}; // check copy assignment op
left_vec = right_vec;
// overwrite right_vec with 0s
right_vec = vector_type<custom_f8_t, size>{};
// check if values were copied correctly // check if values were copied correctly
ck::static_for<0, size, 1>{}([&](auto i) { ck::static_for<0, size, 1>{}([&](auto i) {
ASSERT_EQ(left_vec.template AsType<custom_f8_t>()(Number<i>{}).data, test_vec.at(i)); ASSERT_EQ(left_vec.template AsType<custom_f8_t>()(Number<i>{}).data, test_vec.at(i));
...@@ -369,8 +381,11 @@ TEST(Custom_bf8, TestAsType) ...@@ -369,8 +381,11 @@ TEST(Custom_bf8, TestAsType)
ck::static_for<0, size, 1>{}([&](auto i) { ck::static_for<0, size, 1>{}([&](auto i) {
right_vec.template AsType<custom_bf8_t>()(Number<i>{}) = custom_bf8_t{test_vec.at(i)}; right_vec.template AsType<custom_bf8_t>()(Number<i>{}) = custom_bf8_t{test_vec.at(i)};
}); });
// copy the vector vector_type<custom_bf8_t, size> left_vec;
vector_type<custom_bf8_t, size> left_vec{right_vec}; // check copy assignment op
left_vec = right_vec;
// overwrite right_vec with 0s
right_vec = vector_type<custom_bf8_t, size>{};
// check if values were copied correctly // check if values were copied correctly
ck::static_for<0, size, 1>{}([&](auto i) { ck::static_for<0, size, 1>{}([&](auto i) {
ASSERT_EQ(left_vec.template AsType<custom_bf8_t>()(Number<i>{}).data, test_vec.at(i)); ASSERT_EQ(left_vec.template AsType<custom_bf8_t>()(Number<i>{}).data, test_vec.at(i));
...@@ -450,8 +465,11 @@ TEST(Custom_half, TestAsType) ...@@ -450,8 +465,11 @@ TEST(Custom_half, TestAsType)
ck::static_for<0, size, 1>{}([&](auto i) { ck::static_for<0, size, 1>{}([&](auto i) {
right_vec.template AsType<custom_half_t>()(Number<i>{}) = custom_half_t{test_vec.at(i)}; right_vec.template AsType<custom_half_t>()(Number<i>{}) = custom_half_t{test_vec.at(i)};
}); });
// copy the vector vector_type<custom_half_t, size> left_vec;
vector_type<custom_half_t, size> left_vec{right_vec}; // check copy assignment op
left_vec = right_vec;
// overwrite right_vec with 0s
right_vec = vector_type<custom_half_t, size>{};
// check if values were copied correctly // check if values were copied correctly
ck::static_for<0, size, 1>{}([&](auto i) { ck::static_for<0, size, 1>{}([&](auto i) {
ASSERT_EQ(left_vec.template AsType<custom_half_t>()(Number<i>{}).data, test_vec.at(i)); ASSERT_EQ(left_vec.template AsType<custom_half_t>()(Number<i>{}).data, test_vec.at(i));
...@@ -533,8 +551,11 @@ TEST(Custom_bhalf, TestAsType) ...@@ -533,8 +551,11 @@ TEST(Custom_bhalf, TestAsType)
ck::static_for<0, size, 1>{}([&](auto i) { ck::static_for<0, size, 1>{}([&](auto i) {
right_vec.template AsType<custom_bhalf_t>()(Number<i>{}) = custom_bhalf_t{test_vec.at(i)}; right_vec.template AsType<custom_bhalf_t>()(Number<i>{}) = custom_bhalf_t{test_vec.at(i)};
}); });
// copy the vector vector_type<custom_bhalf_t, size> left_vec;
vector_type<custom_bhalf_t, size> left_vec{right_vec}; // check copy assignment op
left_vec = right_vec;
// overwrite right_vec with 0s
right_vec = vector_type<custom_bhalf_t, size>{};
// check if values were copied correctly // check if values were copied correctly
ck::static_for<0, size, 1>{}([&](auto i) { ck::static_for<0, size, 1>{}([&](auto i) {
ASSERT_EQ(left_vec.template AsType<custom_bhalf_t>()(Number<i>{}).data, test_vec.at(i)); ASSERT_EQ(left_vec.template AsType<custom_bhalf_t>()(Number<i>{}).data, test_vec.at(i));
...@@ -615,8 +636,11 @@ TEST(Custom_float, TestAsType) ...@@ -615,8 +636,11 @@ TEST(Custom_float, TestAsType)
ck::static_for<0, size, 1>{}([&](auto i) { ck::static_for<0, size, 1>{}([&](auto i) {
right_vec.template AsType<custom_float_t>()(Number<i>{}) = custom_float_t{test_vec.at(i)}; right_vec.template AsType<custom_float_t>()(Number<i>{}) = custom_float_t{test_vec.at(i)};
}); });
// copy the vector vector_type<custom_float_t, size> left_vec;
vector_type<custom_float_t, size> left_vec{right_vec}; // check copy assignment op
left_vec = right_vec;
// overwrite right_vec with 0s
right_vec = vector_type<custom_float_t, size>{};
// check if values were copied correctly // check if values were copied correctly
ck::static_for<0, size, 1>{}([&](auto i) { ck::static_for<0, size, 1>{}([&](auto i) {
ASSERT_EQ(left_vec.template AsType<custom_float_t>()(Number<i>{}).data, test_vec.at(i)); ASSERT_EQ(left_vec.template AsType<custom_float_t>()(Number<i>{}).data, test_vec.at(i));
...@@ -693,8 +717,11 @@ TEST(Custom_double, TestAsType) ...@@ -693,8 +717,11 @@ TEST(Custom_double, TestAsType)
ck::static_for<0, size, 1>{}([&](auto i) { ck::static_for<0, size, 1>{}([&](auto i) {
right_vec.template AsType<custom_double_t>()(Number<i>{}) = custom_double_t{test_vec.at(i)}; right_vec.template AsType<custom_double_t>()(Number<i>{}) = custom_double_t{test_vec.at(i)};
}); });
// copy the vector vector_type<custom_double_t, size> left_vec;
vector_type<custom_double_t, size> left_vec{right_vec}; // check copy assignment op
left_vec = right_vec;
// overwrite right_vec with 0s
right_vec = vector_type<custom_double_t, size>{};
// check if values were copied correctly // check if values were copied correctly
ck::static_for<0, size, 1>{}([&](auto i) { ck::static_for<0, size, 1>{}([&](auto i) {
ASSERT_EQ(left_vec.template AsType<custom_double_t>()(Number<i>{}).data, test_vec.at(i)); ASSERT_EQ(left_vec.template AsType<custom_double_t>()(Number<i>{}).data, test_vec.at(i));
...@@ -813,8 +840,11 @@ TEST(Complex_half, TestAsType) ...@@ -813,8 +840,11 @@ TEST(Complex_half, TestAsType)
right_vec.template AsType<complex_half_t>()(Number<i>{}) = right_vec.template AsType<complex_half_t>()(Number<i>{}) =
complex_half_t{test_vec.at(num_elem * i), test_vec.at(num_elem * i + 1)}; complex_half_t{test_vec.at(num_elem * i), test_vec.at(num_elem * i + 1)};
}); });
// copy the vector vector_type<complex_half_t, size> left_vec;
vector_type<complex_half_t, size> left_vec{right_vec}; // check copy assignment op
left_vec = right_vec;
// overwrite right_vec with 0s
right_vec = vector_type<complex_half_t, size>{};
// check if values were copied correctly // check if values were copied correctly
ck::static_for<0, size, 1>{}([&](auto i) { ck::static_for<0, size, 1>{}([&](auto i) {
ASSERT_EQ(left_vec.template AsType<complex_half_t>()(Number<i>{}).real, ASSERT_EQ(left_vec.template AsType<complex_half_t>()(Number<i>{}).real,
...@@ -907,8 +937,11 @@ TEST(FP8OCP, TestAsType) ...@@ -907,8 +937,11 @@ TEST(FP8OCP, TestAsType)
right_vec.template AsType<f8_t>()(Number<i>{}) = ck::type_convert<f8_t>(test_vec.at(i)); right_vec.template AsType<f8_t>()(Number<i>{}) = ck::type_convert<f8_t>(test_vec.at(i));
}); });
// copy the vector vector_type<f8_t, size> left_vec;
vector_type<f8_t, size> left_vec{right_vec}; // check copy assignment op
left_vec = right_vec;
// overwrite right_vec with 0s
right_vec = vector_type<f8_t, size>{};
// check if values were copied correctly // check if values were copied correctly
ck::static_for<0, size, 1>{}([&](auto i) { ck::static_for<0, size, 1>{}([&](auto i) {
...@@ -984,8 +1017,11 @@ TEST(BF8OCP, TestAsType) ...@@ -984,8 +1017,11 @@ TEST(BF8OCP, TestAsType)
right_vec.template AsType<bf8_t>()(Number<i>{}) = ck::type_convert<bf8_t>(test_vec.at(i)); right_vec.template AsType<bf8_t>()(Number<i>{}) = ck::type_convert<bf8_t>(test_vec.at(i));
}); });
// copy the vector
vector_type<bf8_t, size> left_vec{right_vec}; vector_type<bf8_t, size> left_vec{right_vec};
// check copy assignment op
left_vec = right_vec;
// overwrite right_vec with 0s
right_vec = vector_type<bf8_t, size>{};
// check if values were copied correctly // check if values were copied correctly
ck::static_for<0, size, 1>{}([&](auto i) { ck::static_for<0, size, 1>{}([&](auto i) {
......
add_gtest_executable(test_grouped_convnd_bwd_data test_grouped_convnd_bwd_data_xdl_wmma.cpp) add_gtest_executable(test_grouped_convnd_bwd_data_xdl test_grouped_convnd_bwd_data_xdl.cpp)
if(result EQUAL 0) if(result EQUAL 0)
target_link_libraries(test_grouped_convnd_bwd_data PRIVATE utility device_grouped_conv2d_bwd_data_instance device_grouped_conv3d_bwd_data_instance) target_link_libraries(test_grouped_convnd_bwd_data_xdl PRIVATE utility device_grouped_conv2d_bwd_data_instance device_grouped_conv3d_bwd_data_instance)
endif()
add_gtest_executable(test_grouped_convnd_bwd_data_wmma test_grouped_convnd_bwd_data_wmma.cpp)
if(result EQUAL 0)
target_link_libraries(test_grouped_convnd_bwd_data_wmma PRIVATE utility device_grouped_conv2d_bwd_data_instance device_grouped_conv3d_bwd_data_instance)
endif() endif()
add_gtest_executable(test_grouped_convnd_bwd_data_interface_xdl test_grouped_convnd_bwd_data_interface_xdl.cpp) add_gtest_executable(test_grouped_convnd_bwd_data_interface_xdl test_grouped_convnd_bwd_data_interface_xdl.cpp)
if(result EQUAL 0) if(result EQUAL 0)
......
// SPDX-License-Identifier: MIT
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
#include <cstdlib>
#include <iostream>
#include <initializer_list>
#include <tuple>
#include <vector>
#include <gtest/gtest.h>
#include "profiler/profile_grouped_conv_bwd_data_impl.hpp"
template <typename Tuple>
class TestGroupedConvndBwdDataWmma : public ::testing::Test
{
protected:
using DataType = std::tuple_element_t<0, Tuple>;
using OutLayout = std::tuple_element_t<1, Tuple>;
using WeiLayout = std::tuple_element_t<2, Tuple>;
using InLayout = std::tuple_element_t<3, Tuple>;
std::vector<ck::utils::conv::ConvParam> conv_params;
template <ck::index_t NDimSpatial>
void Run()
{
EXPECT_FALSE(conv_params.empty());
bool pass = true;
for(auto& param : conv_params)
{
pass = pass && ck::profiler::profile_grouped_conv_bwd_data_impl<NDimSpatial,
OutLayout,
WeiLayout,
InLayout,
DataType,
DataType,
DataType>(
true, // do_verification
1, // init_method: integer value
false, // do_log
false, // time_kernel
param);
}
EXPECT_TRUE(pass);
}
};
using namespace ck::tensor_layout::convolution;
using KernelTypes2d = ::testing::Types<std::tuple<ck::half_t, GNHWK, GKYXC, GNHWC>,
std::tuple<int8_t, GNHWK, GKYXC, GNHWC>,
std::tuple<ck::half_t, NHWGK, GKYXC, NHWGC>,
std::tuple<int8_t, NHWGK, GKYXC, NHWGC>>;
using KernelTypes3d = ::testing::Types<std::tuple<ck::half_t, GNDHWK, GKZYXC, GNDHWC>,
std::tuple<int8_t, GNDHWK, GKZYXC, GNDHWC>,
std::tuple<ck::half_t, NDHWGK, GKZYXC, NDHWGC>,
std::tuple<int8_t, NDHWGK, GKZYXC, NDHWGC>>;
template <typename Tuple>
class TestGroupedConvndBwdDataWmma2d : public TestGroupedConvndBwdDataWmma<Tuple>
{
};
template <typename Tuple>
class TestGroupedConvndBwdDataWmma3d : public TestGroupedConvndBwdDataWmma<Tuple>
{
};
TYPED_TEST_SUITE(TestGroupedConvndBwdDataWmma2d, KernelTypes2d);
TYPED_TEST_SUITE(TestGroupedConvndBwdDataWmma3d, KernelTypes3d);
TYPED_TEST(TestGroupedConvndBwdDataWmma2d, Test2D)
{
this->conv_params.clear();
this->conv_params.push_back(
{2, 2, 4, 192, 192, {3, 3}, {28, 28}, {1, 1}, {1, 1}, {1, 1}, {1, 1}});
this->conv_params.push_back(
{2, 2, 128, 128, 256, {3, 3}, {14, 14}, {1, 1}, {1, 1}, {1, 1}, {1, 1}});
this->conv_params.push_back(
{2, 2, 128, 128, 256, {1, 1}, {7, 7}, {2, 2}, {1, 1}, {0, 0}, {0, 0}});
this->conv_params.push_back(
{2, 2, 128, 128, 256, {1, 1}, {3, 3}, {1, 1}, {1, 1}, {0, 0}, {0, 0}});
this->conv_params.push_back({2, 1, 1, 1, 32, {8, 8}, {32, 32}, {1, 1}, {1, 1}, {1, 1}, {1, 1}});
this->conv_params.push_back({2, 1, 1, 64, 3, {8, 8}, {32, 32}, {1, 1}, {1, 1}, {1, 1}, {1, 1}});
this->conv_params.push_back({2, 1, 1, 1, 1, {8, 8}, {32, 32}, {1, 1}, {1, 1}, {1, 1}, {1, 1}});
this->template Run<2>();
}
TYPED_TEST(TestGroupedConvndBwdDataWmma3d, Test3D)
{
this->conv_params.clear();
this->conv_params.push_back(
{3, 2, 16, 128, 256, {1, 1, 1}, {7, 7, 7}, {2, 2, 2}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}});
this->conv_params.push_back(
{3, 2, 2, 128, 256, {3, 3, 3}, {14, 14, 3}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}});
this->conv_params.push_back(
{3, 2, 32, 128, 256, {1, 1, 1}, {3, 3, 3}, {1, 1, 1}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}});
this->conv_params.push_back(
{3, 1, 1, 1, 32, {3, 3, 3}, {32, 32, 32}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}});
this->conv_params.push_back(
{3, 1, 1, 64, 3, {3, 3, 3}, {32, 32, 32}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}});
this->conv_params.push_back(
{3, 1, 1, 1, 1, {3, 3, 3}, {32, 32, 32}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}});
this->template Run<3>();
}
// SPDX-License-Identifier: MIT // SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. // Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
#include <cstdlib> #include <cstdlib>
#include <iostream> #include <iostream>
...@@ -12,7 +12,7 @@ ...@@ -12,7 +12,7 @@
#include "profiler/profile_grouped_conv_bwd_data_impl.hpp" #include "profiler/profile_grouped_conv_bwd_data_impl.hpp"
template <typename Tuple> template <typename Tuple>
class TestGroupedConvndBwdData : public ::testing::Test class TestGroupedConvndBwdDataXdl : public ::testing::Test
{ {
protected: protected:
using DataType = std::tuple_element_t<0, Tuple>; using DataType = std::tuple_element_t<0, Tuple>;
...@@ -51,35 +51,31 @@ using namespace ck::tensor_layout::convolution; ...@@ -51,35 +51,31 @@ using namespace ck::tensor_layout::convolution;
using KernelTypes2d = ::testing::Types<std::tuple<float, GNHWK, GKYXC, GNHWC>, using KernelTypes2d = ::testing::Types<std::tuple<float, GNHWK, GKYXC, GNHWC>,
std::tuple<ck::half_t, GNHWK, GKYXC, GNHWC>, std::tuple<ck::half_t, GNHWK, GKYXC, GNHWC>,
std::tuple<ck::bhalf_t, GNHWK, GKYXC, GNHWC>, std::tuple<ck::bhalf_t, GNHWK, GKYXC, GNHWC>,
std::tuple<int8_t, GNHWK, GKYXC, GNHWC>,
std::tuple<float, NHWGK, GKYXC, NHWGC>, std::tuple<float, NHWGK, GKYXC, NHWGC>,
std::tuple<ck::half_t, NHWGK, GKYXC, NHWGC>, std::tuple<ck::half_t, NHWGK, GKYXC, NHWGC>,
std::tuple<ck::bhalf_t, NHWGK, GKYXC, NHWGC>, std::tuple<ck::bhalf_t, NHWGK, GKYXC, NHWGC>>;
std::tuple<int8_t, NHWGK, GKYXC, NHWGC>>;
using KernelTypes3d = ::testing::Types<std::tuple<float, GNDHWK, GKZYXC, GNDHWC>, using KernelTypes3d = ::testing::Types<std::tuple<float, GNDHWK, GKZYXC, GNDHWC>,
std::tuple<ck::half_t, GNDHWK, GKZYXC, GNDHWC>, std::tuple<ck::half_t, GNDHWK, GKZYXC, GNDHWC>,
std::tuple<ck::bhalf_t, GNDHWK, GKZYXC, GNDHWC>, std::tuple<ck::bhalf_t, GNDHWK, GKZYXC, GNDHWC>,
std::tuple<int8_t, GNDHWK, GKZYXC, GNDHWC>,
std::tuple<float, NDHWGK, GKZYXC, NDHWGC>, std::tuple<float, NDHWGK, GKZYXC, NDHWGC>,
std::tuple<ck::half_t, NDHWGK, GKZYXC, NDHWGC>, std::tuple<ck::half_t, NDHWGK, GKZYXC, NDHWGC>,
std::tuple<ck::bhalf_t, NDHWGK, GKZYXC, NDHWGC>, std::tuple<ck::bhalf_t, NDHWGK, GKZYXC, NDHWGC>>;
std::tuple<int8_t, NDHWGK, GKZYXC, NDHWGC>>;
template <typename Tuple> template <typename Tuple>
class TestGroupedConvndBwdData2d : public TestGroupedConvndBwdData<Tuple> class TestGroupedConvndBwdDataXdl2d : public TestGroupedConvndBwdDataXdl<Tuple>
{ {
}; };
template <typename Tuple> template <typename Tuple>
class TestGroupedConvndBwdData3d : public TestGroupedConvndBwdData<Tuple> class TestGroupedConvndBwdDataXdl3d : public TestGroupedConvndBwdDataXdl<Tuple>
{ {
}; };
TYPED_TEST_SUITE(TestGroupedConvndBwdData2d, KernelTypes2d); TYPED_TEST_SUITE(TestGroupedConvndBwdDataXdl2d, KernelTypes2d);
TYPED_TEST_SUITE(TestGroupedConvndBwdData3d, KernelTypes3d); TYPED_TEST_SUITE(TestGroupedConvndBwdDataXdl3d, KernelTypes3d);
TYPED_TEST(TestGroupedConvndBwdData2d, Test2D) TYPED_TEST(TestGroupedConvndBwdDataXdl2d, Test2D)
{ {
this->conv_params.clear(); this->conv_params.clear();
...@@ -94,10 +90,13 @@ TYPED_TEST(TestGroupedConvndBwdData2d, Test2D) ...@@ -94,10 +90,13 @@ TYPED_TEST(TestGroupedConvndBwdData2d, Test2D)
this->conv_params.push_back({2, 1, 1, 1, 32, {8, 8}, {32, 32}, {1, 1}, {1, 1}, {1, 1}, {1, 1}}); this->conv_params.push_back({2, 1, 1, 1, 32, {8, 8}, {32, 32}, {1, 1}, {1, 1}, {1, 1}, {1, 1}});
this->conv_params.push_back({2, 1, 1, 64, 3, {8, 8}, {32, 32}, {1, 1}, {1, 1}, {1, 1}, {1, 1}}); this->conv_params.push_back({2, 1, 1, 64, 3, {8, 8}, {32, 32}, {1, 1}, {1, 1}, {1, 1}, {1, 1}});
this->conv_params.push_back({2, 1, 1, 1, 1, {8, 8}, {32, 32}, {1, 1}, {1, 1}, {1, 1}, {1, 1}}); this->conv_params.push_back({2, 1, 1, 1, 1, {8, 8}, {32, 32}, {1, 1}, {1, 1}, {1, 1}, {1, 1}});
// SplitN case
this->conv_params.push_back(
{2, 1, 128, 4, 192, {2, 2}, {224, 224}, {224, 224}, {1, 1}, {0, 0}, {0, 0}});
this->template Run<2>(); this->template Run<2>();
} }
TYPED_TEST(TestGroupedConvndBwdData3d, Test3D) TYPED_TEST(TestGroupedConvndBwdDataXdl3d, Test3D)
{ {
this->conv_params.clear(); this->conv_params.clear();
this->conv_params.push_back( this->conv_params.push_back(
...@@ -112,5 +111,17 @@ TYPED_TEST(TestGroupedConvndBwdData3d, Test3D) ...@@ -112,5 +111,17 @@ TYPED_TEST(TestGroupedConvndBwdData3d, Test3D)
{3, 1, 1, 64, 3, {3, 3, 3}, {32, 32, 32}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}}); {3, 1, 1, 64, 3, {3, 3, 3}, {32, 32, 32}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}});
this->conv_params.push_back( this->conv_params.push_back(
{3, 1, 1, 1, 1, {3, 3, 3}, {32, 32, 32}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}}); {3, 1, 1, 1, 1, {3, 3, 3}, {32, 32, 32}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}});
// SplitN case
this->conv_params.push_back({3,
1,
128,
4,
192,
{2, 2, 2},
{2, 224, 224},
{1, 224, 224},
{1, 1, 1},
{0, 0, 0},
{0, 0, 0}});
this->template Run<3>(); this->template Run<3>();
} }
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment