Commit d20c20a6 authored by Mirza Halilcevic's avatar Mirza Halilcevic
Browse files

Merge remote-tracking branch 'upstream/develop' into gemm_elementwise_gemm

parents 250a89f3 10158b0f
......@@ -4,15 +4,15 @@
#pragma once
#include "ck_tile/core.hpp"
#include "ck_tile/ops/gemm/pipeline/block_gemm_pipeline_agmem_bgmem_creg_v2_default_policy.hpp"
#include "ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v2_default_policy.hpp"
namespace ck_tile {
// A Tile Window: global memory
// B Tile Window: global memory
// C Distributed tensor: register
template <typename Problem, typename Policy = BlockGemmPipelineAGmemBGmemCRegV2DefaultPolicy>
struct BlockGemmPipelineAGmemBGmemCRegV2
template <typename Problem, typename Policy = GemmPipelineAGmemBGmemCRegV2DefaultPolicy>
struct GemmPipelineAGmemBGmemCRegV2
{
using ADataType = remove_cvref_t<typename Problem::ADataType>;
using BDataType = remove_cvref_t<typename Problem::BDataType>;
......
......@@ -7,12 +7,11 @@
namespace ck_tile {
// Default policy for BlockGemmPipelineAGmemBGmemCRegV2
// Default policy for GemmPipelineAGmemBGmemCRegV2
// Default policy class should not be templated, put template on member functions instead
// NOTE: policy should be binded to its corresponding operation. It's just a coincidence that
// BlockGemmPipelineAGmemBGmemCRegV2DefaultPolicy is the same as
// BlockGemmPipelineAGmemBGmemCRegV1DefaultPolicy
using BlockGemmPipelineAGmemBGmemCRegV2DefaultPolicy =
BlockGemmPipelineAGmemBGmemCRegV1DefaultPolicy;
// GemmPipelineAGmemBGmemCRegV2DefaultPolicy is the same as
// GemmPipelineAGmemBGmemCRegV1DefaultPolicy
using GemmPipelineAGmemBGmemCRegV2DefaultPolicy = GemmPipelineAGmemBGmemCRegV1DefaultPolicy;
} // namespace ck_tile
......@@ -13,20 +13,23 @@ template <typename ADataType_,
typename BDataType_,
typename CDataType_,
typename BlockGemmShape_,
bool kPadA_ = false,
bool kPadB_ = false,
bool kPadC_ = false>
struct BlockGemmPipelineProblem
typename TileGemmTraits_>
struct GemmPipelineProblem
{
using ADataType = remove_cvref_t<ADataType_>;
using BDataType = remove_cvref_t<BDataType_>;
using CDataType = remove_cvref_t<CDataType_>;
using BlockGemmShape = remove_cvref_t<BlockGemmShape_>;
using GemmTraits = remove_cvref_t<TileGemmTraits_>;
static constexpr index_t kBlockSize = BlockGemmShape::NumWarps * get_warp_size();
static constexpr bool kPadA = kPadA_;
static constexpr bool kPadB = kPadB_;
static constexpr bool kPadC = kPadC_;
static constexpr bool kPadA = GemmTraits::kPadA;
static constexpr bool kPadB = GemmTraits::kPadB;
static constexpr bool kPadC = GemmTraits::kPadC;
using LayoutA = remove_cvref_t<typename GemmTraits::LayoutA>;
using LayoutB = remove_cvref_t<typename GemmTraits::LayoutB>;
using LayoutC = remove_cvref_t<typename GemmTraits::LayoutC>;
static constexpr index_t AlignmentA = kPadA ? 1 : VectorLoadSize / sizeof(ADataType);
static constexpr index_t AlignmentB = kPadB ? 1 : VectorLoadSize / sizeof(BDataType);
......
// SPDX-License-Identifier: MIT
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck_tile/core.hpp"
#include "ck_tile/ops/gemm/warp/warp_gemm_dispatcher.hpp"
namespace ck_tile {
// UniversalGemm Policy
template <typename LayoutA_, typename LayoutB_, typename LayoutC_>
struct UniversalGemmPipelineAgBgCrPolicy
{
using LayoutA = remove_cvref_t<LayoutA_>;
using LayoutB = remove_cvref_t<LayoutB_>;
using LayoutC = remove_cvref_t<LayoutC_>;
static constexpr auto I0 = number<0>{};
static constexpr auto I1 = number<1>{};
static constexpr auto I2 = number<2>{};
static constexpr bool TransposeC = true;
template <typename Problem>
CK_TILE_HOST_DEVICE static constexpr auto MakeALdsBlockDescriptor()
{
using WarpGemm = WarpGemmMfmaDispatcher<typename Problem::ADataType,
typename Problem::BDataType,
typename Problem::CDataType,
Problem::BlockGemmShape::WarpTile::at(I0),
Problem::BlockGemmShape::WarpTile::at(I1),
Problem::BlockGemmShape::WarpTile::at(I2),
TransposeC>;
using ADataType = remove_cvref_t<typename Problem::ADataType>;
constexpr index_t MPerBlock = Problem::BlockGemmShape::kM;
constexpr index_t KPerBlock = Problem::BlockGemmShape::kK;
constexpr index_t K1 = WarpGemm::kK;
constexpr index_t K0 = KPerBlock / K1;
if constexpr(std::is_same<tensor_layout::gemm::RowMajor, LayoutA>::value)
{
constexpr auto MLdsLayer = 32 * 4 / KPerBlock / sizeof(ADataType) < 1
? 1
: 32 * 4 / KPerBlock / sizeof(ADataType);
constexpr auto a_lds_block_desc = make_naive_tensor_descriptor(
make_tuple(K0 * number<MLdsLayer>{}, number<MPerBlock / MLdsLayer>{}, K1),
make_tuple(K1, number<KPerBlock * MLdsLayer>{}, I1));
constexpr auto a_lds_block_desc_permuted = transform_tensor_descriptor(
a_lds_block_desc,
make_tuple(make_xor_transform(make_tuple(number<MPerBlock / MLdsLayer>{},
number<K0 * MLdsLayer>{})),
make_pass_through_transform(K1)),
make_tuple(sequence<1, 0>{}, sequence<2>{}),
make_tuple(sequence<1, 0>{}, sequence<2>{}));
constexpr auto a_lds_block_desc_ak0_kMLdsLayer_m_ak1 = transform_tensor_descriptor(
a_lds_block_desc_permuted,
make_tuple(make_unmerge_transform(make_tuple(K0, number<MLdsLayer>{})),
make_pass_through_transform(number<MPerBlock / MLdsLayer>{}),
make_pass_through_transform(K1)),
make_tuple(sequence<0>{}, sequence<1>{}, sequence<2>{}),
make_tuple(sequence<0, 2>{}, sequence<1>{}, sequence<3>{}));
constexpr auto a_lds_block_desc_m_k = transform_tensor_descriptor(
a_lds_block_desc_ak0_kMLdsLayer_m_ak1,
make_tuple(make_merge_transform_v3_division_mod(make_tuple(K0, K1)),
make_merge_transform_v3_division_mod(
make_tuple(number<MPerBlock / MLdsLayer>{}, number<MLdsLayer>{}))),
make_tuple(sequence<0, 3>{}, sequence<1, 2>{}),
make_tuple(sequence<1>{}, sequence<0>{}));
return a_lds_block_desc_m_k;
}
else // ColumnMajor A
{
// kfold and mpair dimension is not always required.
// more dimension in merge_transform increase the difficulty of generating immarg offset
// for compiler.
constexpr auto M0 = get_warp_size() * Problem::BlockGemmShape::BlockWarps::at(I0);
constexpr auto M1 = MPerBlock / M0;
constexpr auto KThreadWrite = Problem::kBlockSize / M0;
constexpr auto K0PerThreadWrite = K0 / KThreadWrite;
constexpr auto KThreadRead = 64 / WarpGemm::kM;
constexpr auto K0PerThreadRead = K0 / KThreadRead;
constexpr auto kfold =
(K1 * M0 * sizeof(ADataType) > 128) ? 1 : 128 / (K1 * M0 * sizeof(ADataType));
constexpr auto KThreadReadPerm =
(kfold * K0PerThreadWrite / K0PerThreadRead) > 1
? KThreadRead / (kfold * K0PerThreadWrite / K0PerThreadRead)
: KThreadRead;
// 1<=mpair<=kN0
constexpr auto mpair = (K1 * WarpGemm::kM * sizeof(ADataType) > 128)
? 1
: ((128 / (K1 * WarpGemm::kM * sizeof(ADataType))) > M0
? M0
: 128 / (K1 * WarpGemm::kM * sizeof(ADataType)));
constexpr auto a_lds_block_desc = make_naive_tensor_descriptor_packed(
make_tuple(number<KThreadWrite / kfold / KThreadReadPerm>{},
number<K0PerThreadWrite>{},
number<KThreadReadPerm * M1>{},
number<kfold * M0 / mpair>{},
number<mpair>{},
K1));
constexpr auto a_lds_block_desc_permuted = transform_tensor_descriptor(
a_lds_block_desc,
make_tuple(
make_pass_through_transform(number<KThreadWrite / kfold / KThreadReadPerm>{}),
make_pass_through_transform(number<K0PerThreadWrite>{}),
make_xor_transform(
make_tuple(number<KThreadReadPerm * M1>{}, number<kfold * M0 / mpair>{})),
make_pass_through_transform(number<mpair>{}),
make_pass_through_transform(K1)),
make_tuple(
sequence<0>{}, sequence<1>{}, sequence<2, 3>{}, sequence<4>{}, sequence<5>{}),
make_tuple(
sequence<0>{}, sequence<1>{}, sequence<2, 3>{}, sequence<4>{}, sequence<5>{}));
constexpr auto a_lds_block_desc_unmerged = transform_tensor_descriptor(
a_lds_block_desc_permuted,
make_tuple(
make_pass_through_transform(number<KThreadWrite / kfold / KThreadReadPerm>{}),
make_pass_through_transform(number<K0PerThreadWrite>{}),
make_unmerge_transform(make_tuple(number<KThreadReadPerm>{}, number<M1>{})),
make_unmerge_transform(make_tuple(number<kfold>{}, number<M0 / mpair>{})),
make_pass_through_transform(number<mpair>{}),
make_pass_through_transform(K1)),
make_tuple(sequence<0>{},
sequence<1>{},
sequence<2>{},
sequence<3>{},
sequence<4>{},
sequence<5>{}),
make_tuple(sequence<1>{},
sequence<2>{},
sequence<0, 3>{},
sequence<4, 5>{},
sequence<6>{},
sequence<7>{}));
constexpr auto a_lds_block_desc_m_k = transform_tensor_descriptor(
a_lds_block_desc_unmerged,
make_tuple(make_merge_transform_v3_division_mod(
make_tuple(number<KThreadReadPerm>{},
number<KThreadWrite / kfold / KThreadReadPerm>{},
number<kfold>{},
number<K0PerThreadWrite>{},
K1)),
make_merge_transform_v3_division_mod(
make_tuple(number<M0 / mpair>{}, number<mpair>{}, number<M1>{}))),
make_tuple(sequence<0, 1, 4, 2, 7>{}, sequence<5, 6, 3>{}),
make_tuple(sequence<1>{}, sequence<0>{}));
return a_lds_block_desc_m_k;
}
}
template <typename Problem>
CK_TILE_HOST_DEVICE static constexpr auto MakeBLdsBlockDescriptor()
{
using WarpGemm = WarpGemmMfmaDispatcher<typename Problem::ADataType,
typename Problem::BDataType,
typename Problem::CDataType,
Problem::BlockGemmShape::WarpTile::at(I0),
Problem::BlockGemmShape::WarpTile::at(I1),
Problem::BlockGemmShape::WarpTile::at(I2),
TransposeC>;
using BDataType = remove_cvref_t<typename Problem::BDataType>;
constexpr index_t NPerBlock = Problem::BlockGemmShape::kN;
constexpr index_t KPerBlock = Problem::BlockGemmShape::kK;
constexpr index_t K1 = WarpGemm::kK;
constexpr index_t K0 = KPerBlock / K1;
if constexpr(std::is_same<tensor_layout::gemm::ColumnMajor, LayoutB>::value)
{
// NLdsLayer * K0 as logical Bank
constexpr auto NLdsLayer = 32 * 4 / KPerBlock / sizeof(BDataType) < 1
? 1
: 32 * 4 / KPerBlock / sizeof(BDataType);
;
constexpr auto b_lds_block_desc = make_naive_tensor_descriptor(
make_tuple(K0 * number<NLdsLayer>{}, number<NPerBlock / NLdsLayer>{}, K1),
make_tuple(K1, number<KPerBlock * NLdsLayer>{}, I1));
constexpr auto b_lds_block_desc_permuted = transform_tensor_descriptor(
b_lds_block_desc,
make_tuple(make_xor_transform(make_tuple(number<NPerBlock / NLdsLayer>{},
number<K0 * NLdsLayer>{})),
make_pass_through_transform(K1)),
make_tuple(sequence<1, 0>{}, sequence<2>{}),
make_tuple(sequence<1, 0>{}, sequence<2>{}));
constexpr auto b_lds_block_desc_bk0_kNLdsLayer_n_bk1 = transform_tensor_descriptor(
b_lds_block_desc_permuted,
make_tuple(make_unmerge_transform(make_tuple(K0, number<NLdsLayer>{})),
make_pass_through_transform(number<NPerBlock / NLdsLayer>{}),
make_pass_through_transform(K1)),
make_tuple(sequence<0>{}, sequence<1>{}, sequence<2>{}),
make_tuple(sequence<0, 2>{}, sequence<1>{}, sequence<3>{}));
constexpr auto b_lds_block_desc_n_k = transform_tensor_descriptor(
b_lds_block_desc_bk0_kNLdsLayer_n_bk1,
make_tuple(make_merge_transform_v3_division_mod(make_tuple(K0, K1)),
make_merge_transform_v3_division_mod(
make_tuple(number<NPerBlock / NLdsLayer>{}, number<NLdsLayer>{}))),
make_tuple(sequence<0, 3>{}, sequence<1, 2>{}),
make_tuple(sequence<1>{}, sequence<0>{}));
return b_lds_block_desc_n_k;
}
else // RowMajor B
{
constexpr auto N0 = get_warp_size() * Problem::BlockGemmShape::BlockWarps::at(I1);
constexpr auto N1 = NPerBlock / N0;
constexpr auto KThreadWrite = Problem::kBlockSize / N0;
constexpr auto K0PerThreadWrite = K0 / KThreadWrite;
constexpr auto KThreadRead = 64 / WarpGemm::kN;
constexpr auto K0PerThreadRead = K0 / KThreadRead;
constexpr auto kfold =
(K1 * N0 * sizeof(BDataType) > 128) ? 1 : 128 / (K1 * N0 * sizeof(BDataType));
constexpr auto KThreadReadPerm =
(kfold * K0PerThreadWrite / K0PerThreadRead) > 1
? KThreadRead / (kfold * K0PerThreadWrite / K0PerThreadRead)
: KThreadRead;
// 1<=npair<=kN0
constexpr auto npair = (K1 * WarpGemm::kN * sizeof(BDataType) > 128)
? 1
: ((128 / (K1 * WarpGemm::kN * sizeof(BDataType))) > N0
? N0
: 128 / (K1 * WarpGemm::kN * sizeof(BDataType)));
constexpr auto b_lds_block_desc = make_naive_tensor_descriptor_packed(
make_tuple(number<KThreadWrite / kfold / KThreadReadPerm>{},
number<K0PerThreadWrite>{},
number<KThreadReadPerm * N1>{},
number<kfold * N0 / npair>{},
number<npair>{},
K1));
constexpr auto b_lds_block_desc_permuted = transform_tensor_descriptor(
b_lds_block_desc,
make_tuple(
make_pass_through_transform(number<KThreadWrite / kfold / KThreadReadPerm>{}),
make_pass_through_transform(number<K0PerThreadWrite>{}),
make_xor_transform(
make_tuple(number<KThreadReadPerm * N1>{}, number<kfold * N0 / npair>{})),
make_pass_through_transform(number<npair>{}),
make_pass_through_transform(K1)),
make_tuple(
sequence<0>{}, sequence<1>{}, sequence<2, 3>{}, sequence<4>{}, sequence<5>{}),
make_tuple(
sequence<0>{}, sequence<1>{}, sequence<2, 3>{}, sequence<4>{}, sequence<5>{}));
constexpr auto b_lds_block_desc_unmerged = transform_tensor_descriptor(
b_lds_block_desc_permuted,
make_tuple(
make_pass_through_transform(number<KThreadWrite / kfold / KThreadReadPerm>{}),
make_pass_through_transform(number<K0PerThreadWrite>{}),
make_unmerge_transform(make_tuple(number<KThreadReadPerm>{}, number<N1>{})),
make_unmerge_transform(make_tuple(number<kfold>{}, number<N0 / npair>{})),
make_pass_through_transform(number<npair>{}),
make_pass_through_transform(K1)),
make_tuple(sequence<0>{},
sequence<1>{},
sequence<2>{},
sequence<3>{},
sequence<4>{},
sequence<5>{}),
make_tuple(sequence<1>{},
sequence<2>{},
sequence<0, 3>{},
sequence<4, 5>{},
sequence<6>{},
sequence<7>{}));
constexpr auto b_lds_block_desc_n_k = transform_tensor_descriptor(
b_lds_block_desc_unmerged,
make_tuple(make_merge_transform_v3_division_mod(
make_tuple(number<KThreadReadPerm>{},
number<KThreadWrite / kfold / KThreadReadPerm>{},
number<kfold>{},
number<K0PerThreadWrite>{},
K1)),
make_merge_transform_v3_division_mod(
make_tuple(number<N0 / npair>{}, number<npair>{}, number<N1>{}))),
make_tuple(sequence<0, 1, 4, 2, 7>{}, sequence<5, 6, 3>{}),
make_tuple(sequence<1>{}, sequence<0>{}));
return b_lds_block_desc_n_k;
}
}
template <typename Problem>
CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSizeA()
{
constexpr index_t smem_size_a = sizeof(typename Problem::ADataType) *
MakeALdsBlockDescriptor<Problem>().get_element_space_size();
return smem_size_a;
}
template <typename Problem>
CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSizeB()
{
constexpr index_t smem_size_b = sizeof(typename Problem::BDataType) *
MakeBLdsBlockDescriptor<Problem>().get_element_space_size();
return smem_size_b;
}
template <typename Problem>
CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize()
{
constexpr index_t smem_size_a = GetSmemSizeA<Problem>();
constexpr index_t smem_size_b = GetSmemSizeB<Problem>();
index_t smem_size = 0;
smem_size += smem_size_a + smem_size_b;
return smem_size;
}
template <typename Problem>
CK_TILE_HOST_DEVICE static constexpr auto MakeADramTileDistribution()
{
using WarpGemm = WarpGemmMfmaDispatcher<typename Problem::ADataType,
typename Problem::BDataType,
typename Problem::CDataType,
Problem::BlockGemmShape::WarpTile::at(I0),
Problem::BlockGemmShape::WarpTile::at(I1),
Problem::BlockGemmShape::WarpTile::at(I2),
TransposeC>;
constexpr index_t BlockSize = Problem::kBlockSize;
constexpr index_t MPerBlock = Problem::BlockGemmShape::kM;
constexpr index_t KPerBlock = Problem::BlockGemmShape::kK;
constexpr index_t K1 = WarpGemm::kK;
constexpr index_t K0 = KPerBlock / K1;
constexpr index_t M2 = get_warp_size() / K0;
constexpr index_t M1 = BlockSize / get_warp_size();
static_assert(M2 != 0, "M2 is zero, which will lead to a division by zero error.");
static_assert(M1 != 0, "M1 is zero, which will lead to a division by zero error.");
constexpr index_t M0 = MPerBlock / (M2 * M1);
return make_static_tile_distribution(
tile_distribution_encoding<sequence<1>,
tuple<sequence<M0, M1, M2>, sequence<K0, K1>>,
tuple<sequence<1>, sequence<1, 2>>,
tuple<sequence<1>, sequence<2, 0>>,
sequence<1, 2>,
sequence<0, 1>>{});
}
template <typename Problem>
CK_TILE_HOST_DEVICE static constexpr auto MakeBDramTileDistribution()
{
using WarpGemm = WarpGemmMfmaDispatcher<typename Problem::ADataType,
typename Problem::BDataType,
typename Problem::CDataType,
Problem::BlockGemmShape::WarpTile::at(I0),
Problem::BlockGemmShape::WarpTile::at(I1),
Problem::BlockGemmShape::WarpTile::at(I2),
TransposeC>;
constexpr index_t BlockSize = Problem::kBlockSize;
constexpr index_t NPerBlock = Problem::BlockGemmShape::kN;
constexpr index_t KPerBlock = Problem::BlockGemmShape::kK;
constexpr index_t K1 = WarpGemm::kK;
constexpr index_t K0 = KPerBlock / K1;
constexpr index_t N2 = get_warp_size() / K0;
constexpr index_t N1 = BlockSize / get_warp_size();
static_assert(N2 != 0, "M2 is zero, which will lead to a division by zero error.");
static_assert(N1 != 0, "M1 is zero, which will lead to a division by zero error.");
constexpr index_t N0 = NPerBlock / (N2 * N1);
return make_static_tile_distribution(
tile_distribution_encoding<sequence<1>,
tuple<sequence<N0, N1, N2>, sequence<K0, K1>>,
tuple<sequence<1>, sequence<1, 2>>,
tuple<sequence<1>, sequence<2, 0>>,
sequence<1, 2>,
sequence<0, 1>>{});
}
template <typename Problem>
CK_TILE_HOST_DEVICE static constexpr auto GetBlockGemm()
{
using AccDataType = float;
using BlockWarps = typename Problem::BlockGemmShape::BlockWarps;
using WarpTile = typename Problem::BlockGemmShape::WarpTile;
using WarpGemm = WarpGemmMfmaDispatcher<typename Problem::ADataType,
typename Problem::BDataType,
AccDataType,
WarpTile::at(I0),
WarpTile::at(I1),
WarpTile::at(I2),
TransposeC>;
using BlockGemmPolicy = BlockGemmASmemBSmemCRegV1CustomPolicy<typename Problem::ADataType,
typename Problem::BDataType,
typename Problem::CDataType,
BlockWarps,
WarpGemm>;
return BlockGemmASmemBSmemCRegV1<Problem, BlockGemmPolicy>{};
}
};
} // namespace ck_tile
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck_tile/core.hpp"
namespace ck_tile {
template <bool kPadA_,
bool kPadB_,
bool kPadC_,
typename LayoutA_,
typename LayoutB_,
typename LayoutC_>
struct TileGemmTraits
{
static constexpr bool kPadA = kPadA_;
static constexpr bool kPadB = kPadB_;
static constexpr bool kPadC = kPadC_;
using LayoutA = LayoutA_;
using LayoutB = LayoutB_;
using LayoutC = LayoutC_;
};
} // namespace ck_tile
......@@ -31,8 +31,14 @@ struct Layernorm2dFwd
static constexpr ck_tile::index_t kMPerBlock = Problem::BlockShape::kMPerBlock;
static constexpr ck_tile::index_t kNPerBlock = Problem::BlockShape::kNPerBlock;
static constexpr bool kPadM = Problem::kPadM;
static constexpr bool kPadN = Problem::kPadN;
static constexpr ck_tile::index_t kNThreadPerWarp = Problem::BlockShape::kNThreadPerWarp;
static constexpr ck_tile::index_t kNPerThread = Problem::BlockShape::kNPerThread;
static constexpr auto I0 = number<0>{};
static constexpr auto I1 = number<1>{};
struct Kargs
{
......@@ -96,19 +102,25 @@ struct Layernorm2dFwd
sequence<2>>{});
}
template <typename Dstr>
CK_TILE_DEVICE static constexpr auto GetNPerThread(Dstr)
CK_TILE_DEVICE static int GetWelfordMaxCount(int N)
{
constexpr auto nDstrSpan = Dstr::get_distributed_spans().template at<1>();
using Lengths = decltype(nDstrSpan.impl_);
constexpr ck_tile::index_t kNThreadPerBlock = kNPerBlock / kNPerThread;
ck_tile::index_t ret = 1;
int thread_id_n = get_thread_id() % kNThreadPerBlock;
int max_count =
__builtin_amdgcn_readfirstlane(N < kNPerBlock ? 0 : kNPerThread * (N / kNPerBlock));
int n_per_block_tail_loop =
__builtin_amdgcn_readfirstlane(N - max_count * kNThreadPerBlock);
ck_tile::static_for<0, Lengths::size(), 1>{}(
[&](auto idx) { ret *= Lengths::template at(idx); });
if(n_per_block_tail_loop > 0)
{
int thread_max_n = (thread_id_n + 1) * kNPerThread;
int delta = thread_max_n - n_per_block_tail_loop;
delta = clamp(thread_max_n - n_per_block_tail_loop, 0, kNPerThread);
max_count += kNPerThread - delta;
}
return ret;
return max_count;
}
template <typename DistributedTensor>
......@@ -129,42 +141,29 @@ struct Layernorm2dFwd
return out_dstr_tensor;
}
template <bool Cond = (kHasGamma && kHasBeta)>
CK_TILE_DEVICE std::enable_if_t<Cond> TwoPassLayernorm2dFwd(const XDataType* p_x,
const GammaDataType* p_gamma,
const BetaDataType* p_beta,
YDataType* p_y,
MeanDataType* p_mean,
InvStdDataType* p_invStd,
const ComputeDataType epsilon,
ck_tile::index_t M,
ck_tile::index_t N) const
template <typename XBlockWindow,
typename GammaBlockWindow,
typename BetaBlockWindow,
typename YBlockWindow,
typename MeanBlockWindow,
typename InvStdBlockWindow,
bool Cond = (kHasGamma && kHasBeta)>
CK_TILE_DEVICE std::enable_if_t<Cond>
TwoPassLayernorm2dFwd(XBlockWindow& x_block_window,
GammaBlockWindow& gamma_block_window,
BetaBlockWindow& beta_block_window,
YBlockWindow& y_block_window,
MeanBlockWindow& mean_block_window,
InvStdBlockWindow& inv_std_block_window,
ComputeDataType epsilon,
ck_tile::index_t N) const
{
constexpr auto I0 = number<0>{};
constexpr auto I1 = number<1>{};
const auto x_m_n = make_naive_tensor_view<address_space_enum::global>(
p_x, make_tuple(M, N), make_tuple(N, 1), number<32>{}, number<1>{});
const auto gamma_n = make_naive_tensor_view<address_space_enum::global>(
p_gamma, make_tuple(N), make_tuple(1), number<32>{}, number<1>{});
// TODO - Optimize tail loop to reduce move_tile_window()
index_t num_n_tile_iteration =
__builtin_amdgcn_readfirstlane(integer_divide_ceil(N, kNPerBlock));
const auto beta_n = make_naive_tensor_view<address_space_enum::global>(
p_beta, make_tuple(N), make_tuple(1), number<32>{}, number<1>{});
const auto iM = get_block_id() * kMPerBlock;
constexpr auto xDstr = MakeXBlockTileDistribution();
auto x_block_window = make_tile_window(
x_m_n, make_tuple(number<kMPerBlock>{}, number<kNPerBlock>{}), {iM, 0}, xDstr);
index_t num_n_tile_iteration = __builtin_amdgcn_readfirstlane(N / kNPerBlock);
// TODO: padding - handle max_count if N % kNPerBlock != 0
constexpr auto NPerThread = GetNPerThread(xDstr);
ThreadWelford<ComputeDataType, XDataType> thread_welford{
type_convert<int>(NPerThread * N / kNPerBlock)};
int welford_max_count = GetWelfordMaxCount(N);
ThreadWelford<ComputeDataType, XDataType> thread_welford{welford_max_count};
using XTensorType = decltype(load_tile(x_block_window));
auto mean_compute_block_tensor =
......@@ -190,44 +189,14 @@ struct Layernorm2dFwd
auto inv_std_compute_block_tensor = InvSqrt(var_compute_block_tensor, epsilon);
if constexpr(kSaveMean)
{
const auto mean_m = make_naive_tensor_view_packed<address_space_enum::global>(
p_mean, make_tuple(M), number<32>{});
auto mean_block_window =
make_tile_window(mean_m, make_tuple(number<kMPerBlock>{}), {iM});
store_tile(mean_block_window, cast_tile<MeanDataType>(mean_compute_block_tensor));
}
if constexpr(kSaveInvStd)
{
const auto inv_std_m = make_naive_tensor_view_packed<address_space_enum::global>(
p_invStd, make_tuple(M), number<32>{});
auto inv_std_block_window =
make_tile_window(inv_std_m, make_tuple(number<kMPerBlock>{}), {iM});
store_tile(inv_std_block_window, cast_tile<MeanDataType>(inv_std_compute_block_tensor));
}
// TODO: Extract normalize pipeline
const auto y_m_n = make_naive_tensor_view<address_space_enum::global>(
p_y, make_tuple(M, N), make_tuple(N, 1), number<32>{}, number<1>{});
auto y_block_window = make_tile_window(
y_m_n, make_tuple(number<kMPerBlock>{}, number<kNPerBlock>{}), {iM, 0});
constexpr auto gammaDstr = MakeGammaBetaBlockTileDistribution();
constexpr auto betaDstr = gammaDstr;
auto gamma_block_window =
make_tile_window(gamma_n, make_tuple(number<kNPerBlock>{}), {0}, gammaDstr);
auto beta_block_window = make_tile_window(
beta_n, make_tuple(number<kMPerBlock>{}, number<kNPerBlock>{}), {0}, betaDstr);
store_tile(inv_std_block_window,
cast_tile<InvStdDataType>(inv_std_compute_block_tensor));
// reverse read x to reuse cache
ck_tile::index_t stride_to_right_most_window = N - kNPerBlock;
ck_tile::index_t stride_to_right_most_window =
N % kNPerBlock == 0 ? N - kNPerBlock : N - N % kNPerBlock;
move_tile_window(x_block_window, {0, -kNPerBlock});
move_tile_window(gamma_block_window, {stride_to_right_most_window});
......@@ -274,17 +243,209 @@ struct Layernorm2dFwd
}
}
template <typename XBlockWindow,
typename GammaBlockWindow,
typename BetaBlockWindow,
typename YBlockWindow,
typename MeanBlockWindow,
typename InvStdBlockWindow,
bool Cond = (kHasGamma && kHasBeta)>
CK_TILE_DEVICE std::enable_if_t<Cond>
OnePassLayernorm2dFwd(XBlockWindow& x_block_window,
GammaBlockWindow& gamma_block_window,
BetaBlockWindow& beta_block_window,
YBlockWindow& y_block_window,
MeanBlockWindow& mean_block_window,
InvStdBlockWindow& inv_std_block_window,
ComputeDataType epsilon,
ck_tile::index_t N) const
{
int welford_max_count = GetWelfordMaxCount(N);
ThreadWelford<ComputeDataType, XDataType> thread_welford{welford_max_count};
using XTensorType = decltype(load_tile(x_block_window));
auto mean_compute_block_tensor =
thread_welford.template MakeInitialMeanVarDistributedTensor<XTensorType>();
auto var_compute_block_tensor =
thread_welford.template MakeInitialMeanVarDistributedTensor<XTensorType>();
clear_tile(mean_compute_block_tensor);
clear_tile(var_compute_block_tensor);
const auto x_block_tensor = load_tile(x_block_window);
thread_welford(x_block_tensor, mean_compute_block_tensor, var_compute_block_tensor);
// TODO: support cross warp Welford
WarpMergeWelford<ComputeDataType, true>{}(
mean_compute_block_tensor, var_compute_block_tensor, thread_welford.cur_count_);
auto inv_std_compute_block_tensor = InvSqrt(var_compute_block_tensor, epsilon);
if constexpr(kSaveMean)
store_tile(mean_block_window, cast_tile<MeanDataType>(mean_compute_block_tensor));
if constexpr(kSaveInvStd)
store_tile(inv_std_block_window,
cast_tile<InvStdDataType>(inv_std_compute_block_tensor));
// normalize
const auto gamma_block_tensor = load_tile(gamma_block_window);
const auto beta_block_tensor = load_tile(beta_block_window);
constexpr auto x_spans = decltype(x_block_tensor)::get_distributed_spans();
auto y_block_tensor =
make_static_distributed_tensor<YDataType>(x_block_tensor.get_tile_distribution());
sweep_tile_span(x_spans[I1], [&](auto idx1) {
constexpr auto j_idx = make_tuple(idx1);
const auto gamma = type_convert<ComputeDataType>(gamma_block_tensor[j_idx]);
const auto beta = type_convert<ComputeDataType>(beta_block_tensor[j_idx]);
sweep_tile_span(x_spans[I0], [&](auto idx0) {
constexpr auto i_idx = make_tuple(idx0);
constexpr auto i_j_idx = make_tuple(idx0, idx1);
const auto mean = mean_compute_block_tensor[i_idx];
const auto inv_std = inv_std_compute_block_tensor[i_idx];
const auto x = type_convert<ComputeDataType>(x_block_tensor[i_j_idx]);
auto y = (x - mean) * inv_std * gamma + beta;
y_block_tensor(i_j_idx) = type_convert<YDataType>(y);
});
});
store_tile(y_block_window, y_block_tensor);
}
CK_TILE_DEVICE void operator()(Kargs kargs) const
{
TwoPassLayernorm2dFwd(static_cast<const XDataType*>(kargs.p_x),
static_cast<const GammaDataType*>(kargs.p_gamma),
static_cast<const BetaDataType*>(kargs.p_beta),
static_cast<YDataType*>(kargs.p_y),
static_cast<MeanDataType*>(kargs.p_mean),
static_cast<InvStdDataType*>(kargs.p_invStd),
static_cast<const ComputeDataType>(kargs.epsilon),
kargs.M,
kargs.N);
const auto x_m_n = [&]() {
const auto x_dram_naive = make_naive_tensor_view<address_space_enum::global>(
static_cast<const XDataType*>(kargs.p_x),
make_tuple(kargs.M, kargs.N),
make_tuple(kargs.N, 1),
number<kNPerThread>{},
number<1>{});
return pad_tensor_view(x_dram_naive,
make_tuple(number<kMPerBlock>{}, number<kNPerBlock>{}),
sequence<kPadM, kPadN>{});
}();
const auto gamma_n = [&]() {
const auto gamma_dram_naive = make_naive_tensor_view<address_space_enum::global>(
static_cast<const GammaDataType*>(kargs.p_gamma),
make_tuple(kargs.N),
make_tuple(1),
number<kNPerThread>{},
number<1>{});
return pad_tensor_view(
gamma_dram_naive, make_tuple(number<kNPerBlock>{}), sequence<kPadN>{});
}();
const auto beta_n = [&]() {
const auto gamma_dram_naive = make_naive_tensor_view<address_space_enum::global>(
static_cast<const BetaDataType*>(kargs.p_beta),
make_tuple(kargs.N),
make_tuple(1),
number<kNPerThread>{},
number<1>{});
return pad_tensor_view(
gamma_dram_naive, make_tuple(number<kNPerBlock>{}), sequence<kPadN>{});
}();
const auto iM = get_block_id() * kMPerBlock;
constexpr auto xDstr = MakeXBlockTileDistribution();
auto x_block_window = make_tile_window(
x_m_n, make_tuple(number<kMPerBlock>{}, number<kNPerBlock>{}), {iM, 0}, xDstr);
const auto y_m_n = [&]() {
const auto y_dram_naive = make_naive_tensor_view<address_space_enum::global>(
static_cast<YDataType*>(kargs.p_y),
make_tuple(kargs.M, kargs.N),
make_tuple(kargs.N, 1),
number<kNPerThread>{},
number<1>{});
return pad_tensor_view(y_dram_naive,
make_tuple(number<kMPerBlock>{}, number<kNPerBlock>{}),
sequence<kPadM, kPadN>{});
}();
auto y_block_window = make_tile_window(
y_m_n, make_tuple(number<kMPerBlock>{}, number<kNPerBlock>{}), {iM, 0});
constexpr auto gammaDstr = MakeGammaBetaBlockTileDistribution();
constexpr auto betaDstr = gammaDstr;
auto gamma_block_window =
make_tile_window(gamma_n, make_tuple(number<kNPerBlock>{}), {0}, gammaDstr);
auto beta_block_window = make_tile_window(
beta_n, make_tuple(number<kMPerBlock>{}, number<kNPerBlock>{}), {0}, betaDstr);
auto mean_block_window = [&]() {
if constexpr(kSaveMean)
{
const auto mean_m = [&]() {
const auto mean_dram_naive =
make_naive_tensor_view_packed<address_space_enum::global>(
static_cast<MeanDataType*>(kargs.p_mean),
make_tuple(kargs.M),
number<1>{});
return pad_tensor_view(
mean_dram_naive, make_tuple(number<kMPerBlock>{}), sequence<kPadM>{});
}();
return make_tile_window(mean_m, make_tuple(number<kMPerBlock>{}), {iM});
}
else
return make_null_tile_window(make_tuple(number<kMPerBlock>{}));
}();
auto inv_std_block_window = [&]() {
if constexpr(kSaveInvStd)
{
const auto inv_std_m = [&]() {
const auto inv_std_dram_naive =
make_naive_tensor_view_packed<address_space_enum::global>(
static_cast<InvStdDataType*>(kargs.p_invStd),
make_tuple(kargs.M),
number<1>{});
return pad_tensor_view(
inv_std_dram_naive, make_tuple(number<kMPerBlock>{}), sequence<kPadM>{});
}();
return make_tile_window(inv_std_m, make_tuple(number<kMPerBlock>{}), {iM});
}
else
return make_null_tile_window(make_tuple(number<kMPerBlock>{}));
}();
if(kargs.N <= kNPerBlock)
OnePassLayernorm2dFwd(x_block_window,
gamma_block_window,
beta_block_window,
y_block_window,
mean_block_window,
inv_std_block_window,
static_cast<const ComputeDataType>(kargs.epsilon),
kargs.N);
else
TwoPassLayernorm2dFwd(x_block_window,
gamma_block_window,
beta_block_window,
y_block_window,
mean_block_window,
inv_std_block_window,
static_cast<const ComputeDataType>(kargs.epsilon),
kargs.N);
}
};
......
......@@ -14,17 +14,21 @@ template <typename XDataType_,
typename YDataType_,
typename MeanDataType_,
typename InvStdDataType_,
typename BlockShape_>
typename BlockShape_,
bool kPadM_,
bool kPadN_>
struct BlockLayernorm2dFwdProblem
{
using XDataType = remove_cvref_t<XDataType_>;
using GammaDataType = remove_cvref_t<GammaDataType_>;
using BetaDataType = remove_cvref_t<BetaDataType_>;
using ComputeDataType = remove_cvref_t<ComputeDataType_>;
using YDataType = remove_cvref_t<YDataType_>;
using MeanDataType = remove_cvref_t<MeanDataType_>;
using InvStdDataType = remove_cvref_t<InvStdDataType_>;
using BlockShape = remove_cvref_t<BlockShape_>;
using XDataType = remove_cvref_t<XDataType_>;
using GammaDataType = remove_cvref_t<GammaDataType_>;
using BetaDataType = remove_cvref_t<BetaDataType_>;
using ComputeDataType = remove_cvref_t<ComputeDataType_>;
using YDataType = remove_cvref_t<YDataType_>;
using MeanDataType = remove_cvref_t<MeanDataType_>;
using InvStdDataType = remove_cvref_t<InvStdDataType_>;
using BlockShape = remove_cvref_t<BlockShape_>;
static constexpr bool kPadM = kPadM_;
static constexpr bool kPadN = kPadN_;
};
} // namespace ck_tile
// SPDX-License-Identifier: MIT
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <iostream>
#include <sstream>
#include "ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp"
#include "ck/tensor_operation/gpu/device/device_base.hpp"
namespace ck {
template <typename ALayout,
typename BLayout,
typename CLayout,
typename ADataType,
typename BDataType,
typename CDataType,
typename AccDataType,
typename AElementwiseOperation,
typename BElementwiseOperation,
typename CDEElementwiseOperation,
typename ComputeTypeA,
typename ComputeTypeB>
__global__ void
#if CK_USE_LAUNCH_BOUNDS
__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
#endif
naive_gemm_kernel(const ADataType* __restrict__ p_a_grid,
const BDataType* __restrict__ p_b_grid,
CDataType* __restrict__ p_c_grid,
index_t m,
index_t n,
index_t k,
const AElementwiseOperation a_element_op,
const BElementwiseOperation b_element_op,
const CDEElementwiseOperation c_element_op)
{
using RowMajor = ck::tensor_layout::gemm::RowMajor;
const int row_idx = blockIdx.x * blockDim.x + threadIdx.x;
const int col_idx = blockIdx.y * blockDim.y + threadIdx.y;
if(row_idx < m && col_idx < n)
{
AccDataType v_acc = static_cast<AccDataType>(0.0);
ComputeTypeA v_a = static_cast<ComputeTypeA>(0.0);
ComputeTypeB v_b = static_cast<ComputeTypeB>(0.0);
CDataType v_c = static_cast<CDataType>(0.0);
for(int k_idx = 0; k_idx < k; ++k_idx)
{
// check input matrices layout
int element_idx_a = 0;
int element_idx_b = 0;
if constexpr(std::is_same_v<ALayout, RowMajor>)
{
element_idx_a = row_idx * k + k_idx;
}
else
{
element_idx_a = row_idx + m * k_idx;
}
if constexpr(std::is_same_v<BLayout, RowMajor>)
{
element_idx_b = k_idx * n + col_idx;
}
else
{
element_idx_b = k_idx + k * col_idx;
}
// apply a_element_op
a_element_op(v_a, p_a_grid[element_idx_a]);
// apply b_element_op
b_element_op(v_b, p_b_grid[element_idx_b]);
// multiply and accumulate
v_acc += static_cast<AccDataType>(v_a) * static_cast<AccDataType>(v_b);
}
// apply c_element_op
c_element_op(v_c, v_acc);
// check output matrix layout
int element_idx_c = 0;
if constexpr(std::is_same_v<CLayout, RowMajor>)
{
element_idx_c = row_idx * n + col_idx;
}
else
{
element_idx_c = row_idx + m * col_idx;
}
// prepare output
p_c_grid[element_idx_c] = v_c;
}
}
} // namespace ck
namespace ck {
namespace tensor_operation {
namespace device {
template <typename ALayout,
typename BLayout,
typename CLayout,
typename ADataType,
typename BDataType,
typename CDataType,
typename AccDataType,
typename AElementwiseOperation,
typename BElementwiseOperation,
typename CElementwiseOperation,
typename ComputeTypeA = CDataType,
typename ComputeTypeB = ComputeTypeA>
struct ReferenceGemm : public device::BaseOperator
{
// Argument
struct Argument : public device::BaseArgument
{
Argument(const void* p_a_grid,
const void* p_b_grid,
void* p_c_grid,
index_t m,
index_t n,
index_t k,
AElementwiseOperation a_element_op,
BElementwiseOperation b_element_op,
CElementwiseOperation c_element_op)
: p_a_grid_{static_cast<const ADataType*>(p_a_grid)},
p_b_grid_{static_cast<const BDataType*>(p_b_grid)},
p_c_grid_{static_cast<CDataType*>(p_c_grid)},
m_{m},
n_{n},
k_{k},
a_element_op_{a_element_op},
b_element_op_{b_element_op},
c_element_op_{c_element_op}
{
}
const ADataType* p_a_grid_;
const BDataType* p_b_grid_;
CDataType* p_c_grid_;
index_t m_;
index_t n_;
index_t k_;
AElementwiseOperation a_element_op_;
BElementwiseOperation b_element_op_;
CElementwiseOperation c_element_op_;
};
// Invoker
struct Invoker : public device::BaseInvoker
{
using Argument = ReferenceGemm::Argument;
float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
{
int block_size = 16;
dim3 block_dim(block_size, block_size, 1);
dim3 grid_dim(
(arg.m_ + block_size - 1) / block_size, (arg.n_ + block_size - 1) / block_size, 1);
auto launch_kernel = [&]() {
const auto kernel = naive_gemm_kernel<ALayout,
BLayout,
CLayout,
ADataType,
BDataType,
CDataType,
AccDataType,
AElementwiseOperation,
BElementwiseOperation,
CElementwiseOperation,
ComputeTypeA,
ComputeTypeB>;
return launch_and_time_kernel(stream_config,
kernel,
grid_dim,
block_dim,
0,
arg.p_a_grid_,
arg.p_b_grid_,
arg.p_c_grid_,
arg.m_,
arg.n_,
arg.k_,
arg.a_element_op_,
arg.b_element_op_,
arg.c_element_op_);
};
return launch_kernel();
}
float Run(const device::BaseArgument* p_arg,
const StreamConfig& stream_config = StreamConfig{}) override
{
return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
}
};
bool IsSupportedArgument(const device::BaseArgument*) override { return true; }
static auto MakeArgument(const void* p_a_grid,
const void* p_b_grid,
void* p_c_grid,
index_t m,
index_t n,
index_t k,
AElementwiseOperation a_element_op,
BElementwiseOperation b_element_op,
CElementwiseOperation c_element_op)
{
return Argument{
p_a_grid, p_b_grid, p_c_grid, m, n, k, a_element_op, b_element_op, c_element_op};
}
static auto MakeInvoker() { return Invoker{}; }
virtual std::unique_ptr<device::BaseInvoker> MakeInvokerPointer()
{
return std::make_unique<Invoker>(Invoker{});
}
std::string GetTypeString() const override
{
auto str = std::stringstream();
// clang-format off
str << "Device Reference Gemm"
<< std::endl;
// clang-format on
return str.str();
}
};
} // namespace device
} // namespace tensor_operation
} // namespace ck
......@@ -37,11 +37,7 @@ function(add_instance_library INSTANCE_NAME)
endforeach()
endif()
if(INSTANCES_ONLY)
set(INST_TARGETS ${DEFAULT_GPU_TARGETS})
else()
set(INST_TARGETS ${GPU_TARGETS})
endif()
set(INST_TARGETS ${SUPPORTED_GPU_TARGETS})
# Do not build DL instances if DL_KERNELS macro is not set
foreach(source IN LISTS ARGN)
......@@ -64,9 +60,9 @@ function(add_instance_library INSTANCE_NAME)
list(REMOVE_ITEM ARGN "${source}")
endif()
endforeach()
# Do not build mha instances if gfx94 targets are not on the target list
# Do not build mha instances if gfx94 or gfx90a targets are not on the target list
foreach(source IN LISTS ARGN)
if(NOT INST_TARGETS MATCHES "gfx94" AND source MATCHES "mha")
if(NOT INST_TARGETS MATCHES "gfx94" AND NOT INST_TARGETS MATCHES "gfx90a" AND source MATCHES "mha")
message("removing mha instance ${source} ")
list(REMOVE_ITEM ARGN "${source}")
endif()
......@@ -75,17 +71,13 @@ function(add_instance_library INSTANCE_NAME)
if(ARGN)
set(INST_OBJ)
foreach(source IN LISTS ARGN)
if(INSTANCES_ONLY)
set(INST_TARGETS ${DEFAULT_GPU_TARGETS})
else()
set(INST_TARGETS ${GPU_TARGETS})
endif()
set(INST_TARGETS ${SUPPORTED_GPU_TARGETS})
if(source MATCHES "_xdl")
list(REMOVE_ITEM INST_TARGETS gfx900 gfx906 gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1200 gfx1201)
elseif(ARGN MATCHES "_wmma")
list(REMOVE_ITEM INST_TARGETS gfx900 gfx906 gfx908 gfx90a gfx940 gfx941 gfx942 gfx1030)
elseif(ARGN MATCHES "mha")
list(REMOVE_ITEM INST_TARGETS gfx900 gfx906 gfx908 gfx90a gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1200 gfx1201)
list(REMOVE_ITEM INST_TARGETS gfx900 gfx906 gfx908 gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1200 gfx1201)
endif()
set(offload_targets)
foreach(target IN LISTS INST_TARGETS)
......@@ -191,12 +183,7 @@ FOREACH(subdir_path ${dir_list})
set(add_inst 1)
endif()
if(INSTANCES_ONLY)
set(INST_TARGETS ${DEFAULT_GPU_TARGETS})
else()
set(INST_TARGETS ${GPU_TARGETS})
endif()
set(INST_TARGETS ${SUPPORTED_GPU_TARGETS})
if(("${cmake_instance}" MATCHES "quantization") AND (DEFINED DTYPES) AND (NOT DTYPES MATCHES "int8"))
message("quantization instances will not be built!")
......@@ -320,8 +307,7 @@ if(CK_DEVICE_CONV_INSTANCES)
endif()
if(CK_DEVICE_MHA_INSTANCES)
set(gpu_list ${INST_TARGETS})
list(FILTER gpu_list INCLUDE REGEX "^gfx94")
if(gpu_list)
if(gpu_list MATCHES "gfx94" OR gpu_list MATCHES "gfx90a")
add_library(device_mha_operations STATIC ${CK_DEVICE_MHA_INSTANCES})
add_library(composablekernels::device_mha_operations ALIAS device_mha_operations)
target_compile_features(device_mha_operations PUBLIC)
......
......@@ -24,7 +24,7 @@ set(PROFILER_SOURCES
profile_permute_scale.cpp
)
if(GPU_TARGETS MATCHES "gfx9")
if(SUPPORTED_GPU_TARGETS MATCHES "gfx9")
if(DTYPES MATCHES "fp32" OR DTYPES MATCHES "fp64" OR NOT DEFINED DTYPES)
list(APPEND PROFILER_SOURCES profile_contraction_bilinear.cpp)
list(APPEND PROFILER_SOURCES profile_contraction_scale.cpp)
......@@ -49,7 +49,7 @@ if(GPU_TARGETS MATCHES "gfx9")
list(APPEND PROFILER_SOURCES profile_grouped_gemm_multiply_tile_loop.cpp)
endif()
list(APPEND PROFILER_SOURCES profile_gemm_multiply_add.cpp)
if(GPU_TARGETS MATCHES "gfx94")
if(SUPPORTED_GPU_TARGETS MATCHES "gfx94")
list(APPEND PROFILER_SOURCES profile_gemm_multiply_multiply.cpp)
list(APPEND PROFILER_SOURCES profile_gemm_ab_scale.cpp)
endif()
......@@ -69,7 +69,7 @@ if(GPU_TARGETS MATCHES "gfx9")
endif()
if(GPU_TARGETS MATCHES "gfx11" OR GPU_TARGETS MATCHES "gfx12" OR GPU_TARGETS MATCHES "gfx9")
if(SUPPORTED_GPU_TARGETS MATCHES "gfx11" OR SUPPORTED_GPU_TARGETS MATCHES "gfx12" OR SUPPORTED_GPU_TARGETS MATCHES "gfx9")
if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
list(APPEND PROFILER_SOURCES profile_gemm_bilinear.cpp)
endif()
......@@ -111,7 +111,7 @@ target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_column_to_image_inst
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_transpose_instance)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_permute_scale_instance)
if(GPU_TARGETS MATCHES "gfx9")
if(SUPPORTED_GPU_TARGETS MATCHES "gfx9")
if(DTYPES MATCHES "fp32" OR DTYPES MATCHES "fp64" OR NOT DEFINED DTYPES)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_contraction_bilinear_instance)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_contraction_scale_instance)
......@@ -135,7 +135,7 @@ if(GPU_TARGETS MATCHES "gfx9")
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batched_gemm_instance)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batched_gemm_reduce_instance)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_multiply_add_instance)
if(GPU_TARGETS MATCHES "gfx94")
if(SUPPORTED_GPU_TARGETS MATCHES "gfx94")
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_multiply_multiply_instance)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_ab_scale_instance)
endif()
......@@ -159,7 +159,7 @@ if(GPU_TARGETS MATCHES "gfx9")
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv3d_fwd_convinvscale_instance)
endif()
if(GPU_TARGETS MATCHES "gfx9" OR GPU_TARGETS MATCHES "gfx11" OR GPU_TARGETS MATCHES "gfx12")
if(SUPPORTED_GPU_TARGETS MATCHES "gfx9" OR SUPPORTED_GPU_TARGETS MATCHES "gfx11" OR SUPPORTED_GPU_TARGETS MATCHES "gfx12")
if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_bilinear_instance)
endif()
......
......@@ -7,7 +7,8 @@ MY_PROJECT_SOURCE=$1
if [ $# -ge 2 ] ; then
GPU_TARGETS=$2
REST_ARGS=${@:3}
shift 2
REST_ARGS=$@
else
GPU_TARGETS="gfx908;gfx90a;gfx940"
REST_ARGS=
......
......@@ -7,7 +7,8 @@ MY_PROJECT_SOURCE=$1
if [ $# -ge 2 ] ; then
GPU_TARGETS=$2
REST_ARGS=${@:3}
shift 2
REST_ARGS=$@
else
GPU_TARGETS="gfx908;gfx90a;gfx940"
REST_ARGS=
......
......@@ -41,11 +41,7 @@ function(add_test_executable TEST_NAME)
endforeach()
endif()
if(INSTANCES_ONLY)
set(TEST_TARGETS ${DEFAULT_GPU_TARGETS})
else()
set(TEST_TARGETS ${GPU_TARGETS})
endif()
set(TEST_TARGETS ${SUPPORTED_GPU_TARGETS})
foreach(source IN LISTS ARGN)
if(NOT DEFINED DL_KERNELS AND source MATCHES "_dl")
......@@ -122,11 +118,7 @@ function(add_gtest_executable TEST_NAME)
endforeach()
endif()
if(INSTANCES_ONLY)
set(TEST_TARGETS ${DEFAULT_GPU_TARGETS})
else()
set(TEST_TARGETS ${GPU_TARGETS})
endif()
set(TEST_TARGETS ${SUPPORTED_GPU_TARGETS})
foreach(source IN LISTS ARGN)
if(NOT DEFINED DL_KERNELS AND source MATCHES "_dl")
......@@ -211,10 +203,10 @@ add_subdirectory(conv_tensor_rearrange)
add_subdirectory(transpose)
add_subdirectory(permute_scale)
add_subdirectory(wrapper)
if(GPU_TARGETS MATCHES "gfx11")
if(SUPPORTED_GPU_TARGETS MATCHES "gfx11")
add_subdirectory(wmma_op)
endif()
if(GPU_TARGETS MATCHES "gfx942" AND CK_HIP_VERSION_MAJOR GREATER_EQUAL 6 AND CK_HIP_VERSION_MINOR GREATER_EQUAL 2) # smfmac needs ROCm6.2
if(SUPPORTED_GPU_TARGETS MATCHES "gfx942" AND CK_HIP_VERSION_MAJOR GREATER_EQUAL 6 AND CK_HIP_VERSION_MINOR GREATER_EQUAL 2) # smfmac needs ROCm6.2
add_subdirectory(smfmac_op)
endif()
add_subdirectory(position_embedding)
......@@ -18,4 +18,9 @@ if(result EQUAL 0)
target_link_libraries(test_bf8 PRIVATE utility)
endif()
add_gtest_executable(test_custom_type test_custom_type.cpp)
if(result EQUAL 0)
target_link_libraries(test_custom_type PRIVATE utility)
endif()
add_gtest_executable(test_type_convert_const type_convert_const.cpp)
// SPDX-License-Identifier: MIT
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
#include "gtest/gtest.h"
#include "ck/utility/data_type.hpp"
#include "ck/utility/type_convert.hpp"
using ck::bf8_t;
using ck::bhalf_t;
using ck::f8_t;
using ck::half_t;
using ck::Number;
using ck::type_convert;
using ck::vector_type;
TEST(Custom_bool, TestSize)
{
struct custom_bool_t
{
bool data;
};
ASSERT_EQ(sizeof(custom_bool_t), sizeof(bool));
ASSERT_EQ(sizeof(vector_type<custom_bool_t, 2>), sizeof(vector_type<bool, 2>));
ASSERT_EQ(sizeof(vector_type<custom_bool_t, 4>), sizeof(vector_type<bool, 4>));
ASSERT_EQ(sizeof(vector_type<custom_bool_t, 8>), sizeof(vector_type<bool, 8>));
ASSERT_EQ(sizeof(vector_type<custom_bool_t, 16>), sizeof(vector_type<bool, 16>));
ASSERT_EQ(sizeof(vector_type<custom_bool_t, 32>), sizeof(vector_type<bool, 32>));
ASSERT_EQ(sizeof(vector_type<custom_bool_t, 64>), sizeof(vector_type<bool, 64>));
}
TEST(Custom_bool, TestAsType)
{
struct custom_bool_t
{
using type = bool;
type data;
custom_bool_t() : data{type{}} {}
custom_bool_t(type init) : data{init} {}
};
// test size
const int size = 4;
std::vector<bool> test_vec = {false, true, false, true};
// reference vector
vector_type<custom_bool_t, size> right_vec;
// check default CTOR
ck::static_for<0, size, 1>{}([&](auto i) {
ASSERT_EQ(right_vec.template AsType<custom_bool_t>()(Number<i>{}).data, false);
});
// assign test values to the vector
ck::static_for<0, size, 1>{}([&](auto i) {
right_vec.template AsType<custom_bool_t>()(Number<i>{}) = custom_bool_t{test_vec.at(i)};
});
// copy the vector
vector_type<custom_bool_t, size> left_vec{right_vec};
// check if values were copied correctly
ck::static_for<0, size, 1>{}([&](auto i) {
ASSERT_EQ(left_vec.template AsType<custom_bool_t>()(Number<i>{}).data, test_vec.at(i));
});
}
TEST(Custom_bool, TestAsTypeReshape)
{
struct custom_bool_t
{
using type = bool;
type data;
custom_bool_t() : data{type{}} {}
custom_bool_t(type init) : data{init} {}
};
// test size
const int size = 4;
std::vector<bool> test_vec = {false, true, false, true};
// reference vector
vector_type<custom_bool_t, size> right_vec;
// check default CTOR
ck::static_for<0, size, 1>{}([&](auto i) {
ASSERT_EQ(right_vec.template AsType<custom_bool_t>()(Number<i>{}).data, false);
});
// assign test values to the vector
ck::static_for<0, size, 1>{}([&](auto i) {
right_vec.template AsType<custom_bool_t>()(Number<i>{}) = custom_bool_t{test_vec.at(i)};
});
// copy the first half of a vector
vector_type<custom_bool_t, size / 2> left_vec{
right_vec.template AsType<vector_type<custom_bool_t, size / 2>::type>()(Number<0>{})};
// check if values were copied correctly
ck::static_for<0, size / 2, 1>{}([&](auto i) {
ASSERT_EQ(left_vec.template AsType<custom_bool_t>()(Number<i>{}).data, test_vec.at(i));
});
}
TEST(Custom_int8, TestSize)
{
struct custom_int8_t
{
int8_t data;
};
ASSERT_EQ(sizeof(custom_int8_t), sizeof(int8_t));
ASSERT_EQ(sizeof(vector_type<custom_int8_t, 2>), sizeof(vector_type<int8_t, 2>));
ASSERT_EQ(sizeof(vector_type<custom_int8_t, 4>), sizeof(vector_type<int8_t, 4>));
ASSERT_EQ(sizeof(vector_type<custom_int8_t, 8>), sizeof(vector_type<int8_t, 8>));
ASSERT_EQ(sizeof(vector_type<custom_int8_t, 16>), sizeof(vector_type<int8_t, 16>));
ASSERT_EQ(sizeof(vector_type<custom_int8_t, 32>), sizeof(vector_type<int8_t, 32>));
ASSERT_EQ(sizeof(vector_type<custom_int8_t, 64>), sizeof(vector_type<int8_t, 64>));
}
TEST(Custom_int8, TestAsType)
{
struct custom_int8_t
{
using type = int8_t;
type data;
custom_int8_t() : data{type{}} {}
custom_int8_t(type init) : data{init} {}
};
// test size
const int size = 4;
std::vector<int8_t> test_vec = {3, -6, 8, -2};
// reference vector
vector_type<custom_int8_t, size> right_vec;
// check default CTOR
ck::static_for<0, size, 1>{}([&](auto i) {
ASSERT_EQ(right_vec.template AsType<custom_int8_t>()(Number<i>{}).data, 0);
});
// assign test values to the vector
ck::static_for<0, size, 1>{}([&](auto i) {
right_vec.template AsType<custom_int8_t>()(Number<i>{}) = custom_int8_t{test_vec.at(i)};
});
// copy the vector
vector_type<custom_int8_t, size> left_vec{right_vec};
// check if values were copied correctly
ck::static_for<0, size, 1>{}([&](auto i) {
ASSERT_EQ(left_vec.template AsType<custom_int8_t>()(Number<i>{}).data, test_vec.at(i));
});
}
TEST(Custom_int8, TestAsTypeReshape)
{
struct custom_int8_t
{
using type = int8_t;
type data;
custom_int8_t() : data{type{}} {}
custom_int8_t(type init) : data{init} {}
};
// test size
const int size = 4;
std::vector<int8_t> test_vec = {3, -6, 8, -2};
// reference vector
vector_type<custom_int8_t, size> right_vec;
// check default CTOR
ck::static_for<0, size, 1>{}([&](auto i) {
ASSERT_EQ(right_vec.template AsType<custom_int8_t>()(Number<i>{}).data, 0);
});
// assign test values to the vector
ck::static_for<0, size, 1>{}([&](auto i) {
right_vec.template AsType<custom_int8_t>()(Number<i>{}) = custom_int8_t{test_vec.at(i)};
});
// copy the first half of a vector
vector_type<custom_int8_t, size / 2> left_vec{
right_vec.template AsType<vector_type<custom_int8_t, size / 2>::type>()(Number<0>{})};
// check if values were copied correctly
ck::static_for<0, size / 2, 1>{}([&](auto i) {
ASSERT_EQ(left_vec.template AsType<custom_int8_t>()(Number<i>{}).data, test_vec.at(i));
});
}
TEST(Custom_uint8, TestSize)
{
struct custom_uint8_t
{
uint8_t data;
};
ASSERT_EQ(sizeof(custom_uint8_t), sizeof(uint8_t));
ASSERT_EQ(sizeof(vector_type<custom_uint8_t, 2>), sizeof(vector_type<uint8_t, 2>));
ASSERT_EQ(sizeof(vector_type<custom_uint8_t, 4>), sizeof(vector_type<uint8_t, 4>));
ASSERT_EQ(sizeof(vector_type<custom_uint8_t, 8>), sizeof(vector_type<uint8_t, 8>));
ASSERT_EQ(sizeof(vector_type<custom_uint8_t, 16>), sizeof(vector_type<uint8_t, 16>));
ASSERT_EQ(sizeof(vector_type<custom_uint8_t, 32>), sizeof(vector_type<uint8_t, 32>));
ASSERT_EQ(sizeof(vector_type<custom_uint8_t, 64>), sizeof(vector_type<uint8_t, 64>));
}
TEST(Custom_uint8, TestAsType)
{
struct custom_uint8_t
{
using type = uint8_t;
type data;
custom_uint8_t() : data{type{}} {}
custom_uint8_t(type init) : data{init} {}
};
// test size
const int size = 4;
std::vector<uint8_t> test_vec = {3, 6, 8, 2};
// reference vector
vector_type<custom_uint8_t, size> right_vec;
// check default CTOR
ck::static_for<0, size, 1>{}([&](auto i) {
ASSERT_EQ(right_vec.template AsType<custom_uint8_t>()(Number<i>{}).data, 0);
});
// assign test values to the vector
ck::static_for<0, size, 1>{}([&](auto i) {
right_vec.template AsType<custom_uint8_t>()(Number<i>{}) = custom_uint8_t{test_vec.at(i)};
});
// copy the vector
vector_type<custom_uint8_t, size> left_vec{right_vec};
// check if values were copied correctly
ck::static_for<0, size, 1>{}([&](auto i) {
ASSERT_EQ(left_vec.template AsType<custom_uint8_t>()(Number<i>{}).data, test_vec.at(i));
});
}
TEST(Custom_uint8, TestAsTypeReshape)
{
struct custom_uint8_t
{
using type = uint8_t;
type data;
custom_uint8_t() : data{type{}} {}
custom_uint8_t(type init) : data{init} {}
};
// test size
const int size = 4;
std::vector<uint8_t> test_vec = {3, 6, 8, 2};
// reference vector
vector_type<custom_uint8_t, size> right_vec;
// check default CTOR
ck::static_for<0, size, 1>{}([&](auto i) {
ASSERT_EQ(right_vec.template AsType<custom_uint8_t>()(Number<i>{}).data, 0);
});
// assign test values to the vector
ck::static_for<0, size, 1>{}([&](auto i) {
right_vec.template AsType<custom_uint8_t>()(Number<i>{}) = custom_uint8_t{test_vec.at(i)};
});
// copy the first half of a vector
vector_type<custom_uint8_t, size / 2> left_vec{
right_vec.template AsType<vector_type<custom_uint8_t, size / 2>::type>()(Number<0>{})};
// check if values were copied correctly
ck::static_for<0, size / 2, 1>{}([&](auto i) {
ASSERT_EQ(left_vec.template AsType<custom_uint8_t>()(Number<i>{}).data, test_vec.at(i));
});
}
TEST(Custom_f8, TestSize)
{
struct custom_f8_t
{
_BitInt(8) data;
};
ASSERT_EQ(sizeof(custom_f8_t), sizeof(_BitInt(8)));
ASSERT_EQ(sizeof(vector_type<custom_f8_t, 2>), sizeof(vector_type<_BitInt(8), 2>));
ASSERT_EQ(sizeof(vector_type<custom_f8_t, 4>), sizeof(vector_type<_BitInt(8), 4>));
ASSERT_EQ(sizeof(vector_type<custom_f8_t, 8>), sizeof(vector_type<_BitInt(8), 8>));
ASSERT_EQ(sizeof(vector_type<custom_f8_t, 16>), sizeof(vector_type<_BitInt(8), 16>));
ASSERT_EQ(sizeof(vector_type<custom_f8_t, 32>), sizeof(vector_type<_BitInt(8), 32>));
ASSERT_EQ(sizeof(vector_type<custom_f8_t, 64>), sizeof(vector_type<_BitInt(8), 64>));
}
TEST(Custom_f8, TestAsType)
{
struct custom_f8_t
{
using type = _BitInt(8);
type data;
custom_f8_t() : data{type{}} {}
custom_f8_t(type init) : data{init} {}
};
// test size
const int size = 4;
std::vector<_BitInt(8)> test_vec = {type_convert<_BitInt(8)>(0.3f),
type_convert<_BitInt(8)>(-0.6f),
type_convert<_BitInt(8)>(0.8f),
type_convert<_BitInt(8)>(-0.2f)};
// reference vector
vector_type<custom_f8_t, size> right_vec;
// check default CTOR
ck::static_for<0, size, 1>{}(
[&](auto i) { ASSERT_EQ(right_vec.template AsType<custom_f8_t>()(Number<i>{}).data, 0); });
// assign test values to the vector
ck::static_for<0, size, 1>{}([&](auto i) {
right_vec.template AsType<custom_f8_t>()(Number<i>{}) = custom_f8_t{test_vec.at(i)};
});
// copy the vector
vector_type<custom_f8_t, size> left_vec{right_vec};
// check if values were copied correctly
ck::static_for<0, size, 1>{}([&](auto i) {
ASSERT_EQ(left_vec.template AsType<custom_f8_t>()(Number<i>{}).data, test_vec.at(i));
});
}
TEST(Custom_f8, TestAsTypeReshape)
{
struct custom_f8_t
{
using type = _BitInt(8);
type data;
custom_f8_t() : data{type{}} {}
custom_f8_t(type init) : data{init} {}
};
// test size
const int size = 4;
std::vector<_BitInt(8)> test_vec = {type_convert<_BitInt(8)>(0.3f),
type_convert<_BitInt(8)>(-0.6f),
type_convert<_BitInt(8)>(0.8f),
type_convert<_BitInt(8)>(-0.2f)};
// reference vector
vector_type<custom_f8_t, size> right_vec;
// check default CTOR
ck::static_for<0, size, 1>{}(
[&](auto i) { ASSERT_EQ(right_vec.template AsType<custom_f8_t>()(Number<i>{}).data, 0); });
// assign test values to the vector
ck::static_for<0, size, 1>{}([&](auto i) {
right_vec.template AsType<custom_f8_t>()(Number<i>{}) = custom_f8_t{test_vec.at(i)};
});
// copy the first half of a vector
vector_type<custom_f8_t, size / 2> left_vec{
right_vec.template AsType<vector_type<custom_f8_t, size / 2>::type>()(Number<0>{})};
// check if values were copied correctly
ck::static_for<0, size / 2, 1>{}([&](auto i) {
ASSERT_EQ(left_vec.template AsType<custom_f8_t>()(Number<i>{}).data, test_vec.at(i));
});
}
TEST(Custom_bf8, TestSize)
{
struct custom_bf8_t
{
unsigned _BitInt(8) data;
};
ASSERT_EQ(sizeof(custom_bf8_t), sizeof(unsigned _BitInt(8)));
ASSERT_EQ(sizeof(vector_type<custom_bf8_t, 2>), sizeof(vector_type<unsigned _BitInt(8), 2>));
ASSERT_EQ(sizeof(vector_type<custom_bf8_t, 4>), sizeof(vector_type<unsigned _BitInt(8), 4>));
ASSERT_EQ(sizeof(vector_type<custom_bf8_t, 8>), sizeof(vector_type<unsigned _BitInt(8), 8>));
ASSERT_EQ(sizeof(vector_type<custom_bf8_t, 16>), sizeof(vector_type<unsigned _BitInt(8), 16>));
ASSERT_EQ(sizeof(vector_type<custom_bf8_t, 32>), sizeof(vector_type<unsigned _BitInt(8), 32>));
ASSERT_EQ(sizeof(vector_type<custom_bf8_t, 64>), sizeof(vector_type<unsigned _BitInt(8), 64>));
}
TEST(Custom_bf8, TestAsType)
{
struct custom_bf8_t
{
using type = unsigned _BitInt(8);
type data;
custom_bf8_t() : data{type{}} {}
custom_bf8_t(type init) : data{init} {}
};
// test size
const int size = 4;
std::vector<unsigned _BitInt(8)> test_vec = {type_convert<unsigned _BitInt(8)>(0.3f),
type_convert<unsigned _BitInt(8)>(-0.6f),
type_convert<unsigned _BitInt(8)>(0.8f),
type_convert<unsigned _BitInt(8)>(-0.2f)};
// reference vector
vector_type<custom_bf8_t, size> right_vec;
// check default CTOR
ck::static_for<0, size, 1>{}(
[&](auto i) { ASSERT_EQ(right_vec.template AsType<custom_bf8_t>()(Number<i>{}).data, 0); });
// assign test values to the vector
ck::static_for<0, size, 1>{}([&](auto i) {
right_vec.template AsType<custom_bf8_t>()(Number<i>{}) = custom_bf8_t{test_vec.at(i)};
});
// copy the vector
vector_type<custom_bf8_t, size> left_vec{right_vec};
// check if values were copied correctly
ck::static_for<0, size, 1>{}([&](auto i) {
ASSERT_EQ(left_vec.template AsType<custom_bf8_t>()(Number<i>{}).data, test_vec.at(i));
});
}
TEST(Custom_bf8, TestAsTypeReshape)
{
struct custom_bf8_t
{
using type = unsigned _BitInt(8);
type data;
custom_bf8_t() : data{type{}} {}
custom_bf8_t(type init) : data{init} {}
};
// test size
const int size = 4;
std::vector<unsigned _BitInt(8)> test_vec = {type_convert<unsigned _BitInt(8)>(0.3f),
type_convert<unsigned _BitInt(8)>(-0.6f),
type_convert<unsigned _BitInt(8)>(0.8f),
type_convert<unsigned _BitInt(8)>(-0.2f)};
// reference vector
vector_type<custom_bf8_t, size> right_vec;
// check default CTOR
ck::static_for<0, size, 1>{}(
[&](auto i) { ASSERT_EQ(right_vec.template AsType<custom_bf8_t>()(Number<i>{}).data, 0); });
// assign test values to the vector
ck::static_for<0, size, 1>{}([&](auto i) {
right_vec.template AsType<custom_bf8_t>()(Number<i>{}) = custom_bf8_t{test_vec.at(i)};
});
// copy the first half of a vector
vector_type<custom_bf8_t, size / 2> left_vec{
right_vec.template AsType<vector_type<custom_bf8_t, size / 2>::type>()(Number<0>{})};
// check if values were copied correctly
ck::static_for<0, size / 2, 1>{}([&](auto i) {
ASSERT_EQ(left_vec.template AsType<custom_bf8_t>()(Number<i>{}).data, test_vec.at(i));
});
}
TEST(Custom_half, TestSize)
{
struct custom_half_t
{
half_t data;
};
ASSERT_EQ(sizeof(custom_half_t), sizeof(half_t));
ASSERT_EQ(sizeof(vector_type<custom_half_t, 2>), sizeof(vector_type<half_t, 2>));
ASSERT_EQ(sizeof(vector_type<custom_half_t, 4>), sizeof(vector_type<half_t, 4>));
ASSERT_EQ(sizeof(vector_type<custom_half_t, 8>), sizeof(vector_type<half_t, 8>));
ASSERT_EQ(sizeof(vector_type<custom_half_t, 16>), sizeof(vector_type<half_t, 16>));
ASSERT_EQ(sizeof(vector_type<custom_half_t, 32>), sizeof(vector_type<half_t, 32>));
ASSERT_EQ(sizeof(vector_type<custom_half_t, 64>), sizeof(vector_type<half_t, 64>));
}
TEST(Custom_half, TestAsType)
{
struct custom_half_t
{
using type = half_t;
type data;
custom_half_t() : data{type{}} {}
custom_half_t(type init) : data{init} {}
};
// test size
const int size = 4;
std::vector<half_t> test_vec = {half_t{0.3f}, half_t{-0.6f}, half_t{0.8f}, half_t{-0.2f}};
// reference vector
vector_type<custom_half_t, size> right_vec;
// check default CTOR
ck::static_for<0, size, 1>{}([&](auto i) {
ASSERT_EQ(right_vec.template AsType<custom_half_t>()(Number<i>{}).data,
type_convert<half_t>(0.0f));
});
// assign test values to the vector
ck::static_for<0, size, 1>{}([&](auto i) {
right_vec.template AsType<custom_half_t>()(Number<i>{}) = custom_half_t{test_vec.at(i)};
});
// copy the vector
vector_type<custom_half_t, size> left_vec{right_vec};
// check if values were copied correctly
ck::static_for<0, size, 1>{}([&](auto i) {
ASSERT_EQ(left_vec.template AsType<custom_half_t>()(Number<i>{}).data, test_vec.at(i));
});
}
TEST(Custom_half, TestAsTypeReshape)
{
struct custom_half_t
{
using type = half_t;
type data;
custom_half_t() : data{type{}} {}
custom_half_t(type init) : data{init} {}
};
// test size
const int size = 4;
std::vector<half_t> test_vec = {half_t{0.3f}, half_t{-0.6f}, half_t{0.8f}, half_t{-0.2f}};
// reference vector
vector_type<custom_half_t, size> right_vec;
// check default CTOR
ck::static_for<0, size, 1>{}([&](auto i) {
ASSERT_EQ(right_vec.template AsType<custom_half_t>()(Number<i>{}).data,
type_convert<half_t>(0.0f));
});
// assign test values to the vector
ck::static_for<0, size, 1>{}([&](auto i) {
right_vec.template AsType<custom_half_t>()(Number<i>{}) = custom_half_t{test_vec.at(i)};
});
// copy the first half of a vector
vector_type<custom_half_t, size / 2> left_vec{
right_vec.template AsType<vector_type<custom_half_t, size / 2>::type>()(Number<0>{})};
// check if values were copied correctly
ck::static_for<0, size / 2, 1>{}([&](auto i) {
ASSERT_EQ(left_vec.template AsType<custom_half_t>()(Number<i>{}).data, test_vec.at(i));
});
}
TEST(Custom_bhalf, TestSize)
{
struct custom_bhalf_t
{
bhalf_t data;
};
ASSERT_EQ(sizeof(custom_bhalf_t), sizeof(bhalf_t));
ASSERT_EQ(sizeof(vector_type<custom_bhalf_t, 2>), sizeof(vector_type<bhalf_t, 2>));
ASSERT_EQ(sizeof(vector_type<custom_bhalf_t, 4>), sizeof(vector_type<bhalf_t, 4>));
ASSERT_EQ(sizeof(vector_type<custom_bhalf_t, 8>), sizeof(vector_type<bhalf_t, 8>));
ASSERT_EQ(sizeof(vector_type<custom_bhalf_t, 16>), sizeof(vector_type<bhalf_t, 16>));
ASSERT_EQ(sizeof(vector_type<custom_bhalf_t, 32>), sizeof(vector_type<bhalf_t, 32>));
ASSERT_EQ(sizeof(vector_type<custom_bhalf_t, 64>), sizeof(vector_type<bhalf_t, 64>));
}
TEST(Custom_bhalf, TestAsType)
{
struct custom_bhalf_t
{
using type = bhalf_t;
type data;
custom_bhalf_t() : data{type{}} {}
custom_bhalf_t(type init) : data{init} {}
};
// test size
const int size = 4;
std::vector<bhalf_t> test_vec = {type_convert<bhalf_t>(0.3f),
type_convert<bhalf_t>(-0.6f),
type_convert<bhalf_t>(0.8f),
type_convert<bhalf_t>(-0.2f)};
// reference vector
vector_type<custom_bhalf_t, size> right_vec;
// check default CTOR
ck::static_for<0, size, 1>{}([&](auto i) {
ASSERT_EQ(right_vec.template AsType<custom_bhalf_t>()(Number<i>{}).data,
type_convert<bhalf_t>(0.0f));
});
// assign test values to the vector
ck::static_for<0, size, 1>{}([&](auto i) {
right_vec.template AsType<custom_bhalf_t>()(Number<i>{}) = custom_bhalf_t{test_vec.at(i)};
});
// copy the vector
vector_type<custom_bhalf_t, size> left_vec{right_vec};
// check if values were copied correctly
ck::static_for<0, size, 1>{}([&](auto i) {
ASSERT_EQ(left_vec.template AsType<custom_bhalf_t>()(Number<i>{}).data, test_vec.at(i));
});
}
TEST(Custom_bhalf, TestAsTypeReshape)
{
struct custom_bhalf_t
{
using type = bhalf_t;
type data;
custom_bhalf_t() : data{type{}} {}
custom_bhalf_t(type init) : data{init} {}
};
// test size
const int size = 4;
std::vector<bhalf_t> test_vec = {type_convert<bhalf_t>(0.3f),
type_convert<bhalf_t>(-0.6f),
type_convert<bhalf_t>(0.8f),
type_convert<bhalf_t>(-0.2f)};
// reference vector
vector_type<custom_bhalf_t, size> right_vec;
// check default CTOR
ck::static_for<0, size, 1>{}([&](auto i) {
ASSERT_EQ(right_vec.template AsType<custom_bhalf_t>()(Number<i>{}).data,
type_convert<bhalf_t>(0.0f));
});
// assign test values to the vector
ck::static_for<0, size, 1>{}([&](auto i) {
right_vec.template AsType<custom_bhalf_t>()(Number<i>{}) = custom_bhalf_t{test_vec.at(i)};
});
// copy the first half of a vector
vector_type<custom_bhalf_t, size / 2> left_vec{
right_vec.template AsType<vector_type<custom_bhalf_t, size / 2>::type>()(Number<0>{})};
// check if values were copied correctly
ck::static_for<0, size / 2, 1>{}([&](auto i) {
ASSERT_EQ(left_vec.template AsType<custom_bhalf_t>()(Number<i>{}).data, test_vec.at(i));
});
}
TEST(Custom_float, TestSize)
{
struct custom_float_t
{
float data;
};
ASSERT_EQ(sizeof(custom_float_t), sizeof(float));
ASSERT_EQ(sizeof(vector_type<custom_float_t, 2>), sizeof(vector_type<float, 2>));
ASSERT_EQ(sizeof(vector_type<custom_float_t, 4>), sizeof(vector_type<float, 4>));
ASSERT_EQ(sizeof(vector_type<custom_float_t, 8>), sizeof(vector_type<float, 8>));
ASSERT_EQ(sizeof(vector_type<custom_float_t, 16>), sizeof(vector_type<float, 16>));
ASSERT_EQ(sizeof(vector_type<custom_float_t, 32>), sizeof(vector_type<float, 32>));
ASSERT_EQ(sizeof(vector_type<custom_float_t, 64>), sizeof(vector_type<float, 64>));
}
TEST(Custom_float, TestAsType)
{
struct custom_float_t
{
using type = float;
type data;
custom_float_t() : data{type{}} {}
custom_float_t(type init) : data{init} {}
};
// test size
const int size = 4;
std::vector<float> test_vec = {0.3f, -0.6f, 0.8f, -0.2f};
// reference vector
vector_type<custom_float_t, size> right_vec;
// check default CTOR
ck::static_for<0, size, 1>{}([&](auto i) {
ASSERT_EQ(right_vec.template AsType<custom_float_t>()(Number<i>{}).data, 0.0f);
});
// assign test values to the vector
ck::static_for<0, size, 1>{}([&](auto i) {
right_vec.template AsType<custom_float_t>()(Number<i>{}) = custom_float_t{test_vec.at(i)};
});
// copy the vector
vector_type<custom_float_t, size> left_vec{right_vec};
// check if values were copied correctly
ck::static_for<0, size, 1>{}([&](auto i) {
ASSERT_EQ(left_vec.template AsType<custom_float_t>()(Number<i>{}).data, test_vec.at(i));
});
}
TEST(Custom_float, TestAsTypeReshape)
{
struct custom_float_t
{
using type = float;
type data;
custom_float_t() : data{type{}} {}
custom_float_t(type init) : data{init} {}
};
// test size
const int size = 4;
std::vector<float> test_vec = {0.3f, -0.6f, 0.8f, -0.2f};
// reference vector
vector_type<custom_float_t, size> right_vec;
// check default CTOR
ck::static_for<0, size, 1>{}([&](auto i) {
ASSERT_EQ(right_vec.template AsType<custom_float_t>()(Number<i>{}).data, 0.0f);
});
// assign test values to the vector
ck::static_for<0, size, 1>{}([&](auto i) {
right_vec.template AsType<custom_float_t>()(Number<i>{}) = custom_float_t{test_vec.at(i)};
});
// copy the first half of a vector
vector_type<custom_float_t, size / 2> left_vec{
right_vec.template AsType<vector_type<custom_float_t, size / 2>::type>()(Number<0>{})};
// check if values were copied correctly
ck::static_for<0, size / 2, 1>{}([&](auto i) {
ASSERT_EQ(left_vec.template AsType<custom_float_t>()(Number<i>{}).data, test_vec.at(i));
});
}
TEST(Custom_double, TestSize)
{
struct custom_double_t
{
double data;
};
ASSERT_EQ(sizeof(custom_double_t), sizeof(double));
ASSERT_EQ(sizeof(vector_type<custom_double_t, 2>), sizeof(vector_type<double, 2>));
ASSERT_EQ(sizeof(vector_type<custom_double_t, 4>), sizeof(vector_type<double, 4>));
ASSERT_EQ(sizeof(vector_type<custom_double_t, 8>), sizeof(vector_type<double, 8>));
ASSERT_EQ(sizeof(vector_type<custom_double_t, 16>), sizeof(vector_type<double, 16>));
ASSERT_EQ(sizeof(vector_type<custom_double_t, 32>), sizeof(vector_type<double, 32>));
ASSERT_EQ(sizeof(vector_type<custom_double_t, 64>), sizeof(vector_type<double, 64>));
}
TEST(Custom_double, TestAsType)
{
struct custom_double_t
{
using type = double;
type data;
custom_double_t() : data{type{}} {}
custom_double_t(type init) : data{init} {}
};
// test size
const int size = 4;
std::vector<double> test_vec = {0.3, 0.6, 0.8, 0.2};
// reference vector
vector_type<custom_double_t, size> right_vec;
// check default CTOR
ck::static_for<0, size, 1>{}([&](auto i) {
ASSERT_EQ(right_vec.template AsType<custom_double_t>()(Number<i>{}).data, 0.0);
});
// assign test values to the vector
ck::static_for<0, size, 1>{}([&](auto i) {
right_vec.template AsType<custom_double_t>()(Number<i>{}) = custom_double_t{test_vec.at(i)};
});
// copy the vector
vector_type<custom_double_t, size> left_vec{right_vec};
// check if values were copied correctly
ck::static_for<0, size, 1>{}([&](auto i) {
ASSERT_EQ(left_vec.template AsType<custom_double_t>()(Number<i>{}).data, test_vec.at(i));
});
}
TEST(Custom_double, TestAsTypeReshape)
{
struct custom_double_t
{
using type = double;
type data;
custom_double_t() : data{type{}} {}
custom_double_t(type init) : data{init} {}
};
// test size
const int size = 4;
std::vector<double> test_vec = {0.3, 0.6, 0.8, 0.2};
// reference vector
vector_type<custom_double_t, size> right_vec;
// check default CTOR
ck::static_for<0, size, 1>{}([&](auto i) {
ASSERT_EQ(right_vec.template AsType<custom_double_t>()(Number<i>{}).data, 0.0);
});
// assign test values to the vector
ck::static_for<0, size, 1>{}([&](auto i) {
right_vec.template AsType<custom_double_t>()(Number<i>{}) = custom_double_t{test_vec.at(i)};
});
// copy the first half of a vector
vector_type<custom_double_t, size / 2> left_vec{
right_vec.template AsType<vector_type<custom_double_t, size / 2>::type>()(Number<0>{})};
// check if values were copied correctly
ck::static_for<0, size / 2, 1>{}([&](auto i) {
ASSERT_EQ(left_vec.template AsType<custom_double_t>()(Number<i>{}).data, test_vec.at(i));
});
}
TEST(Complex_half, TestSize)
{
struct complex_half_t
{
half_t real;
half_t img;
};
ASSERT_EQ(sizeof(complex_half_t), sizeof(half_t) + sizeof(half_t));
ASSERT_EQ(sizeof(vector_type<complex_half_t, 2>),
sizeof(vector_type<half_t, 2>) + sizeof(vector_type<half_t, 2>));
ASSERT_EQ(sizeof(vector_type<complex_half_t, 4>),
sizeof(vector_type<half_t, 4>) + sizeof(vector_type<half_t, 4>));
ASSERT_EQ(sizeof(vector_type<complex_half_t, 8>),
sizeof(vector_type<half_t, 8>) + sizeof(vector_type<half_t, 8>));
ASSERT_EQ(sizeof(vector_type<complex_half_t, 16>),
sizeof(vector_type<half_t, 16>) + sizeof(vector_type<half_t, 16>));
ASSERT_EQ(sizeof(vector_type<complex_half_t, 32>),
sizeof(vector_type<half_t, 32>) + sizeof(vector_type<half_t, 32>));
ASSERT_EQ(sizeof(vector_type<complex_half_t, 64>),
sizeof(vector_type<half_t, 64>) + sizeof(vector_type<half_t, 64>));
}
TEST(Complex_half, TestAlignment)
{
struct complex_half_t
{
half_t real;
half_t img;
};
ASSERT_EQ(alignof(vector_type<complex_half_t, 2>),
alignof(vector_type<half_t, 2>) + alignof(vector_type<half_t, 2>));
ASSERT_EQ(alignof(vector_type<complex_half_t, 4>),
alignof(vector_type<half_t, 4>) + alignof(vector_type<half_t, 4>));
ASSERT_EQ(alignof(vector_type<complex_half_t, 8>),
alignof(vector_type<half_t, 8>) + alignof(vector_type<half_t, 8>));
ASSERT_EQ(alignof(vector_type<complex_half_t, 16>),
alignof(vector_type<half_t, 16>) + alignof(vector_type<half_t, 16>));
ASSERT_EQ(alignof(vector_type<complex_half_t, 32>),
alignof(vector_type<half_t, 32>) + alignof(vector_type<half_t, 32>));
ASSERT_EQ(alignof(vector_type<complex_half_t, 64>),
alignof(vector_type<half_t, 64>) + alignof(vector_type<half_t, 64>));
}
TEST(Complex_half, TestAsType)
{
struct complex_half_t
{
using type = half_t;
type real;
type img;
complex_half_t() : real{type{}}, img{type{}} {}
complex_half_t(type real_init, type img_init) : real{real_init}, img{img_init} {}
};
// test size
const int size = 4;
// custom type number of elements
const int num_elem = sizeof(complex_half_t) / sizeof(complex_half_t::type);
std::vector<half_t> test_vec = {half_t{0.3f},
half_t{-0.6f},
half_t{0.8f},
half_t{-0.2f},
half_t{0.5f},
half_t{-0.7f},
half_t{0.9f},
half_t{-0.3f}};
// reference vector
vector_type<complex_half_t, size> right_vec;
// check default CTOR
ck::static_for<0, size, 1>{}([&](auto i) {
ASSERT_EQ(right_vec.template AsType<complex_half_t>()(Number<i>{}).real,
type_convert<half_t>(0.0f));
ASSERT_EQ(right_vec.template AsType<complex_half_t>()(Number<i>{}).img,
type_convert<half_t>(0.0f));
});
// assign test values to the vector
ck::static_for<0, size, 1>{}([&](auto i) {
right_vec.template AsType<complex_half_t>()(Number<i>{}) =
complex_half_t{test_vec.at(num_elem * i), test_vec.at(num_elem * i + 1)};
});
// copy the vector
vector_type<complex_half_t, size> left_vec{right_vec};
// check if values were copied correctly
ck::static_for<0, size, 1>{}([&](auto i) {
ASSERT_EQ(left_vec.template AsType<complex_half_t>()(Number<i>{}).real,
test_vec.at(num_elem * i));
ASSERT_EQ(left_vec.template AsType<complex_half_t>()(Number<i>{}).img,
test_vec.at(num_elem * i + 1));
});
}
TEST(Complex_half, TestAsTypeReshape)
{
struct complex_half_t
{
using type = half_t;
type real;
type img;
complex_half_t() : real{type{}}, img{type{}} {}
complex_half_t(type real_init, type img_init) : real{real_init}, img{img_init} {}
};
// test size
const int size = 4;
// custom type number of elements
const int num_elem = sizeof(complex_half_t) / sizeof(complex_half_t::type);
std::vector<half_t> test_vec = {half_t{0.3f},
half_t{-0.6f},
half_t{0.8f},
half_t{-0.2f},
half_t{0.5f},
half_t{-0.7f},
half_t{0.9f},
half_t{-0.3f}};
// reference vector
vector_type<complex_half_t, size> right_vec;
// check default CTOR
ck::static_for<0, size, 1>{}([&](auto i) {
ASSERT_EQ(right_vec.template AsType<complex_half_t>()(Number<i>{}).real,
type_convert<half_t>(0.0f));
ASSERT_EQ(right_vec.template AsType<complex_half_t>()(Number<i>{}).img,
type_convert<half_t>(0.0f));
});
// assign test values to the vector
ck::static_for<0, size, 1>{}([&](auto i) {
right_vec.template AsType<complex_half_t>()(Number<i>{}) =
complex_half_t{test_vec.at(num_elem * i), test_vec.at(num_elem * i + 1)};
});
// copy the first half of a vector
vector_type<complex_half_t, size / 2> left_vec{
right_vec.template AsType<vector_type<complex_half_t, size / 2>::type>()(Number<0>{})};
// check if values were copied correctly
ck::static_for<0, size / 2, 1>{}([&](auto i) {
ASSERT_EQ(left_vec.template AsType<complex_half_t>()(Number<i>{}).real,
test_vec.at(num_elem * i));
ASSERT_EQ(left_vec.template AsType<complex_half_t>()(Number<i>{}).img,
test_vec.at(num_elem * i + 1));
});
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment