Merge remote-tracking branch 'upstream/develop' into gemm_elementwise_gemm

d20c20a6 · Mirza Halilcevic · 250a89f3 · 10158b0f · d20c20a6 · d20c20a6
Commit d20c20a6 authored Oct 16, 2024 by Mirza Halilcevic
15 changed files
--- a/include/ck_tile/ops/gemm/pipeline/block_gemm_pipeline_agmem_bgmem_creg_v2.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/block_gemm_pipeline_agmem_bgmem_creg_v2.hpp
@@ -4,15 +4,15 @@
 #pragma once

 #include "ck_tile/core.hpp"
-#include "ck_tile/ops/gemm/pipeline/block_gemm_pipeline_agmem_bgmem_creg_v2_default_policy.hpp"
+#include "ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v2_default_policy.hpp"

 namespace ck_tile {

 //  A Tile Window: global memory
 //  B Tile Window: global memory
 //  C Distributed tensor: register
-template <typename Problem, typename Policy = BlockGemmPipelineAGmemBGmemCRegV2DefaultPolicy>
-struct BlockGemmPipelineAGmemBGmemCRegV2
+template <typename Problem, typename Policy = GemmPipelineAGmemBGmemCRegV2DefaultPolicy>
+struct GemmPipelineAGmemBGmemCRegV2
 {
    using ADataType      = remove_cvref_t<typename Problem::ADataType>;
    using BDataType      = remove_cvref_t<typename Problem::BDataType>;

--- a/include/ck_tile/ops/gemm/pipeline/block_gemm_pipeline_agmem_bgmem_creg_v2_default_policy.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/block_gemm_pipeline_agmem_bgmem_creg_v2_default_policy.hpp
@@ -7,12 +7,11 @@

 namespace ck_tile {

-// Default policy for BlockGemmPipelineAGmemBGmemCRegV2
+// Default policy for GemmPipelineAGmemBGmemCRegV2
 // Default policy class should not be templated, put template on member functions instead
 // NOTE: policy should be binded to its corresponding operation. It's just a coincidence that
-//   BlockGemmPipelineAGmemBGmemCRegV2DefaultPolicy is the same as
-//   BlockGemmPipelineAGmemBGmemCRegV1DefaultPolicy
-using BlockGemmPipelineAGmemBGmemCRegV2DefaultPolicy =
-    BlockGemmPipelineAGmemBGmemCRegV1DefaultPolicy;
+//   GemmPipelineAGmemBGmemCRegV2DefaultPolicy is the same as
+//   GemmPipelineAGmemBGmemCRegV1DefaultPolicy
+using GemmPipelineAGmemBGmemCRegV2DefaultPolicy = GemmPipelineAGmemBGmemCRegV1DefaultPolicy;

 } // namespace ck_tile
--- a/include/ck_tile/ops/gemm/pipeline/block_gemm_pipeline_problem.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/block_gemm_pipeline_problem.hpp
@@ -13,20 +13,23 @@ template <typename ADataType_,
          typename BDataType_,
          typename CDataType_,
          typename BlockGemmShape_,
-          bool kPadA_ = false,
-          bool kPadB_ = false,
-          bool kPadC_ = false>
-struct BlockGemmPipelineProblem
+          typename TileGemmTraits_>
+struct GemmPipelineProblem
 {
    using ADataType      = remove_cvref_t<ADataType_>;
    using BDataType      = remove_cvref_t<BDataType_>;
    using CDataType      = remove_cvref_t<CDataType_>;
    using BlockGemmShape = remove_cvref_t<BlockGemmShape_>;
+    using GemmTraits     = remove_cvref_t<TileGemmTraits_>;

    static constexpr index_t kBlockSize = BlockGemmShape::NumWarps * get_warp_size();
-    static constexpr bool kPadA         = kPadA_;
-    static constexpr bool kPadB         = kPadB_;
-    static constexpr bool kPadC         = kPadC_;
+    static constexpr bool kPadA         = GemmTraits::kPadA;
+    static constexpr bool kPadB         = GemmTraits::kPadB;
+    static constexpr bool kPadC         = GemmTraits::kPadC;
+
+    using LayoutA = remove_cvref_t<typename GemmTraits::LayoutA>;
+    using LayoutB = remove_cvref_t<typename GemmTraits::LayoutB>;
+    using LayoutC = remove_cvref_t<typename GemmTraits::LayoutC>;

    static constexpr index_t AlignmentA = kPadA ? 1 : VectorLoadSize / sizeof(ADataType);
    static constexpr index_t AlignmentB = kPadB ? 1 : VectorLoadSize / sizeof(BDataType);

--- a/include/ck_tile/ops/gemm/pipeline/gemm_universal_pipeline_ag_bg_cr_policy.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/gemm_universal_pipeline_ag_bg_cr_policy.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/gemm/warp/warp_gemm_dispatcher.hpp"
+
+namespace ck_tile {
+
+// UniversalGemm Policy
+template <typename LayoutA_, typename LayoutB_, typename LayoutC_>
+struct UniversalGemmPipelineAgBgCrPolicy
+{
+    using LayoutA = remove_cvref_t<LayoutA_>;
+    using LayoutB = remove_cvref_t<LayoutB_>;
+    using LayoutC = remove_cvref_t<LayoutC_>;
+
+    static constexpr auto I0 = number<0>{};
+    static constexpr auto I1 = number<1>{};
+    static constexpr auto I2 = number<2>{};
+
+    static constexpr bool TransposeC = true;
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto MakeALdsBlockDescriptor()
+    {
+        using WarpGemm = WarpGemmMfmaDispatcher<typename Problem::ADataType,
+                                                typename Problem::BDataType,
+                                                typename Problem::CDataType,
+                                                Problem::BlockGemmShape::WarpTile::at(I0),
+                                                Problem::BlockGemmShape::WarpTile::at(I1),
+                                                Problem::BlockGemmShape::WarpTile::at(I2),
+                                                TransposeC>;
+
+        using ADataType = remove_cvref_t<typename Problem::ADataType>;
+
+        constexpr index_t MPerBlock = Problem::BlockGemmShape::kM;
+        constexpr index_t KPerBlock = Problem::BlockGemmShape::kK;
+        constexpr index_t K1        = WarpGemm::kK;
+        constexpr index_t K0        = KPerBlock / K1;
+
+        if constexpr(std::is_same<tensor_layout::gemm::RowMajor, LayoutA>::value)
+        {
+            constexpr auto MLdsLayer        = 32 * 4 / KPerBlock / sizeof(ADataType) < 1
+                                                  ? 1
+                                                  : 32 * 4 / KPerBlock / sizeof(ADataType);
+            constexpr auto a_lds_block_desc = make_naive_tensor_descriptor(
+                make_tuple(K0 * number<MLdsLayer>{}, number<MPerBlock / MLdsLayer>{}, K1),
+                make_tuple(K1, number<KPerBlock * MLdsLayer>{}, I1));
+
+            constexpr auto a_lds_block_desc_permuted = transform_tensor_descriptor(
+                a_lds_block_desc,
+                make_tuple(make_xor_transform(make_tuple(number<MPerBlock / MLdsLayer>{},
+                                                         number<K0 * MLdsLayer>{})),
+                           make_pass_through_transform(K1)),
+                make_tuple(sequence<1, 0>{}, sequence<2>{}),
+                make_tuple(sequence<1, 0>{}, sequence<2>{}));
+
+            constexpr auto a_lds_block_desc_ak0_kMLdsLayer_m_ak1 = transform_tensor_descriptor(
+                a_lds_block_desc_permuted,
+                make_tuple(make_unmerge_transform(make_tuple(K0, number<MLdsLayer>{})),
+                           make_pass_through_transform(number<MPerBlock / MLdsLayer>{}),
+                           make_pass_through_transform(K1)),
+                make_tuple(sequence<0>{}, sequence<1>{}, sequence<2>{}),
+                make_tuple(sequence<0, 2>{}, sequence<1>{}, sequence<3>{}));
+
+            constexpr auto a_lds_block_desc_m_k = transform_tensor_descriptor(
+                a_lds_block_desc_ak0_kMLdsLayer_m_ak1,
+                make_tuple(make_merge_transform_v3_division_mod(make_tuple(K0, K1)),
+                           make_merge_transform_v3_division_mod(
+                               make_tuple(number<MPerBlock / MLdsLayer>{}, number<MLdsLayer>{}))),
+                make_tuple(sequence<0, 3>{}, sequence<1, 2>{}),
+                make_tuple(sequence<1>{}, sequence<0>{}));
+
+            return a_lds_block_desc_m_k;
+        }
+        else // ColumnMajor A
+        {
+            // kfold and mpair dimension is not always required.
+            // more dimension in merge_transform increase the difficulty of generating immarg offset
+            // for compiler.
+            constexpr auto M0 = get_warp_size() * Problem::BlockGemmShape::BlockWarps::at(I0);
+            constexpr auto M1 = MPerBlock / M0;
+
+            constexpr auto KThreadWrite     = Problem::kBlockSize / M0;
+            constexpr auto K0PerThreadWrite = K0 / KThreadWrite;
+            constexpr auto KThreadRead      = 64 / WarpGemm::kM;
+            constexpr auto K0PerThreadRead  = K0 / KThreadRead;
+
+            constexpr auto kfold =
+                (K1 * M0 * sizeof(ADataType) > 128) ? 1 : 128 / (K1 * M0 * sizeof(ADataType));
+            constexpr auto KThreadReadPerm =
+                (kfold * K0PerThreadWrite / K0PerThreadRead) > 1
+                    ? KThreadRead / (kfold * K0PerThreadWrite / K0PerThreadRead)
+                    : KThreadRead;
+
+            // 1<=mpair<=kN0
+            constexpr auto mpair = (K1 * WarpGemm::kM * sizeof(ADataType) > 128)
+                                       ? 1
+                                       : ((128 / (K1 * WarpGemm::kM * sizeof(ADataType))) > M0
+                                              ? M0
+                                              : 128 / (K1 * WarpGemm::kM * sizeof(ADataType)));
+
+            constexpr auto a_lds_block_desc = make_naive_tensor_descriptor_packed(
+                make_tuple(number<KThreadWrite / kfold / KThreadReadPerm>{},
+                           number<K0PerThreadWrite>{},
+                           number<KThreadReadPerm * M1>{},
+                           number<kfold * M0 / mpair>{},
+                           number<mpair>{},
+                           K1));
+
+            constexpr auto a_lds_block_desc_permuted = transform_tensor_descriptor(
+                a_lds_block_desc,
+                make_tuple(
+                    make_pass_through_transform(number<KThreadWrite / kfold / KThreadReadPerm>{}),
+                    make_pass_through_transform(number<K0PerThreadWrite>{}),
+                    make_xor_transform(
+                        make_tuple(number<KThreadReadPerm * M1>{}, number<kfold * M0 / mpair>{})),
+                    make_pass_through_transform(number<mpair>{}),
+                    make_pass_through_transform(K1)),
+                make_tuple(
+                    sequence<0>{}, sequence<1>{}, sequence<2, 3>{}, sequence<4>{}, sequence<5>{}),
+                make_tuple(
+                    sequence<0>{}, sequence<1>{}, sequence<2, 3>{}, sequence<4>{}, sequence<5>{}));
+
+            constexpr auto a_lds_block_desc_unmerged = transform_tensor_descriptor(
+                a_lds_block_desc_permuted,
+                make_tuple(
+                    make_pass_through_transform(number<KThreadWrite / kfold / KThreadReadPerm>{}),
+                    make_pass_through_transform(number<K0PerThreadWrite>{}),
+                    make_unmerge_transform(make_tuple(number<KThreadReadPerm>{}, number<M1>{})),
+                    make_unmerge_transform(make_tuple(number<kfold>{}, number<M0 / mpair>{})),
+                    make_pass_through_transform(number<mpair>{}),
+                    make_pass_through_transform(K1)),
+                make_tuple(sequence<0>{},
+                           sequence<1>{},
+                           sequence<2>{},
+                           sequence<3>{},
+                           sequence<4>{},
+                           sequence<5>{}),
+                make_tuple(sequence<1>{},
+                           sequence<2>{},
+                           sequence<0, 3>{},
+                           sequence<4, 5>{},
+                           sequence<6>{},
+                           sequence<7>{}));
+
+            constexpr auto a_lds_block_desc_m_k = transform_tensor_descriptor(
+                a_lds_block_desc_unmerged,
+                make_tuple(make_merge_transform_v3_division_mod(
+                               make_tuple(number<KThreadReadPerm>{},
+                                          number<KThreadWrite / kfold / KThreadReadPerm>{},
+                                          number<kfold>{},
+                                          number<K0PerThreadWrite>{},
+                                          K1)),
+                           make_merge_transform_v3_division_mod(
+                               make_tuple(number<M0 / mpair>{}, number<mpair>{}, number<M1>{}))),
+                make_tuple(sequence<0, 1, 4, 2, 7>{}, sequence<5, 6, 3>{}),
+                make_tuple(sequence<1>{}, sequence<0>{}));
+
+            return a_lds_block_desc_m_k;
+        }
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto MakeBLdsBlockDescriptor()
+    {
+        using WarpGemm = WarpGemmMfmaDispatcher<typename Problem::ADataType,
+                                                typename Problem::BDataType,
+                                                typename Problem::CDataType,
+                                                Problem::BlockGemmShape::WarpTile::at(I0),
+                                                Problem::BlockGemmShape::WarpTile::at(I1),
+                                                Problem::BlockGemmShape::WarpTile::at(I2),
+                                                TransposeC>;
+
+        using BDataType = remove_cvref_t<typename Problem::BDataType>;
+
+        constexpr index_t NPerBlock = Problem::BlockGemmShape::kN;
+        constexpr index_t KPerBlock = Problem::BlockGemmShape::kK;
+
+        constexpr index_t K1 = WarpGemm::kK;
+        constexpr index_t K0 = KPerBlock / K1;
+
+        if constexpr(std::is_same<tensor_layout::gemm::ColumnMajor, LayoutB>::value)
+        {
+            // NLdsLayer * K0 as logical Bank
+            constexpr auto NLdsLayer = 32 * 4 / KPerBlock / sizeof(BDataType) < 1
+                                           ? 1
+                                           : 32 * 4 / KPerBlock / sizeof(BDataType);
+            ;
+            constexpr auto b_lds_block_desc = make_naive_tensor_descriptor(
+                make_tuple(K0 * number<NLdsLayer>{}, number<NPerBlock / NLdsLayer>{}, K1),
+                make_tuple(K1, number<KPerBlock * NLdsLayer>{}, I1));
+
+            constexpr auto b_lds_block_desc_permuted = transform_tensor_descriptor(
+                b_lds_block_desc,
+                make_tuple(make_xor_transform(make_tuple(number<NPerBlock / NLdsLayer>{},
+                                                         number<K0 * NLdsLayer>{})),
+                           make_pass_through_transform(K1)),
+                make_tuple(sequence<1, 0>{}, sequence<2>{}),
+                make_tuple(sequence<1, 0>{}, sequence<2>{}));
+
+            constexpr auto b_lds_block_desc_bk0_kNLdsLayer_n_bk1 = transform_tensor_descriptor(
+                b_lds_block_desc_permuted,
+                make_tuple(make_unmerge_transform(make_tuple(K0, number<NLdsLayer>{})),
+                           make_pass_through_transform(number<NPerBlock / NLdsLayer>{}),
+                           make_pass_through_transform(K1)),
+                make_tuple(sequence<0>{}, sequence<1>{}, sequence<2>{}),
+                make_tuple(sequence<0, 2>{}, sequence<1>{}, sequence<3>{}));
+
+            constexpr auto b_lds_block_desc_n_k = transform_tensor_descriptor(
+                b_lds_block_desc_bk0_kNLdsLayer_n_bk1,
+                make_tuple(make_merge_transform_v3_division_mod(make_tuple(K0, K1)),
+                           make_merge_transform_v3_division_mod(
+                               make_tuple(number<NPerBlock / NLdsLayer>{}, number<NLdsLayer>{}))),
+                make_tuple(sequence<0, 3>{}, sequence<1, 2>{}),
+                make_tuple(sequence<1>{}, sequence<0>{}));
+
+            return b_lds_block_desc_n_k;
+        }
+        else // RowMajor B
+        {
+            constexpr auto N0 = get_warp_size() * Problem::BlockGemmShape::BlockWarps::at(I1);
+            constexpr auto N1 = NPerBlock / N0;
+
+            constexpr auto KThreadWrite     = Problem::kBlockSize / N0;
+            constexpr auto K0PerThreadWrite = K0 / KThreadWrite;
+            constexpr auto KThreadRead      = 64 / WarpGemm::kN;
+            constexpr auto K0PerThreadRead  = K0 / KThreadRead;
+
+            constexpr auto kfold =
+                (K1 * N0 * sizeof(BDataType) > 128) ? 1 : 128 / (K1 * N0 * sizeof(BDataType));
+            constexpr auto KThreadReadPerm =
+                (kfold * K0PerThreadWrite / K0PerThreadRead) > 1
+                    ? KThreadRead / (kfold * K0PerThreadWrite / K0PerThreadRead)
+                    : KThreadRead;
+
+            // 1<=npair<=kN0
+            constexpr auto npair = (K1 * WarpGemm::kN * sizeof(BDataType) > 128)
+                                       ? 1
+                                       : ((128 / (K1 * WarpGemm::kN * sizeof(BDataType))) > N0
+                                              ? N0
+                                              : 128 / (K1 * WarpGemm::kN * sizeof(BDataType)));
+
+            constexpr auto b_lds_block_desc = make_naive_tensor_descriptor_packed(
+                make_tuple(number<KThreadWrite / kfold / KThreadReadPerm>{},
+                           number<K0PerThreadWrite>{},
+                           number<KThreadReadPerm * N1>{},
+                           number<kfold * N0 / npair>{},
+                           number<npair>{},
+                           K1));
+
+            constexpr auto b_lds_block_desc_permuted = transform_tensor_descriptor(
+                b_lds_block_desc,
+                make_tuple(
+                    make_pass_through_transform(number<KThreadWrite / kfold / KThreadReadPerm>{}),
+                    make_pass_through_transform(number<K0PerThreadWrite>{}),
+                    make_xor_transform(
+                        make_tuple(number<KThreadReadPerm * N1>{}, number<kfold * N0 / npair>{})),
+                    make_pass_through_transform(number<npair>{}),
+                    make_pass_through_transform(K1)),
+                make_tuple(
+                    sequence<0>{}, sequence<1>{}, sequence<2, 3>{}, sequence<4>{}, sequence<5>{}),
+                make_tuple(
+                    sequence<0>{}, sequence<1>{}, sequence<2, 3>{}, sequence<4>{}, sequence<5>{}));
+
+            constexpr auto b_lds_block_desc_unmerged = transform_tensor_descriptor(
+                b_lds_block_desc_permuted,
+                make_tuple(
+                    make_pass_through_transform(number<KThreadWrite / kfold / KThreadReadPerm>{}),
+                    make_pass_through_transform(number<K0PerThreadWrite>{}),
+                    make_unmerge_transform(make_tuple(number<KThreadReadPerm>{}, number<N1>{})),
+                    make_unmerge_transform(make_tuple(number<kfold>{}, number<N0 / npair>{})),
+                    make_pass_through_transform(number<npair>{}),
+                    make_pass_through_transform(K1)),
+                make_tuple(sequence<0>{},
+                           sequence<1>{},
+                           sequence<2>{},
+                           sequence<3>{},
+                           sequence<4>{},
+                           sequence<5>{}),
+                make_tuple(sequence<1>{},
+                           sequence<2>{},
+                           sequence<0, 3>{},
+                           sequence<4, 5>{},
+                           sequence<6>{},
+                           sequence<7>{}));
+
+            constexpr auto b_lds_block_desc_n_k = transform_tensor_descriptor(
+                b_lds_block_desc_unmerged,
+                make_tuple(make_merge_transform_v3_division_mod(
+                               make_tuple(number<KThreadReadPerm>{},
+                                          number<KThreadWrite / kfold / KThreadReadPerm>{},
+                                          number<kfold>{},
+                                          number<K0PerThreadWrite>{},
+                                          K1)),
+                           make_merge_transform_v3_division_mod(
+                               make_tuple(number<N0 / npair>{}, number<npair>{}, number<N1>{}))),
+                make_tuple(sequence<0, 1, 4, 2, 7>{}, sequence<5, 6, 3>{}),
+                make_tuple(sequence<1>{}, sequence<0>{}));
+
+            return b_lds_block_desc_n_k;
+        }
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSizeA()
+    {
+        constexpr index_t smem_size_a = sizeof(typename Problem::ADataType) *
+                                        MakeALdsBlockDescriptor<Problem>().get_element_space_size();
+        return smem_size_a;
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSizeB()
+    {
+        constexpr index_t smem_size_b = sizeof(typename Problem::BDataType) *
+                                        MakeBLdsBlockDescriptor<Problem>().get_element_space_size();
+        return smem_size_b;
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize()
+    {
+        constexpr index_t smem_size_a = GetSmemSizeA<Problem>();
+        constexpr index_t smem_size_b = GetSmemSizeB<Problem>();
+        index_t smem_size             = 0;
+        smem_size += smem_size_a + smem_size_b;
+
+        return smem_size;
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto MakeADramTileDistribution()
+    {
+        using WarpGemm = WarpGemmMfmaDispatcher<typename Problem::ADataType,
+                                                typename Problem::BDataType,
+                                                typename Problem::CDataType,
+                                                Problem::BlockGemmShape::WarpTile::at(I0),
+                                                Problem::BlockGemmShape::WarpTile::at(I1),
+                                                Problem::BlockGemmShape::WarpTile::at(I2),
+                                                TransposeC>;
+
+        constexpr index_t BlockSize = Problem::kBlockSize;
+
+        constexpr index_t MPerBlock = Problem::BlockGemmShape::kM;
+        constexpr index_t KPerBlock = Problem::BlockGemmShape::kK;
+
+        constexpr index_t K1 = WarpGemm::kK;
+        constexpr index_t K0 = KPerBlock / K1;
+        constexpr index_t M2 = get_warp_size() / K0;
+
+        constexpr index_t M1 = BlockSize / get_warp_size();
+        static_assert(M2 != 0, "M2 is zero, which will lead to a division by zero error.");
+        static_assert(M1 != 0, "M1 is zero, which will lead to a division by zero error.");
+        constexpr index_t M0 = MPerBlock / (M2 * M1);
+
+        return make_static_tile_distribution(
+            tile_distribution_encoding<sequence<1>,
+                                       tuple<sequence<M0, M1, M2>, sequence<K0, K1>>,
+                                       tuple<sequence<1>, sequence<1, 2>>,
+                                       tuple<sequence<1>, sequence<2, 0>>,
+                                       sequence<1, 2>,
+                                       sequence<0, 1>>{});
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto MakeBDramTileDistribution()
+    {
+        using WarpGemm = WarpGemmMfmaDispatcher<typename Problem::ADataType,
+                                                typename Problem::BDataType,
+                                                typename Problem::CDataType,
+                                                Problem::BlockGemmShape::WarpTile::at(I0),
+                                                Problem::BlockGemmShape::WarpTile::at(I1),
+                                                Problem::BlockGemmShape::WarpTile::at(I2),
+                                                TransposeC>;
+
+        constexpr index_t BlockSize = Problem::kBlockSize;
+
+        constexpr index_t NPerBlock = Problem::BlockGemmShape::kN;
+        constexpr index_t KPerBlock = Problem::BlockGemmShape::kK;
+
+        constexpr index_t K1 = WarpGemm::kK;
+        constexpr index_t K0 = KPerBlock / K1;
+        constexpr index_t N2 = get_warp_size() / K0;
+
+        constexpr index_t N1 = BlockSize / get_warp_size();
+        static_assert(N2 != 0, "M2 is zero, which will lead to a division by zero error.");
+        static_assert(N1 != 0, "M1 is zero, which will lead to a division by zero error.");
+        constexpr index_t N0 = NPerBlock / (N2 * N1);
+
+        return make_static_tile_distribution(
+            tile_distribution_encoding<sequence<1>,
+                                       tuple<sequence<N0, N1, N2>, sequence<K0, K1>>,
+                                       tuple<sequence<1>, sequence<1, 2>>,
+                                       tuple<sequence<1>, sequence<2, 0>>,
+                                       sequence<1, 2>,
+                                       sequence<0, 1>>{});
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto GetBlockGemm()
+    {
+        using AccDataType     = float;
+        using BlockWarps      = typename Problem::BlockGemmShape::BlockWarps;
+        using WarpTile        = typename Problem::BlockGemmShape::WarpTile;
+        using WarpGemm        = WarpGemmMfmaDispatcher<typename Problem::ADataType,
+                                                typename Problem::BDataType,
+                                                AccDataType,
+                                                WarpTile::at(I0),
+                                                WarpTile::at(I1),
+                                                WarpTile::at(I2),
+                                                TransposeC>;
+        using BlockGemmPolicy = BlockGemmASmemBSmemCRegV1CustomPolicy<typename Problem::ADataType,
+                                                                      typename Problem::BDataType,
+                                                                      typename Problem::CDataType,
+                                                                      BlockWarps,
+                                                                      WarpGemm>;
+        return BlockGemmASmemBSmemCRegV1<Problem, BlockGemmPolicy>{};
+    }
+};
+
+} // namespace ck_tile
--- a/include/ck_tile/ops/gemm/pipeline/tile_gemm_traits.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/tile_gemm_traits.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+
+namespace ck_tile {
+
+template <bool kPadA_,
+          bool kPadB_,
+          bool kPadC_,
+          typename LayoutA_,
+          typename LayoutB_,
+          typename LayoutC_>
+struct TileGemmTraits
+{
+    static constexpr bool kPadA = kPadA_;
+    static constexpr bool kPadB = kPadB_;
+    static constexpr bool kPadC = kPadC_;
+
+    using LayoutA = LayoutA_;
+    using LayoutB = LayoutB_;
+    using LayoutC = LayoutC_;
+};
+
+} // namespace ck_tile
--- a/include/ck_tile/ops/layernorm2d/kernel/layernorm2d_fwd_kernel.hpp
+++ b/include/ck_tile/ops/layernorm2d/kernel/layernorm2d_fwd_kernel.hpp
@@ -31,8 +31,14 @@ struct Layernorm2dFwd

    static constexpr ck_tile::index_t kMPerBlock = Problem::BlockShape::kMPerBlock;
    static constexpr ck_tile::index_t kNPerBlock = Problem::BlockShape::kNPerBlock;
+    static constexpr bool kPadM                  = Problem::kPadM;
+    static constexpr bool kPadN                  = Problem::kPadN;

    static constexpr ck_tile::index_t kNThreadPerWarp = Problem::BlockShape::kNThreadPerWarp;
+    static constexpr ck_tile::index_t kNPerThread     = Problem::BlockShape::kNPerThread;
+
+    static constexpr auto I0 = number<0>{};
+    static constexpr auto I1 = number<1>{};

    struct Kargs
    {
@@ -96,19 +102,25 @@ struct Layernorm2dFwd
                sequence<2>>{});
    }

-    template <typename Dstr>
-    CK_TILE_DEVICE static constexpr auto GetNPerThread(Dstr)
+    CK_TILE_DEVICE static int GetWelfordMaxCount(int N)
    {
-        constexpr auto nDstrSpan = Dstr::get_distributed_spans().template at<1>();
-
-        using Lengths = decltype(nDstrSpan.impl_);
+        constexpr ck_tile::index_t kNThreadPerBlock = kNPerBlock / kNPerThread;

-        ck_tile::index_t ret = 1;
+        int thread_id_n = get_thread_id() % kNThreadPerBlock;
+        int max_count =
+            __builtin_amdgcn_readfirstlane(N < kNPerBlock ? 0 : kNPerThread * (N / kNPerBlock));
+        int n_per_block_tail_loop =
+            __builtin_amdgcn_readfirstlane(N - max_count * kNThreadPerBlock);

-        ck_tile::static_for<0, Lengths::size(), 1>{}(
-            [&](auto idx) { ret *= Lengths::template at(idx); });
+        if(n_per_block_tail_loop > 0)
+        {
+            int thread_max_n = (thread_id_n + 1) * kNPerThread;
+            int delta        = thread_max_n - n_per_block_tail_loop;
+            delta            = clamp(thread_max_n - n_per_block_tail_loop, 0, kNPerThread);
+            max_count += kNPerThread - delta;
+        }

-        return ret;
+        return max_count;
    }

    template <typename DistributedTensor>
@@ -129,42 +141,29 @@ struct Layernorm2dFwd
        return out_dstr_tensor;
    }

-    template <bool Cond = (kHasGamma && kHasBeta)>
-    CK_TILE_DEVICE std::enable_if_t<Cond> TwoPassLayernorm2dFwd(const XDataType* p_x,
-                                                                const GammaDataType* p_gamma,
-                                                                const BetaDataType* p_beta,
-                                                                YDataType* p_y,
-                                                                MeanDataType* p_mean,
-                                                                InvStdDataType* p_invStd,
-                                                                const ComputeDataType epsilon,
-                                                                ck_tile::index_t M,
-                                                                ck_tile::index_t N) const
+    template <typename XBlockWindow,
+              typename GammaBlockWindow,
+              typename BetaBlockWindow,
+              typename YBlockWindow,
+              typename MeanBlockWindow,
+              typename InvStdBlockWindow,
+              bool Cond = (kHasGamma && kHasBeta)>
+    CK_TILE_DEVICE std::enable_if_t<Cond>
+    TwoPassLayernorm2dFwd(XBlockWindow& x_block_window,
+                          GammaBlockWindow& gamma_block_window,
+                          BetaBlockWindow& beta_block_window,
+                          YBlockWindow& y_block_window,
+                          MeanBlockWindow& mean_block_window,
+                          InvStdBlockWindow& inv_std_block_window,
+                          ComputeDataType epsilon,
+                          ck_tile::index_t N) const
    {
-        constexpr auto I0 = number<0>{};
-        constexpr auto I1 = number<1>{};
-
-        const auto x_m_n = make_naive_tensor_view<address_space_enum::global>(
-            p_x, make_tuple(M, N), make_tuple(N, 1), number<32>{}, number<1>{});
-
-        const auto gamma_n = make_naive_tensor_view<address_space_enum::global>(
-            p_gamma, make_tuple(N), make_tuple(1), number<32>{}, number<1>{});
+        // TODO - Optimize tail loop to reduce move_tile_window()
+        index_t num_n_tile_iteration =
+            __builtin_amdgcn_readfirstlane(integer_divide_ceil(N, kNPerBlock));

-        const auto beta_n = make_naive_tensor_view<address_space_enum::global>(
-            p_beta, make_tuple(N), make_tuple(1), number<32>{}, number<1>{});
-
-        const auto iM = get_block_id() * kMPerBlock;
-
-        constexpr auto xDstr = MakeXBlockTileDistribution();
-
-        auto x_block_window = make_tile_window(
-            x_m_n, make_tuple(number<kMPerBlock>{}, number<kNPerBlock>{}), {iM, 0}, xDstr);
-
-        index_t num_n_tile_iteration = __builtin_amdgcn_readfirstlane(N / kNPerBlock);
-
-        // TODO: padding - handle max_count if N % kNPerBlock != 0
-        constexpr auto NPerThread = GetNPerThread(xDstr);
-        ThreadWelford<ComputeDataType, XDataType> thread_welford{
-            type_convert<int>(NPerThread * N / kNPerBlock)};
+        int welford_max_count = GetWelfordMaxCount(N);
+        ThreadWelford<ComputeDataType, XDataType> thread_welford{welford_max_count};

        using XTensorType = decltype(load_tile(x_block_window));
        auto mean_compute_block_tensor =
@@ -190,44 +189,14 @@ struct Layernorm2dFwd
        auto inv_std_compute_block_tensor = InvSqrt(var_compute_block_tensor, epsilon);

        if constexpr(kSaveMean)
-        {
-            const auto mean_m = make_naive_tensor_view_packed<address_space_enum::global>(
-                p_mean, make_tuple(M), number<32>{});
-
-            auto mean_block_window =
-                make_tile_window(mean_m, make_tuple(number<kMPerBlock>{}), {iM});
-
            store_tile(mean_block_window, cast_tile<MeanDataType>(mean_compute_block_tensor));
-        }
        if constexpr(kSaveInvStd)
-        {
-            const auto inv_std_m = make_naive_tensor_view_packed<address_space_enum::global>(
-                p_invStd, make_tuple(M), number<32>{});
-
-            auto inv_std_block_window =
-                make_tile_window(inv_std_m, make_tuple(number<kMPerBlock>{}), {iM});
-
-            store_tile(inv_std_block_window, cast_tile<MeanDataType>(inv_std_compute_block_tensor));
-        }
-
-        // TODO: Extract normalize pipeline
-        const auto y_m_n = make_naive_tensor_view<address_space_enum::global>(
-            p_y, make_tuple(M, N), make_tuple(N, 1), number<32>{}, number<1>{});
-
-        auto y_block_window = make_tile_window(
-            y_m_n, make_tuple(number<kMPerBlock>{}, number<kNPerBlock>{}), {iM, 0});
-
-        constexpr auto gammaDstr = MakeGammaBetaBlockTileDistribution();
-        constexpr auto betaDstr  = gammaDstr;
-
-        auto gamma_block_window =
-            make_tile_window(gamma_n, make_tuple(number<kNPerBlock>{}), {0}, gammaDstr);
-
-        auto beta_block_window = make_tile_window(
-            beta_n, make_tuple(number<kMPerBlock>{}, number<kNPerBlock>{}), {0}, betaDstr);
+            store_tile(inv_std_block_window,
+                       cast_tile<InvStdDataType>(inv_std_compute_block_tensor));

        // reverse read x to reuse cache
-        ck_tile::index_t stride_to_right_most_window = N - kNPerBlock;
+        ck_tile::index_t stride_to_right_most_window =
+            N % kNPerBlock == 0 ? N - kNPerBlock : N - N % kNPerBlock;

        move_tile_window(x_block_window, {0, -kNPerBlock});
        move_tile_window(gamma_block_window, {stride_to_right_most_window});
@@ -274,17 +243,209 @@ struct Layernorm2dFwd
        }
    }

+    template <typename XBlockWindow,
+              typename GammaBlockWindow,
+              typename BetaBlockWindow,
+              typename YBlockWindow,
+              typename MeanBlockWindow,
+              typename InvStdBlockWindow,
+              bool Cond = (kHasGamma && kHasBeta)>
+    CK_TILE_DEVICE std::enable_if_t<Cond>
+    OnePassLayernorm2dFwd(XBlockWindow& x_block_window,
+                          GammaBlockWindow& gamma_block_window,
+                          BetaBlockWindow& beta_block_window,
+                          YBlockWindow& y_block_window,
+                          MeanBlockWindow& mean_block_window,
+                          InvStdBlockWindow& inv_std_block_window,
+                          ComputeDataType epsilon,
+                          ck_tile::index_t N) const
+    {
+        int welford_max_count = GetWelfordMaxCount(N);
+        ThreadWelford<ComputeDataType, XDataType> thread_welford{welford_max_count};
+
+        using XTensorType = decltype(load_tile(x_block_window));
+        auto mean_compute_block_tensor =
+            thread_welford.template MakeInitialMeanVarDistributedTensor<XTensorType>();
+        auto var_compute_block_tensor =
+            thread_welford.template MakeInitialMeanVarDistributedTensor<XTensorType>();
+
+        clear_tile(mean_compute_block_tensor);
+        clear_tile(var_compute_block_tensor);
+
+        const auto x_block_tensor = load_tile(x_block_window);
+        thread_welford(x_block_tensor, mean_compute_block_tensor, var_compute_block_tensor);
+        // TODO: support cross warp Welford
+        WarpMergeWelford<ComputeDataType, true>{}(
+            mean_compute_block_tensor, var_compute_block_tensor, thread_welford.cur_count_);
+
+        auto inv_std_compute_block_tensor = InvSqrt(var_compute_block_tensor, epsilon);
+
+        if constexpr(kSaveMean)
+            store_tile(mean_block_window, cast_tile<MeanDataType>(mean_compute_block_tensor));
+        if constexpr(kSaveInvStd)
+            store_tile(inv_std_block_window,
+                       cast_tile<InvStdDataType>(inv_std_compute_block_tensor));
+
+        // normalize
+        const auto gamma_block_tensor = load_tile(gamma_block_window);
+        const auto beta_block_tensor  = load_tile(beta_block_window);
+
+        constexpr auto x_spans = decltype(x_block_tensor)::get_distributed_spans();
+
+        auto y_block_tensor =
+            make_static_distributed_tensor<YDataType>(x_block_tensor.get_tile_distribution());
+
+        sweep_tile_span(x_spans[I1], [&](auto idx1) {
+            constexpr auto j_idx = make_tuple(idx1);
+            const auto gamma     = type_convert<ComputeDataType>(gamma_block_tensor[j_idx]);
+            const auto beta      = type_convert<ComputeDataType>(beta_block_tensor[j_idx]);
+
+            sweep_tile_span(x_spans[I0], [&](auto idx0) {
+                constexpr auto i_idx   = make_tuple(idx0);
+                constexpr auto i_j_idx = make_tuple(idx0, idx1);
+
+                const auto mean    = mean_compute_block_tensor[i_idx];
+                const auto inv_std = inv_std_compute_block_tensor[i_idx];
+
+                const auto x = type_convert<ComputeDataType>(x_block_tensor[i_j_idx]);
+                auto y       = (x - mean) * inv_std * gamma + beta;
+
+                y_block_tensor(i_j_idx) = type_convert<YDataType>(y);
+            });
+        });
+
+        store_tile(y_block_window, y_block_tensor);
+    }
+
    CK_TILE_DEVICE void operator()(Kargs kargs) const
    {
-        TwoPassLayernorm2dFwd(static_cast<const XDataType*>(kargs.p_x),
-                              static_cast<const GammaDataType*>(kargs.p_gamma),
-                              static_cast<const BetaDataType*>(kargs.p_beta),
-                              static_cast<YDataType*>(kargs.p_y),
-                              static_cast<MeanDataType*>(kargs.p_mean),
-                              static_cast<InvStdDataType*>(kargs.p_invStd),
-                              static_cast<const ComputeDataType>(kargs.epsilon),
-                              kargs.M,
-                              kargs.N);
+        const auto x_m_n = [&]() {
+            const auto x_dram_naive = make_naive_tensor_view<address_space_enum::global>(
+                static_cast<const XDataType*>(kargs.p_x),
+                make_tuple(kargs.M, kargs.N),
+                make_tuple(kargs.N, 1),
+                number<kNPerThread>{},
+                number<1>{});
+
+            return pad_tensor_view(x_dram_naive,
+                                   make_tuple(number<kMPerBlock>{}, number<kNPerBlock>{}),
+                                   sequence<kPadM, kPadN>{});
+        }();
+
+        const auto gamma_n = [&]() {
+            const auto gamma_dram_naive = make_naive_tensor_view<address_space_enum::global>(
+                static_cast<const GammaDataType*>(kargs.p_gamma),
+                make_tuple(kargs.N),
+                make_tuple(1),
+                number<kNPerThread>{},
+                number<1>{});
+
+            return pad_tensor_view(
+                gamma_dram_naive, make_tuple(number<kNPerBlock>{}), sequence<kPadN>{});
+        }();
+
+        const auto beta_n = [&]() {
+            const auto gamma_dram_naive = make_naive_tensor_view<address_space_enum::global>(
+                static_cast<const BetaDataType*>(kargs.p_beta),
+                make_tuple(kargs.N),
+                make_tuple(1),
+                number<kNPerThread>{},
+                number<1>{});
+
+            return pad_tensor_view(
+                gamma_dram_naive, make_tuple(number<kNPerBlock>{}), sequence<kPadN>{});
+        }();
+
+        const auto iM = get_block_id() * kMPerBlock;
+
+        constexpr auto xDstr = MakeXBlockTileDistribution();
+
+        auto x_block_window = make_tile_window(
+            x_m_n, make_tuple(number<kMPerBlock>{}, number<kNPerBlock>{}), {iM, 0}, xDstr);
+
+        const auto y_m_n = [&]() {
+            const auto y_dram_naive = make_naive_tensor_view<address_space_enum::global>(
+                static_cast<YDataType*>(kargs.p_y),
+                make_tuple(kargs.M, kargs.N),
+                make_tuple(kargs.N, 1),
+                number<kNPerThread>{},
+                number<1>{});
+
+            return pad_tensor_view(y_dram_naive,
+                                   make_tuple(number<kMPerBlock>{}, number<kNPerBlock>{}),
+                                   sequence<kPadM, kPadN>{});
+        }();
+
+        auto y_block_window = make_tile_window(
+            y_m_n, make_tuple(number<kMPerBlock>{}, number<kNPerBlock>{}), {iM, 0});
+
+        constexpr auto gammaDstr = MakeGammaBetaBlockTileDistribution();
+        constexpr auto betaDstr  = gammaDstr;
+
+        auto gamma_block_window =
+            make_tile_window(gamma_n, make_tuple(number<kNPerBlock>{}), {0}, gammaDstr);
+
+        auto beta_block_window = make_tile_window(
+            beta_n, make_tuple(number<kMPerBlock>{}, number<kNPerBlock>{}), {0}, betaDstr);
+
+        auto mean_block_window = [&]() {
+            if constexpr(kSaveMean)
+            {
+                const auto mean_m = [&]() {
+                    const auto mean_dram_naive =
+                        make_naive_tensor_view_packed<address_space_enum::global>(
+                            static_cast<MeanDataType*>(kargs.p_mean),
+                            make_tuple(kargs.M),
+                            number<1>{});
+
+                    return pad_tensor_view(
+                        mean_dram_naive, make_tuple(number<kMPerBlock>{}), sequence<kPadM>{});
+                }();
+
+                return make_tile_window(mean_m, make_tuple(number<kMPerBlock>{}), {iM});
+            }
+            else
+                return make_null_tile_window(make_tuple(number<kMPerBlock>{}));
+        }();
+
+        auto inv_std_block_window = [&]() {
+            if constexpr(kSaveInvStd)
+            {
+                const auto inv_std_m = [&]() {
+                    const auto inv_std_dram_naive =
+                        make_naive_tensor_view_packed<address_space_enum::global>(
+                            static_cast<InvStdDataType*>(kargs.p_invStd),
+                            make_tuple(kargs.M),
+                            number<1>{});
+
+                    return pad_tensor_view(
+                        inv_std_dram_naive, make_tuple(number<kMPerBlock>{}), sequence<kPadM>{});
+                }();
+
+                return make_tile_window(inv_std_m, make_tuple(number<kMPerBlock>{}), {iM});
+            }
+            else
+                return make_null_tile_window(make_tuple(number<kMPerBlock>{}));
+        }();
+
+        if(kargs.N <= kNPerBlock)
+            OnePassLayernorm2dFwd(x_block_window,
+                                  gamma_block_window,
+                                  beta_block_window,
+                                  y_block_window,
+                                  mean_block_window,
+                                  inv_std_block_window,
+                                  static_cast<const ComputeDataType>(kargs.epsilon),
+                                  kargs.N);
+        else
+            TwoPassLayernorm2dFwd(x_block_window,
+                                  gamma_block_window,
+                                  beta_block_window,
+                                  y_block_window,
+                                  mean_block_window,
+                                  inv_std_block_window,
+                                  static_cast<const ComputeDataType>(kargs.epsilon),
+                                  kargs.N);
    }
 };


--- a/include/ck_tile/ops/layernorm2d/pipeline/block_layernorm2d_fwd_problem.hpp
+++ b/include/ck_tile/ops/layernorm2d/pipeline/block_layernorm2d_fwd_problem.hpp
@@ -14,17 +14,21 @@ template <typename XDataType_,
          typename YDataType_,
          typename MeanDataType_,
          typename InvStdDataType_,
-          typename BlockShape_>
+          typename BlockShape_,
+          bool kPadM_,
+          bool kPadN_>
 struct BlockLayernorm2dFwdProblem
 {
-    using XDataType       = remove_cvref_t<XDataType_>;
-    using GammaDataType   = remove_cvref_t<GammaDataType_>;
-    using BetaDataType    = remove_cvref_t<BetaDataType_>;
-    using ComputeDataType = remove_cvref_t<ComputeDataType_>;
-    using YDataType       = remove_cvref_t<YDataType_>;
-    using MeanDataType    = remove_cvref_t<MeanDataType_>;
-    using InvStdDataType  = remove_cvref_t<InvStdDataType_>;
-    using BlockShape      = remove_cvref_t<BlockShape_>;
+    using XDataType             = remove_cvref_t<XDataType_>;
+    using GammaDataType         = remove_cvref_t<GammaDataType_>;
+    using BetaDataType          = remove_cvref_t<BetaDataType_>;
+    using ComputeDataType       = remove_cvref_t<ComputeDataType_>;
+    using YDataType             = remove_cvref_t<YDataType_>;
+    using MeanDataType          = remove_cvref_t<MeanDataType_>;
+    using InvStdDataType        = remove_cvref_t<InvStdDataType_>;
+    using BlockShape            = remove_cvref_t<BlockShape_>;
+    static constexpr bool kPadM = kPadM_;
+    static constexpr bool kPadN = kPadN_;
 };

 } // namespace ck_tile
--- a/library/include/ck/library/reference_tensor_operation/gpu/reference_gemm.hpp
+++ b/library/include/ck/library/reference_tensor_operation/gpu/reference_gemm.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <sstream>
+
+#include "ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp"
+#include "ck/tensor_operation/gpu/device/device_base.hpp"
+
+namespace ck {
+
+template <typename ALayout,
+          typename BLayout,
+          typename CLayout,
+          typename ADataType,
+          typename BDataType,
+          typename CDataType,
+          typename AccDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CDEElementwiseOperation,
+          typename ComputeTypeA,
+          typename ComputeTypeB>
+__global__ void
+#if CK_USE_LAUNCH_BOUNDS
+    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+#endif
+        naive_gemm_kernel(const ADataType* __restrict__ p_a_grid,
+                          const BDataType* __restrict__ p_b_grid,
+                          CDataType* __restrict__ p_c_grid,
+                          index_t m,
+                          index_t n,
+                          index_t k,
+                          const AElementwiseOperation a_element_op,
+                          const BElementwiseOperation b_element_op,
+                          const CDEElementwiseOperation c_element_op)
+{
+    using RowMajor = ck::tensor_layout::gemm::RowMajor;
+
+    const int row_idx = blockIdx.x * blockDim.x + threadIdx.x;
+    const int col_idx = blockIdx.y * blockDim.y + threadIdx.y;
+
+    if(row_idx < m && col_idx < n)
+    {
+
+        AccDataType v_acc = static_cast<AccDataType>(0.0);
+        ComputeTypeA v_a  = static_cast<ComputeTypeA>(0.0);
+        ComputeTypeB v_b  = static_cast<ComputeTypeB>(0.0);
+        CDataType v_c     = static_cast<CDataType>(0.0);
+
+        for(int k_idx = 0; k_idx < k; ++k_idx)
+        {
+            // check input matrices layout
+            int element_idx_a = 0;
+            int element_idx_b = 0;
+            if constexpr(std::is_same_v<ALayout, RowMajor>)
+            {
+                element_idx_a = row_idx * k + k_idx;
+            }
+            else
+            {
+                element_idx_a = row_idx + m * k_idx;
+            }
+            if constexpr(std::is_same_v<BLayout, RowMajor>)
+            {
+                element_idx_b = k_idx * n + col_idx;
+            }
+            else
+            {
+                element_idx_b = k_idx + k * col_idx;
+            }
+            // apply a_element_op
+            a_element_op(v_a, p_a_grid[element_idx_a]);
+            // apply b_element_op
+            b_element_op(v_b, p_b_grid[element_idx_b]);
+            // multiply and accumulate
+            v_acc += static_cast<AccDataType>(v_a) * static_cast<AccDataType>(v_b);
+        }
+        // apply c_element_op
+        c_element_op(v_c, v_acc);
+        // check output matrix layout
+        int element_idx_c = 0;
+        if constexpr(std::is_same_v<CLayout, RowMajor>)
+        {
+            element_idx_c = row_idx * n + col_idx;
+        }
+        else
+        {
+            element_idx_c = row_idx + m * col_idx;
+        }
+        // prepare output
+        p_c_grid[element_idx_c] = v_c;
+    }
+}
+
+} // namespace ck
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+template <typename ALayout,
+          typename BLayout,
+          typename CLayout,
+          typename ADataType,
+          typename BDataType,
+          typename CDataType,
+          typename AccDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CElementwiseOperation,
+          typename ComputeTypeA = CDataType,
+          typename ComputeTypeB = ComputeTypeA>
+struct ReferenceGemm : public device::BaseOperator
+{
+    // Argument
+    struct Argument : public device::BaseArgument
+    {
+        Argument(const void* p_a_grid,
+                 const void* p_b_grid,
+                 void* p_c_grid,
+                 index_t m,
+                 index_t n,
+                 index_t k,
+                 AElementwiseOperation a_element_op,
+                 BElementwiseOperation b_element_op,
+                 CElementwiseOperation c_element_op)
+            : p_a_grid_{static_cast<const ADataType*>(p_a_grid)},
+              p_b_grid_{static_cast<const BDataType*>(p_b_grid)},
+              p_c_grid_{static_cast<CDataType*>(p_c_grid)},
+              m_{m},
+              n_{n},
+              k_{k},
+              a_element_op_{a_element_op},
+              b_element_op_{b_element_op},
+              c_element_op_{c_element_op}
+        {
+        }
+
+        const ADataType* p_a_grid_;
+        const BDataType* p_b_grid_;
+        CDataType* p_c_grid_;
+
+        index_t m_;
+        index_t n_;
+        index_t k_;
+
+        AElementwiseOperation a_element_op_;
+        BElementwiseOperation b_element_op_;
+        CElementwiseOperation c_element_op_;
+    };
+
+    // Invoker
+    struct Invoker : public device::BaseInvoker
+    {
+        using Argument = ReferenceGemm::Argument;
+
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        {
+            int block_size = 16;
+            dim3 block_dim(block_size, block_size, 1);
+            dim3 grid_dim(
+                (arg.m_ + block_size - 1) / block_size, (arg.n_ + block_size - 1) / block_size, 1);
+
+            auto launch_kernel = [&]() {
+                const auto kernel = naive_gemm_kernel<ALayout,
+                                                      BLayout,
+                                                      CLayout,
+                                                      ADataType,
+                                                      BDataType,
+                                                      CDataType,
+                                                      AccDataType,
+                                                      AElementwiseOperation,
+                                                      BElementwiseOperation,
+                                                      CElementwiseOperation,
+                                                      ComputeTypeA,
+                                                      ComputeTypeB>;
+
+                return launch_and_time_kernel(stream_config,
+                                              kernel,
+                                              grid_dim,
+                                              block_dim,
+                                              0,
+                                              arg.p_a_grid_,
+                                              arg.p_b_grid_,
+                                              arg.p_c_grid_,
+                                              arg.m_,
+                                              arg.n_,
+                                              arg.k_,
+                                              arg.a_element_op_,
+                                              arg.b_element_op_,
+                                              arg.c_element_op_);
+            };
+
+            return launch_kernel();
+        }
+
+        float Run(const device::BaseArgument* p_arg,
+                  const StreamConfig& stream_config = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
+        }
+    };
+
+    bool IsSupportedArgument(const device::BaseArgument*) override { return true; }
+
+    static auto MakeArgument(const void* p_a_grid,
+                             const void* p_b_grid,
+                             void* p_c_grid,
+                             index_t m,
+                             index_t n,
+                             index_t k,
+                             AElementwiseOperation a_element_op,
+                             BElementwiseOperation b_element_op,
+                             CElementwiseOperation c_element_op)
+    {
+        return Argument{
+            p_a_grid, p_b_grid, p_c_grid, m, n, k, a_element_op, b_element_op, c_element_op};
+    }
+
+    static auto MakeInvoker() { return Invoker{}; }
+
+    virtual std::unique_ptr<device::BaseInvoker> MakeInvokerPointer()
+    {
+        return std::make_unique<Invoker>(Invoker{});
+    }
+
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+
+        // clang-format off
+        str << "Device Reference Gemm"
+            << std::endl;
+        // clang-format on
+
+        return str.str();
+    }
+};
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/CMakeLists.txt
@@ -37,11 +37,7 @@ function(add_instance_library INSTANCE_NAME)
        endforeach()
    endif()

-    if(INSTANCES_ONLY)
-        set(INST_TARGETS ${DEFAULT_GPU_TARGETS})
-    else()
-        set(INST_TARGETS ${GPU_TARGETS})
-    endif()
+    set(INST_TARGETS ${SUPPORTED_GPU_TARGETS})

    # Do not build DL instances if DL_KERNELS macro is not set
    foreach(source IN LISTS ARGN)
@@ -64,9 +60,9 @@ function(add_instance_library INSTANCE_NAME)
            list(REMOVE_ITEM ARGN "${source}")
        endif()
    endforeach()
-    # Do not build mha instances if gfx94 targets are not on the target list
+    # Do not build mha instances if gfx94 or gfx90a targets are not on the target list
    foreach(source IN LISTS ARGN)
-    if(NOT INST_TARGETS MATCHES "gfx94" AND source MATCHES "mha")
+    if(NOT INST_TARGETS MATCHES "gfx94" AND NOT INST_TARGETS MATCHES "gfx90a" AND source MATCHES "mha")
         message("removing mha instance ${source} ")
         list(REMOVE_ITEM ARGN "${source}")
    endif()
@@ -75,17 +71,13 @@ function(add_instance_library INSTANCE_NAME)
    if(ARGN)
        set(INST_OBJ)
        foreach(source IN LISTS ARGN)
-            if(INSTANCES_ONLY)
-                set(INST_TARGETS ${DEFAULT_GPU_TARGETS})
-            else()
-                set(INST_TARGETS ${GPU_TARGETS})
-            endif()
+            set(INST_TARGETS ${SUPPORTED_GPU_TARGETS})
            if(source MATCHES "_xdl")
                list(REMOVE_ITEM INST_TARGETS gfx900 gfx906 gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1200 gfx1201)
            elseif(ARGN MATCHES "_wmma")
                list(REMOVE_ITEM INST_TARGETS gfx900 gfx906 gfx908 gfx90a gfx940 gfx941 gfx942 gfx1030)
            elseif(ARGN MATCHES "mha")
-                list(REMOVE_ITEM INST_TARGETS gfx900 gfx906 gfx908 gfx90a gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1200 gfx1201)
+                list(REMOVE_ITEM INST_TARGETS gfx900 gfx906 gfx908 gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1200 gfx1201)
            endif()
            set(offload_targets)
            foreach(target IN LISTS INST_TARGETS)
@@ -191,12 +183,7 @@ FOREACH(subdir_path ${dir_list})
            set(add_inst 1)
        endif()

-        if(INSTANCES_ONLY)
-            set(INST_TARGETS ${DEFAULT_GPU_TARGETS})
-        else()
-            set(INST_TARGETS ${GPU_TARGETS})
-        endif()
-
+        set(INST_TARGETS ${SUPPORTED_GPU_TARGETS})

        if(("${cmake_instance}" MATCHES "quantization") AND (DEFINED DTYPES) AND (NOT DTYPES MATCHES "int8"))
            message("quantization instances will not be built!")
@@ -320,8 +307,7 @@ if(CK_DEVICE_CONV_INSTANCES)
 endif()
 if(CK_DEVICE_MHA_INSTANCES)
        set(gpu_list ${INST_TARGETS})
-        list(FILTER gpu_list INCLUDE REGEX "^gfx94")
-        if(gpu_list)
+        if(gpu_list MATCHES "gfx94" OR gpu_list MATCHES "gfx90a")
            add_library(device_mha_operations STATIC ${CK_DEVICE_MHA_INSTANCES})
            add_library(composablekernels::device_mha_operations ALIAS device_mha_operations)
            target_compile_features(device_mha_operations PUBLIC)

--- a/profiler/src/CMakeLists.txt
+++ b/profiler/src/CMakeLists.txt
@@ -24,7 +24,7 @@ set(PROFILER_SOURCES
    profile_permute_scale.cpp
 )

-if(GPU_TARGETS MATCHES "gfx9")
+if(SUPPORTED_GPU_TARGETS MATCHES "gfx9")
  if(DTYPES MATCHES "fp32" OR DTYPES MATCHES "fp64" OR NOT DEFINED DTYPES)
    list(APPEND PROFILER_SOURCES profile_contraction_bilinear.cpp)
    list(APPEND PROFILER_SOURCES profile_contraction_scale.cpp)
@@ -49,7 +49,7 @@ if(GPU_TARGETS MATCHES "gfx9")
    list(APPEND PROFILER_SOURCES profile_grouped_gemm_multiply_tile_loop.cpp)
  endif()
  list(APPEND PROFILER_SOURCES profile_gemm_multiply_add.cpp)
-  if(GPU_TARGETS MATCHES "gfx94")
+  if(SUPPORTED_GPU_TARGETS MATCHES "gfx94")
    list(APPEND PROFILER_SOURCES profile_gemm_multiply_multiply.cpp)
    list(APPEND PROFILER_SOURCES profile_gemm_ab_scale.cpp)
  endif()
@@ -69,7 +69,7 @@ if(GPU_TARGETS MATCHES "gfx9")

 endif()

-if(GPU_TARGETS MATCHES "gfx11" OR GPU_TARGETS MATCHES "gfx12" OR GPU_TARGETS MATCHES "gfx9")
+if(SUPPORTED_GPU_TARGETS MATCHES "gfx11" OR SUPPORTED_GPU_TARGETS MATCHES "gfx12" OR SUPPORTED_GPU_TARGETS MATCHES "gfx9")
  if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
    list(APPEND PROFILER_SOURCES profile_gemm_bilinear.cpp)
  endif()
@@ -111,7 +111,7 @@ target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_column_to_image_inst
 target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_transpose_instance)
 target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_permute_scale_instance)

-if(GPU_TARGETS MATCHES "gfx9")
+if(SUPPORTED_GPU_TARGETS MATCHES "gfx9")
  if(DTYPES MATCHES "fp32" OR DTYPES MATCHES "fp64" OR NOT DEFINED DTYPES)
    target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_contraction_bilinear_instance)
    target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_contraction_scale_instance)
@@ -135,7 +135,7 @@ if(GPU_TARGETS MATCHES "gfx9")
  target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batched_gemm_instance)
  target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batched_gemm_reduce_instance)
  target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_multiply_add_instance)
-  if(GPU_TARGETS MATCHES "gfx94")
+  if(SUPPORTED_GPU_TARGETS MATCHES "gfx94")
    target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_multiply_multiply_instance)
    target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_ab_scale_instance)
  endif()
@@ -159,7 +159,7 @@ if(GPU_TARGETS MATCHES "gfx9")
  target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv3d_fwd_convinvscale_instance)
 endif()

-if(GPU_TARGETS MATCHES "gfx9" OR GPU_TARGETS MATCHES "gfx11" OR GPU_TARGETS MATCHES "gfx12")
+if(SUPPORTED_GPU_TARGETS MATCHES "gfx9" OR SUPPORTED_GPU_TARGETS MATCHES "gfx11" OR SUPPORTED_GPU_TARGETS MATCHES "gfx12")
  if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
    target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_bilinear_instance)
  endif()

--- a/script/cmake-ck-dev.sh
+++ b/script/cmake-ck-dev.sh
@@ -7,7 +7,8 @@ MY_PROJECT_SOURCE=$1

 if [ $# -ge 2 ] ; then
    GPU_TARGETS=$2
-    REST_ARGS=${@:3}
+    shift 2
+    REST_ARGS=$@
 else
    GPU_TARGETS="gfx908;gfx90a;gfx940"
    REST_ARGS=

--- a/script/cmake-ck-release.sh
+++ b/script/cmake-ck-release.sh
@@ -7,7 +7,8 @@ MY_PROJECT_SOURCE=$1

 if [ $# -ge 2 ] ; then
    GPU_TARGETS=$2
-    REST_ARGS=${@:3}
+    shift 2
+    REST_ARGS=$@
 else
    GPU_TARGETS="gfx908;gfx90a;gfx940"
    REST_ARGS=

--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -41,11 +41,7 @@ function(add_test_executable TEST_NAME)
        endforeach()
    endif()

-    if(INSTANCES_ONLY)
-        set(TEST_TARGETS ${DEFAULT_GPU_TARGETS})
-    else()
-        set(TEST_TARGETS ${GPU_TARGETS})
-    endif()
+    set(TEST_TARGETS ${SUPPORTED_GPU_TARGETS})

    foreach(source IN LISTS ARGN)
        if(NOT DEFINED DL_KERNELS AND source MATCHES "_dl")
@@ -122,11 +118,7 @@ function(add_gtest_executable TEST_NAME)
        endforeach()
    endif()

-    if(INSTANCES_ONLY)
-        set(TEST_TARGETS ${DEFAULT_GPU_TARGETS})
-    else()
-        set(TEST_TARGETS ${GPU_TARGETS})
-    endif()
+    set(TEST_TARGETS ${SUPPORTED_GPU_TARGETS})

    foreach(source IN LISTS ARGN)
        if(NOT DEFINED DL_KERNELS AND source MATCHES "_dl")
@@ -211,10 +203,10 @@ add_subdirectory(conv_tensor_rearrange)
 add_subdirectory(transpose)
 add_subdirectory(permute_scale)
 add_subdirectory(wrapper)
-if(GPU_TARGETS MATCHES "gfx11")
+if(SUPPORTED_GPU_TARGETS MATCHES "gfx11")
    add_subdirectory(wmma_op)
 endif()
-if(GPU_TARGETS MATCHES "gfx942" AND CK_HIP_VERSION_MAJOR GREATER_EQUAL 6 AND CK_HIP_VERSION_MINOR GREATER_EQUAL 2) # smfmac needs ROCm6.2
+if(SUPPORTED_GPU_TARGETS MATCHES "gfx942" AND CK_HIP_VERSION_MAJOR GREATER_EQUAL 6 AND CK_HIP_VERSION_MINOR GREATER_EQUAL 2) # smfmac needs ROCm6.2
    add_subdirectory(smfmac_op)
 endif()
 add_subdirectory(position_embedding)
--- a/test/data_type/CMakeLists.txt
+++ b/test/data_type/CMakeLists.txt
@@ -18,4 +18,9 @@ if(result EQUAL 0)
  target_link_libraries(test_bf8 PRIVATE utility)
 endif()

+add_gtest_executable(test_custom_type test_custom_type.cpp)
+if(result EQUAL 0)
+  target_link_libraries(test_custom_type PRIVATE utility)
+endif()
+
 add_gtest_executable(test_type_convert_const type_convert_const.cpp)
--- a/test/data_type/test_custom_type.cpp
+++ b/test/data_type/test_custom_type.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "gtest/gtest.h"
+#include "ck/utility/data_type.hpp"
+#include "ck/utility/type_convert.hpp"
+
+using ck::bf8_t;
+using ck::bhalf_t;
+using ck::f8_t;
+using ck::half_t;
+using ck::Number;
+using ck::type_convert;
+using ck::vector_type;
+
+TEST(Custom_bool, TestSize)
+{
+    struct custom_bool_t
+    {
+        bool data;
+    };
+    ASSERT_EQ(sizeof(custom_bool_t), sizeof(bool));
+    ASSERT_EQ(sizeof(vector_type<custom_bool_t, 2>), sizeof(vector_type<bool, 2>));
+    ASSERT_EQ(sizeof(vector_type<custom_bool_t, 4>), sizeof(vector_type<bool, 4>));
+    ASSERT_EQ(sizeof(vector_type<custom_bool_t, 8>), sizeof(vector_type<bool, 8>));
+    ASSERT_EQ(sizeof(vector_type<custom_bool_t, 16>), sizeof(vector_type<bool, 16>));
+    ASSERT_EQ(sizeof(vector_type<custom_bool_t, 32>), sizeof(vector_type<bool, 32>));
+    ASSERT_EQ(sizeof(vector_type<custom_bool_t, 64>), sizeof(vector_type<bool, 64>));
+}
+
+TEST(Custom_bool, TestAsType)
+{
+    struct custom_bool_t
+    {
+        using type = bool;
+        type data;
+        custom_bool_t() : data{type{}} {}
+        custom_bool_t(type init) : data{init} {}
+    };
+
+    // test size
+    const int size             = 4;
+    std::vector<bool> test_vec = {false, true, false, true};
+    // reference vector
+    vector_type<custom_bool_t, size> right_vec;
+    // check default CTOR
+    ck::static_for<0, size, 1>{}([&](auto i) {
+        ASSERT_EQ(right_vec.template AsType<custom_bool_t>()(Number<i>{}).data, false);
+    });
+    // assign test values to the vector
+    ck::static_for<0, size, 1>{}([&](auto i) {
+        right_vec.template AsType<custom_bool_t>()(Number<i>{}) = custom_bool_t{test_vec.at(i)};
+    });
+    // copy the vector
+    vector_type<custom_bool_t, size> left_vec{right_vec};
+    // check if values were copied correctly
+    ck::static_for<0, size, 1>{}([&](auto i) {
+        ASSERT_EQ(left_vec.template AsType<custom_bool_t>()(Number<i>{}).data, test_vec.at(i));
+    });
+}
+
+TEST(Custom_bool, TestAsTypeReshape)
+{
+    struct custom_bool_t
+    {
+        using type = bool;
+        type data;
+        custom_bool_t() : data{type{}} {}
+        custom_bool_t(type init) : data{init} {}
+    };
+
+    // test size
+    const int size             = 4;
+    std::vector<bool> test_vec = {false, true, false, true};
+    // reference vector
+    vector_type<custom_bool_t, size> right_vec;
+    // check default CTOR
+    ck::static_for<0, size, 1>{}([&](auto i) {
+        ASSERT_EQ(right_vec.template AsType<custom_bool_t>()(Number<i>{}).data, false);
+    });
+    // assign test values to the vector
+    ck::static_for<0, size, 1>{}([&](auto i) {
+        right_vec.template AsType<custom_bool_t>()(Number<i>{}) = custom_bool_t{test_vec.at(i)};
+    });
+    // copy the first half of a vector
+    vector_type<custom_bool_t, size / 2> left_vec{
+        right_vec.template AsType<vector_type<custom_bool_t, size / 2>::type>()(Number<0>{})};
+    // check if values were copied correctly
+    ck::static_for<0, size / 2, 1>{}([&](auto i) {
+        ASSERT_EQ(left_vec.template AsType<custom_bool_t>()(Number<i>{}).data, test_vec.at(i));
+    });
+}
+
+TEST(Custom_int8, TestSize)
+{
+    struct custom_int8_t
+    {
+        int8_t data;
+    };
+    ASSERT_EQ(sizeof(custom_int8_t), sizeof(int8_t));
+    ASSERT_EQ(sizeof(vector_type<custom_int8_t, 2>), sizeof(vector_type<int8_t, 2>));
+    ASSERT_EQ(sizeof(vector_type<custom_int8_t, 4>), sizeof(vector_type<int8_t, 4>));
+    ASSERT_EQ(sizeof(vector_type<custom_int8_t, 8>), sizeof(vector_type<int8_t, 8>));
+    ASSERT_EQ(sizeof(vector_type<custom_int8_t, 16>), sizeof(vector_type<int8_t, 16>));
+    ASSERT_EQ(sizeof(vector_type<custom_int8_t, 32>), sizeof(vector_type<int8_t, 32>));
+    ASSERT_EQ(sizeof(vector_type<custom_int8_t, 64>), sizeof(vector_type<int8_t, 64>));
+}
+
+TEST(Custom_int8, TestAsType)
+{
+    struct custom_int8_t
+    {
+        using type = int8_t;
+        type data;
+        custom_int8_t() : data{type{}} {}
+        custom_int8_t(type init) : data{init} {}
+    };
+
+    // test size
+    const int size               = 4;
+    std::vector<int8_t> test_vec = {3, -6, 8, -2};
+    // reference vector
+    vector_type<custom_int8_t, size> right_vec;
+    // check default CTOR
+    ck::static_for<0, size, 1>{}([&](auto i) {
+        ASSERT_EQ(right_vec.template AsType<custom_int8_t>()(Number<i>{}).data, 0);
+    });
+    // assign test values to the vector
+    ck::static_for<0, size, 1>{}([&](auto i) {
+        right_vec.template AsType<custom_int8_t>()(Number<i>{}) = custom_int8_t{test_vec.at(i)};
+    });
+    // copy the vector
+    vector_type<custom_int8_t, size> left_vec{right_vec};
+    // check if values were copied correctly
+    ck::static_for<0, size, 1>{}([&](auto i) {
+        ASSERT_EQ(left_vec.template AsType<custom_int8_t>()(Number<i>{}).data, test_vec.at(i));
+    });
+}
+
+TEST(Custom_int8, TestAsTypeReshape)
+{
+    struct custom_int8_t
+    {
+        using type = int8_t;
+        type data;
+        custom_int8_t() : data{type{}} {}
+        custom_int8_t(type init) : data{init} {}
+    };
+
+    // test size
+    const int size               = 4;
+    std::vector<int8_t> test_vec = {3, -6, 8, -2};
+    // reference vector
+    vector_type<custom_int8_t, size> right_vec;
+    // check default CTOR
+    ck::static_for<0, size, 1>{}([&](auto i) {
+        ASSERT_EQ(right_vec.template AsType<custom_int8_t>()(Number<i>{}).data, 0);
+    });
+    // assign test values to the vector
+    ck::static_for<0, size, 1>{}([&](auto i) {
+        right_vec.template AsType<custom_int8_t>()(Number<i>{}) = custom_int8_t{test_vec.at(i)};
+    });
+    // copy the first half of a vector
+    vector_type<custom_int8_t, size / 2> left_vec{
+        right_vec.template AsType<vector_type<custom_int8_t, size / 2>::type>()(Number<0>{})};
+    // check if values were copied correctly
+    ck::static_for<0, size / 2, 1>{}([&](auto i) {
+        ASSERT_EQ(left_vec.template AsType<custom_int8_t>()(Number<i>{}).data, test_vec.at(i));
+    });
+}
+
+TEST(Custom_uint8, TestSize)
+{
+    struct custom_uint8_t
+    {
+        uint8_t data;
+    };
+    ASSERT_EQ(sizeof(custom_uint8_t), sizeof(uint8_t));
+    ASSERT_EQ(sizeof(vector_type<custom_uint8_t, 2>), sizeof(vector_type<uint8_t, 2>));
+    ASSERT_EQ(sizeof(vector_type<custom_uint8_t, 4>), sizeof(vector_type<uint8_t, 4>));
+    ASSERT_EQ(sizeof(vector_type<custom_uint8_t, 8>), sizeof(vector_type<uint8_t, 8>));
+    ASSERT_EQ(sizeof(vector_type<custom_uint8_t, 16>), sizeof(vector_type<uint8_t, 16>));
+    ASSERT_EQ(sizeof(vector_type<custom_uint8_t, 32>), sizeof(vector_type<uint8_t, 32>));
+    ASSERT_EQ(sizeof(vector_type<custom_uint8_t, 64>), sizeof(vector_type<uint8_t, 64>));
+}
+
+TEST(Custom_uint8, TestAsType)
+{
+    struct custom_uint8_t
+    {
+        using type = uint8_t;
+        type data;
+        custom_uint8_t() : data{type{}} {}
+        custom_uint8_t(type init) : data{init} {}
+    };
+
+    // test size
+    const int size                = 4;
+    std::vector<uint8_t> test_vec = {3, 6, 8, 2};
+    // reference vector
+    vector_type<custom_uint8_t, size> right_vec;
+    // check default CTOR
+    ck::static_for<0, size, 1>{}([&](auto i) {
+        ASSERT_EQ(right_vec.template AsType<custom_uint8_t>()(Number<i>{}).data, 0);
+    });
+    // assign test values to the vector
+    ck::static_for<0, size, 1>{}([&](auto i) {
+        right_vec.template AsType<custom_uint8_t>()(Number<i>{}) = custom_uint8_t{test_vec.at(i)};
+    });
+    // copy the vector
+    vector_type<custom_uint8_t, size> left_vec{right_vec};
+    // check if values were copied correctly
+    ck::static_for<0, size, 1>{}([&](auto i) {
+        ASSERT_EQ(left_vec.template AsType<custom_uint8_t>()(Number<i>{}).data, test_vec.at(i));
+    });
+}
+
+TEST(Custom_uint8, TestAsTypeReshape)
+{
+    struct custom_uint8_t
+    {
+        using type = uint8_t;
+        type data;
+        custom_uint8_t() : data{type{}} {}
+        custom_uint8_t(type init) : data{init} {}
+    };
+
+    // test size
+    const int size                = 4;
+    std::vector<uint8_t> test_vec = {3, 6, 8, 2};
+    // reference vector
+    vector_type<custom_uint8_t, size> right_vec;
+    // check default CTOR
+    ck::static_for<0, size, 1>{}([&](auto i) {
+        ASSERT_EQ(right_vec.template AsType<custom_uint8_t>()(Number<i>{}).data, 0);
+    });
+    // assign test values to the vector
+    ck::static_for<0, size, 1>{}([&](auto i) {
+        right_vec.template AsType<custom_uint8_t>()(Number<i>{}) = custom_uint8_t{test_vec.at(i)};
+    });
+    // copy the first half of a vector
+    vector_type<custom_uint8_t, size / 2> left_vec{
+        right_vec.template AsType<vector_type<custom_uint8_t, size / 2>::type>()(Number<0>{})};
+    // check if values were copied correctly
+    ck::static_for<0, size / 2, 1>{}([&](auto i) {
+        ASSERT_EQ(left_vec.template AsType<custom_uint8_t>()(Number<i>{}).data, test_vec.at(i));
+    });
+}
+
+TEST(Custom_f8, TestSize)
+{
+    struct custom_f8_t
+    {
+        _BitInt(8) data;
+    };
+    ASSERT_EQ(sizeof(custom_f8_t), sizeof(_BitInt(8)));
+    ASSERT_EQ(sizeof(vector_type<custom_f8_t, 2>), sizeof(vector_type<_BitInt(8), 2>));
+    ASSERT_EQ(sizeof(vector_type<custom_f8_t, 4>), sizeof(vector_type<_BitInt(8), 4>));
+    ASSERT_EQ(sizeof(vector_type<custom_f8_t, 8>), sizeof(vector_type<_BitInt(8), 8>));
+    ASSERT_EQ(sizeof(vector_type<custom_f8_t, 16>), sizeof(vector_type<_BitInt(8), 16>));
+    ASSERT_EQ(sizeof(vector_type<custom_f8_t, 32>), sizeof(vector_type<_BitInt(8), 32>));
+    ASSERT_EQ(sizeof(vector_type<custom_f8_t, 64>), sizeof(vector_type<_BitInt(8), 64>));
+}
+
+TEST(Custom_f8, TestAsType)
+{
+    struct custom_f8_t
+    {
+        using type = _BitInt(8);
+        type data;
+        custom_f8_t() : data{type{}} {}
+        custom_f8_t(type init) : data{init} {}
+    };
+
+    // test size
+    const int size                   = 4;
+    std::vector<_BitInt(8)> test_vec = {type_convert<_BitInt(8)>(0.3f),
+                                        type_convert<_BitInt(8)>(-0.6f),
+                                        type_convert<_BitInt(8)>(0.8f),
+                                        type_convert<_BitInt(8)>(-0.2f)};
+    // reference vector
+    vector_type<custom_f8_t, size> right_vec;
+    // check default CTOR
+    ck::static_for<0, size, 1>{}(
+        [&](auto i) { ASSERT_EQ(right_vec.template AsType<custom_f8_t>()(Number<i>{}).data, 0); });
+    // assign test values to the vector
+    ck::static_for<0, size, 1>{}([&](auto i) {
+        right_vec.template AsType<custom_f8_t>()(Number<i>{}) = custom_f8_t{test_vec.at(i)};
+    });
+    // copy the vector
+    vector_type<custom_f8_t, size> left_vec{right_vec};
+    // check if values were copied correctly
+    ck::static_for<0, size, 1>{}([&](auto i) {
+        ASSERT_EQ(left_vec.template AsType<custom_f8_t>()(Number<i>{}).data, test_vec.at(i));
+    });
+}
+
+TEST(Custom_f8, TestAsTypeReshape)
+{
+    struct custom_f8_t
+    {
+        using type = _BitInt(8);
+        type data;
+        custom_f8_t() : data{type{}} {}
+        custom_f8_t(type init) : data{init} {}
+    };
+
+    // test size
+    const int size                   = 4;
+    std::vector<_BitInt(8)> test_vec = {type_convert<_BitInt(8)>(0.3f),
+                                        type_convert<_BitInt(8)>(-0.6f),
+                                        type_convert<_BitInt(8)>(0.8f),
+                                        type_convert<_BitInt(8)>(-0.2f)};
+    // reference vector
+    vector_type<custom_f8_t, size> right_vec;
+    // check default CTOR
+    ck::static_for<0, size, 1>{}(
+        [&](auto i) { ASSERT_EQ(right_vec.template AsType<custom_f8_t>()(Number<i>{}).data, 0); });
+    // assign test values to the vector
+    ck::static_for<0, size, 1>{}([&](auto i) {
+        right_vec.template AsType<custom_f8_t>()(Number<i>{}) = custom_f8_t{test_vec.at(i)};
+    });
+    // copy the first half of a vector
+    vector_type<custom_f8_t, size / 2> left_vec{
+        right_vec.template AsType<vector_type<custom_f8_t, size / 2>::type>()(Number<0>{})};
+    // check if values were copied correctly
+    ck::static_for<0, size / 2, 1>{}([&](auto i) {
+        ASSERT_EQ(left_vec.template AsType<custom_f8_t>()(Number<i>{}).data, test_vec.at(i));
+    });
+}
+
+TEST(Custom_bf8, TestSize)
+{
+    struct custom_bf8_t
+    {
+        unsigned _BitInt(8) data;
+    };
+    ASSERT_EQ(sizeof(custom_bf8_t), sizeof(unsigned _BitInt(8)));
+    ASSERT_EQ(sizeof(vector_type<custom_bf8_t, 2>), sizeof(vector_type<unsigned _BitInt(8), 2>));
+    ASSERT_EQ(sizeof(vector_type<custom_bf8_t, 4>), sizeof(vector_type<unsigned _BitInt(8), 4>));
+    ASSERT_EQ(sizeof(vector_type<custom_bf8_t, 8>), sizeof(vector_type<unsigned _BitInt(8), 8>));
+    ASSERT_EQ(sizeof(vector_type<custom_bf8_t, 16>), sizeof(vector_type<unsigned _BitInt(8), 16>));
+    ASSERT_EQ(sizeof(vector_type<custom_bf8_t, 32>), sizeof(vector_type<unsigned _BitInt(8), 32>));
+    ASSERT_EQ(sizeof(vector_type<custom_bf8_t, 64>), sizeof(vector_type<unsigned _BitInt(8), 64>));
+}
+
+TEST(Custom_bf8, TestAsType)
+{
+    struct custom_bf8_t
+    {
+        using type = unsigned _BitInt(8);
+        type data;
+        custom_bf8_t() : data{type{}} {}
+        custom_bf8_t(type init) : data{init} {}
+    };
+
+    // test size
+    const int size                            = 4;
+    std::vector<unsigned _BitInt(8)> test_vec = {type_convert<unsigned _BitInt(8)>(0.3f),
+                                                 type_convert<unsigned _BitInt(8)>(-0.6f),
+                                                 type_convert<unsigned _BitInt(8)>(0.8f),
+                                                 type_convert<unsigned _BitInt(8)>(-0.2f)};
+    // reference vector
+    vector_type<custom_bf8_t, size> right_vec;
+    // check default CTOR
+    ck::static_for<0, size, 1>{}(
+        [&](auto i) { ASSERT_EQ(right_vec.template AsType<custom_bf8_t>()(Number<i>{}).data, 0); });
+    // assign test values to the vector
+    ck::static_for<0, size, 1>{}([&](auto i) {
+        right_vec.template AsType<custom_bf8_t>()(Number<i>{}) = custom_bf8_t{test_vec.at(i)};
+    });
+    // copy the vector
+    vector_type<custom_bf8_t, size> left_vec{right_vec};
+    // check if values were copied correctly
+    ck::static_for<0, size, 1>{}([&](auto i) {
+        ASSERT_EQ(left_vec.template AsType<custom_bf8_t>()(Number<i>{}).data, test_vec.at(i));
+    });
+}
+
+TEST(Custom_bf8, TestAsTypeReshape)
+{
+    struct custom_bf8_t
+    {
+        using type = unsigned _BitInt(8);
+        type data;
+        custom_bf8_t() : data{type{}} {}
+        custom_bf8_t(type init) : data{init} {}
+    };
+
+    // test size
+    const int size                            = 4;
+    std::vector<unsigned _BitInt(8)> test_vec = {type_convert<unsigned _BitInt(8)>(0.3f),
+                                                 type_convert<unsigned _BitInt(8)>(-0.6f),
+                                                 type_convert<unsigned _BitInt(8)>(0.8f),
+                                                 type_convert<unsigned _BitInt(8)>(-0.2f)};
+    // reference vector
+    vector_type<custom_bf8_t, size> right_vec;
+    // check default CTOR
+    ck::static_for<0, size, 1>{}(
+        [&](auto i) { ASSERT_EQ(right_vec.template AsType<custom_bf8_t>()(Number<i>{}).data, 0); });
+    // assign test values to the vector
+    ck::static_for<0, size, 1>{}([&](auto i) {
+        right_vec.template AsType<custom_bf8_t>()(Number<i>{}) = custom_bf8_t{test_vec.at(i)};
+    });
+    // copy the first half of a vector
+    vector_type<custom_bf8_t, size / 2> left_vec{
+        right_vec.template AsType<vector_type<custom_bf8_t, size / 2>::type>()(Number<0>{})};
+    // check if values were copied correctly
+    ck::static_for<0, size / 2, 1>{}([&](auto i) {
+        ASSERT_EQ(left_vec.template AsType<custom_bf8_t>()(Number<i>{}).data, test_vec.at(i));
+    });
+}
+
+TEST(Custom_half, TestSize)
+{
+    struct custom_half_t
+    {
+        half_t data;
+    };
+    ASSERT_EQ(sizeof(custom_half_t), sizeof(half_t));
+    ASSERT_EQ(sizeof(vector_type<custom_half_t, 2>), sizeof(vector_type<half_t, 2>));
+    ASSERT_EQ(sizeof(vector_type<custom_half_t, 4>), sizeof(vector_type<half_t, 4>));
+    ASSERT_EQ(sizeof(vector_type<custom_half_t, 8>), sizeof(vector_type<half_t, 8>));
+    ASSERT_EQ(sizeof(vector_type<custom_half_t, 16>), sizeof(vector_type<half_t, 16>));
+    ASSERT_EQ(sizeof(vector_type<custom_half_t, 32>), sizeof(vector_type<half_t, 32>));
+    ASSERT_EQ(sizeof(vector_type<custom_half_t, 64>), sizeof(vector_type<half_t, 64>));
+}
+
+TEST(Custom_half, TestAsType)
+{
+    struct custom_half_t
+    {
+        using type = half_t;
+        type data;
+        custom_half_t() : data{type{}} {}
+        custom_half_t(type init) : data{init} {}
+    };
+
+    // test size
+    const int size               = 4;
+    std::vector<half_t> test_vec = {half_t{0.3f}, half_t{-0.6f}, half_t{0.8f}, half_t{-0.2f}};
+    // reference vector
+    vector_type<custom_half_t, size> right_vec;
+    // check default CTOR
+    ck::static_for<0, size, 1>{}([&](auto i) {
+        ASSERT_EQ(right_vec.template AsType<custom_half_t>()(Number<i>{}).data,
+                  type_convert<half_t>(0.0f));
+    });
+    // assign test values to the vector
+    ck::static_for<0, size, 1>{}([&](auto i) {
+        right_vec.template AsType<custom_half_t>()(Number<i>{}) = custom_half_t{test_vec.at(i)};
+    });
+    // copy the vector
+    vector_type<custom_half_t, size> left_vec{right_vec};
+    // check if values were copied correctly
+    ck::static_for<0, size, 1>{}([&](auto i) {
+        ASSERT_EQ(left_vec.template AsType<custom_half_t>()(Number<i>{}).data, test_vec.at(i));
+    });
+}
+
+TEST(Custom_half, TestAsTypeReshape)
+{
+    struct custom_half_t
+    {
+        using type = half_t;
+        type data;
+        custom_half_t() : data{type{}} {}
+        custom_half_t(type init) : data{init} {}
+    };
+
+    // test size
+    const int size               = 4;
+    std::vector<half_t> test_vec = {half_t{0.3f}, half_t{-0.6f}, half_t{0.8f}, half_t{-0.2f}};
+    // reference vector
+    vector_type<custom_half_t, size> right_vec;
+    // check default CTOR
+    ck::static_for<0, size, 1>{}([&](auto i) {
+        ASSERT_EQ(right_vec.template AsType<custom_half_t>()(Number<i>{}).data,
+                  type_convert<half_t>(0.0f));
+    });
+    // assign test values to the vector
+    ck::static_for<0, size, 1>{}([&](auto i) {
+        right_vec.template AsType<custom_half_t>()(Number<i>{}) = custom_half_t{test_vec.at(i)};
+    });
+    // copy the first half of a vector
+    vector_type<custom_half_t, size / 2> left_vec{
+        right_vec.template AsType<vector_type<custom_half_t, size / 2>::type>()(Number<0>{})};
+    // check if values were copied correctly
+    ck::static_for<0, size / 2, 1>{}([&](auto i) {
+        ASSERT_EQ(left_vec.template AsType<custom_half_t>()(Number<i>{}).data, test_vec.at(i));
+    });
+}
+
+TEST(Custom_bhalf, TestSize)
+{
+    struct custom_bhalf_t
+    {
+        bhalf_t data;
+    };
+    ASSERT_EQ(sizeof(custom_bhalf_t), sizeof(bhalf_t));
+    ASSERT_EQ(sizeof(vector_type<custom_bhalf_t, 2>), sizeof(vector_type<bhalf_t, 2>));
+    ASSERT_EQ(sizeof(vector_type<custom_bhalf_t, 4>), sizeof(vector_type<bhalf_t, 4>));
+    ASSERT_EQ(sizeof(vector_type<custom_bhalf_t, 8>), sizeof(vector_type<bhalf_t, 8>));
+    ASSERT_EQ(sizeof(vector_type<custom_bhalf_t, 16>), sizeof(vector_type<bhalf_t, 16>));
+    ASSERT_EQ(sizeof(vector_type<custom_bhalf_t, 32>), sizeof(vector_type<bhalf_t, 32>));
+    ASSERT_EQ(sizeof(vector_type<custom_bhalf_t, 64>), sizeof(vector_type<bhalf_t, 64>));
+}
+
+TEST(Custom_bhalf, TestAsType)
+{
+    struct custom_bhalf_t
+    {
+        using type = bhalf_t;
+        type data;
+        custom_bhalf_t() : data{type{}} {}
+        custom_bhalf_t(type init) : data{init} {}
+    };
+
+    // test size
+    const int size                = 4;
+    std::vector<bhalf_t> test_vec = {type_convert<bhalf_t>(0.3f),
+                                     type_convert<bhalf_t>(-0.6f),
+                                     type_convert<bhalf_t>(0.8f),
+                                     type_convert<bhalf_t>(-0.2f)};
+    // reference vector
+    vector_type<custom_bhalf_t, size> right_vec;
+    // check default CTOR
+    ck::static_for<0, size, 1>{}([&](auto i) {
+        ASSERT_EQ(right_vec.template AsType<custom_bhalf_t>()(Number<i>{}).data,
+                  type_convert<bhalf_t>(0.0f));
+    });
+    // assign test values to the vector
+    ck::static_for<0, size, 1>{}([&](auto i) {
+        right_vec.template AsType<custom_bhalf_t>()(Number<i>{}) = custom_bhalf_t{test_vec.at(i)};
+    });
+    // copy the vector
+    vector_type<custom_bhalf_t, size> left_vec{right_vec};
+    // check if values were copied correctly
+    ck::static_for<0, size, 1>{}([&](auto i) {
+        ASSERT_EQ(left_vec.template AsType<custom_bhalf_t>()(Number<i>{}).data, test_vec.at(i));
+    });
+}
+
+TEST(Custom_bhalf, TestAsTypeReshape)
+{
+    struct custom_bhalf_t
+    {
+        using type = bhalf_t;
+        type data;
+        custom_bhalf_t() : data{type{}} {}
+        custom_bhalf_t(type init) : data{init} {}
+    };
+
+    // test size
+    const int size                = 4;
+    std::vector<bhalf_t> test_vec = {type_convert<bhalf_t>(0.3f),
+                                     type_convert<bhalf_t>(-0.6f),
+                                     type_convert<bhalf_t>(0.8f),
+                                     type_convert<bhalf_t>(-0.2f)};
+    // reference vector
+    vector_type<custom_bhalf_t, size> right_vec;
+    // check default CTOR
+    ck::static_for<0, size, 1>{}([&](auto i) {
+        ASSERT_EQ(right_vec.template AsType<custom_bhalf_t>()(Number<i>{}).data,
+                  type_convert<bhalf_t>(0.0f));
+    });
+    // assign test values to the vector
+    ck::static_for<0, size, 1>{}([&](auto i) {
+        right_vec.template AsType<custom_bhalf_t>()(Number<i>{}) = custom_bhalf_t{test_vec.at(i)};
+    });
+    // copy the first half of a vector
+    vector_type<custom_bhalf_t, size / 2> left_vec{
+        right_vec.template AsType<vector_type<custom_bhalf_t, size / 2>::type>()(Number<0>{})};
+    // check if values were copied correctly
+    ck::static_for<0, size / 2, 1>{}([&](auto i) {
+        ASSERT_EQ(left_vec.template AsType<custom_bhalf_t>()(Number<i>{}).data, test_vec.at(i));
+    });
+}
+
+TEST(Custom_float, TestSize)
+{
+    struct custom_float_t
+    {
+        float data;
+    };
+    ASSERT_EQ(sizeof(custom_float_t), sizeof(float));
+    ASSERT_EQ(sizeof(vector_type<custom_float_t, 2>), sizeof(vector_type<float, 2>));
+    ASSERT_EQ(sizeof(vector_type<custom_float_t, 4>), sizeof(vector_type<float, 4>));
+    ASSERT_EQ(sizeof(vector_type<custom_float_t, 8>), sizeof(vector_type<float, 8>));
+    ASSERT_EQ(sizeof(vector_type<custom_float_t, 16>), sizeof(vector_type<float, 16>));
+    ASSERT_EQ(sizeof(vector_type<custom_float_t, 32>), sizeof(vector_type<float, 32>));
+    ASSERT_EQ(sizeof(vector_type<custom_float_t, 64>), sizeof(vector_type<float, 64>));
+}
+
+TEST(Custom_float, TestAsType)
+{
+    struct custom_float_t
+    {
+        using type = float;
+        type data;
+        custom_float_t() : data{type{}} {}
+        custom_float_t(type init) : data{init} {}
+    };
+
+    // test size
+    const int size              = 4;
+    std::vector<float> test_vec = {0.3f, -0.6f, 0.8f, -0.2f};
+    // reference vector
+    vector_type<custom_float_t, size> right_vec;
+    // check default CTOR
+    ck::static_for<0, size, 1>{}([&](auto i) {
+        ASSERT_EQ(right_vec.template AsType<custom_float_t>()(Number<i>{}).data, 0.0f);
+    });
+    // assign test values to the vector
+    ck::static_for<0, size, 1>{}([&](auto i) {
+        right_vec.template AsType<custom_float_t>()(Number<i>{}) = custom_float_t{test_vec.at(i)};
+    });
+    // copy the vector
+    vector_type<custom_float_t, size> left_vec{right_vec};
+    // check if values were copied correctly
+    ck::static_for<0, size, 1>{}([&](auto i) {
+        ASSERT_EQ(left_vec.template AsType<custom_float_t>()(Number<i>{}).data, test_vec.at(i));
+    });
+}
+
+TEST(Custom_float, TestAsTypeReshape)
+{
+    struct custom_float_t
+    {
+        using type = float;
+        type data;
+        custom_float_t() : data{type{}} {}
+        custom_float_t(type init) : data{init} {}
+    };
+
+    // test size
+    const int size              = 4;
+    std::vector<float> test_vec = {0.3f, -0.6f, 0.8f, -0.2f};
+    // reference vector
+    vector_type<custom_float_t, size> right_vec;
+    // check default CTOR
+    ck::static_for<0, size, 1>{}([&](auto i) {
+        ASSERT_EQ(right_vec.template AsType<custom_float_t>()(Number<i>{}).data, 0.0f);
+    });
+    // assign test values to the vector
+    ck::static_for<0, size, 1>{}([&](auto i) {
+        right_vec.template AsType<custom_float_t>()(Number<i>{}) = custom_float_t{test_vec.at(i)};
+    });
+    // copy the first half of a vector
+    vector_type<custom_float_t, size / 2> left_vec{
+        right_vec.template AsType<vector_type<custom_float_t, size / 2>::type>()(Number<0>{})};
+    // check if values were copied correctly
+    ck::static_for<0, size / 2, 1>{}([&](auto i) {
+        ASSERT_EQ(left_vec.template AsType<custom_float_t>()(Number<i>{}).data, test_vec.at(i));
+    });
+}
+
+TEST(Custom_double, TestSize)
+{
+    struct custom_double_t
+    {
+        double data;
+    };
+    ASSERT_EQ(sizeof(custom_double_t), sizeof(double));
+    ASSERT_EQ(sizeof(vector_type<custom_double_t, 2>), sizeof(vector_type<double, 2>));
+    ASSERT_EQ(sizeof(vector_type<custom_double_t, 4>), sizeof(vector_type<double, 4>));
+    ASSERT_EQ(sizeof(vector_type<custom_double_t, 8>), sizeof(vector_type<double, 8>));
+    ASSERT_EQ(sizeof(vector_type<custom_double_t, 16>), sizeof(vector_type<double, 16>));
+    ASSERT_EQ(sizeof(vector_type<custom_double_t, 32>), sizeof(vector_type<double, 32>));
+    ASSERT_EQ(sizeof(vector_type<custom_double_t, 64>), sizeof(vector_type<double, 64>));
+}
+
+TEST(Custom_double, TestAsType)
+{
+    struct custom_double_t
+    {
+        using type = double;
+        type data;
+        custom_double_t() : data{type{}} {}
+        custom_double_t(type init) : data{init} {}
+    };
+
+    // test size
+    const int size               = 4;
+    std::vector<double> test_vec = {0.3, 0.6, 0.8, 0.2};
+    // reference vector
+    vector_type<custom_double_t, size> right_vec;
+    // check default CTOR
+    ck::static_for<0, size, 1>{}([&](auto i) {
+        ASSERT_EQ(right_vec.template AsType<custom_double_t>()(Number<i>{}).data, 0.0);
+    });
+    // assign test values to the vector
+    ck::static_for<0, size, 1>{}([&](auto i) {
+        right_vec.template AsType<custom_double_t>()(Number<i>{}) = custom_double_t{test_vec.at(i)};
+    });
+    // copy the vector
+    vector_type<custom_double_t, size> left_vec{right_vec};
+    // check if values were copied correctly
+    ck::static_for<0, size, 1>{}([&](auto i) {
+        ASSERT_EQ(left_vec.template AsType<custom_double_t>()(Number<i>{}).data, test_vec.at(i));
+    });
+}
+
+TEST(Custom_double, TestAsTypeReshape)
+{
+    struct custom_double_t
+    {
+        using type = double;
+        type data;
+        custom_double_t() : data{type{}} {}
+        custom_double_t(type init) : data{init} {}
+    };
+
+    // test size
+    const int size               = 4;
+    std::vector<double> test_vec = {0.3, 0.6, 0.8, 0.2};
+    // reference vector
+    vector_type<custom_double_t, size> right_vec;
+    // check default CTOR
+    ck::static_for<0, size, 1>{}([&](auto i) {
+        ASSERT_EQ(right_vec.template AsType<custom_double_t>()(Number<i>{}).data, 0.0);
+    });
+    // assign test values to the vector
+    ck::static_for<0, size, 1>{}([&](auto i) {
+        right_vec.template AsType<custom_double_t>()(Number<i>{}) = custom_double_t{test_vec.at(i)};
+    });
+    // copy the first half of a vector
+    vector_type<custom_double_t, size / 2> left_vec{
+        right_vec.template AsType<vector_type<custom_double_t, size / 2>::type>()(Number<0>{})};
+    // check if values were copied correctly
+    ck::static_for<0, size / 2, 1>{}([&](auto i) {
+        ASSERT_EQ(left_vec.template AsType<custom_double_t>()(Number<i>{}).data, test_vec.at(i));
+    });
+}
+
+TEST(Complex_half, TestSize)
+{
+    struct complex_half_t
+    {
+        half_t real;
+        half_t img;
+    };
+    ASSERT_EQ(sizeof(complex_half_t), sizeof(half_t) + sizeof(half_t));
+    ASSERT_EQ(sizeof(vector_type<complex_half_t, 2>),
+              sizeof(vector_type<half_t, 2>) + sizeof(vector_type<half_t, 2>));
+    ASSERT_EQ(sizeof(vector_type<complex_half_t, 4>),
+              sizeof(vector_type<half_t, 4>) + sizeof(vector_type<half_t, 4>));
+    ASSERT_EQ(sizeof(vector_type<complex_half_t, 8>),
+              sizeof(vector_type<half_t, 8>) + sizeof(vector_type<half_t, 8>));
+    ASSERT_EQ(sizeof(vector_type<complex_half_t, 16>),
+              sizeof(vector_type<half_t, 16>) + sizeof(vector_type<half_t, 16>));
+    ASSERT_EQ(sizeof(vector_type<complex_half_t, 32>),
+              sizeof(vector_type<half_t, 32>) + sizeof(vector_type<half_t, 32>));
+    ASSERT_EQ(sizeof(vector_type<complex_half_t, 64>),
+              sizeof(vector_type<half_t, 64>) + sizeof(vector_type<half_t, 64>));
+}
+
+TEST(Complex_half, TestAlignment)
+{
+    struct complex_half_t
+    {
+        half_t real;
+        half_t img;
+    };
+    ASSERT_EQ(alignof(vector_type<complex_half_t, 2>),
+              alignof(vector_type<half_t, 2>) + alignof(vector_type<half_t, 2>));
+    ASSERT_EQ(alignof(vector_type<complex_half_t, 4>),
+              alignof(vector_type<half_t, 4>) + alignof(vector_type<half_t, 4>));
+    ASSERT_EQ(alignof(vector_type<complex_half_t, 8>),
+              alignof(vector_type<half_t, 8>) + alignof(vector_type<half_t, 8>));
+    ASSERT_EQ(alignof(vector_type<complex_half_t, 16>),
+              alignof(vector_type<half_t, 16>) + alignof(vector_type<half_t, 16>));
+    ASSERT_EQ(alignof(vector_type<complex_half_t, 32>),
+              alignof(vector_type<half_t, 32>) + alignof(vector_type<half_t, 32>));
+    ASSERT_EQ(alignof(vector_type<complex_half_t, 64>),
+              alignof(vector_type<half_t, 64>) + alignof(vector_type<half_t, 64>));
+}
+
+TEST(Complex_half, TestAsType)
+{
+    struct complex_half_t
+    {
+        using type = half_t;
+        type real;
+        type img;
+        complex_half_t() : real{type{}}, img{type{}} {}
+        complex_half_t(type real_init, type img_init) : real{real_init}, img{img_init} {}
+    };
+
+    // test size
+    const int size = 4;
+    // custom type number of elements
+    const int num_elem           = sizeof(complex_half_t) / sizeof(complex_half_t::type);
+    std::vector<half_t> test_vec = {half_t{0.3f},
+                                    half_t{-0.6f},
+                                    half_t{0.8f},
+                                    half_t{-0.2f},
+                                    half_t{0.5f},
+                                    half_t{-0.7f},
+                                    half_t{0.9f},
+                                    half_t{-0.3f}};
+    // reference vector
+    vector_type<complex_half_t, size> right_vec;
+    // check default CTOR
+    ck::static_for<0, size, 1>{}([&](auto i) {
+        ASSERT_EQ(right_vec.template AsType<complex_half_t>()(Number<i>{}).real,
+                  type_convert<half_t>(0.0f));
+        ASSERT_EQ(right_vec.template AsType<complex_half_t>()(Number<i>{}).img,
+                  type_convert<half_t>(0.0f));
+    });
+    // assign test values to the vector
+    ck::static_for<0, size, 1>{}([&](auto i) {
+        right_vec.template AsType<complex_half_t>()(Number<i>{}) =
+            complex_half_t{test_vec.at(num_elem * i), test_vec.at(num_elem * i + 1)};
+    });
+    // copy the vector
+    vector_type<complex_half_t, size> left_vec{right_vec};
+    // check if values were copied correctly
+    ck::static_for<0, size, 1>{}([&](auto i) {
+        ASSERT_EQ(left_vec.template AsType<complex_half_t>()(Number<i>{}).real,
+                  test_vec.at(num_elem * i));
+        ASSERT_EQ(left_vec.template AsType<complex_half_t>()(Number<i>{}).img,
+                  test_vec.at(num_elem * i + 1));
+    });
+}
+
+TEST(Complex_half, TestAsTypeReshape)
+{
+    struct complex_half_t
+    {
+        using type = half_t;
+        type real;
+        type img;
+        complex_half_t() : real{type{}}, img{type{}} {}
+        complex_half_t(type real_init, type img_init) : real{real_init}, img{img_init} {}
+    };
+
+    // test size
+    const int size = 4;
+    // custom type number of elements
+    const int num_elem           = sizeof(complex_half_t) / sizeof(complex_half_t::type);
+    std::vector<half_t> test_vec = {half_t{0.3f},
+                                    half_t{-0.6f},
+                                    half_t{0.8f},
+                                    half_t{-0.2f},
+                                    half_t{0.5f},
+                                    half_t{-0.7f},
+                                    half_t{0.9f},
+                                    half_t{-0.3f}};
+    // reference vector
+    vector_type<complex_half_t, size> right_vec;
+    // check default CTOR
+    ck::static_for<0, size, 1>{}([&](auto i) {
+        ASSERT_EQ(right_vec.template AsType<complex_half_t>()(Number<i>{}).real,
+                  type_convert<half_t>(0.0f));
+        ASSERT_EQ(right_vec.template AsType<complex_half_t>()(Number<i>{}).img,
+                  type_convert<half_t>(0.0f));
+    });
+    // assign test values to the vector
+    ck::static_for<0, size, 1>{}([&](auto i) {
+        right_vec.template AsType<complex_half_t>()(Number<i>{}) =
+            complex_half_t{test_vec.at(num_elem * i), test_vec.at(num_elem * i + 1)};
+    });
+    // copy the first half of a vector
+    vector_type<complex_half_t, size / 2> left_vec{
+        right_vec.template AsType<vector_type<complex_half_t, size / 2>::type>()(Number<0>{})};
+    // check if values were copied correctly
+    ck::static_for<0, size / 2, 1>{}([&](auto i) {
+        ASSERT_EQ(left_vec.template AsType<complex_half_t>()(Number<i>{}).real,
+                  test_vec.at(num_elem * i));
+        ASSERT_EQ(left_vec.template AsType<complex_half_t>()(Number<i>{}).img,
+                  test_vec.at(num_elem * i + 1));
+    });
+}