merge with the develop support the fp8 with computev4

9ba504b6 · ThomasNing · e3402c93 · f49de496 · 9ba504b6 · 9ba504b6
Commit 9ba504b6 authored Feb 07, 2025 by ThomasNing
20 changed files
--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_layernorm_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_layernorm_xdl_cshuffle.hpp
@@ -63,8 +63,7 @@ __global__ void
            const Block2ETileMap block_2_etile_map,
            index_t NRaw)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
-    defined(__gfx94__))
    __shared__ char p_shared[GridwiseGemmWelford::GetSharedMemoryNumberOfByte()];
    GridwiseGemmWelford::template Run<HasMainKBlockLoop>(

--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp
@@ -60,8 +60,7 @@ __global__ void
            const RsGridDescriptor_MBlock_MPerBlock rs_grid_desc_mblock_mperblock,
            const Block2ETileMap block_2_etile_map)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
-    defined(__gfx94__))
    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
    GridwiseGemm::template Run<HasMainKBlockLoop>(p_a_grid,

--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle.hpp
@@ -52,8 +52,7 @@ __global__ void
                                                e_grid_desc_mblock_mperblock_nblock_nperblock,
                                            const Block2ETileMap block_2_etile_map)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
-    defined(__gfx94__))
    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
    GridwiseGemm::template Run<HasMainKBlockLoop>(p_a_grid,

--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp
@@ -138,6 +138,7 @@ struct DeviceGemm_Xdl_CShuffleV3 : public DeviceGemmV2<ALayout,
            if(stream_config.log_level_ > 0)
            {
                arg.Print();
+                GridwiseGemm::BlockwiseGemmPipe::HotLoopInstList::Print();
            }
            if(!GridwiseGemm::CheckValidity(arg))
@@ -733,7 +734,9 @@ struct DeviceGemm_Xdl_CShuffleV3 : public DeviceGemmV2<ALayout,
            << "BlkGemmPipelineVersion: "
            << BlkGemmPipelineVersionToString[BlkGemmPipelineVer] << ", "
            << "BlkGemmPipelinePrefetchStages: "
-            << GridwiseGemm::BlockwiseGemmPipe::PrefetchStages;
+            << GridwiseGemm::BlockwiseGemmPipe::PrefetchStages << ", "
+            << "Kpack: "
+            << GridwiseGemm::BlockwiseGemmPipe::AMmaKStride;
        // clang-format on
        return str.str();

--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_waveletmodel_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_waveletmodel_cshuffle.hpp
@@ -47,8 +47,7 @@ __global__ void
                e_grid_desc_mblock_mperblock_nblock_nperblock,
            const Block2ETileMap block_2_etile_map)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
-    defined(__gfx94__))
    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
    GridwiseGemm::template Run<HasMainKBlockLoop>(p_a_grid,

--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_contraction_multiple_d_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_contraction_multiple_d_xdl_cshuffle.hpp
@@ -37,8 +37,7 @@ __global__ void
            const BElementwiseOperation b_element_op,
            const CDEElementwiseOperation cde_element_op)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
-    defined(__gfx94__))
    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
    const index_t block_id = get_block_1d_id();

--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp
@@ -87,8 +87,7 @@ __global__ void
            const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch,
            const ComputePtrOffsetOfN compute_ptr_offset_of_n)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
-    defined(__gfx94__))
    // offset base pointer for each work-group
    const index_t n_idx = __builtin_amdgcn_readfirstlane(blockIdx.z);
    const index_t g_idx = __builtin_amdgcn_readfirstlane(blockIdx.y);

--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_multiple_d_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_multiple_d_xdl_cshuffle.hpp
@@ -60,8 +60,7 @@ __global__ void
            const Block2CTileMap block_2_ctile_map,
            const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
-    defined(__gfx94__))
    const index_t num_blocks_per_batch =
        __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count);
    const index_t g_idx = __builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch);
@@ -103,7 +102,7 @@ __global__ void
    compute_ptr_offset_of_batch.GetAPtrOffset(0);
    compute_ptr_offset_of_batch.GetBPtrOffset(0);
    compute_ptr_offset_of_batch.GetCPtrOffset(0);
-#endif // end of if (defined(__gfx908__) || defined(__gfx90a__))
+#endif // end of if (defined(__gfx9__))
 }
 template <index_t NDimSpatial,

--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_two_stage_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_two_stage_xdl_cshuffle.hpp
@@ -55,8 +55,7 @@ __global__ void
            [[maybe_unused]] const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch,
            [[maybe_unused]] const index_t num_k_per_block)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
-    defined(__gfx94__))
    const index_t g_idx = __builtin_amdgcn_readfirstlane(blockIdx.z * NumGroupsToMerge);
    const index_t k_idx = __builtin_amdgcn_readfirstlane(blockIdx.y * num_k_per_block);
@@ -85,7 +84,7 @@ __global__ void
                                        k_idx);
 #else
    ignore = karg;
-#endif // end of if (defined(__gfx908__) || defined(__gfx90a__))
+#endif // end of if (defined(__gfx9__))
 }
 template <typename GridwiseGemm,
@@ -145,7 +144,7 @@ __global__ void
                                             k_idx);
 #else
    ignore = karg;
-#endif // end of if (defined(__gfx908__) || defined(__gfx90a__))
+#endif // end of if (defined(__gfx9__))
 }
 template <ck::index_t NDimSpatial,

--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp
@@ -99,8 +99,7 @@ __global__ void
            const ComputePtrOffsetOfG compute_ptr_offset_of_groups,
            const ComputePtrOffsetOfN compute_ptr_offset_of_n)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
-    defined(__gfx94__))
    // offset base pointer for each work-group
    const index_t g_idx = __builtin_amdgcn_readfirstlane(blockIdx.y);

--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle_v3.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle_v3.hpp
@@ -118,7 +118,7 @@ __global__ void
                                        c_grid_desc_mblock_mperblock_nblock_nperblock);
 #else
    ignore = karg;
-#endif // end of if (defined(__gfx908__) || defined(__gfx90a__))
+#endif // end of if (defined(__gfx9__))
 }
 template <typename GridwiseGemm,
@@ -184,7 +184,7 @@ __global__ void
                                             c_grid_desc_mblock_mperblock_nblock_nperblock);
 #else
    ignore = karg;
-#endif // end of if (defined(__gfx908__) || defined(__gfx90a__))
+#endif // end of if (defined(__gfx9__))
 }
 } // namespace

--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_multiple_r_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_multiple_r_xdl_cshuffle.hpp
@@ -155,8 +155,7 @@ __global__ void
            const Block2ETileMap block_2_ctile_map,
            const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
-    defined(__gfx94__))
    const index_t num_blocks_per_batch =
        __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count);
    const index_t g_idx = __builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch);

--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_large_tensor_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_large_tensor_cshuffle.hpp
@@ -52,8 +52,7 @@ __global__ void
            const ComputePtrOffset compute_ptr_offset_of_groups,
            const ComputePtrOffset compute_ptr_offset_of_n)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
-    defined(__gfx94__))
    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
    const index_t block_id_x = __builtin_amdgcn_readfirstlane(blockIdx.x);

--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_xdl_cshuffle_tile_loop.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_xdl_cshuffle_tile_loop.hpp
@@ -68,8 +68,7 @@ __global__ void
                                           const BElementwiseOperation b_element_op,
                                           const CDEElementwiseOperation cde_element_op)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
-    defined(__gfx94__))
    constexpr index_t shared_size = GridwiseGemm::GetSharedMemoryNumberOfByte();
    __shared__ uint8_t p_shared[shared_size];
@@ -404,7 +403,7 @@ __global__ void
    ignore = a_element_op;
    ignore = b_element_op;
    ignore = cde_element_op;
-#endif // end of if (defined(__gfx908__) || defined(__gfx90a__))
+#endif // end of if (defined(__gfx9__))
 }
 template <typename ALayout,

--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_softmax_gemm_permute_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_softmax_gemm_permute_xdl_cshuffle.hpp
@@ -43,8 +43,7 @@ __global__ void
            const B1ElementwiseOperation b1_element_op,
            const CElementwiseOperation c_element_op)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
-    defined(__gfx94__))
    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
    const index_t block_id = get_block_1d_id();
@@ -109,7 +108,7 @@ __global__ void
    ignore = acc_element_op;
    ignore = b1_element_op;
    ignore = c_element_op;
-#endif // end of if (defined(__gfx908__) || defined(__gfx90a__))
+#endif // end of if (defined(__gfx9__))
 }
 // Computes C = A * B0 * B1

--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl.hpp
@@ -38,8 +38,7 @@ __global__ void
                                const BElementwiseOperation b_element_op,
                                const CDEElementwiseOperation c_element_op)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
-    defined(__gfx94__))
    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
    const index_t block_id = get_block_1d_id();

--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_fixed_nk.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_fixed_nk.hpp
@@ -50,8 +50,7 @@ __global__ void
                                         const BElementwiseOperation b_element_op,
                                         const CDEElementwiseOperation c_element_op)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
-    defined(__gfx94__))
    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
    const index_t block_id = get_block_1d_id();

--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_splitk_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_splitk_cshuffle.hpp
@@ -40,8 +40,7 @@ __global__ void
                                       const BElementwiseOperation b_element_op,
                                       const CElementwiseOperation c_element_op)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
-    defined(__gfx94__))
    constexpr index_t shared_size = GridwiseGemm::GetSharedMemoryNumberOfByte();
    __shared__ uint8_t p_shared[shared_size];
@@ -80,7 +79,7 @@ __global__ void
    ignore = a_element_op;
    ignore = b_element_op;
    ignore = c_element_op;
-#endif // end of if (defined(__gfx908__) || defined(__gfx90a__))
+#endif // end of if (defined(__gfx9__))
 }
 template <typename ALayout,

--- a/include/ck/tensor_operation/gpu/device/impl/device_splitk_contraction_multiple_d_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_splitk_contraction_multiple_d_xdl_cshuffle.hpp
@@ -56,8 +56,7 @@ __global__ void
            const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch,
            const Block2ETileMap block_2_etile_map)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
-    defined(__gfx94__))
    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
    const index_t num_blocks_per_batch =

--- a/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp
+++ b/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp
@@ -16,7 +16,8 @@ namespace ck {
 // [Who Says Elephants Can't Run: Bringing Large Scale MoE Models into Cloud Scale Production]
 // (https://arxiv.org/abs/2211.10017) and implementation:
 // https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h
-__host__ __device__ inline half4_t pki4_to_half4(int q)
+// Convert lower part of packed int4 -> int4 to half
+__device__ inline half4_t i4_to_half4(int q)
 {
    const int LO = 0x000f000f;
    const int HI = 0x00f000f0;
@@ -44,7 +45,7 @@ __host__ __device__ inline half4_t pki4_to_half4(int q)
    return res.template AsType<half4_t>()[Number<0>{}];
 }
-__host__ __device__ inline half4_t pki4_to_half4_scale(int q, const ck::half2_t& scale)
+__device__ inline half4_t i4_to_half4_scale(int q, const ck::half2_t& scale)
 {
    const int LO = 0x000f000f;
    const int HI = 0x00f000f0;
@@ -78,34 +79,7 @@ __host__ __device__ inline half4_t pki4_to_half4_scale(int q, const ck::half2_t&
    return res.template AsType<half4_t>()[Number<0>{}];
 }
-__host__ __device__ inline half2_t pki4_to_half2(pk_i4_t q)
+__device__ inline bhalf4_t i4_to_bhalf4(int q)
-{
-#if 1
-    uint8_t x_u8 = ck::bit_cast<uint8_t>(q);
-    uint32_t i4s = ((x_u8 & 0x0f) << 16) | ((x_u8 & 0xf0) >> 4);
-    const int EX  = 0x64006400;
-    const int SUB = 0xE408E408; //-8
-    int lo = i4s | EX;
-    return amd_assembly_pk_add_f16(bit_cast<half2_t>(lo), bit_cast<half2_t>(SUB));
-#else
-    uint8_t x_u8 = ck::bit_cast<uint8_t>(q);
-    vector_type<half_t, 2> res;
-    half_t x_h = (x_u8 & 0x0f) - 8;
-    half_t x_l = ((x_u8 & 0xf0) >> 4) - 8;
-    res.template AsType<half_t>()(Number<0>{}) = x_l;
-    res.template AsType<half_t>()(Number<1>{}) = x_h;
-    return res.template AsType<half2_t>()[Number<0>{}];
-#endif
-}
-__host__ __device__ inline bhalf4_t pki4_to_bhalf4(int q)
 {
    uint32_t i8s = (q & 0xf) | ((q & 0xf0) << 4) | ((q & 0xf00) << 8) | ((q & 0xf000) << 12);
@@ -134,21 +108,6 @@ __host__ __device__ inline bhalf4_t pki4_to_bhalf4(int q)
    return res.template AsType<bhalf4_t>()[Number<0>{}];
 }
-__host__ __device__ inline bhalf2_t pki4_to_bhalf2(pk_i4_t q)
-{
-    uint8_t x_u8 = ck::bit_cast<uint8_t>(q);
-    float x_h = ((x_u8 & 0x0f) >> 0) - 8.f;
-    float x_l = ((x_u8 & 0xf0) >> 4) - 8.f;
-    vector_type<bhalf_t, 2> res;
-    res.template AsType<bhalf_t>()(Number<0>{}) = type_convert<bhalf_t>(x_l);
-    res.template AsType<bhalf_t>()(Number<1>{}) = type_convert<bhalf_t>(x_h);
-    return res.template AsType<bhalf2_t>()[Number<0>{}];
-}
 namespace tensor_operation {
 namespace element_wise {
@@ -159,11 +118,11 @@ struct PassThroughPack8
    __host__ __device__ constexpr void operator()(ck::half8_t& y, const ck::pk_i4x4_t& x) const
    {
-#if 1
+#if CK_USE_PK4_LAYOUT_SHUFFLE
        vector_type<half_t, 8> result;
-        result.template AsType<half4_t>()(Number<0>{}) = pki4_to_half4(bit_cast<int>(x));
+        result.template AsType<half4_t>()(Number<0>{}) = i4_to_half4(bit_cast<int>(x));
-        result.template AsType<half4_t>()(Number<1>{}) = pki4_to_half4(bit_cast<int>(x) >> 8);
+        result.template AsType<half4_t>()(Number<1>{}) = i4_to_half4(bit_cast<int>(x) >> 8);
        y = result.template AsType<half8_t>()[Number<0>{}];
 #else
@@ -171,13 +130,13 @@ struct PassThroughPack8
        vector_type<pk_i4_t, 4> src{x};
        dst.template AsType<half2_t>()(Number<0>{}) =
-            pki4_to_half2(src.template AsType<pk_i4_t>()[Number<0>{}]);
+            type_convert<half2_t>(src.template AsType<pk_i4_t>()[Number<0>{}]);
        dst.template AsType<half2_t>()(Number<1>{}) =
-            pki4_to_half2(src.template AsType<pk_i4_t>()[Number<1>{}]);
+            type_convert<half2_t>(src.template AsType<pk_i4_t>()[Number<1>{}]);
        dst.template AsType<half2_t>()(Number<2>{}) =
-            pki4_to_half2(src.template AsType<pk_i4_t>()[Number<2>{}]);
+            type_convert<half2_t>(src.template AsType<pk_i4_t>()[Number<2>{}]);
        dst.template AsType<half2_t>()(Number<3>{}) =
-            pki4_to_half2(src.template AsType<pk_i4_t>()[Number<3>{}]);
+            type_convert<half2_t>(src.template AsType<pk_i4_t>()[Number<3>{}]);
        y = dst.template AsType<half8_t>()[Number<0>{}];
 #endif
@@ -185,11 +144,11 @@ struct PassThroughPack8
    __host__ __device__ constexpr void operator()(ck::bhalf8_t& y, const ck::pk_i4x4_t& x) const
    {
-#if 1
+#if CK_USE_PK4_LAYOUT_SHUFFLE
        vector_type<bhalf_t, 8> result;
-        result.template AsType<bhalf4_t>()(Number<0>{}) = pki4_to_bhalf4(bit_cast<int>(x));
+        result.template AsType<bhalf4_t>()(Number<0>{}) = i4_to_bhalf4(bit_cast<int>(x));
-        result.template AsType<bhalf4_t>()(Number<1>{}) = pki4_to_bhalf4(bit_cast<int>(x) >> 16);
+        result.template AsType<bhalf4_t>()(Number<1>{}) = i4_to_bhalf4(bit_cast<int>(x) >> 16);
        y = result.template AsType<bhalf8_t>()[Number<0>{}];
 #else
@@ -197,13 +156,13 @@ struct PassThroughPack8
        vector_type<pk_i4_t, 4> src{x};
        dst.template AsType<bhalf2_t>()(Number<0>{}) =
-            pki4_to_bhalf2(src.template AsType<pk_i4_t>()[Number<0>{}]);
+            type_convert<bhalf2_t>(src.template AsType<pk_i4_t>()[Number<0>{}]);
        dst.template AsType<bhalf2_t>()(Number<1>{}) =
-            pki4_to_bhalf2(src.template AsType<pk_i4_t>()[Number<1>{}]);
+            type_convert<bhalf2_t>(src.template AsType<pk_i4_t>()[Number<1>{}]);
        dst.template AsType<bhalf2_t>()(Number<2>{}) =
-            pki4_to_bhalf2(src.template AsType<pk_i4_t>()[Number<2>{}]);
+            type_convert<bhalf2_t>(src.template AsType<pk_i4_t>()[Number<2>{}]);
        dst.template AsType<bhalf2_t>()(Number<3>{}) =
-            pki4_to_bhalf2(src.template AsType<pk_i4_t>()[Number<3>{}]);
+            type_convert<bhalf2_t>(src.template AsType<pk_i4_t>()[Number<3>{}]);
        y = dst.template AsType<bhalf8_t>()[Number<0>{}];
 #endif
@@ -219,12 +178,12 @@ struct DequantPack8
    __host__ __device__ constexpr void
    operator()(ck::half8_t& y, const ck::pk_i4x4_t& x, const ck::half2_t& z) const
    {
-#if 1
+#if CK_USE_PK4_LAYOUT_SHUFFLE
        vector_type<half_t, 8> result;
-        result.template AsType<half4_t>()(Number<0>{}) = pki4_to_half4_scale(bit_cast<int>(x), z);
+        result.template AsType<half4_t>()(Number<0>{}) = i4_to_half4_scale(bit_cast<int>(x), z);
        result.template AsType<half4_t>()(Number<1>{}) =
-            pki4_to_half4_scale(bit_cast<int>(x) >> 8, z);
+            i4_to_half4_scale(bit_cast<int>(x) >> 8, z);
        y = result.template AsType<half8_t>()[Number<0>{}];
 #else
@@ -232,13 +191,13 @@ struct DequantPack8
        vector_type<pk_i4_t, 4> src{x};
        dst.template AsType<half2_t>()(Number<0>{}) =
-            pki4_to_half2(src.template AsType<pk_i4_t>()[Number<0>{}]);
+            type_convert<half2_t>(src.template AsType<pk_i4_t>()[Number<0>{}]);
        dst.template AsType<half2_t>()(Number<1>{}) =
-            pki4_to_half2(src.template AsType<pk_i4_t>()[Number<1>{}]);
+            type_convert<half2_t>(src.template AsType<pk_i4_t>()[Number<1>{}]);
        dst.template AsType<half2_t>()(Number<2>{}) =
-            pki4_to_half2(src.template AsType<pk_i4_t>()[Number<2>{}]);
+            type_convert<half2_t>(src.template AsType<pk_i4_t>()[Number<2>{}]);
        dst.template AsType<half2_t>()(Number<3>{}) =
-            pki4_to_half2(src.template AsType<pk_i4_t>()[Number<3>{}]);
+            type_convert<half2_t>(src.template AsType<pk_i4_t>()[Number<3>{}]);
        y          = dst.template AsType<half8_t>()[Number<0>{}];
 #endif
@@ -260,7 +219,7 @@ struct PassThroughPack2
    __host__ __device__ constexpr void operator()(ck::half2_t& y, const ck::pk_i4_t& x) const
    {
-#if 1
+#if CK_USE_PK4_LAYOUT_SHUFFLE
        uint8_t x_u8 = ck::bit_cast<uint8_t>(x);
        uint8_t x_l  = (x_u8 & 0x0f) >> 0;
        uint8_t x_h  = (x_u8 & 0xf0) >> 4;