debug

c3d05c0c · Jing Zhang · 3ef4d2c2 · c3d05c0c · c3d05c0c · c3d05c0c
Commit c3d05c0c authored Oct 13, 2024 by Jing Zhang
7 changed files
--- a/example/01_gemm/gemm_xdl_fp16_pk_i4_v3.cpp
+++ b/example/01_gemm/gemm_xdl_fp16_pk_i4_v3.cpp
@@ -8,38 +8,13 @@
 using ADataType        = ck::half_t;
 using BDataType        = ck::pk_i4_t;
 using AccDataType      = float;
-using CShuffleDataType = ck::half_t;
+using CShuffleDataType = float;
 using CDataType        = ck::half_t;

 using ALayout = Row;
 using BLayout = Col;
 using CLayout = Row;

-inline __host__ __device__ ck::half2_t
-type_convert_packed_i4_to_half2(ck::pk_i4_t x)
-{
-    uint8_t x_u8 = ck::bit_cast<uint8_t>(x);
-    uint8_t x_l  = (x_u8 & 0x0f);
-    uint8_t x_h  = (x_u8 & 0xf0) >> 4;
-
-    auto l_f16 = ck::type_convert<ck::half_t>(x_l);
-    auto h_f16 = ck::type_convert<ck::half_t>(x_h);
-
-    return {l_f16, h_f16};
-}
-
-
-struct ElementwisePackedI4ToHalf2
-{
-	__host__ __device__ void
-	operator()(ck::half2_t& y, const ck::pk_i4_t& x) const
-    {
-        y = type_convert_packed_i4_to_half2(x);
-    }
-
-	constexpr const static bool is_pack2_invocable = true;
-};
-
 using AElementOp = PassThrough;
 using BElementOp = PassThrough;
 using CElementOp = PassThrough;

--- a/example/01_gemm/run_gemm_example_v2.inc
+++ b/example/01_gemm/run_gemm_example_v2.inc
@@ -133,7 +133,7 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
        };

    StrideA = f_get_default_stride(M, K, StrideA, ALayout{});
-    StrideB = f_get_default_stride(K, N, StrideB / 2, BLayout{});
+    StrideB = f_get_default_stride(K, N, StrideB, BLayout{});
    StrideC = f_get_default_stride(M, N, StrideC, CLayout{});

    Tensor<ADataType> a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
@@ -143,7 +143,7 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
    {
    case 0:
        a_m_k.GenerateTensorValue(GeneratorTensor_1<ADataType>{1});
-        b_k_n.GenerateTensorValue(GeneratorTensor_1<BDataType>{1});
+        b_k_n.GenerateTensorValue(GeneratorTensor_1<BDataType>{0x11});
        break;
    case 1:
        a_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-2, 2});
@@ -228,16 +228,15 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
    }

    bool pass = true;
-#if 0
    if(config.do_verification)
    {
-        auto ref_gemm    = ReferenceGemmInstance{};
-        auto ref_invoker = ref_gemm.MakeInvoker();
+      //auto ref_gemm    = ReferenceGemmInstance{};
+      //auto ref_invoker = ref_gemm.MakeInvoker();

-        auto ref_argument = ref_gemm.MakeArgument(
-            a_m_k, b_k_n, c_m_n_host_result, PassThrough{}, PassThrough{}, PassThrough{});
+      //auto ref_argument = ref_gemm.MakeArgument(
+      //    a_m_k, b_k_n, c_m_n_host_result, PassThrough{}, PassThrough{}, PassThrough{});

-        ref_invoker.Run(ref_argument);
+      //ref_invoker.Run(ref_argument);

        ave_time = invoker.Run(argument, StreamConfig{nullptr, false, 1});
 #ifdef BUILD_INT4_EXAMPLE
@@ -251,14 +250,16 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
 #else
        c_m_n_device_buf.FromDevice(c_m_n_device_result.mData.data());

-        pass &= ck::utils::check_err(c_m_n_device_result,
-                                     c_m_n_host_result,
-                                     "Error: Incorrect results!",
-                                     get_rtol<CDataType>(),
-                                     get_atol<CDataType>());
+      //pass &= ck::utils::check_err(c_m_n_device_result,
+      //                             c_m_n_host_result,
+      //                             "Error: Incorrect results!",
+      //                             get_rtol<CDataType>(),
+      //                             get_atol<CDataType>());
+
+      LogRangeAsType<float>(std::cout << "c_m_n_device_buf : ", c_m_n_device_result.mData, ",") << std::endl;
 #endif
    }
-#endif
+

    if(config.time_kernel)
    {
@@ -267,7 +268,7 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)

        std::size_t flop = 2_uz * M * N * K;
        std::size_t num_btype =
-            sizeof(ADataType) * M * K + sizeof(BDataType) * K * N / 2 + sizeof(CDataType) * M * N;
+            sizeof(ADataType) * M * K + sizeof(BDataType) * K * N / (ck::is_same_v<ck::remove_cvref_t<BDataType>, ck::pk_i4_t> ? 2 : 1) + sizeof(CDataType) * M * N;

        float tflops = static_cast<float>(flop) / 1.E9 / ave_time;


--- a/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp
+++ b/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp
@@ -25,7 +25,7 @@ struct PassThroughPack2

    __host__ __device__ constexpr void operator()(ck::half2_t& y, const ck::pk_i4_t& x) const
 	{
-#if 0
+#if 1
 		uint8_t x_u8 = ck::bit_cast<uint8_t>(x);
 		uint8_t x_l  = (x_u8 & 0x0f) >> 0;
 		uint8_t x_h  = (x_u8 & 0xf0) >> 4;

--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp
@@ -924,6 +924,13 @@ struct GridwiseGemm_xdl_cshuffle_v3
                                NXdlPerWave,
                                KPack>())>;

+    static constexpr index_t APackedSize = []() {
+        if constexpr(is_same_v<remove_cvref_t<ADataType>, pk_i4_t>)
+            return 2;
+        else
+            return 1;
+    }();
+
    static constexpr index_t BPackedSize = []() {
        if constexpr(is_same_v<remove_cvref_t<BDataType>, pk_i4_t>)
            return 2;
@@ -941,10 +948,10 @@ struct GridwiseGemm_xdl_cshuffle_v3
        constexpr auto max_lds_align = math::lcm(AK1Number, BK1Number);

        constexpr auto a_block_space_size_aligned = math::integer_least_multiple(
-            a_block_desc_ak0_m_ak1.GetElementSpaceSize(), max_lds_align);
+            a_block_desc_ak0_m_ak1.GetElementSpaceSize() / APackedSize, max_lds_align);

        constexpr auto b_block_space_size_aligned = math::integer_least_multiple(
-            b_block_desc_bk0_n_bk1.GetElementSpaceSize(), max_lds_align) / BPackedSize;
+            b_block_desc_bk0_n_bk1.GetElementSpaceSize() / BPackedSize, max_lds_align);

        // LDS allocation for C shuffle in LDS
        constexpr auto c_shuffle_block_desc_mblock_mperblock_nblock_nperblock =
@@ -1312,14 +1319,14 @@ struct GridwiseGemm_xdl_cshuffle_v3

        // LDS allocation for A and B: be careful of alignment
        constexpr auto a_block_space_size_aligned = math::integer_least_multiple(
-            a_block_desc_ak0_m_ak1.GetElementSpaceSize(), max_lds_align);
+            a_block_desc_ak0_m_ak1.GetElementSpaceSize() / APackedSize, max_lds_align);

        // Cast after lds
        auto a_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
-            static_cast<ADataType*>(p_shared), a_block_desc_ak0_m_ak1.GetElementSpaceSize());
+            static_cast<ADataType*>(p_shared), a_block_desc_ak0_m_ak1.GetElementSpaceSize() / APackedSize);

        auto b_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
-            static_cast<BDataType*>(static_cast<unsigned char *>(p_shared) +
+            bit_cast<BDataType*>(bit_cast<unsigned char *>(p_shared) +
                a_block_space_size_aligned * sizeof(ADataType)),
            b_block_desc_bk0_n_bk1.GetElementSpaceSize() / BPackedSize);

@@ -1707,10 +1714,10 @@ struct GridwiseGemm_xdl_cshuffle_v3

        // LDS allocation for A and B: be careful of alignment
        constexpr auto a_block_space_size_aligned = math::integer_least_multiple(
-            a_block_desc_ak0_m_ak1.GetElementSpaceSize(), max_lds_align);
+            a_block_desc_ak0_m_ak1.GetElementSpaceSize() / APackedSize, max_lds_align);

        auto a_block_buf_ping = make_dynamic_buffer<AddressSpaceEnum::Lds>(
-            static_cast<ADataType*>(p_shared_0), a_block_desc_ak0_m_ak1.GetElementSpaceSize());
+            static_cast<ADataType*>(p_shared_0), a_block_desc_ak0_m_ak1.GetElementSpaceSize() / APackedSize);

        auto b_block_buf_ping = make_dynamic_buffer<AddressSpaceEnum::Lds>(
            static_cast<BDataType*>(static_cast<char*>(p_shared_0) +
@@ -1718,10 +1725,10 @@ struct GridwiseGemm_xdl_cshuffle_v3
            b_block_desc_bk0_n_bk1.GetElementSpaceSize() / BPackedSize);

        auto a_block_buf_pong = make_dynamic_buffer<AddressSpaceEnum::Lds>(
-            static_cast<ADataType*>(p_shared_1), a_block_desc_ak0_m_ak1.GetElementSpaceSize());
+            static_cast<ADataType*>(p_shared_1), a_block_desc_ak0_m_ak1.GetElementSpaceSize() / APackedSize);

        auto b_block_buf_pong = make_dynamic_buffer<AddressSpaceEnum::Lds>(
-            static_cast<BDataType*>(static_cast<char*>(p_shared_1) +
+            bit_cast<BDataType*>(bit_cast<char*>(p_shared_1) +
                a_block_space_size_aligned * sizeof(ADataType)),
            b_block_desc_bk0_n_bk1.GetElementSpaceSize() / BPackedSize);


--- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp
+++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp
@@ -1149,9 +1149,11 @@ struct ThreadwiseTensorSliceTransfer_v4
                // DstData)
                vector_type_maker_t<DstData, SrcScalarPerVector> dst_tmp_vector;

-                using dst_v_t = typename vector_type_maker_t<DstData, PackedSize>::type;
+                constexpr index_t pack_size = PackedSize;
+
+                using dst_v_t = typename vector_type_maker_t<DstData, pack_size>::type;
                using src_v_t = typename vector_type_maker_t<SrcData, 1>::type;
-                static_for<0, SrcScalarPerVector / PackedSize, 1>{}([&](auto i) {
+                static_for<0, SrcScalarPerVector / pack_size, 1>{}([&](auto i) {
                    ck::tensor_operation::element_wise::PassThroughPack2{}(
                        dst_tmp_vector.template AsType<dst_v_t>()(i),
                        src_tmp_vector.template AsType<src_v_t>()[i]);
@@ -1209,6 +1211,10 @@ struct ThreadwiseTensorSliceTransfer_v4
                        dst_origin_idx + data_to_origin_disp_idx + i * src_scalar_step_in_vector);

                    dst_buf(Number<dst_offset>{}) = dst_tmp_vector.template AsType<DstData>()[i];
+
+
+                    if constexpr(is_same_v<remove_cvref_t<SrcData>, half_t>)
+                        printf("v4: %f %d\n", type_convert<float>(dst_buf[Number<dst_offset>{}]), threadIdx.x);
                });
            }
        });

--- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp
+++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp
@@ -193,9 +193,6 @@ struct ThreadwiseTensorSliceTransfer_v3r1
            using src_vector_type = vector_type_maker_t<SrcData, SrcScalarPerVector>;
            using src_vector_t    = typename src_vector_type::type;

-            auto src_vector_container =
-                src_vector_type{src_buf.template Get<src_vector_t>(src_coord_.GetOffset() / PackedSize, true)};
-
            using dst_vector_type = vector_type_maker_t<DstData, SrcScalarPerVector>;
            using dst_vector_t    = typename dst_vector_type::type;
            dst_vector_type op_r_v;
@@ -229,6 +226,9 @@ struct ThreadwiseTensorSliceTransfer_v3r1

            static_assert(elem_op_vec_len == 1, "elem_op_vec_len != 1");

+            auto src_vector_container =
+                src_vector_type{src_buf.template Get<src_vector_t>(src_coord_.GetOffset() / PackedSize, true)};
+
            static_for<0, SrcScalarPerVector / elem_op_vec_len, 1>{}([&](auto idx) {
                // apply the src elementwise op and convert to DstData under the hood if needed
                src_element_op_(op_r_v.template AsType<dst_elem_op_vec_t>()(idx),
@@ -554,6 +554,9 @@ struct ThreadwiseTensorSliceTransfer_v3r1
                dst_element_op_(dst_v, dst_vector_container.template AsType<DstData>()[i]);

                dst_vector_container.template AsType<DstData>()(i) = dst_v;
+
+                //if constexpr(is_same_v<remove_cvref_t<SrcData>, half_t>)
+                    //printf("v3r1: %f %d\n", type_convert<float>(dst_v), threadIdx.x);
            });

            // copy data from dst_vector_container to dst_buf

--- a/include/ck/utility/amd_xdlops.hpp
+++ b/include/ck/utility/amd_xdlops.hpp
@@ -157,11 +157,18 @@ struct intrin_mfma_f32_16x16x16f16<16, 16>
    template <class FloatC>
    __device__ static void Run(const half4_t& reg_a, const half4_t& reg_b, FloatC& reg_c)
    {
-        ignore = reg_a;
-        ignore = reg_b;
-        ignore = reg_c;
-        //reg_c.template AsType<float4_t>()(Number<0>{}) = __builtin_amdgcn_mfma_f32_16x16x16f16(
-            //reg_a, reg_b, reg_c.template AsType<float4_t>()[Number<0>{}], 0, 0, 0);
+        auto tmp_a = vector_type<half_t, 4>{reg_a};
+        auto tmp_b = vector_type<half_t, 4>{reg_b};
+        printf("{%f %f}, {%f %f}, {%f %f}, {%f %f} %d %d\n", 
+        static_cast<float>(tmp_a.template AsType<half_t>()(Number<0>{})), static_cast<float>(tmp_b.template AsType<half_t>()(Number<0>{})),
+        static_cast<float>(tmp_a.template AsType<half_t>()(Number<1>{})), static_cast<float>(tmp_b.template AsType<half_t>()(Number<1>{})),
+        static_cast<float>(tmp_a.template AsType<half_t>()(Number<2>{})), static_cast<float>(tmp_b.template AsType<half_t>()(Number<2>{})),
+        static_cast<float>(tmp_a.template AsType<half_t>()(Number<3>{})), static_cast<float>(tmp_b.template AsType<half_t>()(Number<3>{})),
+        threadIdx.x, blockIdx.x
+        );
+
+        reg_c.template AsType<float4_t>()(Number<0>{}) = __builtin_amdgcn_mfma_f32_16x16x16f16(
+            reg_a, reg_b, reg_c.template AsType<float4_t>()[Number<0>{}], 0, 0, 0);
    }
 };