clean

40d038e9 · Jing Zhang · c3d05c0c · 40d038e9 · 40d038e9 · 40d038e9
Commit 40d038e9 authored Oct 14, 2024 by Jing Zhang
7 changed files
--- a/example/01_gemm/gemm_xdl_fp16_fp8_v3.cpp
+++ b/example/01_gemm/gemm_xdl_fp16_fp8_v3.cpp
@@ -27,7 +27,7 @@ using DeviceGemmV2Instance =
        ALayout,   BLayout,  CLayout,   
        ADataType, BDataType, CDataType, AccDataType, CShuffleDataType, 
        AElementOp, BElementOp, CElementOp, GemmDefault, 
-#if 0
+#if 1
        64,
        16, 16, 
        256, 8, 16,

--- a/example/01_gemm/run_gemm_example_v2.inc
+++ b/example/01_gemm/run_gemm_example_v2.inc
@@ -256,7 +256,14 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
      //                             get_rtol<CDataType>(),
      //                             get_atol<CDataType>());

-      LogRangeAsType<float>(std::cout << "c_m_n_device_buf : ", c_m_n_device_result.mData, ",") << std::endl;
+      //for(int i = 0; i < M; i++)
+      //{
+      //    for(int j = 0; j < N; j++)
+      //    {
+      //        std::cout << ck::type_convert<float>(c_m_n_device_result(i, j)) << ",";
+      //    }
+      //    std::cout << std::endl;
+      //}
 #endif
    }


--- a/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp
+++ b/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp
@@ -25,7 +25,7 @@ struct PassThroughPack2

    __host__ __device__ constexpr void operator()(ck::half2_t& y, const ck::pk_i4_t& x) const
 	{
-#if 1
+#if 0
 		uint8_t x_u8 = ck::bit_cast<uint8_t>(x);
 		uint8_t x_l  = (x_u8 & 0x0f) >> 0;
 		uint8_t x_h  = (x_u8 & 0xf0) >> 4;

--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp
@@ -151,6 +151,20 @@ struct GridwiseGemm_xdl_cshuffle_v3

    using ThisThreadBlock = ThisThreadBlock<BlockSize>;

+    static constexpr index_t APackedSize = []() {
+        if constexpr(is_same_v<remove_cvref_t<ADataType>, pk_i4_t>)
+            return 2;
+        else
+            return 1;
+    }();
+
+    static constexpr index_t BPackedSize = []() {
+        if constexpr(is_same_v<remove_cvref_t<BDataType>, pk_i4_t>)
+            return 2;
+        else
+            return 1;
+    }();
+
    __host__ static auto CalculateGridSize(index_t M, index_t N, index_t KBatch)
    {
        return std::make_tuple(Block2CTileMap::CalculateGridSize(M, N), 1, KBatch);
@@ -625,9 +639,8 @@ struct GridwiseGemm_xdl_cshuffle_v3
        // in some cases.
        else if constexpr(is_same<tensor_layout::gemm::RowMajor, ALayout>::value)
        {
-            constexpr auto MLdsLayer        = 32 * 4 / KPerBlock / sizeof(ADataType) < 1
-                                                  ? 1
-                                                  : 32 * 4 / KPerBlock / sizeof(ADataType);
+            constexpr index_t LdsSize = 32 * 4 / KPerBlock / sizeof(ADataType);
+            constexpr auto MLdsLayer =  LdsSize < 1 ? 1 : LdsSize;
            constexpr auto a_lds_block_desc = make_naive_tensor_descriptor(
                make_tuple(
                    AK0Number * Number<MLdsLayer>{}, Number<MPerBlock / MLdsLayer>{}, AK1Number),
@@ -761,10 +774,8 @@ struct GridwiseGemm_xdl_cshuffle_v3
        else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, BLayout>::value)
        {
            // NLdsLayer * K0 as logical Bank
-            constexpr auto NLdsLayer = 32 * 4 / KPerBlock / sizeof(BDataType) < 1
-                                           ? 1
-                                           : 32 * 4 / KPerBlock / sizeof(BDataType);
-            ;
+            constexpr index_t LdsSize = 32 * 4 / KPerBlock / sizeof(BDataType);
+            constexpr auto NLdsLayer =   LdsSize < 1 ? 1 : LdsSize;
            constexpr auto b_lds_block_desc = make_naive_tensor_descriptor(
                make_tuple(
                    BK0Number * Number<NLdsLayer>{}, Number<NPerBlock / NLdsLayer>{}, BK1Number),
@@ -924,20 +935,6 @@ struct GridwiseGemm_xdl_cshuffle_v3
                                NXdlPerWave,
                                KPack>())>;

-    static constexpr index_t APackedSize = []() {
-        if constexpr(is_same_v<remove_cvref_t<ADataType>, pk_i4_t>)
-            return 2;
-        else
-            return 1;
-    }();
-
-    static constexpr index_t BPackedSize = []() {
-        if constexpr(is_same_v<remove_cvref_t<BDataType>, pk_i4_t>)
-            return 2;
-        else
-            return 1;
-    }();
-
    __device__ static constexpr index_t GetSharedMemoryNumberOfByte()
    {
        // LDS allocation for A and B: be careful of alignment
@@ -1326,8 +1323,8 @@ struct GridwiseGemm_xdl_cshuffle_v3
            static_cast<ADataType*>(p_shared), a_block_desc_ak0_m_ak1.GetElementSpaceSize() / APackedSize);

        auto b_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
-            bit_cast<BDataType*>(bit_cast<unsigned char *>(p_shared) +
-                a_block_space_size_aligned * sizeof(ADataType)),
+            reinterpret_cast<BDataType*>(static_cast<ADataType*>(p_shared) +
+                a_block_space_size_aligned),
            b_block_desc_bk0_n_bk1.GetElementSpaceSize() / BPackedSize);

        constexpr auto a_block_slice_copy_step = make_multi_index(KPerBlock / AK1Number, 0, 0);

--- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp
+++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp
@@ -1211,10 +1211,6 @@ struct ThreadwiseTensorSliceTransfer_v4
                        dst_origin_idx + data_to_origin_disp_idx + i * src_scalar_step_in_vector);

                    dst_buf(Number<dst_offset>{}) = dst_tmp_vector.template AsType<DstData>()[i];
-
-
-                    if constexpr(is_same_v<remove_cvref_t<SrcData>, half_t>)
-                        printf("v4: %f %d\n", type_convert<float>(dst_buf[Number<dst_offset>{}]), threadIdx.x);
                });
            }
        });

--- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp
+++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp
@@ -554,9 +554,6 @@ struct ThreadwiseTensorSliceTransfer_v3r1
                dst_element_op_(dst_v, dst_vector_container.template AsType<DstData>()[i]);

                dst_vector_container.template AsType<DstData>()(i) = dst_v;
-
-                //if constexpr(is_same_v<remove_cvref_t<SrcData>, half_t>)
-                    //printf("v3r1: %f %d\n", type_convert<float>(dst_v), threadIdx.x);
            });

            // copy data from dst_vector_container to dst_buf

--- a/include/ck/utility/amd_xdlops.hpp
+++ b/include/ck/utility/amd_xdlops.hpp
@@ -157,16 +157,6 @@ struct intrin_mfma_f32_16x16x16f16<16, 16>
    template <class FloatC>
    __device__ static void Run(const half4_t& reg_a, const half4_t& reg_b, FloatC& reg_c)
    {
-        auto tmp_a = vector_type<half_t, 4>{reg_a};
-        auto tmp_b = vector_type<half_t, 4>{reg_b};
-        printf("{%f %f}, {%f %f}, {%f %f}, {%f %f} %d %d\n", 
-        static_cast<float>(tmp_a.template AsType<half_t>()(Number<0>{})), static_cast<float>(tmp_b.template AsType<half_t>()(Number<0>{})),
-        static_cast<float>(tmp_a.template AsType<half_t>()(Number<1>{})), static_cast<float>(tmp_b.template AsType<half_t>()(Number<1>{})),
-        static_cast<float>(tmp_a.template AsType<half_t>()(Number<2>{})), static_cast<float>(tmp_b.template AsType<half_t>()(Number<2>{})),
-        static_cast<float>(tmp_a.template AsType<half_t>()(Number<3>{})), static_cast<float>(tmp_b.template AsType<half_t>()(Number<3>{})),
-        threadIdx.x, blockIdx.x
-        );
-
        reg_c.template AsType<float4_t>()(Number<0>{}) = __builtin_amdgcn_mfma_f32_16x16x16f16(
            reg_a, reg_b, reg_c.template AsType<float4_t>()[Number<0>{}], 0, 0, 0);
    }