clean

3ef4d2c2 · Jing Zhang · 0f3b88bf · 3ef4d2c2 · 3ef4d2c2 · 3ef4d2c2
Commit 3ef4d2c2 authored Oct 13, 2024 by Jing Zhang
9 changed files
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -489,7 +489,7 @@ include_directories(BEFORE
 SET(BUILD_DEV ON CACHE BOOL "BUILD_DEV")
 if(BUILD_DEV)
-    add_compile_options(-Werror)
+    #add_compile_options(-Werror)
    add_compile_options(-Weverything)
 endif()
 message("CMAKE_CXX_FLAGS: ${CMAKE_CXX_FLAGS}")

--- a/cmake/EnableCompilerWarnings.cmake
+++ b/cmake/EnableCompilerWarnings.cmake
@@ -66,7 +66,7 @@ else()
            -Wunreachable-code
            -Wunused
            -Wno-reserved-identifier
-	    -Werror
+            #-Werror
            -Wno-option-ignored
            -Wsign-compare
            -Wno-extra-semi-stmt

--- a/example/01_gemm/gemm_xdl_fp16_fp8_v3.cpp
+++ b/example/01_gemm/gemm_xdl_fp16_fp8_v3.cpp
@@ -8,7 +8,7 @@
 using ADataType        = ck::half_t;
 using BDataType        = ck::f8_t;
 using AccDataType      = float;
-using CShuffleDataType = ck::half_t;
+using CShuffleDataType = float;
 using CDataType        = ck::half_t;
 using ALayout = Row;

--- a/example/01_gemm/run_gemm_example_v2.inc
+++ b/example/01_gemm/run_gemm_example_v2.inc
@@ -133,7 +133,7 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
        };
    StrideA = f_get_default_stride(M, K, StrideA, ALayout{});
-    StrideB = f_get_default_stride(K, N, StrideB, BLayout{});
+    StrideB = f_get_default_stride(K, N, StrideB / 2, BLayout{});
    StrideC = f_get_default_stride(M, N, StrideC, CLayout{});
    Tensor<ADataType> a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
@@ -267,7 +267,7 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
        std::size_t flop = 2_uz * M * N * K;
        std::size_t num_btype =
-            sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(CDataType) * M * N;
+            sizeof(ADataType) * M * K + sizeof(BDataType) * K * N / 2 + sizeof(CDataType) * M * N;
        float tflops = static_cast<float>(flop) / 1.E9 / ave_time;

--- a/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp
+++ b/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp
@@ -25,6 +25,7 @@ struct PassThroughPack2
    __host__ __device__ constexpr void operator()(ck::half2_t& y, const ck::pk_i4_t& x) const
 	{
+#if 0
 		uint8_t x_u8 = ck::bit_cast<uint8_t>(x);
 		uint8_t x_l  = (x_u8 & 0x0f) >> 0;
 		uint8_t x_h  = (x_u8 & 0xf0) >> 4;
@@ -33,6 +34,10 @@ struct PassThroughPack2
 		auto h_f16 = ck::type_convert<ck::half_t>(x_h);
 		y = {l_f16, h_f16};
+#else
+        uint32_t t = ck::bit_cast<uint8_t>(x);
+        y = ck::bit_cast<half2_t>(t);
+#endif
 	}
    constexpr const static bool is_pack2_invocable = true;

--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp
@@ -924,6 +924,13 @@ struct GridwiseGemm_xdl_cshuffle_v3
                                NXdlPerWave,
                                KPack>())>;
+    static constexpr index_t BPackedSize = []() {
+        if constexpr(is_same_v<remove_cvref_t<BDataType>, pk_i4_t>)
+            return 2;
+        else
+            return 1;
+    }();
    __device__ static constexpr index_t GetSharedMemoryNumberOfByte()
    {
        // LDS allocation for A and B: be careful of alignment
@@ -937,7 +944,7 @@ struct GridwiseGemm_xdl_cshuffle_v3
            a_block_desc_ak0_m_ak1.GetElementSpaceSize(), max_lds_align);
        constexpr auto b_block_space_size_aligned = math::integer_least_multiple(
-            b_block_desc_bk0_n_bk1.GetElementSpaceSize(), max_lds_align);
+            b_block_desc_bk0_n_bk1.GetElementSpaceSize(), max_lds_align) / BPackedSize;
        // LDS allocation for C shuffle in LDS
        constexpr auto c_shuffle_block_desc_mblock_mperblock_nblock_nperblock =
@@ -1312,9 +1319,9 @@ struct GridwiseGemm_xdl_cshuffle_v3
            static_cast<ADataType*>(p_shared), a_block_desc_ak0_m_ak1.GetElementSpaceSize());
        auto b_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
-            static_cast<BDataType*>(p_shared) +
+            static_cast<BDataType*>(static_cast<unsigned char *>(p_shared) +
-                a_block_space_size_aligned * sizeof(ADataType) / sizeof(BDataType),
+                a_block_space_size_aligned * sizeof(ADataType)),
-            b_block_desc_bk0_n_bk1.GetElementSpaceSize());
+            b_block_desc_bk0_n_bk1.GetElementSpaceSize() / BPackedSize);
        constexpr auto a_block_slice_copy_step = make_multi_index(KPerBlock / AK1Number, 0, 0);
        constexpr auto b_block_slice_copy_step = make_multi_index(KPerBlock / BK1Number, 0, 0);
@@ -1329,19 +1336,19 @@ struct GridwiseGemm_xdl_cshuffle_v3
            KPerBlock);
        blockwise_gemm_pipeline.template Run<HasMainKBlockLoop, TailNum>(a_grid_desc_ak0_m_ak1,
-                                                                         a_block_desc_ak0_m_ak1,
+                a_block_desc_ak0_m_ak1,
-                                                                         a_blockwise_copy,
+                a_blockwise_copy,
-                                                                         a_grid_buf,
+                a_grid_buf,
-                                                                         a_block_buf,
+                a_block_buf,
-                                                                         a_block_slice_copy_step,
+                a_block_slice_copy_step,
-                                                                         b_grid_desc_bk0_n_bk1,
+                b_grid_desc_bk0_n_bk1,
-                                                                         b_block_desc_bk0_n_bk1,
+                b_block_desc_bk0_n_bk1,
-                                                                         b_blockwise_copy,
+                b_blockwise_copy,
-                                                                         b_grid_buf,
+                b_grid_buf,
-                                                                         b_block_buf,
+                b_block_buf,
-                                                                         b_block_slice_copy_step,
+                b_block_slice_copy_step,
-                                                                         c_thread_buf,
+                c_thread_buf,
-                                                                         num_k_block_main_loop);
+                num_k_block_main_loop);
        // shuffle C and write out
        {
@@ -1706,17 +1713,17 @@ struct GridwiseGemm_xdl_cshuffle_v3
            static_cast<ADataType*>(p_shared_0), a_block_desc_ak0_m_ak1.GetElementSpaceSize());
        auto b_block_buf_ping = make_dynamic_buffer<AddressSpaceEnum::Lds>(
-            static_cast<BDataType*>(p_shared_0) +
+            static_cast<BDataType*>(static_cast<char*>(p_shared_0) +
-                a_block_space_size_aligned * sizeof(ADataType) / sizeof(BDataType),
+                a_block_space_size_aligned * sizeof(ADataType)),
-            b_block_desc_bk0_n_bk1.GetElementSpaceSize());
+            b_block_desc_bk0_n_bk1.GetElementSpaceSize() / BPackedSize);
        auto a_block_buf_pong = make_dynamic_buffer<AddressSpaceEnum::Lds>(
            static_cast<ADataType*>(p_shared_1), a_block_desc_ak0_m_ak1.GetElementSpaceSize());
        auto b_block_buf_pong = make_dynamic_buffer<AddressSpaceEnum::Lds>(
-            static_cast<BDataType*>(p_shared_1) +
+            static_cast<BDataType*>(static_cast<char*>(p_shared_1) +
-                a_block_space_size_aligned * sizeof(ADataType) / sizeof(BDataType),
+                a_block_space_size_aligned * sizeof(ADataType)),
-            b_block_desc_bk0_n_bk1.GetElementSpaceSize());
+            b_block_desc_bk0_n_bk1.GetElementSpaceSize() / BPackedSize);
        auto a_block_bufs = make_tuple(a_block_buf_ping, a_block_buf_pong);
        auto b_block_bufs = make_tuple(b_block_buf_ping, b_block_buf_pong);

--- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp
+++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp
@@ -79,7 +79,7 @@ struct ThreadwiseTensorSliceTransfer_v3r1
          dst_element_op_(dst_element_op)
    {
 		static_assert(is_same_v<remove_cvref_t<SrcData>, remove_cvref_t<DstData>>, "SrcData != DstData");
-		static_assert(!(is_same_v<remove_cvref_t<SrcData>, pk_i4_t> && (SrcScalarPerVector == 1 || DstScalarPerVector == 1)), "pk data N cannot be 1");
+               static_assert(!(is_same_v<remove_cvref_t<SrcData>, pk_i4_t> && (SrcScalarPerVector_ == 1 || DstScalarPerVector_ == 1)), "pk data N cannot be 1");
    }
    __device__ void SetSrcSliceOrigin(const SrcDesc& src_desc, const Index& src_slice_origin_idx)
@@ -108,11 +108,11 @@ struct ThreadwiseTensorSliceTransfer_v3r1
        // scalar per access on each dim
        // TODO: don't use lambda_scalar_per_access
        constexpr auto src_scalar_per_access = generate_sequence(
-            detail::lambda_scalar_per_access<SrcVectorDim, SrcScalarPerVector * PackedSize>{}, Number<nDim>{});
+            detail::lambda_scalar_per_access<SrcVectorDim, SrcScalarPerVector_>{}, Number<nDim>{});
        constexpr auto src_access_lengths = SliceLengths{} / src_scalar_per_access;
-        static_assert(SliceLengths::At(SrcVectorDim) % (SrcScalarPerVector * PackedSize) == 0,
+        static_assert(SliceLengths::At(SrcVectorDim) % (SrcScalarPerVector_) == 0,
                      "SliceLengths[SrcVectorDim] must be divisible by SrcScalarPerVector");
        constexpr auto src_dim_access_order = SrcDimAccessOrder{};
@@ -206,17 +206,20 @@ struct ThreadwiseTensorSliceTransfer_v3r1
                    if constexpr(decltype(src_element_op_)::is_pack8_invocable)
                        return math::min(8, SrcScalarPerVector);
                }
-                if constexpr(is_detected<is_pack4_invocable_t, decltype(src_element_op_)>::value)
+                else if constexpr(is_detected<is_pack4_invocable_t, decltype(src_element_op_)>::value)
                {
                    if constexpr(decltype(src_element_op_)::is_pack4_invocable)
                        return math::min(4, SrcScalarPerVector);
                }
-                if constexpr(is_detected<is_pack2_invocable_t, decltype(src_element_op_)>::value)
+                else if constexpr(is_detected<is_pack2_invocable_t, decltype(src_element_op_)>::value)
                {
                    if constexpr(decltype(src_element_op_)::is_pack2_invocable)
                        return math::min(2, SrcScalarPerVector);
                }
-                return 1;
+                else
+                {
+                    return 1;
+                }
            };
            constexpr index_t elem_op_vec_len = get_elem_op_vec_len();
@@ -224,6 +227,8 @@ struct ThreadwiseTensorSliceTransfer_v3r1
            using src_elem_op_vec_t = typename vector_type<SrcData, elem_op_vec_len>::type;
            using dst_elem_op_vec_t = typename vector_type<DstData, elem_op_vec_len>::type;
+            static_assert(elem_op_vec_len == 1, "elem_op_vec_len != 1");
            static_for<0, SrcScalarPerVector / elem_op_vec_len, 1>{}([&](auto idx) {
                // apply the src elementwise op and convert to DstData under the hood if needed
                src_element_op_(op_r_v.template AsType<dst_elem_op_vec_t>()(idx),
@@ -285,6 +290,7 @@ struct ThreadwiseTensorSliceTransfer_v3r1
    TransferDataFromSrcThreadScratchToDstThreadScratch(Number<ThreadScratchId> thread_scratch_id)
    {
 #if !CK_EXPERIMENTAL_USE_IN_REGISTER_SUB_DWORD_TRANSPOSE
+        static_assert(false, "");
        static_ford<SliceLengths>{}([&](auto idx) {
            dst_thread_scratch_(idx) = src_thread_scratch_tuple_[thread_scratch_id][idx];
        });
@@ -292,7 +298,7 @@ struct ThreadwiseTensorSliceTransfer_v3r1
        // OOB Check
        constexpr auto src_scalar_per_access = generate_sequence(
-            detail::lambda_scalar_per_access<SrcVectorDim, SrcScalarPerVector * PackedSize>{}, Number<nDim>{});
+            detail::lambda_scalar_per_access<SrcVectorDim, SrcScalarPerVector_>{}, Number<nDim>{});
        constexpr auto src_access_lengths = SliceLengths{} / src_scalar_per_access;
@@ -353,6 +359,8 @@ struct ThreadwiseTensorSliceTransfer_v3r1
                .template SetAsType<vector_t>(src_data_idx_seq, op_r_v);
        });
+        static_assert(!(is_same_v<pk_i4_t, remove_cvref_t<SrcData>> && SrcVectorDim != DstVectorDim), "pk_i4_t does not support transpose");
        // sub-dword transpose between src_thread_scratch_ and dst_thread_scratch_
        // TODO make this logic more generic for more sub-dword datatype
        if constexpr(SrcVectorDim != DstVectorDim &&
@@ -381,9 +389,9 @@ struct ThreadwiseTensorSliceTransfer_v3r1
            constexpr auto scalar_per_access = generate_sequence(
                detail::lambda_scalar_per_access_for_src_and_dst<SrcVectorDim,
-                                                                 SrcScalarPerVector * PackedSize,
+                                                                 SrcScalarPerVector_,
                                                                 DstVectorDim,
-                                                                 DstScalarPerVector * PackedSize>{},
+                                                                 DstScalarPerVector_>{},
                Number<nDim>{});
            constexpr auto access_lengths = SliceLengths{} / scalar_per_access;
@@ -456,7 +464,7 @@ struct ThreadwiseTensorSliceTransfer_v3r1
        // src scalar per access on each dim
        // TODO: don't use this
        constexpr auto dst_scalar_per_access = generate_sequence(
-            detail::lambda_scalar_per_access<DstVectorDim, DstScalarPerVector * PackedSize>{}, Number<nDim>{});
+            detail::lambda_scalar_per_access<DstVectorDim, DstScalarPerVector_>{}, Number<nDim>{});
        constexpr auto dst_access_lengths = SliceLengths{} / dst_scalar_per_access;

--- a/include/ck/utility/amd_xdlops.hpp
+++ b/include/ck/utility/amd_xdlops.hpp
@@ -157,8 +157,11 @@ struct intrin_mfma_f32_16x16x16f16<16, 16>
    template <class FloatC>
    __device__ static void Run(const half4_t& reg_a, const half4_t& reg_b, FloatC& reg_c)
    {
-        reg_c.template AsType<float4_t>()(Number<0>{}) = __builtin_amdgcn_mfma_f32_16x16x16f16(
+        ignore = reg_a;
-            reg_a, reg_b, reg_c.template AsType<float4_t>()[Number<0>{}], 0, 0, 0);
+        ignore = reg_b;
+        ignore = reg_c;
+        //reg_c.template AsType<float4_t>()(Number<0>{}) = __builtin_amdgcn_mfma_f32_16x16x16f16(
+            //reg_a, reg_b, reg_c.template AsType<float4_t>()[Number<0>{}], 0, 0, 0);
    }
 };

--- a/script/cmake-ck-dev.sh
+++ b/script/cmake-ck-dev.sh
@@ -17,7 +17,7 @@ fi
 cmake                                                                                             \
 -D CMAKE_PREFIX_PATH=/opt/rocm                                                                    \
 -D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc                                                         \
-D CMAKE_CXX_FLAGS="-Xclang -mllvm -Xclang -enable-post-misched=0 -std=c++17 -O3 -ftemplate-backtrace-limit=0  -fPIE  -Wno-gnu-line-marker"     \
+-D CMAKE_HIP_FLAGS="-save-temps -gline-tables-only -Xclang -mllvm -Xclang -enable-post-misched=0 -std=c++17 -O3 -ftemplate-backtrace-limit=0  -fPIE  -Wno-gnu-line-marker"     \
 -D CMAKE_BUILD_TYPE=Release                                                                       \
 -D BUILD_DEV=ON                                                                                   \
 -D GPU_TARGETS=$GPU_TARGETS                                                                       \