Merge branch 'develop' into fa-h512

8b49f207 · Max Podkorytov · GitHub · 0d59f474 · a6b761c3 · 8b49f207
Unverified Commit 8b49f207 authored Jan 07, 2025 by Max Podkorytov Committed by GitHub Jan 07, 2025
20 changed files
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v3_b_scale.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v3_b_scale.hpp
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v4_b_scale.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v4_b_scale.hpp
--- a/include/ck/tensor_operation/gpu/device/device_base.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_base.hpp
@@ -5,6 +5,8 @@
 #include <string>
 #include <sstream>
+#include <regex>
+#include <optional>
 #include "ck/stream_config.hpp"
@@ -12,6 +14,34 @@ namespace ck {
 namespace tensor_operation {
 namespace device {
+#define GET_OBJECT_NAME_IMLP                                                  \
+    std::optional<std::string> GetObjectName() const override                 \
+    {                                                                         \
+        std::string str = __PRETTY_FUNCTION__;                                \
+        static std::regex obj_name_expr{"<std::string> (.*)::GetObjectName"}; \
+        std::smatch match;                                                    \
+        if(!std::regex_search(str, match, obj_name_expr))                     \
+        {                                                                     \
+            return str;                                                       \
+        }                                                                     \
+        return std::string(match[1]) + ';';                                   \
+    }
+#define GET_TEMPLATE_INFO_IMPL                                  \
+    std::optional<std::string> GetTemplateInfo() const override \
+    {                                                           \
+        std::string str = __PRETTY_FUNCTION__;                  \
+        static std::regex template_expr{"\\[(.*)\\]"};          \
+        std::smatch match;                                      \
+        if(!std::regex_search(str, match, template_expr))       \
+        {                                                       \
+            return std::nullopt;                                \
+        }                                                       \
+        return std::string(match[1]);                           \
+    }
+#define REGISTER_EXTRA_PRINTING_METHODS GET_OBJECT_NAME_IMLP GET_TEMPLATE_INFO_IMPL
 struct BaseArgument
 {
    BaseArgument()                    = default;
@@ -48,6 +78,10 @@ struct BaseOperator
    virtual std::string GetTypeIdName() const { return typeid(*this).name(); }
+    virtual std::optional<std::string> GetObjectName() const { return std::nullopt; }
+    virtual std::optional<std::string> GetTemplateInfo() const { return std::nullopt; }
    virtual std::string GetTypeIdHashCode() const
    {
        std::ostringstream oss;

--- a/include/ck/tensor_operation/gpu/device/device_batched_gemm_multi_d.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_batched_gemm_multi_d.hpp
@@ -89,7 +89,8 @@ struct DeviceBatchedGemmV2MultiD : public BaseOperator
                        index_t BatchStrideE,
                        AElementwiseOperation a_element_op,
                        BElementwiseOperation b_element_op,
-                        CDEElementwiseOperation cde_element_op) = 0;
+                        CDEElementwiseOperation cde_element_op,
+                        index_t KBatch) = 0;
    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
 };

--- a/include/ck/tensor_operation/gpu/device/device_gemm_v2.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_gemm_v2.hpp
@@ -36,6 +36,10 @@ struct DeviceGemmV2 : public BaseOperator
                        CElementwiseOperation c_element_op) = 0;
    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
+    virtual bool GetPermuteA()         = 0;
+    virtual bool GetPermuteB()         = 0;
+    virtual ck::index_t GetKPerBlock() = 0;
 };
 template <typename ALayout,
@@ -73,6 +77,43 @@ struct DeviceGemmV2R1 : public BaseOperator
    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
 };
+template <typename ALayout,
+          typename BLayout,
+          typename CLayout,
+          typename ADataType,
+          typename BDataType,
+          typename BScaleType,
+          typename CDataType,
+          index_t ScaleBlockN,
+          index_t ScaleBlockK,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CElementwiseOperation>
+struct DeviceGemmV2BScale : public BaseOperator
+{
+    virtual std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(const void* p_a,
+                        const void* p_b,
+                        void* p_c,
+                        ck::index_t M,
+                        ck::index_t N,
+                        ck::index_t K,
+                        ck::index_t StrideA,
+                        ck::index_t StrideB,
+                        ck::index_t StrideC,
+                        ck::index_t StrideScaleB,
+                        const void* p_b_scale,
+                        ck::index_t KSplit,
+                        AElementwiseOperation a_element_op,
+                        BElementwiseOperation b_element_op,
+                        CElementwiseOperation c_element_op) = 0;
+    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
+    virtual bool GetPermuteB()         = 0;
+    virtual ck::index_t GetKPerBlock() = 0;
+};
 } // namespace device
 } // namespace tensor_operation
 } // namespace ck
--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_xdl_cshuffle_v3.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_xdl_cshuffle_v3.hpp
@@ -41,12 +41,15 @@ __global__ void
    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
    const index_t g_idx = blockIdx.z % karg.Batch;
+    const index_t k_idx = blockIdx.z / karg.Batch;
    const auto a_batch_offset  = karg.compute_ptr_offset_of_batch.GetAPtrOffset(g_idx);
    const auto b_batch_offset  = karg.compute_ptr_offset_of_batch.GetBPtrOffset(g_idx);
    const auto ds_batch_offset = karg.compute_ptr_offset_of_batch.GetDsPtrOffset(g_idx);
    const auto c_batch_offset  = karg.compute_ptr_offset_of_batch.GetCPtrOffset(g_idx);
+    auto splitk_batch_offset = typename GridwiseGemm::SplitKBatchOffset(karg, k_idx);
    // populate pointer, desc for Ds
    static_for<0, GridwiseGemm::NumDTensor, 1>{}([&](auto i) {
        // D pointer
@@ -54,8 +57,8 @@ __global__ void
    });
    GridwiseGemm::template Run<HasMainKBlockLoop, CGlobalMemoryDataOperation, TailNum>(
-        karg.p_a_grid + a_batch_offset,
+        karg.p_a_grid + a_batch_offset + splitk_batch_offset.a_k_split_offset,
-        karg.p_b_grid + b_batch_offset,
+        karg.p_b_grid + b_batch_offset + splitk_batch_offset.b_k_split_offset,
        karg.p_ds_grid,
        karg.p_c_grid + c_batch_offset,
        p_shared,
@@ -87,12 +90,15 @@ __global__ void
    __shared__ char p_shared_1[GridwiseGemm::GetSharedMemoryNumberOfByte()];
    const index_t g_idx = blockIdx.z % karg.Batch;
+    const index_t k_idx = blockIdx.z / karg.Batch;
    const auto a_batch_offset  = karg.compute_ptr_offset_of_batch.GetAPtrOffset(g_idx);
    const auto b_batch_offset  = karg.compute_ptr_offset_of_batch.GetBPtrOffset(g_idx);
    const auto ds_batch_offset = karg.compute_ptr_offset_of_batch.GetDsPtrOffset(g_idx);
    const auto c_batch_offset  = karg.compute_ptr_offset_of_batch.GetCPtrOffset(g_idx);
+    auto splitk_batch_offset = typename GridwiseGemm::SplitKBatchOffset(karg, k_idx);
    // populate pointer, desc for Ds
    static_for<0, GridwiseGemm::NumDTensor, 1>{}([&](auto i) {
        // D pointer
@@ -100,8 +106,8 @@ __global__ void
    });
    GridwiseGemm::template Run_2Lds<HasMainKBlockLoop, CGlobalMemoryDataOperation, TailNum>(
-        karg.p_a_grid + a_batch_offset,
+        karg.p_a_grid + a_batch_offset + splitk_batch_offset.a_k_split_offset,
-        karg.p_b_grid + b_batch_offset,
+        karg.p_b_grid + b_batch_offset + splitk_batch_offset.b_k_split_offset,
        karg.p_ds_grid,
        karg.p_c_grid + c_batch_offset,
        p_shared_0,
@@ -303,7 +309,8 @@ struct DeviceBatchedGemmMultiD_Xdl_CShuffle_V3
                 index_t Batch_,
                 AElementwiseOperation a_element_op_,
                 BElementwiseOperation b_element_op_,
-                 CElementwiseOperation c_element_op_)
+                 CElementwiseOperation c_element_op_,
+                 index_t KBatch_)
            : GridwiseGemm::Argument{p_a_grid_,
                                     p_b_grid_,
                                     p_ds_grid_,
@@ -315,7 +322,7 @@ struct DeviceBatchedGemmMultiD_Xdl_CShuffle_V3
                                     StrideB_,
                                     StrideDs_,
                                     StrideE_,
-                                     1,
+                                     KBatch_,
                                     a_element_op_,
                                     b_element_op_,
                                     c_element_op_},
@@ -336,13 +343,14 @@ struct DeviceBatchedGemmMultiD_Xdl_CShuffle_V3
                arg.Print();
            }
-            if(!GridwiseGemm::CheckValidity(arg) || arg.KBatch > 1)
+            if(!GridwiseGemm::CheckValidity(arg))
            {
                throw std::runtime_error("wrong! GridwiseGemm has invalid setting");
            }
            index_t gdx, gdy, gdz;
-            std::tie(gdx, gdy, gdz) = GridwiseGemm::CalculateGridSize(arg.M, arg.N, arg.Batch);
+            std::tie(gdx, gdy, gdz) =
+                GridwiseGemm::CalculateGridSize(arg.M, arg.N, arg.Batch * arg.KBatch);
            float ave_time = 0;
@@ -387,10 +395,11 @@ struct DeviceBatchedGemmMultiD_Xdl_CShuffle_V3
                        rotating_mem.Next();
                        // clear c mem
                        if(arg_.KBatch > 1)
-                            hipGetErrorString(hipMemsetAsync(arg_.p_c_grid,
+                            hipGetErrorString(
-                                                             0,
+                                hipMemsetAsync(arg_.p_c_grid,
-                                                             arg_.M * arg_.N * sizeof(CDataType),
+                                               0,
-                                                             stream_config.stream_id_));
+                                               arg.Batch * arg_.M * arg_.N * sizeof(CDataType),
+                                               stream_config.stream_id_));
                    };
                    ave_time = ck::utility::launch_and_time_kernel_with_preprocess<false>(
@@ -889,7 +898,8 @@ struct DeviceBatchedGemmMultiD_Xdl_CShuffle_V3
                             index_t BatchStrideE,
                             AElementwiseOperation a_element_op,
                             BElementwiseOperation b_element_op,
-                             CElementwiseOperation c_element_op)
+                             CElementwiseOperation c_element_op,
+                             index_t KBatch = 1)
    {
        return Argument{static_cast<const ADataType*>(p_a),
                        static_cast<const BDataType*>(p_b),
@@ -909,7 +919,8 @@ struct DeviceBatchedGemmMultiD_Xdl_CShuffle_V3
                        Batch,
                        a_element_op,
                        b_element_op,
-                        c_element_op};
+                        c_element_op,
+                        KBatch};
    }
    static auto MakeInvoker() { return Invoker{}; }
@@ -934,7 +945,8 @@ struct DeviceBatchedGemmMultiD_Xdl_CShuffle_V3
                        index_t BatchStrideE,
                        AElementwiseOperation a_element_op,
                        BElementwiseOperation b_element_op,
-                        CElementwiseOperation c_element_op) override
+                        CElementwiseOperation c_element_op,
+                        index_t KBatch = 1) override
    {
        return std::make_unique<Argument>(static_cast<const ADataType*>(p_a),
                                          static_cast<const BDataType*>(p_b),
@@ -954,7 +966,8 @@ struct DeviceBatchedGemmMultiD_Xdl_CShuffle_V3
                                          Batch,
                                          a_element_op,
                                          b_element_op,
-                                          c_element_op);
+                                          c_element_op,
+                                          KBatch);
    }
    // polymorphic

--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_streamk_v3.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_streamk_v3.hpp
@@ -469,7 +469,11 @@ struct DeviceGemm_Xdl_CShuffle_Streamk_V3 : public DeviceGemm_Streamk_V2<ALayout
        {
            return false;
        }
+        if(!is_bf16_atomic_supported() && std::is_same_v<CDataType, ck::bhalf_t> &&
+           arg.Streamk_sel > 0)
+        {
+            return false;
+        }
        if((arg.K % AK1 != 0 || arg.K % BK1 != 0) && !(GemmSpec == GemmSpecialization::MKPadding ||
                                                       GemmSpec == GemmSpecialization::NKPadding ||
                                                       GemmSpec == GemmSpecialization::MNKPadding ||

--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp
@@ -64,7 +64,9 @@ template <typename ALayout,
          BlockGemmPipelineScheduler BlkGemmPipeSched = BlockGemmPipelineScheduler::Intrawave,
          BlockGemmPipelineVersion BlkGemmPipelineVer = BlockGemmPipelineVersion::v1,
          typename ComputeTypeA                       = CDataType,
-          typename ComputeTypeB                       = ComputeTypeA>
+          typename ComputeTypeB                       = ComputeTypeA,
+          bool PermuteA                               = false,
+          bool PermuteB                               = false>
 struct DeviceGemm_Xdl_CShuffleV3 : public DeviceGemmV2<ALayout,
                                                       BLayout,
                                                       CLayout,
@@ -122,7 +124,9 @@ struct DeviceGemm_Xdl_CShuffleV3 : public DeviceGemmV2<ALayout,
        BlkGemmPipeSched,
        BlkGemmPipelineVer,
        ComputeTypeA,
-        ComputeTypeB>;
+        ComputeTypeB,
+        PermuteA,
+        PermuteB>;
    using Argument = typename GridwiseGemm::Argument;
@@ -633,6 +637,11 @@ struct DeviceGemm_Xdl_CShuffleV3 : public DeviceGemmV2<ALayout,
        return IsSupportedArgument(*dynamic_cast<const Argument*>(p_arg));
    }
+    index_t GetKPerBlock() override { return KPerBlock; }
+    bool GetPermuteA() override { return PermuteA; }
+    bool GetPermuteB() override { return PermuteB; }
    static auto MakeArgument(const ADataType* p_a,
                             const BDataType* p_b,
                             CDataType* p_c,
@@ -729,6 +738,7 @@ struct DeviceGemm_Xdl_CShuffleV3 : public DeviceGemmV2<ALayout,
        return str.str();
    }
+    REGISTER_EXTRA_PRINTING_METHODS
 };
 } // namespace device

--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3_b_scale.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3_b_scale.hpp
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_wmma_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_wmma_cshuffle.hpp
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp
--- a/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp
+++ b/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_b_scale.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_b_scale.hpp
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d.hpp
@@ -41,7 +41,7 @@ __global__ void
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
-    auto splitk_batch_offset = typename GridwiseGemm::SplitKBatchOffset(karg);
+    auto splitk_batch_offset = typename GridwiseGemm::SplitKBatchOffset(karg, blockIdx.z);
    GridwiseGemm::template Run<HasMainKBlockLoop, CGlobalMemoryDataOperation, TailNum>(
        karg.p_a_grid + splitk_batch_offset.a_k_split_offset,
@@ -76,7 +76,7 @@ __global__ void
    __shared__ char p_shared_0[GridwiseGemm::GetSharedMemoryNumberOfByte()];
    __shared__ char p_shared_1[GridwiseGemm::GetSharedMemoryNumberOfByte()];
-    auto splitk_batch_offset = typename GridwiseGemm::SplitKBatchOffset(karg);
+    auto splitk_batch_offset = typename GridwiseGemm::SplitKBatchOffset(karg, blockIdx.z);
    GridwiseGemm::template Run_2Lds<HasMainKBlockLoop, CGlobalMemoryDataOperation, TailNum>(
        karg.p_a_grid + splitk_batch_offset.a_k_split_offset,
@@ -639,27 +639,27 @@ struct GridwiseGemmMultiD_xdl_cshuffle_v3
    struct SplitKBatchOffset
    {
-        __device__ SplitKBatchOffset(Argument& karg)
+        __device__ SplitKBatchOffset(Argument& karg, index_t k_id)
        {
            if constexpr(is_same_v<tensor_layout::gemm::RowMajor, ALayout>)
            {
-                a_k_split_offset = blockIdx.z * karg.KRead;
+                a_k_split_offset = k_id * karg.KRead;
            }
            else if constexpr(is_same_v<tensor_layout::gemm::ColumnMajor, ALayout>)
            {
-                a_k_split_offset = blockIdx.z * karg.KRead * karg.StrideA;
+                a_k_split_offset = k_id * karg.KRead * karg.StrideA;
            }
            if constexpr(is_same_v<tensor_layout::gemm::RowMajor, BLayout>)
            {
-                b_k_split_offset = blockIdx.z * karg.KRead * karg.StrideB;
+                b_k_split_offset = k_id * karg.KRead * karg.StrideB;
            }
            else if constexpr(is_same_v<tensor_layout::gemm::ColumnMajor, BLayout>)
            {
-                b_k_split_offset = blockIdx.z * karg.KRead;
+                b_k_split_offset = k_id * karg.KRead;
            }
-            if(blockIdx.z < static_cast<uint32_t>(karg.KBatch - 1))
+            if(k_id < karg.KBatch - 1)
            {
                karg.K = karg.KRead;
            }

--- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp
+++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp
--- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp
+++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp
--- a/include/ck/tensor_operation/gpu/warp/wmma_gemm.hpp
+++ b/include/ck/tensor_operation/gpu/warp/wmma_gemm.hpp
@@ -307,7 +307,7 @@ struct wmma_type<WmmaInstr::wmma_f32_16x16x16_f16_gfx12,
    // Wave mode dependent propety
    static constexpr index_t wave_size = Number<WaveSize>{};
-    // * Fixed in Navi3x, Will be wave mode dependent on Navi4x
+    // * Fixed for gfx11, Will be wave mode dependent on gfx12
    // static constexpr index_t num_src_a_vgprs_per_wave = k_per_wmma / 2 * src_a_data_size / 4;
    // static constexpr index_t num_src_b_vgprs_per_wave = k_per_wmma / 2 * src_b_data_size / 4;
    // * num_acc_vgprs_per_wave alone M direction

--- a/include/ck/tensor_operation/operator_transform/transform_conv_bwd_data_to_gemm_v1.hpp
+++ b/include/ck/tensor_operation/operator_transform/transform_conv_bwd_data_to_gemm_v1.hpp
--- a/include/ck/utility/amd_buffer_addressing.hpp
+++ b/include/ck/utility/amd_buffer_addressing.hpp