Merge branch 'develop' into gridwise_2d

dc70e3e1 · arai713 · GitHub · 10947a54 · 8ee36118 · dc70e3e1
Unverified Commit dc70e3e1 authored Nov 01, 2022 by arai713 Committed by GitHub Nov 01, 2022
20 changed files
--- a/example/34_batchnorm/batchnorm_forward_nhwc.cpp
+++ b/example/34_batchnorm/batchnorm_forward_nhwc.cpp
--- a/example/34_batchnorm/batchnorm_infer_impl.hpp
+++ b/example/34_batchnorm/batchnorm_infer_impl.hpp
@@ -14,8 +14,12 @@
 #include "batchnorm_common.hpp"
-template <typename InOutDataType,
+template <typename XDataType,
+          typename YDataType,
          typename AccDataType,
+          typename ScaleDataType,
+          typename BiasDataType,
+          typename MeanVarDataType,
          ck::index_t Rank,
          ck::index_t NumBatchNormReduceDim,
          bool fastest_dim_is_reduced = false>
@@ -26,7 +30,9 @@ int bnorm_infer(
    const std::array<ck::index_t, Rank> xStrides,
    const std::array<ck::index_t, Rank> yStrides,
    const std::array<ck::index_t, Rank - NumBatchNormReduceDim> bnScaleBiasMeanVarLengths,
-    const std::array<ck::index_t, Rank - NumBatchNormReduceDim> bnScaleBiasMeanVarStrides,
+    const std::array<ck::index_t, Rank - NumBatchNormReduceDim> bnScaleStrides,
+    const std::array<ck::index_t, Rank - NumBatchNormReduceDim> bnBiasStrides,
+    const std::array<ck::index_t, Rank - NumBatchNormReduceDim> bnMeanVarStrides,
    const void* p_x,
    const void* p_scale,
    const void* p_bias,
@@ -41,11 +47,11 @@ int bnorm_infer(
                  "Invalid number of reduced dimensions for batchnorm!");
    using DeviceNormalizeInstance = ck::tensor_operation::device::DeviceElementwise<
-        ck::Tuple<InOutDataType, AccDataType, AccDataType, AccDataType, AccDataType>, // x, mean,
+        ck::Tuple<XDataType, AccDataType, AccDataType, AccDataType, AccDataType>, // x, mean,
-                                                                                      // variance,
+                                                                                  // variance,
-                                                                                      // scale,
+                                                                                  // scale,
-                                                                                      // bias,
+                                                                                  // bias,
-        ck::Tuple<InOutDataType>,                                                     // y
+        ck::Tuple<YDataType>,                                                     // y
        NormalizeInInfer,
        Rank,
        2,                           // MPerthread
@@ -53,14 +59,18 @@ int bnorm_infer(
        ck::Sequence<1>>;            // scalarPerVector: y
    auto invariantDims = get_invariant_dims<Rank, NumBatchNormReduceDim>(reduceDims);
-    std::array<ck::index_t, Rank> aligned_scaleBiasMeanVarStrides{0};
+    std::array<ck::index_t, Rank> aligned_bnScaleStrides{0};
+    std::array<ck::index_t, Rank> aligned_bnBiasStrides{0};
+    std::array<ck::index_t, Rank> aligned_bnMeanVarStrides{0};
    int i = 0;
    for(auto dim : invariantDims)
    {
        assert(xyLengths[dim] == bnScaleBiasMeanVarLengths[i]);
-        aligned_scaleBiasMeanVarStrides[dim] = bnScaleBiasMeanVarStrides[i];
+        aligned_bnScaleStrides[dim]   = bnScaleStrides[i];
+        aligned_bnBiasStrides[dim]    = bnBiasStrides[i];
+        aligned_bnMeanVarStrides[dim] = bnMeanVarStrides[i];
        i++;
    };
@@ -84,10 +94,10 @@ int bnorm_infer(
    auto argument_ptr1 = dev_normalize.MakeArgumentPointer(
        xyLengths,
        {xStrides,
-         aligned_scaleBiasMeanVarStrides,
+         aligned_bnMeanVarStrides,
-         aligned_scaleBiasMeanVarStrides,
+         aligned_bnMeanVarStrides,
-         aligned_scaleBiasMeanVarStrides,
+         aligned_bnScaleStrides,
-         aligned_scaleBiasMeanVarStrides},
+         aligned_bnBiasStrides},
        {yStrides},
        {p_x, p_estimatedMean, p_estimatedVariance, p_scale, p_bias},
        {p_y},
@@ -105,8 +115,10 @@ int bnorm_infer(
    avg_time += invoker_ptr1->Run(argument_ptr1.get(), StreamConfig{nullptr, time_kernel});
-    num_bytes += (total_length * (1 * sizeof(InOutDataType) + 4 * sizeof(AccDataType)) +
+    num_bytes += total_length * sizeof(XDataType) +
-                  total_length * sizeof(InOutDataType));
+                 invariantLength *
+                     (sizeof(ScaleDataType) + sizeof(BiasDataType) + 2 * sizeof(MeanVarDataType)) +
+                 total_length * sizeof(YDataType);
    if(time_kernel)
    {

--- a/example/34_batchnorm/batchnorm_infer_nhwc.cpp
+++ b/example/34_batchnorm/batchnorm_infer_nhwc.cpp
@@ -18,11 +18,6 @@
 #include "batchnorm_infer_impl.hpp"
-template <typename InOutDataType, typename AccDataType>
-using ReferenceBatchNormInferInstance =
-    ck::tensor_operation::host::ReferenceBatchNormInfer_Input_N_H_W_C_Output_C<InOutDataType,
-                                                                               AccDataType>;
 static struct option long_options[] = {{"inOutLengths", required_argument, nullptr, 'D'},
                                       {"verify", required_argument, nullptr, 'v'},
                                       {"help", no_argument, nullptr, '?'},
@@ -236,21 +231,30 @@ bool bnorm_infer_nhwc_test(bool do_verification,
    int result = 0;
-    result = bnorm_infer<InOutDataType, AccDataType, Rank, NumReduceDim, false>(
+    result = bnorm_infer<InOutDataType,
-        time_kernel,
+                         InOutDataType,
-        {0, 1, 2},
+                         AccDataType,
-        i_inOutLengths,
+                         AccDataType,
-        i_inOutStrides,
+                         AccDataType,
-        i_inOutStrides,
+                         AccDataType,
-        i_scaleBiasMeanVarLengths,
+                         Rank,
-        i_scaleBiasMeanVarStrides,
+                         NumReduceDim,
-        x_dev.GetDeviceBuffer(),
+                         false>(time_kernel,
-        bnScale_dev.GetDeviceBuffer(),
+                                {0, 1, 2},
-        bnBias_dev.GetDeviceBuffer(),
+                                i_inOutLengths,
-        epsilon,
+                                i_inOutStrides,
-        estimatedMean_dev.GetDeviceBuffer(),
+                                i_inOutStrides,
-        estimatedVariance_dev.GetDeviceBuffer(),
+                                i_scaleBiasMeanVarLengths,
-        y_dev.GetDeviceBuffer());
+                                i_scaleBiasMeanVarStrides,
+                                i_scaleBiasMeanVarStrides,
+                                i_scaleBiasMeanVarStrides,
+                                x_dev.GetDeviceBuffer(),
+                                bnScale_dev.GetDeviceBuffer(),
+                                bnBias_dev.GetDeviceBuffer(),
+                                epsilon,
+                                estimatedMean_dev.GetDeviceBuffer(),
+                                estimatedVariance_dev.GetDeviceBuffer(),
+                                y_dev.GetDeviceBuffer());
    if(result < 0)
        return (false);
@@ -259,7 +263,15 @@ bool bnorm_infer_nhwc_test(bool do_verification,
    if(do_verification)
    {
-        auto batchNormInfer_ref = ReferenceBatchNormInferInstance<InOutDataType, AccDataType>{};
+        using ReferenceBatchNormInferInstance =
+            ck::tensor_operation::host::ReferenceBatchNormInfer_Input_N_H_W_C_Output_C<
+                InOutDataType,
+                InOutDataType,
+                AccDataType,
+                AccDataType,
+                AccDataType,
+                AccDataType>;
+        auto batchNormInfer_ref = ReferenceBatchNormInferInstance{};
        auto argument_ptr_ref =
            batchNormInfer_ref.MakeArgumentPointer(i_inOutLengths,
@@ -267,6 +279,8 @@ bool bnorm_infer_nhwc_test(bool do_verification,
                                                   i_inOutStrides,
                                                   i_scaleBiasMeanVarLengths,
                                                   i_scaleBiasMeanVarStrides,
+                                                   i_scaleBiasMeanVarStrides,
+                                                   i_scaleBiasMeanVarStrides,
                                                   x.mData.data(),
                                                   bnScale.mData.data(),
                                                   bnBias.mData.data(),

--- a/include/ck/ck.hpp
+++ b/include/ck/ck.hpp
@@ -159,6 +159,11 @@
 // tuning parameter
 #define CK_WORKAROUND_SWDEV_325164 0
+// workaround: disable broken fused attention kernel instance that does not pass validation
+// issue found on mi100/#10738 combo when irregular KPerBlock attention kernel has acc0 scaling
+// enabled
+#define CK_WORKAROUND_DISABLE_BROKEN_ATTN_KERNEL_INSTANCE 1
 namespace ck {
 enum struct InMemoryDataOperationEnum

--- a/include/ck/tensor_description/tensor_space_filling_curve.hpp
+++ b/include/ck/tensor_description/tensor_space_filling_curve.hpp
@@ -14,7 +14,8 @@ namespace ck {
 template <typename TensorLengths,
          typename DimAccessOrder,
-          typename ScalarsPerAccess> // # of scalars per access in each dimension
+          typename ScalarsPerAccess,
+          bool SnakeCurved = true> // # of scalars per access in each dimension
 struct SpaceFillingCurve
 {
    static constexpr index_t nDim = TensorLengths::Size();
@@ -136,9 +137,10 @@ struct SpaceFillingCurve
            Index ordered_idx;
            static_for<0, nDim, 1>{}([&](auto idim) {
-                ordered_idx(idim) = forward_sweep[idim] ? ordered_access_idx[idim]
+                ordered_idx(idim) =
-                                                        : ordered_access_lengths[idim] - 1 -
+                    !SnakeCurved || forward_sweep[idim]
-                                                              ordered_access_idx[idim];
+                        ? ordered_access_idx[idim]
+                        : ordered_access_lengths[idim] - 1 - ordered_access_idx[idim];
            });
            return container_reorder_given_old2new(ordered_idx, dim_access_order) *

--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp
@@ -151,6 +151,27 @@ struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
        return make_tuple(c_thread_m, c_thread_n);
    }
+    template <index_t m0, index_t n0, index_t xdlops_i, index_t blk_i>
+    __device__ static auto
+        CalculateCThreadOriginDataIndex8D(Number<m0>, Number<n0>, Number<xdlops_i>, Number<blk_i>)
+    {
+        const auto wave_idx = GetWaveIdx();
+        const auto waveId_m = wave_idx[I0];
+        const auto waveId_n = wave_idx[I1];
+        const auto blk_idx = xdlops_gemm.GetBeginOfThreadBlk4D(xdlops_i, blk_i);
+        return make_tuple(Number<m0>{},
+                          Number<n0>{},
+                          waveId_m,
+                          waveId_n,
+                          blk_idx[I0],
+                          blk_idx[I1],
+                          blk_idx[I2],
+                          blk_idx[I3]);
+    }
    __host__ __device__ BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1()
    {
        static_assert(AK0MK1BlockDesc::IsKnownAtCompileTime() &&
@@ -724,6 +745,21 @@ struct BlockwiseGemmXdlops_v2
        return make_tuple(c_thread_m, c_thread_n);
    }
+    template <index_t m0, index_t n0, index_t xdlops_i, index_t blk_i>
+    __device__ static auto
+        CalculateCThreadOriginDataIndex8D(Number<m0>, Number<n0>, Number<xdlops_i>, Number<blk_i>)
+    {
+        const auto wave_idx = GetWaveIdx();
+        const auto waveId_m = wave_idx[I0];
+        const auto waveId_n = wave_idx[I1];
+        const auto blk_idx = xdlops_gemm.GetBeginOfThreadBlk4D(xdlops_i, blk_i);
+        return make_tuple(
+            m0, n0, waveId_m, waveId_n, blk_idx[I0], blk_idx[I1], blk_idx[I2], blk_idx[I3]);
+    }
    using Tuple4 = decltype(CalculateAThreadOriginDataIndex());
    __host__ __device__ BlockwiseGemmXdlops_v2(Tuple4 a_origin = CalculateAThreadOriginDataIndex(),

--- a/include/ck/tensor_operation/gpu/device/device_batched_gemm_softmax_gemm.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_batched_gemm_softmax_gemm.hpp
@@ -24,7 +24,8 @@ template <typename ALayout,
          typename B0ElementwiseOperation,
          typename Acc0ElementwiseOperation,
          typename B1ElementwiseOperation,
-          typename CElementwiseOperation>
+          typename CElementwiseOperation,
+          bool MaskOutUpperTriangle> // TODO: enum for mask type
 struct DeviceBatchedGemmSoftmaxGemm : public BaseOperator
 {
    virtual std::unique_ptr<BaseArgument>

--- a/include/ck/tensor_operation/gpu/device/device_batched_gemm_softmax_gemm_permute.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_batched_gemm_softmax_gemm_permute.hpp
@@ -7,49 +7,60 @@
 #include <vector>
 #include "device_base.hpp"
+#include "ck/tensor_operation/gpu/device/masking_specialization.hpp"
 namespace ck {
 namespace tensor_operation {
 namespace device {
-template <typename ALayout,
+template <index_t NumDimG,
-          typename B0Layout,
+          index_t NumDimM,
-          typename B1Layout,
+          index_t NumDimN,
-          typename CPermuteNumDims_G_M_Gemm1N, // Sequence<>
+          index_t NumDimK,
+          index_t NumDimO,
          typename ADataType,
          typename B0DataType,
          typename B1DataType,
          typename CDataType,
+          typename Acc0BiasDataType,
+          typename Acc1BiasDataType,
          typename AElementwiseOperation,
          typename B0ElementwiseOperation,
          typename Acc0ElementwiseOperation,
          typename B1ElementwiseOperation,
-          typename CElementwiseOperation>
+          typename CElementwiseOperation,
+          MaskingSpecialization MaskingSpec>
 struct DeviceBatchedGemmSoftmaxGemmPermute : public BaseOperator
 {
-    virtual std::unique_ptr<BaseArgument>
+    static constexpr index_t NumAcc0Bias = Acc0BiasDataType::Size();
-    MakeArgumentPointer(const void* p_a,
+    static constexpr index_t NumAcc1Bias = Acc1BiasDataType::Size();
-                        const void* p_b0,
-                        const void* p_b1,
+    virtual std::unique_ptr<BaseArgument> MakeArgumentPointer(
-                        void* p_c,
+        const void* p_a,
-                        ck::index_t M,
+        const void* p_b0,
-                        ck::index_t N,
+        const void* p_b1,
-                        ck::index_t K,
+        void* p_c,
-                        ck::index_t O,
+        const std::array<void*, NumAcc0Bias> p_acc0_biases,
-                        ck::index_t Batch,
+        const std::array<void*, NumAcc1Bias> p_acc1_biases,
-                        std::vector<index_t> c_gs_ms_os_lengths,
+        const std::vector<index_t>& a_gs_ms_ks_lengths,
-                        std::vector<index_t> c_gs_ms_os_strides,
+        const std::vector<index_t>& a_gs_ms_ks_strides,
-                        ck::index_t StrideA,
+        const std::vector<index_t>& b_gs_ns_ks_lengths,
-                        ck::index_t StrideB0,
+        const std::vector<index_t>& b_gs_ns_ks_strides,
-                        ck::index_t StrideB1,
+        const std::vector<index_t>& b1_gs_gemm1ns_gemm1ks_lengths, // b1_gs_os_ns_lengths
-                        ck::index_t BatchStrideA,
+        const std::vector<index_t>& b1_gs_gemm1ns_gemm1ks_strides, // b1_gs_os_ns_strides
-                        ck::index_t BatchStrideB0,
+        const std::vector<index_t>& c_gs_ms_gemm1ns_lengths,       // c_gs_ms_os_lengths
-                        ck::index_t BatchStrideB1,
+        const std::vector<index_t>& c_gs_ms_gemm1ns_strides,       // c_gs_ms_os_strides
-                        AElementwiseOperation a_element_op,
+        const std::array<std::vector<index_t>, NumAcc0Bias> acc0_biases_gs_ms_ns_lengths,
-                        B0ElementwiseOperation b0_element_op,
+        const std::array<std::vector<index_t>, NumAcc0Bias> acc0_biases_gs_ms_ns_strides,
-                        Acc0ElementwiseOperation acc0_element_op,
+        const std::array<std::vector<index_t>, NumAcc1Bias>
-                        B1ElementwiseOperation b1_element_op,
+            acc1_biases_gs_ms_gemm1ns_lengths, // acc1_biases_gs_ms_os_lengths
-                        CElementwiseOperation c_element_op) = 0;
+        const std::array<std::vector<index_t>, NumAcc1Bias>
+            acc1_biases_gs_ms_gemm1ns_strides, // acc1_biases_gs_ms_os_strides
+        AElementwiseOperation a_element_op,
+        B0ElementwiseOperation b0_element_op,
+        Acc0ElementwiseOperation acc0_element_op,
+        B1ElementwiseOperation b1_element_op,
+        CElementwiseOperation c_element_op) = 0;
    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
 };

--- a/include/ck/tensor_operation/gpu/device/device_batchnorm_forward.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_batchnorm_forward.hpp
@@ -13,31 +13,36 @@ namespace ck {
 namespace tensor_operation {
 namespace device {
-template <index_t Rank, index_t NumBatchNormReduceDim>
+template <index_t Rank, index_t NumBatchNormReduceDim, typename YElementwiseOp>
 struct DeviceBatchNormFwd : public BaseOperator
 {
    virtual std::unique_ptr<BaseArgument> MakeArgumentPointer(
        const std::array<index_t, Rank> xyLengths,
        const std::array<index_t, Rank> xStrides,
        const std::array<index_t, Rank> yStrides,
+        const std::array<int, NumBatchNormReduceDim> reduceDims,
        const std::array<index_t, Rank - NumBatchNormReduceDim> bnScaleBiasMeanVarLengths,
-        const std::array<index_t, Rank - NumBatchNormReduceDim> bnScaleBiasMeanVarStrides,
+        const std::array<index_t, Rank - NumBatchNormReduceDim> bnScaleStrides,
+        const std::array<index_t, Rank - NumBatchNormReduceDim> bnBiasStrides,
+        const std::array<index_t, Rank - NumBatchNormReduceDim> bnMeanVarStrides,
        const void* p_x,
        const void* bnScale,
        const void* bnBias,
+        double epsilon,
+        const YElementwiseOp y_elementwise_op,
        void* p_y,
+        void* resultSaveMean,
+        void* resultSaveInvVariance,
        double exponentialAverageFactor,
        void* resultRunningMean,
-        void* resultRunningVariance,
+        void* resultRunningVariance) = 0;
-        double epsilon,
-        void* resultSaveMean,
-        void* resultSaveInvVariance) = 0;
    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
 };
-template <index_t Rank, index_t NumBatchNormReduceDim>
+template <index_t Rank, index_t NumBatchNormReduceDim, typename YElementwiseOp>
-using DeviceBatchNormFwdPtr = std::unique_ptr<DeviceBatchNormFwd<Rank, NumBatchNormReduceDim>>;
+using DeviceBatchNormFwdPtr =
+    std::unique_ptr<DeviceBatchNormFwd<Rank, NumBatchNormReduceDim, YElementwiseOp>>;
 } // namespace device
 } // namespace tensor_operation

--- a/include/ck/tensor_operation/gpu/device/device_batchnorm_infer.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_batchnorm_infer.hpp
@@ -21,7 +21,9 @@ struct DeviceBatchNormInfer : public BaseOperator
        const std::array<index_t, Rank> xStrides,
        const std::array<index_t, Rank> yStrides,
        const std::array<index_t, Rank - NumBatchNormReduceDim> bnScaleBiasMeanVarLengths,
-        const std::array<index_t, Rank - NumBatchNormReduceDim> bnScaleBiasMeanVarStrides,
+        const std::array<index_t, Rank - NumBatchNormReduceDim> bnScaleStrides,
+        const std::array<index_t, Rank - NumBatchNormReduceDim> bnBiasStrides,
+        const std::array<index_t, Rank - NumBatchNormReduceDim> bnMeanVarStrides,
        const void* p_x,
        const void* bnScale,
        const void* bnBias,

--- a/include/ck/tensor_operation/gpu/device/device_grouped_conv_fwd.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_grouped_conv_fwd.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#include <array>
+#include "ck/tensor_operation/gpu/device/device_base.hpp"
+namespace ck {
+namespace tensor_operation {
+namespace device {
+// Convolution Forward:
+//   input : input image A[G, N, C, Hi, Wi],
+//   input : weight B[G, K, C, Y, X],
+//   input : D0[G, N, K, Ho, Wo], D1[G, N, K, Ho, Wo], ...
+//   output : output image E[G, N, K, Ho, Wo]
+//   C = a_op(A) * b_op(B)
+//   E = cde_op(C, D0, D1, ...)
+template <index_t NDimSpatial,
+          typename ALayout,
+          typename BLayout,
+          typename CLayout,
+          typename ADataType,
+          typename BDataType,
+          typename CDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CElementwiseOperation>
+struct DeviceGroupedConvFwd : public BaseOperator
+{
+    virtual std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(const void* p_a, // input image
+                        const void* p_b, // weight
+                        void* p_c,       // output image
+                        const std::array<index_t, NDimSpatial + 3>& a_g_n_c_wis_lengths,
+                        const std::array<index_t, NDimSpatial + 3>& a_g_n_c_wis_strides,
+                        const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_lengths,
+                        const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_strides,
+                        const std::array<index_t, NDimSpatial + 3>& c_g_n_k_wos_lengths,
+                        const std::array<index_t, NDimSpatial + 3>& c_g_n_k_wos_strides,
+                        const std::array<index_t, NDimSpatial>& conv_filter_strides,
+                        const std::array<index_t, NDimSpatial>& conv_filter_dilations,
+                        const std::array<index_t, NDimSpatial>& input_left_pads,
+                        const std::array<index_t, NDimSpatial>& input_right_pads,
+                        const AElementwiseOperation& a_element_op,
+                        const BElementwiseOperation& b_element_op,
+                        const CElementwiseOperation& c_element_op) = 0;
+    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
+};
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/include/ck/tensor_operation/gpu/device/device_grouped_conv_fwd_dl_nhwc_kyxc_nhwk.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_grouped_conv_fwd_dl_nhwc_kyxc_nhwk.hpp
--- a/include/ck/tensor_operation/gpu/device/device_grouped_gemm_softmax_gemm_permute.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_grouped_gemm_softmax_gemm_permute.hpp
@@ -7,46 +7,50 @@
 #include <vector>
 #include "device_base.hpp"
+#include "ck/tensor_operation/gpu/device/masking_specialization.hpp"
 namespace ck {
 namespace tensor_operation {
 namespace device {
-template <typename ALayout,
+template <index_t NumDimG,
-          typename B0Layout,
+          index_t NumDimM,
-          typename B1Layout,
+          index_t NumDimN,
-          typename CPermuteNumDims_G_M_Gemm1N, // Sequence<>
+          index_t NumDimK,
+          index_t NumDimO,
          typename ADataType,
          typename B0DataType,
          typename B1DataType,
          typename CDataType,
+          typename Acc0BiasDataType,
+          typename Acc1BiasDataType,
          typename AElementwiseOperation,
          typename B0ElementwiseOperation,
          typename Acc0ElementwiseOperation,
          typename B1ElementwiseOperation,
-          typename CElementwiseOperation>
+          typename CElementwiseOperation,
+          MaskingSpecialization MaskingSpec>
 struct DeviceGroupedGemmSoftmaxGemmPermute : public BaseOperator
 {
    struct ProblemDesc
    {
-        // Overall problem shape
+        std::vector<index_t> a_gs_ms_ks_lengths;
-        index_t M;
+        std::vector<index_t> a_gs_ms_ks_strides;
-        index_t N;
-        index_t K;
-        index_t O;
-        index_t Batch;
-        // Stride for A/B0/B1; layout determined by template args
+        std::vector<index_t> b0_gs_ns_ks_lengths;
-        index_t StrideA;
+        std::vector<index_t> b0_gs_ns_ks_strides;
-        index_t StrideB0;
-        index_t StrideB1;
+        std::vector<index_t> b1_gs_os_ns_lengths;
-        index_t BatchStrideA;
+        std::vector<index_t> b1_gs_os_ns_strides;
-        index_t BatchStrideB0;
-        index_t BatchStrideB1;
-        // Lengths and strides for output C
        std::vector<index_t> c_gs_ms_os_lengths;
        std::vector<index_t> c_gs_ms_os_strides;
+        std::vector<std::vector<index_t>> acc0_biases_gs_ms_ns_lengths;
+        std::vector<std::vector<index_t>> acc0_biases_gs_ms_ns_strides;
+        std::vector<std::vector<index_t>> acc1_biases_gs_ms_os_lengths;
+        std::vector<std::vector<index_t>> acc1_biases_gs_ms_os_strides;
    };
    virtual std::unique_ptr<BaseArgument>
@@ -54,6 +58,8 @@ struct DeviceGroupedGemmSoftmaxGemmPermute : public BaseOperator
                        std::vector<const void*> p_b0_vec,
                        std::vector<const void*> p_b1_vec,
                        std::vector<void*> p_c_vec,
+                        std::vector<std::vector<const void*>> p_acc0_biases_vec,
+                        std::vector<std::vector<const void*>> p_acc1_biases_vec,
                        std::vector<ProblemDesc> problem_desc_vec,
                        AElementwiseOperation a_element_op,
                        B0ElementwiseOperation b0_element_op,

--- a/include/ck/tensor_operation/gpu/device/device_grouped_gemm_softmax_gemm_permute_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_grouped_gemm_softmax_gemm_permute_xdl_cshuffle.hpp
--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_contraction_multiple_d_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_contraction_multiple_d_xdl_cshuffle.hpp
@@ -130,8 +130,11 @@ namespace device {
 //   D[G0, G1, ..., M0, M1, M2, ..., N0, N1, N2, ...]
 //   E[G0, G1, ..., M0, M1, M2, ..., N0, N1, N2, ...]
-// FIXME: TensorSpecialization::Packed specialization does not cover all packed tensor cases, it
+// NOTE: TensorSpecialization::Packed specialized tensor is "packed" in a sense that each inner
-// merely degenerates into TensorSpecialization::Default with NumDimG/M/N/K = 1
+// dimension in a dimension group (eg [G0, G1] in Gs, [M0, M1, M2] in Ms, etc.) are contiguous and
+// ordered. Not in a sense that the tensor [G0, G1, ..., M0, M1, ..., N0, N1...] can be permuted
+// while still being a contiguous, unpadded tensor. In other words, it merely degenerates into
+// TensorSpecialization::Default with NumDimG/M/N/K = 1
 //
 // Detail- Packed tensor satisfies
 //   stride_0 = 1
@@ -147,7 +150,7 @@ namespace device {
 // essentially a degenerated case of TensorSpecialization::Default with NumDimG/M/N/K = 1.
 //
 // Might need to expose dimension order to the interface to fully support
-// TensorSpecialization::Packed.
+// TensorSpecialization::Packed in a traditional sense of "packed" tensor
 template <index_t NumDimG,
          index_t NumDimM,
          index_t NumDimN,

--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle.hpp
--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_xdl_cshuffle.hpp
@@ -12,6 +12,7 @@
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "ck/tensor_operation/gpu/device/device_batched_gemm_softmax_gemm.hpp"
 #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/masking_specialization.hpp"
 #include "ck/tensor_operation/gpu/device/matrix_padder.hpp"
 #include "ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_xdl_cshuffle_v1.hpp"
 #include "ck/host_utility/device_prop.hpp"
@@ -196,7 +197,8 @@ struct DeviceBatchedGemmSoftmaxGemm_Xdl_CShuffle
                                          BElementwiseOperation,
                                          AccElementwiseOperation,
                                          B1ElementwiseOperation,
-                                          CElementwiseOperation>
+                                          CElementwiseOperation,
+                                          MaskOutUpperTriangle>
 {
    using DeviceOp = DeviceBatchedGemmSoftmaxGemm_Xdl_CShuffle;
@@ -315,29 +317,6 @@ struct DeviceBatchedGemmSoftmaxGemm_Xdl_CShuffle
        return matrix_padder.PadCDescriptor_M_N(c_grid_desc_mraw_nraw);
    }
-    // to track the points which need to be set to -inf on C0
-    // Note: no need to reset M padding value, because they will not be stored out.
-    struct C0MatrixMask
-    {
-        C0MatrixMask(index_t NRaw) : NRaw_(NRaw) {}
-        __host__ __device__ bool IsUpperTriangle(index_t m, index_t n) const { return n > m; }
-        __host__ __device__ bool IsNOutOfBound(/*index_t m, */ index_t n) const
-        {
-            return n >= NRaw_;
-        }
-        __host__ __device__ bool IsMaskedElement(index_t m, index_t n) const
-        {
-            return IsUpperTriangle(m, n) || IsNOutOfBound(n);
-        }
-        private:
-        // index_t MRaw_;
-        index_t NRaw_;
-    };
    struct ComputeBasePtrOfStridedBatch
    {
        ComputeBasePtrOfStridedBatch(index_t BatchStrideA,
@@ -383,6 +362,10 @@ struct DeviceBatchedGemmSoftmaxGemm_Xdl_CShuffle
    using B1GridDesc_BK0_N_BK1 = decltype(MakeB1GridDescriptor_BK0_N_BK1(1, 1, 1));
    using CGridDesc_M_N        = decltype(MakeCGridDescriptor_M_N(1, 1, 1));
+    using C0MatrixMask = conditional_t<MaskOutUpperTriangle,
+                                       C0MatrixMask_impl<MaskOutUpperTrianglePredicate>,
+                                       C0MatrixMask_impl<MaskDisabledPredicate>>;
    // GridwiseGemm
    using GridwiseGemm = GridwiseBatchedGemmSoftmaxGemm_Xdl_CShuffle<
        ADataType, // TODO: distinguish A/B datatype

--- a/include/ck/tensor_operation/gpu/device/impl/device_batchnorm_forward_impl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batchnorm_forward_impl.hpp
--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_dl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_dl.hpp
@@ -214,6 +214,7 @@ struct DeviceGemmDl : public DeviceGemm<ALayout,
                                     MPerBlock,
                                     NPerBlock,
                                     K0PerBlock,
+                                     K1,
                                     M1PerThread,
                                     N1PerThread,
                                     KPerThread,

--- a/include/ck/tensor_operation/gpu/device/masking_specialization.hpp
+++ b/include/ck/tensor_operation/gpu/device/masking_specialization.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+namespace ck {
+namespace tensor_operation {
+namespace device {
+enum struct MaskingSpecialization
+{
+    MaskDisabled,
+    MaskOutUpperTriangle
+};
+inline std::string getMaskingSpecializationString(const MaskingSpecialization& s)
+{
+    switch(s)
+    {
+    case MaskingSpecialization::MaskDisabled: return "MaskDisabled";
+    case MaskingSpecialization::MaskOutUpperTriangle: return "MaskOutUpperTriangle";
+    default: return "Unrecognized specialization!";
+    }
+}
+struct MaskDisabledPredicate
+{
+    __host__ __device__ constexpr bool operator()(index_t /*m*/, index_t /*n*/) const
+    {
+        return false;
+    };
+    __host__ __device__ constexpr bool
+        IsTileSkippable(index_t /*m*/, index_t /*n*/, index_t /*m_tile*/, index_t /*n_tile*/) const
+    {
+        return false;
+    }
+};
+struct MaskOutUpperTrianglePredicate
+{
+    __host__ __device__ constexpr bool operator()(index_t m, index_t n) const { return n > m; }
+    __host__ __device__ constexpr bool
+    IsTileSkippable(index_t m, index_t n, index_t m_tile, index_t /*n_tile*/) const
+    {
+        return operator()(m + m_tile - 1, n);
+    }
+};
+// to track the points which need to be set to -inf on C0
+// Note: no need to reset M padding value, because they will not be stored out.
+template <typename MaskOutPredicate>
+struct C0MatrixMask_impl
+{
+    C0MatrixMask_impl(index_t NRaw) : NRaw_(NRaw), predicate_(MaskOutPredicate{}) {}
+    __host__ __device__ constexpr bool IsNOutOfBound(/*index_t m, */ index_t n) const
+    {
+        return n >= NRaw_;
+    }
+    __host__ __device__ constexpr bool IsMaskedElement(index_t m, index_t n) const
+    {
+        return predicate_(m, n) || IsNOutOfBound(n);
+    }
+    __host__ __device__ constexpr bool
+    IsTileSkippable(index_t m, index_t n, index_t m_tile, index_t n_tile) const
+    {
+        return predicate_.IsTileSkippable(m, n, m_tile, n_tile);
+    }
+    private:
+    // index_t MRaw_;
+    index_t NRaw_;
+    MaskOutPredicate predicate_;
+};
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck