Merge branch 'mha-train-develop' into attn-train-develop-qloop-mask

8dd7156d · ltqin · d5f629e7 · b5a3ea2d · 8dd7156d · 8dd7156d
Commit 8dd7156d authored Jul 25, 2023 by ltqin
20 changed files
--- a/include/ck/tensor_operation/gpu/block/blockwise_tensor_slice_transfer_v5r1.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_tensor_slice_transfer_v5r1.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 #pragma once

--- a/include/ck/tensor_operation/gpu/block/blockwise_welford.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_welford.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 #pragma once
 #include "ck/tensor_description/cluster_descriptor.hpp"
-#include "ck/utility/reduction_common.hpp"
+#include "ck/utility/get_shift.hpp"
 namespace ck {
@@ -35,10 +35,11 @@ struct BlockwiseWelford
    static constexpr auto thread_cluster_desc =
        make_cluster_descriptor(ThreadClusterLengths_M_K{}, ThreadClusterArrangeOrder{});
+    template <typename CountDataType>
    __device__ static inline void
-    Merge(T& mean_a, T& var_a, int& count_a, T mean_b, T var_b, int count_b)
+    Merge(T& mean_a, T& var_a, CountDataType& count_a, T mean_b, T var_b, CountDataType count_b)
    {
-        int count            = count_a + count_b;
+        CountDataType count  = count_a + count_b;
        T count_b_over_count = count == 0 ? type_convert<T>(0) : type_convert<T>(count_b) / count;
        T delta              = mean_b - mean_a;
        mean_a += delta * count_b_over_count;
@@ -46,11 +47,12 @@ struct BlockwiseWelford
        count_a = count;
    }
-    __device__ static void Run(T& mean_value, T& var_value, int& count)
+    template <typename CountDataType>
+    __device__ static void Run(T& mean_value, T& var_value, CountDataType& count)
    {
        __shared__ T mean_block_buf[BlockSize];
        __shared__ T var_block_buf[BlockSize];
-        __shared__ int count_block_buf[BlockSize];
+        __shared__ CountDataType count_block_buf[BlockSize];
        constexpr auto cluster_len_shift = get_shift<BufferLength_K>();
@@ -76,13 +78,13 @@ struct BlockwiseWelford
                index_t offset2 = block_buf_desc_m_k.CalculateOffset(thread_cluster_idx +
                                                                     make_tuple(0, indOffset));
-                T mean1    = mean_block_buf[offset1];
+                T mean1              = mean_block_buf[offset1];
-                T var1     = var_block_buf[offset1];
+                T var1               = var_block_buf[offset1];
-                int count1 = count_block_buf[offset1];
+                CountDataType count1 = count_block_buf[offset1];
-                T mean2    = mean_block_buf[offset2];
+                T mean2              = mean_block_buf[offset2];
-                T var2     = var_block_buf[offset2];
+                T var2               = var_block_buf[offset2];
-                int count2 = count_block_buf[offset2];
+                CountDataType count2 = count_block_buf[offset2];
                Merge(mean1, var1, count1, mean2, var2, count2);

--- a/include/ck/tensor_operation/gpu/block/reduction_functions_blockwise.hpp
+++ b/include/ck/tensor_operation/gpu/block/reduction_functions_blockwise.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 #pragma once
 #include "ck/tensor_description/cluster_descriptor.hpp"
-#include "ck/utility/reduction_common.hpp"
+#include "ck/utility/get_shift.hpp"
 #include "ck/utility/reduction_functions_accumulate.hpp"
 namespace ck {

--- a/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v4r1.hpp
+++ b/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v4r1.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 #pragma once
@@ -11,10 +11,15 @@
 namespace ck {
-// this version does following things to avoid scratch memory issue
+/**
-// 1. Use StaticallyIndexedArray instead of C array for thread buffer
+ * @brief Blockwise data transfer
-// 2. ThreadwiseTensorSliceTransfer_v3 does not keep reference to tensor descriptor
+ *
-// 3. ThreadwiseTensorSliceTransfer_v3::Run() does not construct new tensor coordinate
+ * This version does following things to avoid scratch memory issue
+ * 1. Use StaticallyIndexedArray instead of C array for thread buffer
+ * 2. ThreadwiseTensorSliceTransfer_v3 does not keep reference to tensor descriptor
+ * 3. ThreadwiseTensorSliceTransfer_v3::Run() does not construct new tensor coordinate
+ *
+ */
 template <typename ThreadGroup,
          typename SrcElementwiseOperation,
          typename DstElementwiseOperation,

--- a/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v6r1.hpp
+++ b/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v6r1.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 #pragma once

--- a/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v6r2.hpp
+++ b/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v6r2.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 #pragma once

--- a/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v6r3.hpp
+++ b/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v6r3.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 #pragma once

--- a/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v7.hpp
+++ b/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v7.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 #pragma once

--- a/include/ck/tensor_operation/gpu/device/convolution_backward_data_specialization.hpp
+++ b/include/ck/tensor_operation/gpu/device/convolution_backward_data_specialization.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 #pragma once
@@ -19,8 +19,7 @@ getConvBackwardDataSpecializationString(const ConvolutionBackwardDataSpecializat
    switch(s)
    {
    case ConvolutionBackwardDataSpecialization::Default: return "Default";
-    case ConvolutionBackwardDataSpecialization::Filter1x1Stride1Pad0:
+    case ConvolutionBackwardDataSpecialization::Filter1x1Stride1Pad0: return "Filter1x1Stride1Pad0";
-        return "FFilter1x1Stride1Pad0";
    default: return "Unrecognized specialization!";
    }
 }

--- a/include/ck/tensor_operation/gpu/device/convolution_backward_weight_specialization.hpp
+++ b/include/ck/tensor_operation/gpu/device/convolution_backward_weight_specialization.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 #pragma once

--- a/include/ck/tensor_operation/gpu/device/convolution_forward_specialization.hpp
+++ b/include/ck/tensor_operation/gpu/device/convolution_forward_specialization.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 #pragma once

--- a/include/ck/tensor_operation/gpu/device/device_base.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_base.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 #pragma once

--- a/include/ck/tensor_operation/gpu/device/device_batched_contraction_multiple_d.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_batched_contraction_multiple_d.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 #pragma once

--- a/include/ck/tensor_operation/gpu/device/device_batched_gemm.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_batched_gemm.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 #pragma once

--- a/include/ck/tensor_operation/gpu/device/device_batched_gemm_gemm.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_batched_gemm_gemm.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 #pragma once

--- a/include/ck/tensor_operation/gpu/device/device_batched_gemm_multi_d.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_batched_gemm_multi_d.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 #pragma once

--- a/include/ck/tensor_operation/gpu/device/device_batched_gemm_multiple_d_gemm_multiple_d.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_batched_gemm_multiple_d_gemm_multiple_d.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 #pragma once

--- a/include/ck/tensor_operation/gpu/device/device_batched_gemm_softmax_gemm.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_batched_gemm_softmax_gemm.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 #pragma once

--- a/include/ck/tensor_operation/gpu/device/device_batched_gemm_softmax_gemm_permute.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_batched_gemm_softmax_gemm_permute.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 #pragma once
@@ -27,9 +27,9 @@ template <index_t NumDimG,
          typename Acc1BiasDataType,
          typename AElementwiseOperation,
          typename B0ElementwiseOperation,
-          typename Acc0ElementwiseOperation,
+          typename C0DEElementwiseOperation,
          typename B1ElementwiseOperation,
-          typename CElementwiseOperation,
+          typename C1DEElementwiseOperation,
          MaskingSpecialization MaskingSpec>
 struct DeviceBatchedGemmSoftmaxGemmPermute : public BaseOperator
 {
@@ -59,9 +59,9 @@ struct DeviceBatchedGemmSoftmaxGemmPermute : public BaseOperator
            acc1_biases_gs_ms_gemm1ns_strides, // acc1_biases_gs_ms_os_strides
        AElementwiseOperation a_element_op,
        B0ElementwiseOperation b0_element_op,
-        Acc0ElementwiseOperation acc0_element_op,
+        C0DEElementwiseOperation c0de_element_op,
        B1ElementwiseOperation b1_element_op,
-        CElementwiseOperation c_element_op) = 0;
+        C1DEElementwiseOperation c1de_element_op) = 0;
    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
 };

--- a/include/ck/tensor_operation/gpu/device/device_batchnorm_backward.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_batchnorm_backward.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 #pragma once