rename device operator

ac07f2cd · danyao12 · fsx950223 · b5af5db8 · ac07f2cd · ac07f2cd
Commit ac07f2cd authored Jan 29, 2023 by danyao12 Committed by fsx950223 Jan 29, 2023
2 changed files
--- a/example/32_batched_gemm_scale_softmax_gemm/batched_multihead_attention_backward_pt1_fp16.cpp
+++ b/example/32_batched_gemm_scale_softmax_gemm/batched_multihead_attention_backward_pt1_fp16.cpp
@@ -84,7 +84,7 @@ static constexpr auto TensorSpecV = ck::tensor_operation::device::TensorSpeciali
 static constexpr auto TensorSpecY = ck::tensor_operation::device::TensorSpecialization::Default;
 using DeviceGemmInstance =
-    ck::tensor_operation::device::DeviceBatchedMultiheadAttentionBackward_Xdl_CShuffle<
+    ck::tensor_operation::device::DeviceBatchedMultiheadAttentionBackward_Xdl_CShuffle_V2<
        NumDimG,
        NumDimM,
        NumDimN,
@@ -248,8 +248,8 @@ int run(int argc, char* argv[])
    ck::index_t N  = 512;
    ck::index_t K  = 64;
    ck::index_t O  = 64;
-    ck::index_t G0 = 4;
+    ck::index_t G0 = 54; //54
-    ck::index_t G1 = 16;
+    ck::index_t G1 = 16; //16
    float alpha = 1.f / std::sqrt(K);

--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_multihead_attention_backward_xdl_cshuffle_v2.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_multihead_attention_backward_xdl_cshuffle_v2.hpp
@@ -15,7 +15,7 @@
 #include "ck/tensor_operation/gpu/device/masking_specialization.hpp"
 #include "ck/tensor_operation/gpu/device/matrix_padder.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
-#include "ck/tensor_operation/gpu/grid/gridwise_batched_multihead_attention_backward_xdl_cshuffle_v2.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_batched_multihead_attention_backward_xdl_cshuffle_v3.hpp"
 #include "ck/tensor_operation/operator_transform/transform_contraction_to_gemm.hpp"
 #include "ck/host_utility/device_prop.hpp"
 #include "ck/host_utility/kernel_launch.hpp"
@@ -207,7 +207,7 @@ template <index_t NumDimG,
          index_t CShuffleBlockTransferScalarPerVector_NPerBlock,
          MaskingSpecialization MaskingSpec,
          LoopScheduler LoopSched = LoopScheduler::Default>
-struct DeviceBatchedMultiheadAttentionBackward_Xdl_CShuffle
+struct DeviceBatchedMultiheadAttentionBackward_Xdl_CShuffle_V2
    : public BaseOperator // TODO inherit atten bwd op once API stablizes
 {
    static_assert(NumDimG > 0 && NumDimM > 0 && NumDimN > 0 && NumDimK > 0 && NumDimO > 0,
@@ -229,7 +229,7 @@ struct DeviceBatchedMultiheadAttentionBackward_Xdl_CShuffle
    static constexpr index_t NumDimGemm1K = NumDimN;
 #endif
-    using DeviceOp = DeviceBatchedMultiheadAttentionBackward_Xdl_CShuffle;
+    using DeviceOp = DeviceBatchedMultiheadAttentionBackward_Xdl_CShuffle_V2;
    static constexpr auto I0 = Number<0>{};
    static constexpr auto I1 = Number<1>{};
@@ -1134,7 +1134,7 @@ struct DeviceBatchedMultiheadAttentionBackward_Xdl_CShuffle
        auto str = std::stringstream();
        // clang-format off
-        str << "DeviceBatchedMultiheadAttentionBackward_Xdl_CShuffle"
+        str << "DeviceBatchedMultiheadAttentionBackward_Xdl_CShuffle_V2"
            << "<"
            << BlockSize << ", "
            << MPerBlock << ", "