Merge branch 'attn-train-develop-qloop' into attn-train-develop-qloop-dropout-v2

94177eb6 · danyao12 · 44f4498a · 71e2a917 · 94177eb6 · 94177eb6
Commit 94177eb6 authored Jun 14, 2023 by danyao12
2 changed files
--- a/example/32_batched_gemm_scale_softmax_gemm/batched_multihead_attention_backward_v2.cpp
+++ b/example/32_batched_gemm_scale_softmax_gemm/batched_multihead_attention_backward_v2.cpp
@@ -98,9 +98,9 @@ static constexpr auto TensorSpecY   = ck::tensor_operation::device::TensorSpecia
 static constexpr bool Deterministic = false;

 // DIM should be a multiple of 8.
-// If      DIM <= 32 , ues prototype1 1st template.
-// If 32 < DIM <= 64 , ues prototype1 2nd template.
-// If 64 < DIM <= 128, ues prototype2 2nd template.
+// If      DIM <= 32 , ues prototype1.
+// If 32 < DIM <= 64 , ues prototype1.
+// If 64 < DIM <= 128, ues prototype2.
 #if(DIM <= 32)
 // clang-format off
 using DeviceGemmInstance =

--- a/example/32_batched_gemm_scale_softmax_gemm/grouped_multihead_attention_backward_v2.cpp
+++ b/example/32_batched_gemm_scale_softmax_gemm/grouped_multihead_attention_backward_v2.cpp
@@ -62,9 +62,9 @@ using Scale       = ck::tensor_operation::element_wise::Scale;
 using QKVElementOp = PassThrough;
 using YElementOp   = PassThrough;

-using InputDataType    = BF16;
-using OutputDataType   = F32;
-using GemmDataType     = BF16;
+using InputDataType    = F16;
+using OutputDataType   = F16;
+using GemmDataType     = F16;
 using AccDataType      = F32;
 using ShuffleDataType  = F32;
 using LSEDataType      = F32;
@@ -79,7 +79,7 @@ static constexpr ck::index_t NumDimK = 1;
 static constexpr ck::index_t NumDimO = 1;
 // When OutputDataType == F32,      CShuffleBlockTransferScalarPerVector_NPerBlock = 4
 // When OutputDataType == F16/BF16, CShuffleBlockTransferScalarPerVector_NPerBlock = 8
-static constexpr ck::index_t CShuffleBlockTransferScalarPerVector_NPerBlock = 4;
+static constexpr ck::index_t CShuffleBlockTransferScalarPerVector_NPerBlock = 8;

 static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKOPadding;
 #if USING_MASK
@@ -97,9 +97,9 @@ static constexpr auto TensorSpecY   = ck::tensor_operation::device::TensorSpecia
 static constexpr bool Deterministic = false;

 // DIM should be a multiple of 8.
-// If      DIM <= 32 , ues prototype1 1st template.
-// If 32 < DIM <= 64 , ues prototype1 2nd template.
-// If 64 < DIM <= 128, ues prototype2 2nd template.
+// If      DIM <= 32 , ues prototype1.
+// If 32 < DIM <= 64 , ues prototype1.
+// If 64 < DIM <= 128, ues prototype2.
 #if(DIM <= 32)
 // clang-format off
 using DeviceGemmInstance =