change cshuffle precision type to f16; clean up

597155e8 · Anthony Chang · bf44991f · 597155e8 · 597155e8 · 597155e8
Commit 597155e8 authored May 31, 2022 by Anthony Chang
3 changed files
--- a/example/21_gemm_layernorm/gemm_xdl_layernorm_single_kernel_fp16.cpp
+++ b/example/21_gemm_layernorm/gemm_xdl_layernorm_single_kernel_fp16.cpp
@@ -31,6 +31,7 @@ using BDataType   = F16;
 using CDataType   = F16;
 using C0DataType  = F16;
 using AccDataType = F32;
+using CShuffleDataType = F16;

 using ALayout = ck::tensor_layout::gemm::RowMajor;
 using BLayout = ck::tensor_layout::gemm::ColumnMajor;
@@ -47,7 +48,9 @@ struct Relu

 using AElementOp = ck::tensor_operation::element_wise::PassThrough;
 using BElementOp = ck::tensor_operation::element_wise::PassThrough;
+// Elementwise operation that operates on the output of matrix multiplication Acc = A * B
 using AccElementOp = Relu;
+// Elementwise operation that operates on the output of layer normalization
 using CElementOp = Relu;

 static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
@@ -58,7 +61,7 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmLayerNorm_Xdl
 //######|        |        |        |      Type|      Type|      Type|       Type|    DataType|         DataType|    DataType| Elementwise| Elementwise|  Elementwise| Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN| MXdlPerWave| NXdlPerWave|            _MBlock_MPerBlock| ScalarPerVector| ThreadClusterLengths| SrcDstScalarPerVector|
 //######|        |        |        |          |          |          |           |            |                 |            |   Operation|   Operation|    Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|            _NBlock_NPerBlock|      _NPerBlock| _MPerBlock_NPerBlock|            _NPerBlock|
 //######|        |        |        |          |          |          |           |            |                 |            |            |            |             |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                     |                      |
-        <     Row,     Col,     Row, ADataType, BDataType, CDataType, C0DataType, AccDataType, AccDataType, AccDataType,  AElementOp,  BElementOp, AccElementOp,  CElementOp,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           2,               S<1, 32, 1, 8>,               8,             S<64, 4>,                     4>;
+        <     Row,     Col,     Row, ADataType, BDataType, CDataType, C0DataType, AccDataType, CShuffleDataType, AccDataType,  AElementOp,  BElementOp, AccElementOp,  CElementOp,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           2,               S<1, 32, 1, 8>,               8,             S<64, 4>,                     4>;
 // clang-format on

 using ReferenceInstance = ck::tensor_operation::host::ReferenceGemmLayernorm<ADataType,
@@ -252,16 +255,16 @@ int main(int argc, char* argv[])

        ref_invoker.Run(ref_argument);

+        if constexpr(std::is_same<CShuffleDataType, F32>::value)
+        {
            pass &= ck::utils::check_err(
                c_m_n_device_result.mData, c_m_n_host_result.mData, "Error: Incorrect results c");
-
-        // if (!pass)
-        // {
-        //     LogRangeAsType<float>(std::cout << "c_host: ", c_m_n_host_result.mData, ",")
-        //                 << std::endl;
-        //     LogRangeAsType<float>(std::cout << "c_device: ", c_m_n_device_result.mData, ",")
-        //                 << std::endl;
-        // }
+        }
+        else if constexpr(std::is_same<CShuffleDataType, F16>::value)
+        {
+            pass &= ck::utils::check_err(
+                c_m_n_device_result.mData, c_m_n_host_result.mData, "Error: Incorrect results c", 1e-2, 1e-2);
+        }
    }
    return pass ? 0 : 1;
 }
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_layernorm_cshuffle_v1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_layernorm_cshuffle_v1.hpp
@@ -664,7 +664,6 @@ struct GridwiseGemmLayernorm_k0mk1_k0nk1_mn_xdl_cshuffle_v1
                 make_multi_index(block_work_idx[I0], 0, block_work_idx[I1], 0),
                 c_element_op};

-            // add bias: load bias to vgpr buffer, add to LDS
            const auto NBlock = c0_grid_desc_nblock_nperblock.GetLength(I0);

            // for broadcasting bias, beta, gamma

--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm_layernorm.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm_layernorm.hpp
@@ -65,8 +65,6 @@ struct ReferenceGemmLayernorm : public device::BaseOperator
            }
            avg_acc_sq(i) = sum_acc_sq / N;
            avg_acc(i)    = sum_acc / N;
-            // std::cout << "avg_acc_(" << i << ") =" << avg_acc(i) << std::endl;
-            // std::cout << "avg_acc_sq_(" << i << ") =" << avg_acc_sq(i) << std::endl;
        }

        // normalize