Merge remote-tracking branch 'origin' into aosewski/test_ggemm_splitk

7546eb7a · Adam Osewski · 8ad9c393 · a1e344b1 · 7546eb7a · 7546eb7a
Commit 7546eb7a authored May 11, 2023 by Adam Osewski
5 changed files
--- a/include/ck/tensor_operation/gpu/grid/normalization/gridwise_normalization_splitk_2nd.hpp
+++ b/include/ck/tensor_operation/gpu/grid/normalization/gridwise_normalization_splitk_2nd.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#include "ck/utility/data_type.hpp"
+#include "ck/utility/math.hpp"
+#include "ck/tensor_operation/gpu/block/blockwise_welford.hpp"
+#include "ck/tensor_operation/gpu/thread/threadwise_welford.hpp"
+#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+namespace ck {
+template <typename MeanVarDataType,
+          typename XDataType,
+          typename GammaDataType,
+          typename BetaDataType,
+          typename YDataType,
+          typename ComputeDataType,
+          typename YElementwiseOperation,
+          typename MeanVarGridDesc_M_KBlock,
+          typename CountGridDesc_M_KBlock,
+          typename XYGammaBetaGridDesc_M_K,
+          index_t BlockSize,
+          index_t MThreadClusterSize,
+          index_t KThreadClusterSize,
+          index_t MThreadSliceSize,
+          index_t KThreadSliceSize,
+          index_t XSrcVectorDim,
+          index_t XSrcVectorSize,
+          index_t GammaSrcVectorDim,
+          index_t GammaSrcVectorSize,
+          index_t BetaSrcVectorDim,
+          index_t BetaSrcVectorSize,
+          index_t YDstVectorDim,
+          index_t YDstVectorSize>
+struct GridwiseNormalizationSplitK2nd
+{
+    static_assert((XSrcVectorDim == 0 && MThreadSliceSize % XSrcVectorSize == 0) ||
+                      (XSrcVectorDim == 1 && KThreadSliceSize % XSrcVectorSize == 0),
+                  "Invalid thread slice sizes and/or vector sizes configuration, please check!");
+    static_assert((YDstVectorDim == 0 && MThreadSliceSize % YDstVectorSize == 0) ||
+                      (YDstVectorDim == 1 && KThreadSliceSize % YDstVectorSize == 0),
+                  "Invalid thread slice sizes and/or vector sizes configuration, please check!");
+    static_assert(XSrcVectorSize == YDstVectorSize);
+    static_assert(XSrcVectorSize == GammaSrcVectorSize);
+    static_assert(XSrcVectorSize == BetaSrcVectorSize);
+    static constexpr bool reorder_thread_cluster = (XSrcVectorDim == 0);
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+    using ThreadClusterLengths_M_K = Sequence<MThreadClusterSize, KThreadClusterSize>;
+    using ThreadBufferDimAccessOrder =
+        typename conditional<reorder_thread_cluster, Sequence<1, 0>, Sequence<0, 1>>::type;
+    using ThreadClusterArrangeOrder =
+        typename conditional<reorder_thread_cluster, Sequence<1, 0>, Sequence<0, 1>>::type;
+    static constexpr auto thread_cluster_desc =
+        make_cluster_descriptor(ThreadClusterLengths_M_K{}, ThreadClusterArrangeOrder{});
+    using ThreadBufferLengths_M_K                = Sequence<MThreadSliceSize, XSrcVectorSize>;
+    static constexpr auto thread_buffer_desc_m_k = make_naive_tensor_descriptor_packed(
+        make_tuple(Number<MThreadSliceSize>{}, Number<XSrcVectorSize>{}));
+    using ThreadBufferLengths_M_1 = Sequence<MThreadSliceSize, 1>;
+    static constexpr auto thread_buffer_desc_m_1 =
+        make_naive_tensor_descriptor_packed(make_tuple(Number<MThreadSliceSize>{}, I1));
+    using ThreadWelfordSrcDesc_M_1 = decltype(thread_buffer_desc_m_1);
+    using ThreadWelfordDstDesc_M =
+        decltype(make_naive_tensor_descriptor_packed(make_tuple(Number<MThreadSliceSize>{})));
+    using ThreadwiseWelford =
+        ThreadwiseWelfordMerge<ComputeDataType, ThreadWelfordSrcDesc_M_1, ThreadWelfordDstDesc_M>;
+    using BlockwiseWelford = BlockwiseWelford<ComputeDataType,
+                                              BlockSize,
+                                              ThreadClusterLengths_M_K,
+                                              ThreadClusterArrangeOrder>;
+    using PassThroughOp = tensor_operation::element_wise::PassThrough;
+    static constexpr index_t M_BlockTileSize     = MThreadClusterSize * MThreadSliceSize;
+    static constexpr index_t K_BlockTileSize     = KThreadClusterSize * KThreadSliceSize;
+    static constexpr index_t K_BlockTileStepSize = KThreadClusterSize * XSrcVectorSize;
+    static constexpr auto ThreadBufferNumber = Number<KThreadSliceSize / XSrcVectorSize>{};
+    __device__ static void Run(const MeanVarGridDesc_M_KBlock& mean_var_grid_desc_m_kblock,
+                               const CountGridDesc_M_KBlock& count_grid_desc_m_kblock,
+                               const XYGammaBetaGridDesc_M_K& x_grid_desc_m_k,
+                               const XYGammaBetaGridDesc_M_K& gamma_grid_desc_m_k,
+                               const XYGammaBetaGridDesc_M_K& beta_grid_desc_m_k,
+                               const XYGammaBetaGridDesc_M_K& y_grid_desc_m_k,
+                               index_t num_k_mean_var_count_iteration,
+                               index_t num_k_block_tile_iteration,
+                               index_t k_grid_size,
+                               ComputeDataType epsilon,
+                               const MeanVarDataType* const p_mean_global,
+                               const MeanVarDataType* const p_variance_global,
+                               const int32_t* const p_welford_count_global,
+                               const XDataType* const __restrict__ p_x_global,
+                               const GammaDataType* const __restrict__ p_gamma_global,
+                               const BetaDataType* const __restrict__ p_beta_global,
+                               YDataType* const __restrict__ p_y_global,
+                               const YElementwiseOperation y_elementwise_op)
+    {
+        // Thread/Block id
+        const index_t thread_local_id    = get_thread_local_1d_id();
+        const index_t block_global_id    = get_block_1d_id();
+        const index_t block_m_cluster_id = block_global_id / k_grid_size;
+        const index_t block_k_cluster_id = block_global_id % k_grid_size;
+        const auto thread_cluster_idx =
+            thread_cluster_desc.CalculateBottomIndex(make_multi_index(thread_local_id));
+        const auto thread_m_cluster_id = thread_cluster_idx[I0];
+        const auto thread_k_cluster_id = thread_cluster_idx[I1];
+        // Global Memory
+        const auto mean_global_val_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_mean_global, mean_var_grid_desc_m_kblock.GetElementSpaceSize());
+        const auto var_global_val_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_variance_global, mean_var_grid_desc_m_kblock.GetElementSpaceSize());
+        const auto welford_count_global_val_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_welford_count_global, count_grid_desc_m_kblock.GetElementSpaceSize());
+        const auto x_global_val_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_x_global, x_grid_desc_m_k.GetElementSpaceSize());
+        const auto gamma_global_val_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_gamma_global, gamma_grid_desc_m_k.GetElementSpaceSize());
+        const auto beta_global_val_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_beta_global, beta_grid_desc_m_k.GetElementSpaceSize());
+        auto y_global_val_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_y_global, y_grid_desc_m_k.GetElementSpaceSize());
+        // VGPR
+        StaticBuffer<AddressSpaceEnum::Vgpr, ComputeDataType, MThreadSliceSize, true>
+            in_mean_thread_buf;
+        StaticBuffer<AddressSpaceEnum::Vgpr, ComputeDataType, MThreadSliceSize, true>
+            in_var_thread_buf;
+        StaticBuffer<AddressSpaceEnum::Vgpr, int32_t, MThreadSliceSize, true>
+            in_welford_count_thread_buf;
+        StaticBuffer<AddressSpaceEnum::Vgpr, ComputeDataType, MThreadSliceSize, true>
+            mean_thread_buf;
+        StaticBuffer<AddressSpaceEnum::Vgpr, ComputeDataType, MThreadSliceSize, true>
+            var_thread_buf;
+        StaticBuffer<AddressSpaceEnum::Vgpr, int32_t, MThreadSliceSize, true>
+            welford_count_thread_buf;
+        auto x_thread_buf = generate_tuple(
+            [&](auto) {
+                return StaticBuffer<AddressSpaceEnum::Vgpr,
+                                    ComputeDataType,
+                                    MThreadSliceSize * XSrcVectorSize,
+                                    true>{};
+            },
+            Number<ThreadBufferNumber>{});
+        auto gamma_thread_buf = generate_tuple(
+            [&](auto) {
+                return StaticBuffer<AddressSpaceEnum::Vgpr,
+                                    ComputeDataType,
+                                    MThreadSliceSize * GammaSrcVectorSize,
+                                    true>{};
+            },
+            Number<ThreadBufferNumber>{});
+        auto& beta_thread_buf = gamma_thread_buf;
+        auto& y_thread_buf    = x_thread_buf;
+        // IO
+        auto threadwise_mean_var_load_m_kblock =
+            ThreadwiseTensorSliceTransfer_v2<MeanVarDataType,
+                                             ComputeDataType,
+                                             MeanVarGridDesc_M_KBlock,
+                                             decltype(thread_buffer_desc_m_1),
+                                             ThreadBufferLengths_M_1,
+                                             Sequence<0, 1>,
+                                             1,
+                                             1,
+                                             1,
+                                             true>(
+                mean_var_grid_desc_m_kblock,
+                make_multi_index(block_m_cluster_id * M_BlockTileSize +
+                                     thread_m_cluster_id * MThreadSliceSize,
+                                 thread_k_cluster_id));
+        auto threadwise_count_load_m_kblock =
+            ThreadwiseTensorSliceTransfer_v2<int32_t,
+                                             int32_t,
+                                             CountGridDesc_M_KBlock,
+                                             decltype(thread_buffer_desc_m_1),
+                                             ThreadBufferLengths_M_1,
+                                             Sequence<0, 1>,
+                                             1,
+                                             1,
+                                             1,
+                                             true>(
+                count_grid_desc_m_kblock,
+                make_multi_index(block_m_cluster_id * M_BlockTileSize +
+                                     thread_m_cluster_id * MThreadSliceSize,
+                                 thread_k_cluster_id));
+        auto threadwise_x_load = ThreadwiseTensorSliceTransfer_v2<XDataType,
+                                                                  ComputeDataType,
+                                                                  XYGammaBetaGridDesc_M_K,
+                                                                  decltype(thread_buffer_desc_m_k),
+                                                                  ThreadBufferLengths_M_K,
+                                                                  ThreadBufferDimAccessOrder,
+                                                                  XSrcVectorDim,
+                                                                  XSrcVectorSize,
+                                                                  1,
+                                                                  true>(
+            x_grid_desc_m_k,
+            make_multi_index(block_m_cluster_id * M_BlockTileSize +
+                                 thread_m_cluster_id * MThreadSliceSize,
+                             block_k_cluster_id * K_BlockTileSize * num_k_block_tile_iteration +
+                                 thread_k_cluster_id * XSrcVectorSize));
+        auto threadwise_gamma_load =
+            ThreadwiseTensorSliceTransfer_v2<GammaDataType,
+                                             ComputeDataType,
+                                             XYGammaBetaGridDesc_M_K,
+                                             decltype(thread_buffer_desc_m_k),
+                                             ThreadBufferLengths_M_K,
+                                             ThreadBufferDimAccessOrder,
+                                             GammaSrcVectorDim,
+                                             GammaSrcVectorSize,
+                                             1,
+                                             true>(
+                gamma_grid_desc_m_k,
+                make_multi_index(block_m_cluster_id * M_BlockTileSize +
+                                     thread_m_cluster_id * MThreadSliceSize,
+                                 block_k_cluster_id * K_BlockTileSize * num_k_block_tile_iteration +
+                                     thread_k_cluster_id * GammaSrcVectorSize));
+        auto threadwise_beta_load =
+            ThreadwiseTensorSliceTransfer_v2<BetaDataType,
+                                             ComputeDataType,
+                                             XYGammaBetaGridDesc_M_K,
+                                             decltype(thread_buffer_desc_m_k),
+                                             ThreadBufferLengths_M_K,
+                                             ThreadBufferDimAccessOrder,
+                                             BetaSrcVectorDim,
+                                             BetaSrcVectorSize,
+                                             1,
+                                             true>(
+                beta_grid_desc_m_k,
+                make_multi_index(block_m_cluster_id * M_BlockTileSize +
+                                     thread_m_cluster_id * MThreadSliceSize,
+                                 block_k_cluster_id * K_BlockTileSize * num_k_block_tile_iteration +
+                                     thread_k_cluster_id * BetaSrcVectorSize));
+        auto threadwise_y_store =
+            ThreadwiseTensorSliceTransfer_v1r3<ComputeDataType,
+                                               YDataType,
+                                               decltype(thread_buffer_desc_m_k),
+                                               XYGammaBetaGridDesc_M_K,
+                                               YElementwiseOperation,
+                                               ThreadBufferLengths_M_K,
+                                               ThreadBufferDimAccessOrder,
+                                               YDstVectorDim,
+                                               YDstVectorSize,
+                                               InMemoryDataOperationEnum::Set,
+                                               1,
+                                               true>(
+                y_grid_desc_m_k,
+                make_multi_index(block_m_cluster_id * M_BlockTileSize +
+                                     thread_m_cluster_id * MThreadSliceSize,
+                                 block_k_cluster_id * K_BlockTileSize * num_k_block_tile_iteration +
+                                     thread_k_cluster_id * YDstVectorSize),
+                y_elementwise_op);
+        // step1: Merge mean and variance
+        constexpr auto mean_var_count_thread_copy_step_I0_k =
+            make_multi_index(I0, KThreadClusterSize);
+        static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
+            mean_thread_buf(I)          = type_convert<ComputeDataType>(0.0f);
+            var_thread_buf(I)           = type_convert<ComputeDataType>(0.0f);
+            welford_count_thread_buf(I) = 0;
+        });
+        for(index_t k = 0; k < num_k_mean_var_count_iteration; ++k)
+        {
+            threadwise_mean_var_load_m_kblock.Run(mean_var_grid_desc_m_kblock,
+                                                  mean_global_val_buf,
+                                                  thread_buffer_desc_m_1,
+                                                  make_tuple(I0, I0),
+                                                  in_mean_thread_buf);
+            threadwise_mean_var_load_m_kblock.Run(mean_var_grid_desc_m_kblock,
+                                                  var_global_val_buf,
+                                                  thread_buffer_desc_m_1,
+                                                  make_tuple(I0, I0),
+                                                  in_var_thread_buf);
+            threadwise_count_load_m_kblock.Run(count_grid_desc_m_kblock,
+                                               welford_count_global_val_buf,
+                                               thread_buffer_desc_m_1,
+                                               make_tuple(I0, I0),
+                                               in_welford_count_thread_buf);
+            ThreadwiseWelford::Run(in_mean_thread_buf,
+                                   in_var_thread_buf,
+                                   in_welford_count_thread_buf,
+                                   mean_thread_buf,
+                                   var_thread_buf,
+                                   welford_count_thread_buf);
+            threadwise_mean_var_load_m_kblock.MoveSrcSliceWindow(
+                mean_var_grid_desc_m_kblock, mean_var_count_thread_copy_step_I0_k);
+            threadwise_count_load_m_kblock.MoveSrcSliceWindow(count_grid_desc_m_kblock,
+                                                              mean_var_count_thread_copy_step_I0_k);
+        }
+        static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
+            if constexpr(I > 0)
+                block_sync_lds();
+            BlockwiseWelford::Run(
+                mean_thread_buf(I), var_thread_buf(I), welford_count_thread_buf(I));
+        });
+        // step2: normalization
+        constexpr auto thread_copy_fwd_step_m_k = make_multi_index(0, K_BlockTileStepSize);
+        for(index_t k = 0; k < num_k_block_tile_iteration; ++k)
+        {
+            static_for<0, ThreadBufferNumber, 1>{}([&](auto i) {
+                threadwise_x_load.Run(x_grid_desc_m_k,
+                                      x_global_val_buf,
+                                      thread_buffer_desc_m_k,
+                                      make_tuple(I0, I0),
+                                      x_thread_buf(i));
+                threadwise_x_load.MoveSrcSliceWindow(x_grid_desc_m_k, thread_copy_fwd_step_m_k);
+            });
+            static_for<0, ThreadBufferNumber, 1>{}([&](auto i) {
+                threadwise_gamma_load.Run(gamma_grid_desc_m_k,
+                                          gamma_global_val_buf,
+                                          thread_buffer_desc_m_k,
+                                          make_tuple(I0, I0),
+                                          gamma_thread_buf(i));
+                threadwise_gamma_load.MoveSrcSliceWindow(gamma_grid_desc_m_k,
+                                                         thread_copy_fwd_step_m_k);
+            });
+            static_for<0, MThreadSliceSize, 1>{}([&](auto iM) {
+                auto divisor = 1 / ck::math::sqrt(var_thread_buf(iM) + epsilon);
+                static_for<0, ThreadBufferNumber, 1>{}([&](auto iK0) {
+                    static_for<0, XSrcVectorSize, 1>{}([&](auto iK1) {
+                        constexpr auto offset_m_k =
+                            thread_buffer_desc_m_k.CalculateOffset(make_tuple(iM, iK1));
+                        // normalize
+                        y_thread_buf(iK0)(Number<offset_m_k>{}) =
+                            (x_thread_buf(iK0)(Number<offset_m_k>{}) - mean_thread_buf(iM)) *
+                            divisor;
+                        // gamma
+                        y_thread_buf(iK0)(Number<offset_m_k>{}) =
+                            y_thread_buf(iK0)(Number<offset_m_k>{}) *
+                            gamma_thread_buf(iK0)(Number<offset_m_k>{});
+                    });
+                });
+            });
+            static_for<0, ThreadBufferNumber, 1>{}([&](auto i) {
+                threadwise_beta_load.Run(beta_grid_desc_m_k,
+                                         beta_global_val_buf,
+                                         thread_buffer_desc_m_k,
+                                         make_tuple(I0, I0),
+                                         beta_thread_buf(i));
+                threadwise_beta_load.MoveSrcSliceWindow(beta_grid_desc_m_k,
+                                                        thread_copy_fwd_step_m_k);
+            });
+            static_for<0, MThreadSliceSize, 1>{}([&](auto iM) {
+                static_for<0, ThreadBufferNumber, 1>{}([&](auto iK0) {
+                    static_for<0, XSrcVectorSize, 1>{}([&](auto iK1) {
+                        constexpr auto offset_m_k =
+                            thread_buffer_desc_m_k.CalculateOffset(make_tuple(iM, iK1));
+                        // beta
+                        y_thread_buf(iK0)(Number<offset_m_k>{}) =
+                            y_thread_buf(iK0)(Number<offset_m_k>{}) +
+                            beta_thread_buf(iK0)(Number<offset_m_k>{});
+                    });
+                });
+            });
+            static_for<0, ThreadBufferNumber, 1>{}([&](auto i) {
+                threadwise_y_store.Run(thread_buffer_desc_m_k,
+                                       make_tuple(I0, I0),
+                                       y_thread_buf(i),
+                                       y_grid_desc_m_k,
+                                       y_global_val_buf);
+                threadwise_y_store.MoveDstSliceWindow(y_grid_desc_m_k, thread_copy_fwd_step_m_k);
+            });
+        } // end for (normalization)
+    }
+};
+} // namespace ck
--- a/include/ck/tensor_operation/gpu/grid/gridwise_normalization_welford_variance.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_normalization_welford_variance.hpp
--- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp
+++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp
@@ -6,6 +6,7 @@
 #include "ck/utility/common_header.hpp"
 #include "ck/tensor_description/tensor_descriptor.hpp"
 #include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp"
 #include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
 #include "ck/tensor/static_tensor.hpp"
@@ -207,15 +208,6 @@ struct ThreadwiseTensorSliceTransfer_v3r1
            auto src_vector_container = src_vector_type{
                src_buf.template Get<src_vector_t>(src_coord_.GetOffset(), is_src_valid)};
-            // apply SrcElementwiseOperation on src_vector_container
-            static_for<0, SrcScalarPerVector, 1>{}([&](auto i) {
-                SrcData src_v;
-                src_element_op_(src_v, src_vector_container.template AsType<SrcData>()[i]);
-                src_vector_container.template AsType<SrcData>()(i) = src_v;
-            });
            // copy data from src_vector_container into src_thread_scratch_
            src_thread_scratch_tuple_(thread_scratch_id)
                .template SetAsType<src_vector_t>(
@@ -318,7 +310,6 @@ struct ThreadwiseTensorSliceTransfer_v3r1
                constexpr auto data_idx_seq = generate_sequence_v2(
                    [&](auto i) { return Number<data_idx[i]>{}; }, Number<nDim>{});
-                // TODO type_convert is not used yet!!!!!
                using src_vector_t = vector_type_maker_t<SrcData, SrcScalarPerVector>;
                using dst_vector_t = vector_type_maker_t<DstData, DstScalarPerVector>;
@@ -342,19 +333,17 @@ struct ThreadwiseTensorSliceTransfer_v3r1
                    Number<num_dst_vector>{});
                // do data transpose
-                // TODO type_convert is not used yet!!!!!
                transpose_vectors<SrcData, DstScalarPerVector, SrcScalarPerVector>{}(
                    src_vector_refs, dst_vector_refs);
            });
        }
-        else
-        {
+        static_ford<SliceLengths>{}([&](auto idx) {
-            static_ford<SliceLengths>{}([&](auto idx) {
+            // apply the src elementwise op and convert to DstData under the hood if needed
-                // convert from SrcData to DstData here
+            DstData dst_v;
-                dst_thread_scratch_(idx) =
+            src_element_op_(dst_v, src_thread_scratch_tuple_[thread_scratch_id][idx]);
-                    type_convert<DstData>(src_thread_scratch_tuple_[thread_scratch_id][idx]);
+            dst_thread_scratch_(idx) = dst_v;
-            });
+        });
-        }
 #endif
    }

--- a/include/ck/utility/data_type.hpp
+++ b/include/ck/utility/data_type.hpp
@@ -976,37 +976,6 @@ inline __host__ __device__ constexpr bhalf_t type_convert<bhalf_t, float>(float
        uint32_t int32;
    } u = {x};
-    // When the exponent bits are not all 1s, then the value is zero, normal,
-    // or subnormal. We round the bfloat16 mantissa up by adding 0x7FFF, plus
-    // 1 if the least significant bit of the bfloat16 mantissa is 1 (odd).
-    // This causes the bfloat16's mantissa to be incremented by 1 if the 16
-    // least significant bits of the float mantissa are greater than 0x8000,
-    // or if they are equal to 0x8000 and the least significant bit of the
-    // bfloat16 mantissa is 1 (odd). This causes it to be rounded to even when
-    // the lower 16 bits are exactly 0x8000. If the bfloat16 mantissa already
-    // has the value 0x7f, then incrementing it causes it to become 0x00 and
-    // the exponent is incremented by one, which is the next higher FP value
-    // to the unrounded bfloat16 value. When the bfloat16 value is subnormal
-    // with an exponent of 0x00 and a mantissa of 0x7f, it may be rounded up
-    // to a normal value with an exponent of 0x01 and a mantissa of 0x00.
-    // When the bfloat16 value has an exponent of 0xFE and a mantissa of 0x7F,
-    // incrementing it causes it to become an exponent of 0xFF and a mantissa
-    // of 0x00, which is Inf, the next higher value to the unrounded value.
-    bool flag0 = ~u.int32 & 0x7f800000;
-    // When all of the exponent bits are 1, the value is Inf or NaN.
-    // Inf is indicated by a zero mantissa. NaN is indicated by any nonzero
-    // mantissa bit. Quiet NaN is indicated by the most significant mantissa
-    // bit being 1. Signaling NaN is indicated by the most significant
-    // mantissa bit being 0 but some other bit(s) being 1. If any of the
-    // lower 16 bits of the mantissa are 1, we set the least significant bit
-    // of the bfloat16 mantissa, in order to preserve signaling NaN in case
-    // the bfloat16's mantissa bits are all 0.
-    bool flag1 = !flag0 && (u.int32 & 0xffff);
-    u.int32 += flag0 ? 0x7fff + ((u.int32 >> 16) & 1) : 0; // Round to nearest, round to even
-    u.int32 |= flag1 ? 0x10000 : 0x0;                      // Preserve signaling NaN
    return uint16_t(u.int32 >> 16);
 }
@@ -1064,6 +1033,63 @@ inline __host__ __device__ constexpr bhalf_t type_convert<bhalf_t, int8_t>(int8_
    return type_convert<bhalf_t>(x_fp32);
 }
+// Declare a template function for bf16 conversion using RTN
+template <typename Y, typename X>
+__host__ __device__ constexpr Y bf16_convert_rtn(X x);
+// Convert fp32 to bf16 with RTN if higher precision is needed
+template <>
+inline __host__ __device__ constexpr bhalf_t bf16_convert_rtn<bhalf_t, float>(float x)
+{
+    union
+    {
+        float fp32;
+        uint32_t int32;
+    } u = {x};
+    // When the exponent bits are not all 1s, then the value is zero, normal,
+    // or subnormal. We round the bfloat16 mantissa up by adding 0x7FFF, plus
+    // 1 if the least significant bit of the bfloat16 mantissa is 1 (odd).
+    // This causes the bfloat16's mantissa to be incremented by 1 if the 16
+    // least significant bits of the float mantissa are greater than 0x8000,
+    // or if they are equal to 0x8000 and the least significant bit of the
+    // bfloat16 mantissa is 1 (odd). This causes it to be rounded to even when
+    // the lower 16 bits are exactly 0x8000. If the bfloat16 mantissa already
+    // has the value 0x7f, then incrementing it causes it to become 0x00 and
+    // the exponent is incremented by one, which is the next higher FP value
+    // to the unrounded bfloat16 value. When the bfloat16 value is subnormal
+    // with an exponent of 0x00 and a mantissa of 0x7f, it may be rounded up
+    // to a normal value with an exponent of 0x01 and a mantissa of 0x00.
+    // When the bfloat16 value has an exponent of 0xFE and a mantissa of 0x7F,
+    // incrementing it causes it to become an exponent of 0xFF and a mantissa
+    // of 0x00, which is Inf, the next higher value to the unrounded value.
+    bool flag0 = ~u.int32 & 0x7f800000;
+    // When all of the exponent bits are 1, the value is Inf or NaN.
+    // Inf is indicated by a zero mantissa. NaN is indicated by any nonzero
+    // mantissa bit. Quiet NaN is indicated by the most significant mantissa
+    // bit being 1. Signaling NaN is indicated by the most significant
+    // mantissa bit being 0 but some other bit(s) being 1. If any of the
+    // lower 16 bits of the mantissa are 1, we set the least significant bit
+    // of the bfloat16 mantissa, in order to preserve signaling NaN in case
+    // the bfloat16's mantissa bits are all 0.
+    bool flag1 = !flag0 && (u.int32 & 0xffff);
+    u.int32 += flag0 ? 0x7fff + ((u.int32 >> 16) & 1) : 0; // Round to nearest, round to even
+    u.int32 |= flag1 ? 0x10000 : 0x0;                      // Preserve signaling NaN
+    return uint16_t(u.int32 >> 16);
+}
+// convert fp16 to bfp16 via fp32 with RTN if higher precision is needed
+template <>
+inline __host__ __device__ constexpr bhalf_t bf16_convert_rtn<bhalf_t, half_t>(half_t x)
+{
+    float x_fp32 = static_cast<float>(x);
+    return bf16_convert_rtn<bhalf_t>(x_fp32);
+}
 template <typename T>
 struct NumericLimits
 {

--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm.hpp
@@ -6,6 +6,7 @@
 #include <iostream>
 #include <sstream>
+#include "ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp"
 #include "ck/tensor_operation/gpu/device/device_base.hpp"
 #include "ck/library/utility/host_tensor.hpp"
@@ -66,8 +67,26 @@ struct ReferenceGemm : public device::BaseOperator
                    ADataType v_a;
                    BDataType v_b;
-                    arg.a_element_op_(v_a, arg.a_m_k_(m, k));
+                    // use PassThrough instead of ConvertBF16RTN for reference calculation
-                    arg.b_element_op_(v_b, arg.b_k_n_(k, n));
+                    if constexpr(is_same_v<AElementwiseOperation,
+                                           ck::tensor_operation::element_wise::ConvertBF16RTN>)
+                    {
+                        ck::tensor_operation::element_wise::PassThrough{}(v_a, arg.a_m_k_(m, k));
+                    }
+                    else
+                    {
+                        arg.a_element_op_(v_a, arg.a_m_k_(m, k));
+                    }
+                    // same for B matrix
+                    if constexpr(is_same_v<BElementwiseOperation,
+                                           ck::tensor_operation::element_wise::ConvertBF16RTN>)
+                    {
+                        ck::tensor_operation::element_wise::PassThrough{}(v_b, arg.b_k_n_(k, n));
+                    }
+                    else
+                    {
+                        arg.b_element_op_(v_b, arg.b_k_n_(k, n));
+                    }
                    v_acc +=
                        ck::type_convert<AccDataType>(v_a) * ck::type_convert<AccDataType>(v_b);