From 8f7de8477759198d0944c3d6023cb803b4c452d8 Mon Sep 17 00:00:00 2001
From: yuguo960516yuguo <yuguo960516@outlook.com>
Date: Tue, 25 Apr 2023 19:14:37 +0800
Subject: [PATCH] dtk

---
 README.md                                     |   19 +-
 audit_dtk-22.04.2.py                          |   21 -
 audit_dtk-22.10.py                            |   21 -
 cmake/oneflow.cmake                           |   11 +
 .../embedding/cached_key_value_store.hip.cpp  |  650 ++--
 oneflow/core/embedding/full_cache.hip.cpp     | 1278 +++----
 oneflow/core/embedding/hash_functions.hip.h   |  198 +-
 oneflow/core/embedding/lru_cache.hip.cpp      | 1168 +++---
 .../embedding/mock_key_value_store.hip.cpp    |  496 +--
 .../persistent_table_key_value_store.hip.cpp  |  484 +--
 oneflow/core/ep/rocm/cuda_device.cpp          |  358 +-
 oneflow/core/ep/rocm/cuda_device.h            |  156 +-
 oneflow/core/ep/rocm/cuda_device_manager.cpp  |  136 +-
 oneflow/core/ep/rocm/cuda_device_manager.h    |  108 +-
 .../ep/rocm/cuda_device_manager_factory.cpp   |  234 +-
 oneflow/core/ep/rocm/cuda_event.cpp           |  112 +-
 oneflow/core/ep/rocm/cuda_event.h             |  100 +-
 oneflow/core/ep/rocm/cuda_stream.cpp          |  360 +-
 oneflow/core/ep/rocm/cuda_stream.h            |  336 +-
 oneflow/core/ep/rocm/primitive/add.hip.cpp    |  278 +-
 .../ep/rocm/primitive/binary_functor.hip.h    |  300 +-
 .../broadcast_elementwise_binary.hip.cpp      |  218 +-
 .../broadcast_elementwise_binary.hip.h        |  792 ++--
 ...elementwise_binary_activation_grad.hip.cpp |   78 +-
 ...ast_elementwise_binary_comparision.hip.cpp |   76 +-
 ...oadcast_elementwise_binary_logical.hip.cpp |   76 +-
 .../broadcast_elementwise_binary_math.hip.cpp |   70 +-
 .../ep/rocm/primitive/broadcast_matmul.cpp    |  474 +--
 oneflow/core/ep/rocm/primitive/cast.hip.cpp   |  296 +-
 .../ep/rocm/primitive/constant_pad.hip.cpp    |  508 +--
 .../core/ep/rocm/primitive/copy_nd.hip.cpp    |  190 +-
 .../rocm/primitive/elementwise_unary.hip.cpp  |  232 +-
 oneflow/core/ep/rocm/primitive/fill.hip.cpp   |  302 +-
 oneflow/core/ep/rocm/primitive/memcpy.cpp     |  124 +-
 oneflow/core/ep/rocm/primitive/memset.cpp     |  118 +-
 .../core/ep/rocm/primitive/permute.hip.cpp    |  666 ++--
 .../core/ep/rocm/primitive/softmax.hip.cpp    |  214 +-
 .../rocm/primitive/softmax_backward.hip.cpp   |  232 +-
 oneflow/core/ep/rocm/primitive/type_seq.h     |  154 +-
 .../ep/rocm/primitive/unary_functor.hip.h     |  340 +-
 .../framework/random_generator_impl.hip.cpp   |   90 +-
 oneflow/core/hip/atomic.hip.h                 |  428 +--
 oneflow/core/hip/elementwise.hip.h            |  486 +--
 oneflow/core/hip/layer_norm.hip.h             | 3212 ++++++++---------
 oneflow/core/hip/softmax.hip.h                | 2998 +++++++--------
 oneflow/core/hip/unique.hip.h                 |  502 +--
 .../nccl_executor_backend.hip.cpp             | 1328 +++----
 .../insert_nccl_logical_op_pass.cpp           |    2 +-
 ...uential_one_embedding_shuffle_ops_pass.cpp |  158 +-
 ...uda_check_numerics_kernel_observer.hip.cpp |  264 +-
 oneflow/core/kernel/kernel_util.hip.h         |  106 +-
 oneflow/core/kernel/random_generator.hip.cpp  |  116 +-
 oneflow/core/kernel/util/numeric_limits.hip.h |  254 +-
 oneflow/core/kernel/util/numerics.hip.h       |  498 +--
 .../ndarray/ndarray_apply_binary_core.hip.cpp |  134 +-
 ...darray_apply_broadcast_binary_core.hip.cpp |  378 +-
 ...ndarray_apply_broadcast_unary_core.hip.cpp |   90 +-
 .../ndarray/ndarray_apply_unary_core.hip.cpp  |   92 +-
 .../core/ndarray/ndarray_assign_core.hip.cpp  |  124 +-
 .../core/ndarray/ndarray_reduce_impl.hip.cpp  |  764 ++--
 .../core/ndarray/xpu_ndarray_assign.hip.cpp   |  122 +-
 oneflow/core/profiler/event.cpp               |  180 +-
 oneflow/core/profiler/event.h                 |  372 +-
 oneflow/core/profiler/event_recorder.h        |  120 +-
 oneflow/core/vm/sync_vm_mode_guard.h          |   76 +-
 .../kernels/adaptive_pool_gpu_kernel.hip.cpp  |  590 +--
 .../user/kernels/affine_grid_kernel.hip.cpp   |  264 +-
 .../user/kernels/arange_kernel_util.hip.cpp   |   94 +-
 oneflow/user/kernels/arg_sort_kernel.hip.cpp  |  294 +-
 .../kernels/arg_where_kernel_util.hip.cpp     |  282 +-
 oneflow/user/kernels/argmax_kernel.hip.cpp    |  386 +-
 .../user/kernels/as_strided_kernel.hip.cpp    |  396 +-
 oneflow/user/kernels/assign_if_kernel.hip.cpp |  150 +-
 oneflow/user/kernels/avg_pool_kernel.hip.cpp  |  398 +-
 .../kernels/batch_gather_kernel_util.hip.cpp  |  204 +-
 .../binary_cross_entropy_kernel.hip.cpp       |  406 +--
 ...y_cross_entropy_with_logits_kernel.hip.cpp |  744 ++--
 ...ss_entropy_with_logits_mean_kernel.hip.cpp |  552 +--
 .../kernels/broadcast_pow_grad_kernel.hip.cpp |  174 +-
 ...gorical_ordinal_encode_kernel_util.hip.cpp |  248 +-
 .../user/kernels/clip_by_value_kernel.hip.cpp |  142 +-
 .../combined_margin_loss_kernel.hip.cpp       |  448 +--
 .../kernels/count_not_finite_kernel.hip.cpp   |  344 +-
 .../user/kernels/ctc_greedy_decoder.hip.cpp   |  290 +-
 .../user/kernels/ctc_loss_kernel_util.hip.cpp |  568 +--
 .../user/kernels/cum_backward_kernel.hip.cpp  |  276 +-
 .../user/kernels/cum_forward_kernel.hip.cpp   |  336 +-
 .../user/kernels/data_shuffle_kernel.hip.cpp  | 3044 ++++++++--------
 oneflow/user/kernels/diag_kernel.hip.cpp      |  158 +-
 oneflow/user/kernels/diagonal_kernel.hip.cpp  |  324 +-
 .../kernels/dim_gather_kernel_util.hip.cpp    |  128 +-
 .../kernels/dim_scatter_kernel_util.hip.cpp   |  132 +-
 .../dim_scatter_scalar_kernel_util.hip.cpp    |  100 +-
 .../distributions/normal_distribution.hip.cpp |  140 +-
 .../uniform_distribution.hip.cpp              |  152 +-
 .../uniform_int_distribution.hip.cpp          |  142 +-
 oneflow/user/kernels/dropout_kernel.hip.cpp   |  924 ++---
 ...dynamic_loss_scale_schedule_kernel.hip.cpp |  132 +-
 .../user/kernels/eager_nccl_kernels.hip.cpp   |  806 ++---
 ...elementwise_maximum_minimum_kernel.hip.cpp |  112 +-
 oneflow/user/kernels/embedding_kernel.hip.cpp |  318 +-
 .../kernels/embedding_kernel_util.hip.cpp     |  362 +-
 oneflow/user/kernels/erfinv_kernel.hip.cpp    |  120 +-
 oneflow/user/kernels/expand_kernel.hip.cpp    |  438 +--
 oneflow/user/kernels/eye_kernel_util.hip.cpp  |   78 +-
 .../kernels/fake_quantization_kernel.hip.cpp  |  318 +-
 oneflow/user/kernels/fill_kernel.hip.cpp      |  120 +-
 oneflow/user/kernels/flip_kernel.hip.cpp      |  206 +-
 oneflow/user/kernels/fold_kernel_util.hip.cpp |  148 +-
 .../kernels/fused_bias_add_kernel.hip.cpp     |  910 ++---
 .../kernels/fused_cast_scale_kernel.hip.cpp   |  222 +-
 .../fused_cross_feature_interaction.hip.cpp   |  516 +--
 ...sed_cross_feature_interaction_grad.hip.cpp |  908 ++---
 ...sed_dot_feature_interaction_kernel.hip.cpp | 1844 +++++-----
 .../kernels/fused_gru_cell_kernel.hip.cpp     |  942 ++---
 .../kernels/fused_lstm_cell_kernel.hip.cpp    | 1008 +++---
 .../fused_relu_dropout_grad_kernel.hip.cpp    |  290 +-
 .../kernels/fused_scale_mask_softmax.hip.cpp  |  470 +--
 .../kernels/fused_scale_mask_softmax.hip.h    |  430 +--
 .../fused_scale_mask_softmax_dropout.hip.cpp  |  604 ++--
 ...ion_query_mul_key_and_value_kernel.hip.cpp |  584 +--
 ...il_scale_softmax_mask_scale_kernel.hip.cpp |  456 +--
 .../user/kernels/gather_kernel_util.hip.cpp   |  244 +-
 ...m_batch_permutation_indices_kernel.hip.cpp |  274 +-
 .../heap_selection_top_k_kernel.hip.cpp       |  464 +--
 .../kernels/image_preprocess_kernels.hip.cpp  |  430 +--
 .../user/kernels/in_top_k_kernel_util.hip.cpp |  134 +-
 oneflow/user/kernels/kl_div_kernel.hip.cpp    |  240 +-
 ...l2_regularize_gradient_kernel_util.hip.cpp |  100 +-
 .../user/kernels/l2_normalize_kernel.hip.cpp  |  298 +-
 .../kernels/layer_norm_gpu_kernel.hip.cpp     | 1143 +++---
 .../math_binary_elementwise_kernel.hip.cpp    |  486 +--
 .../kernels/math_unary_elementwise_func.h     | 1966 +++++-----
 .../math_unary_elementwise_kernel.hip.cpp     |  352 +-
 oneflow/user/kernels/max_pool_kernel.hip.cpp  |  576 +--
 oneflow/user/kernels/median_kernel.hip.cpp    |  136 +-
 .../median_with_indices_kernel.hip.cpp        |  310 +-
 .../kernels/min_max_observer_kernel.hip.cpp   |  518 +--
 .../kernels/model_update_kernel_util.hip.cpp  | 1596 ++++----
 ...ng_average_min_max_observer_kernel.hip.cpp |  632 ++--
 .../user/kernels/multi_reduce_kernels.hip.cpp |  282 +-
 .../kernels/nd_index_slice_kernels.hip.cpp    |  330 +-
 oneflow/user/kernels/nll_kernel_util.hip.cpp  |  184 +-
 oneflow/user/kernels/nms_kernel.hip.cpp       |  288 +-
 .../user/kernels/normalization_kernel.hip.cpp | 1066 +++---
 .../user/kernels/nvtx_range_kernel.hip.cpp    |  276 +-
 .../kernels/one_embedding_kernels.hip.cpp     | 1266 +++----
 .../one_embedding_update_kernels.hip.cpp      | 1206 +++----
 oneflow/user/kernels/one_hot_kernel.hip.cpp   |  160 +-
 .../user/kernels/pad2d_kernels_util.hip.cpp   |  426 +--
 .../kernels/partial_fc_sample_kernel.hip.cpp  |  860 ++---
 oneflow/user/kernels/prelu_kernel.hip.cpp     | 1008 +++---
 .../user/kernels/quantization_kernel.hip.cpp  |  316 +-
 oneflow/user/kernels/radix_sort.hip.h         |  558 +--
 .../kernels/radix_sort_top_k_kernel.hip.cpp   |  288 +-
 .../kernels/random_mask_generator.hip.cpp     |  136 +-
 oneflow/user/kernels/randperm_kernel.hip.cpp  |  400 +-
 .../kernels/repeat_interleave_kernel.hip.cpp  |  144 +-
 oneflow/user/kernels/roi_align_kernel.hip.cpp |  602 +--
 oneflow/user/kernels/roll_kernel.hip.cpp      |  588 +--
 .../user/kernels/scalar_math_kernels.hip.cpp  |  444 +--
 .../user/kernels/search_sorted_kernel.hip.cpp |  256 +-
 .../sigmoid_cross_entropy_kernel.hip.cpp      |  108 +-
 oneflow/user/kernels/slice_util.hip.cpp       |  462 +--
 .../kernels/smooth_l1_loss_kernel.hip.cpp     |  288 +-
 .../softmax_cross_entropy_kernel.hip.cpp      |  310 +-
 oneflow/user/kernels/sort_kernel.hip.cpp      |  160 +-
 .../sparse_cross_entropy_kernel_util.hip.cpp  |  532 +--
 ...parse_softmax_cross_entropy_kernel.hip.cpp |  260 +-
 ..._softmax_cross_entropy_kernel_util.hip.cpp |  266 +-
 .../sqrt_square_sum_kernel_util.hip.cpp       |  164 +-
 .../kernels/square_sum_kernel_util.hip.cpp    |  208 +-
 oneflow/user/kernels/stateful_opkernel.cpp    | 1802 ++++-----
 oneflow/user/kernels/tf_prelu_kernel.hip.cpp  |  506 +--
 .../user/kernels/to_contiguous_kernel.hip.cpp |  320 +-
 oneflow/user/kernels/tril_kernel.hip.cpp      |  510 +--
 oneflow/user/kernels/triu_kernel.hip.cpp      |  260 +-
 .../two_stage_reduce_kernel_util.hip.cpp      |  132 +-
 .../user/kernels/unfold_kernel_util.hip.cpp   |  138 +-
 .../user/kernels/unfold_tensor_kernel.hip.cpp |  442 +--
 .../user/kernels/unique_kernel_util.hip.cpp   |  172 +-
 .../unsorted_segment_sum_kernel_util.hip.cpp  |  442 +--
 .../upsample_bicubic_2d_kernel.hip.cpp        |  466 +--
 .../upsample_bilinear_2d_kernel.hip.cpp       |  378 +-
 .../kernels/upsample_linear_1d_kernel.hip.cpp |  324 +-
 .../kernels/upsample_nearest_kernel.hip.cpp   |  822 ++---
 .../upsample_trilinear_3d_kernel.hip.cpp      |  472 +--
 .../user/kernels/variance_kernel_util.hip.cpp |  382 +-
 .../user/kernels/where_kernel_util.hip.cpp    |  178 +-
 .../modules/fused_dot_feature_interaction.py  |   86 +-
 python/oneflow/test/modules/test_conv.py      |  692 ++--
 .../test/modules/test_softmax_cross_entropy   |  348 +-
 .../test/profiler/test_profile_lenet.py       |  296 +-
 version_script.lds                            |   14 +-
 194 files changed, 41577 insertions(+), 41396 deletions(-)
 delete mode 100644 audit_dtk-22.04.2.py
 delete mode 100644 audit_dtk-22.10.py

diff --git a/README.md b/README.md
index 15f73ae..8d49a2c 100644
--- a/README.md
+++ b/README.md
@@ -1,9 +1,6 @@
-# OneFlow
+# OneFlow 
 
-OneFlow is a deep learning framework designed to be **user-friendly, scalable and efficient**. With OneFlow, it is easy to:
-- program a model with **PyTorch-like API**
-- scale a model to n-dimensional-parallel/distributed execution with the **Global View API**
-- accelerate/deploy a model with the **Static Graph Compiler**.
+**OneFlow is a performance-centered and open-source deep learning framework.**
 
 [![Simple CI](https://github.com/Oneflow-Inc/oneflow/actions/workflows/simple.yml/badge.svg)](https://github.com/Oneflow-Inc/oneflow/actions/workflows/simple.yml)
 [![Nightly Docker Image](https://github.com/Oneflow-Inc/docker-images/actions/workflows/oneflow-nightly.yml/badge.svg)](https://github.com/Oneflow-Inc/docker-images/actions/workflows/oneflow-nightly.yml)
@@ -12,8 +9,10 @@ OneFlow is a deep learning framework designed to be **user-friendly, scalable an
 
 ## Latest News
 
-- Version 0.8.0 is out!
-  - [Full changelog](https://github.com/Oneflow-Inc/oneflow/releases/tag/v0.8.0)
+- Version 0.7.0 is out!
+  - Introducing global tensor
+  - Semi-auto parallelization has landed
+  - [Full changelog](https://github.com/Oneflow-Inc/oneflow/releases/tag/v0.7.0)
 
 ## Publication
 
@@ -36,7 +35,7 @@ OneFlow is a deep learning framework designed to be **user-friendly, scalable an
 ### System Requirements
 
 - Linux. As for now, there is no pre-built release for macOS, Windows.
-- Python 3.7, 3.8, 3.9, 3.10
+- Python 3.6, 3.7, 3.8, 3.9, 3.10
 - (**Highly recommended**) Upgrade pip
 
   ```
@@ -54,7 +53,7 @@ OneFlow is a deep learning framework designed to be **user-friendly, scalable an
 - To install latest stable release of OneFlow with CUDA support:
 
   ```bash
-  python3 -m pip install oneflow
+  python3 -m pip install -f https://release.oneflow.info oneflow==0.7.0+cu102
   ```
 
 - To install nightly release of OneFlow with CUDA support:
@@ -67,7 +66,7 @@ OneFlow is a deep learning framework designed to be **user-friendly, scalable an
 
   - Stable
     ```bash
-    python3 -m pip install --find-links https://release.oneflow.info oneflow==0.8.0+[PLATFORM]
+    python3 -m pip install --find-links https://release.oneflow.info oneflow==0.7.0+[PLATFORM]
     ```
   - Nightly
     ```
diff --git a/audit_dtk-22.04.2.py b/audit_dtk-22.04.2.py
deleted file mode 100644
index 6646a3b..0000000
--- a/audit_dtk-22.04.2.py
+++ /dev/null
@@ -1,21 +0,0 @@
-# Monkey patch to not ship libjvm.so in pypi wheels
-import sys
-
-from auditwheel.main import main
-from auditwheel.policy import _POLICIES as POLICIES
-
-# libjvm is loaded dynamically; do not include it
-for p in POLICIES:
-   p['lib_whitelist'].append('librccl.so.1')
-   p['lib_whitelist'].append('libhipblas.so.0')
-   p['lib_whitelist'].append('libhiprand.so.1')
-   p['lib_whitelist'].append('librocrand.so.1')
-   p['lib_whitelist'].append('libMIOpen.so.1')
-   p['lib_whitelist'].append('libgalaxyhip.so.4')
-   p['lib_whitelist'].append('librocm_smi64.so.2')
-   p['lib_whitelist'].append('librocsolver.so.0 ')
-   p['lib_whitelist'].append('librocblas.so.0')
-
-
-if __name__ == "__main__":
-   sys.exit(main())
diff --git a/audit_dtk-22.10.py b/audit_dtk-22.10.py
deleted file mode 100644
index db45007..0000000
--- a/audit_dtk-22.10.py
+++ /dev/null
@@ -1,21 +0,0 @@
-# Monkey patch to not ship libjvm.so in pypi wheels
-import sys
-
-from auditwheel.main import main
-from auditwheel.policy import _POLICIES as POLICIES
-
-# libjvm is loaded dynamically; do not include it
-for p in POLICIES:
-   p['lib_whitelist'].append('librccl.so.1')
-   p['lib_whitelist'].append('libhipblas.so.0')
-   p['lib_whitelist'].append('libhiprand.so.1')
-   p['lib_whitelist'].append('librocrand.so.1')
-   p['lib_whitelist'].append('libMIOpen.so.1')
-   p['lib_whitelist'].append('libgalaxyhip.so.5')
-   p['lib_whitelist'].append('librocm_smi64.so.2')
-   p['lib_whitelist'].append('librocsolver.so.0 ')
-   p['lib_whitelist'].append('librocblas.so.0')
-
-
-if __name__ == "__main__":
-   sys.exit(main())
diff --git a/cmake/oneflow.cmake b/cmake/oneflow.cmake
index fff0d9d..a9de1f2 100644
--- a/cmake/oneflow.cmake
+++ b/cmake/oneflow.cmake
@@ -328,6 +328,17 @@ if(BUILD_PYTHON OR BUILD_CPP_API)
   endif()
 endif()
 
+if (BUILD_ROCM)
+  # AMD compiler fails to compile these three files with '-O1/2/3'.
+  # The value of `COMPILE_OPTIONS` target property is added after CMAKE_<LANG>_FLAGS_<CONFIG>,
+  # so '-O0' will override '-O1/2/3'.
+  set_source_files_properties(${PROJECT_SOURCE_DIR}/oneflow/user/kernels/median_with_indices_kernel.hip.cpp
+                              ${PROJECT_SOURCE_DIR}/oneflow/user/kernels/radix_sort_top_k_kernel.hip.cpp
+                              ${PROJECT_SOURCE_DIR}/oneflow/user/kernels/arg_sort_kernel.hip.cpp
+                              # ${PROJECT_SOURCE_DIR}/oneflow/core/ep/cuda/primitive/broadcast_elementwise_binary_math.hip.cpp
+                              PROPERTIES COMPILE_OPTIONS "-O0")
+endif()
+
 if(BUILD_PYTHON)
 
   # py ext lib
diff --git a/oneflow/core/embedding/cached_key_value_store.hip.cpp b/oneflow/core/embedding/cached_key_value_store.hip.cpp
index 88456a4..a0a215e 100644
--- a/oneflow/core/embedding/cached_key_value_store.hip.cpp
+++ b/oneflow/core/embedding/cached_key_value_store.hip.cpp
@@ -1,326 +1,326 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "hip/hip_runtime.h"
-#include "oneflow/core/embedding/cached_key_value_store.h"
-#include "oneflow/core/ep/rocm/cuda_stream.h"
-#include "oneflow/core/ep/include/device_manager_registry.h"
-
-namespace oneflow {
-
-namespace embedding {
-
-namespace {
-template<typename Key, typename Elem>
-__global__ void PostStoreGetKernel(uint32_t num_cache_missing, uint32_t num_store_missing,
-                                   uint32_t num_elems_per_value,
-                                   const uint32_t* cache_missing_indices,
-                                   const uint32_t* store_missing_indices, const Elem* store_values,
-                                   Elem* values, uint32_t* missing_indices) {
-  const uint32_t num_cache_missing_elem = num_cache_missing * num_elems_per_value;
-  CUDA_1D_KERNEL_LOOP_T(uint32_t, i, num_cache_missing_elem) {
-    const uint32_t value_index = i / num_elems_per_value;
-    const uint32_t elem_index = i - value_index * num_elems_per_value;
-    values[cache_missing_indices[value_index] * num_elems_per_value + elem_index] = store_values[i];
-  }
-  CUDA_1D_KERNEL_LOOP_T(uint32_t, i, num_store_missing) {
-    missing_indices[i] = cache_missing_indices[store_missing_indices[i]];
-  }
-}
-
-template<typename Key, typename Elem>
-class CacheKeyValueStoreImpl : public KeyValueStore {
- public:
-  OF_DISALLOW_COPY_AND_MOVE(CacheKeyValueStoreImpl);
-  CacheKeyValueStoreImpl(std::unique_ptr<KeyValueStore>&& store, std::unique_ptr<Cache>&& cache)
-      : store_(std::move(store)), cache_(std::move(cache)), synced_(true), max_query_length_(0) {
-    OF_CUDA_CHECK(hipGetDevice(&device_index_));
-    CHECK_EQ(store_->KeySize(), cache_->KeySize());
-    CHECK_EQ(store_->ValueSize(), cache_->ValueSize());
-    OF_CUDA_CHECK(hipMalloc(&num_buffer_, sizeof(uint32_t)));
-    OF_CUDA_CHECK(hipMallocHost(reinterpret_cast<void **>(&host_num_buffer_), sizeof(uint32_t)));
-    num_elems_per_value_ = store_->ValueSize() / sizeof(Elem);
-  }
-  ~CacheKeyValueStoreImpl() {
-    CudaCurrentDeviceGuard guard(device_index_);
-    OF_CUDA_CHECK(hipFree(num_buffer_));
-    OF_CUDA_CHECK(hipHostFree(host_num_buffer_));
-    if (max_query_length_ != 0) {
-      OF_CUDA_CHECK(hipFree(keys_buffer_));
-      OF_CUDA_CHECK(hipFree(values_buffer_));
-      OF_CUDA_CHECK(hipFree(indices_buffer0_));
-      OF_CUDA_CHECK(hipFree(indices_buffer1_));
-    }
-    cache_.reset();
-    store_.reset();
-  }
-
-  uint32_t KeySize() const override { return store_->KeySize(); }
-  uint32_t ValueSize() const override { return store_->ValueSize(); }
-  uint32_t MaxQueryLength() const override { return max_query_length_; }
-
-  void ReserveQueryLength(uint32_t query_length) override {
-    CudaCurrentDeviceGuard guard(device_index_);
-    if (query_length <= max_query_length_) { return; }
-    if (query_length > cache_->MaxQueryLength()) { cache_->ReserveQueryLength(query_length); }
-    if (query_length > store_->MaxQueryLength()) { store_->ReserveQueryLength(query_length); }
-    if (max_query_length_ != 0) {
-      OF_CUDA_CHECK(hipFree(keys_buffer_));
-      OF_CUDA_CHECK(hipFree(values_buffer_));
-      OF_CUDA_CHECK(hipFree(indices_buffer0_));
-      OF_CUDA_CHECK(hipFree(indices_buffer1_));
-    }
-    OF_CUDA_CHECK(hipMalloc(&keys_buffer_, query_length * store_->KeySize()));
-    OF_CUDA_CHECK(hipMalloc(&values_buffer_, query_length * store_->ValueSize()));
-    OF_CUDA_CHECK(hipMalloc(&indices_buffer0_, query_length * sizeof(uint32_t)));
-    OF_CUDA_CHECK(hipMalloc(&indices_buffer1_, query_length * sizeof(uint32_t)));
-    max_query_length_ = query_length;
-  }
-
-  void Get(ep::Stream* stream, uint32_t num_keys, const void* keys, void* values,
-           uint32_t* n_missing, uint32_t* missing_indices) override;
-  void Get(ep::Stream* stream, uint32_t num_keys, const void* keys, void* values,
-           uint8_t* mask) override;
-  void Put(ep::Stream* stream, uint32_t num_keys, const void* keys, const void* values) override;
-  void FusedHalfUpdatePut(ep::Stream* stream, uint32_t n_keys, const void* keys, const void* values,
-                          const void* update, const float* lr, float scale) override;
-  bool IsFusionSupported() override {
-    return cache_->Policy() == CacheOptions::Policy::kFull
-           && cache_->ValueType() == DataType::kFloat;
-  }
-  bool SnapshotExists(const std::string& name) override;
-  void LoadSnapshot(const std::string& name) override;
-  void SaveSnapshot(const std::string& name) override;
-  void LoadSnapshot(const std::string& name,
-                    const std::function<void(KVIterator* iter)>& Hook) override;
-
- private:
-  void SyncCacheToStore();
-
-  std::unique_ptr<KeyValueStore> store_;
-  std::unique_ptr<Cache> cache_;
-
-  uint32_t* num_buffer_{};
-  uint32_t* host_num_buffer_{};
-  Key* keys_buffer_{};
-  Elem* values_buffer_{};
-  uint32_t* indices_buffer0_{};
-  uint32_t* indices_buffer1_{};
-  int device_index_{};
-  uint32_t max_query_length_;
-  uint32_t num_elems_per_value_{};
-  std::recursive_mutex mutex_;
-  bool synced_;
-};
-
-template<typename Key, typename Elem>
-void CacheKeyValueStoreImpl<Key, Elem>::Get(ep::Stream* stream, uint32_t num_keys, const void* keys,
-                                            void* values, uint32_t* n_missing,
-                                            uint32_t* missing_indices) {
-  std::lock_guard<std::recursive_mutex> lock(mutex_);
-  auto cuda_stream = stream->As<ep::CudaStream>();
-  if (cache_->Policy() == CacheOptions::Policy::kFull) {
-    cache_->Get(stream, num_keys, keys, values, n_missing, keys_buffer_, missing_indices);
-    return;
-  } else {
-    cache_->Get(stream, num_keys, keys, values, num_buffer_, keys_buffer_, indices_buffer0_);
-  }
-  OF_CUDA_CHECK(hipMemcpyAsync(host_num_buffer_, num_buffer_, sizeof(uint32_t), hipMemcpyDefault,
-                                cuda_stream->cuda_stream()));
-  CHECK_JUST(cuda_stream->Sync());
-  const uint32_t num_cache_missing = *host_num_buffer_;
-  if (num_cache_missing == 0) {
-    OF_CUDA_CHECK(hipMemsetAsync(n_missing, 0, sizeof(uint32_t),
-                                  stream->As<ep::CudaStream>()->cuda_stream()));
-    return;
-  }
-  store_->Get(stream, num_cache_missing, keys_buffer_, values_buffer_, n_missing, indices_buffer1_);
-  OF_CUDA_CHECK(hipMemcpyAsync(host_num_buffer_, n_missing, sizeof(uint32_t), hipMemcpyDefault,
-                                cuda_stream->cuda_stream()));
-  CHECK_JUST(cuda_stream->Sync());
-  const uint32_t num_store_missing = *host_num_buffer_;
-  RUN_CUDA_KERNEL((PostStoreGetKernel<Key, Elem>), stream, num_cache_missing * num_elems_per_value_,
-                  num_cache_missing, num_store_missing, num_elems_per_value_, indices_buffer0_,
-                  indices_buffer1_, values_buffer_, static_cast<Elem*>(values), missing_indices);
-}
-
-template<typename Key, typename Elem>
-void CacheKeyValueStoreImpl<Key, Elem>::Get(ep::Stream* stream, uint32_t num_keys, const void* keys,
-                                            void* values, uint8_t* mask) {
-  std::lock_guard<std::recursive_mutex> lock(mutex_);
-  if (cache_->Policy() == CacheOptions::Policy::kFull) {
-    cache_->Get(stream, num_keys, keys, values, mask);
-    return;
-  } else {
-    UNIMPLEMENTED();
-  }
-}
-
-template<typename Key, typename Elem>
-void CacheKeyValueStoreImpl<Key, Elem>::Put(ep::Stream* stream, uint32_t num_keys, const void* keys,
-                                            const void* values) {
-  std::lock_guard<std::recursive_mutex> lock(mutex_);
-  synced_ = false;
-  auto cuda_stream = stream->As<ep::CudaStream>();
-  cache_->Put(stream, num_keys, keys, values, num_buffer_, keys_buffer_, values_buffer_);
-  if (cache_->Policy() == CacheOptions::Policy::kFull) { return; }
-  OF_CUDA_CHECK(hipMemcpyAsync(host_num_buffer_, num_buffer_, sizeof(uint32_t), hipMemcpyDefault,
-                                cuda_stream->cuda_stream()));
-  CHECK_JUST(cuda_stream->Sync());
-  store_->Put(stream, *host_num_buffer_, keys_buffer_, values_buffer_);
-}
-
-template<typename Key, typename Elem>
-void CacheKeyValueStoreImpl<Key, Elem>::FusedHalfUpdatePut(ep::Stream* stream, uint32_t num_keys,
-                                                           const void* keys, const void* values,
-                                                           const void* update, const float* lr,
-                                                           float scale) {
-  std::lock_guard<std::recursive_mutex> lock(mutex_);
-  if (cache_->Policy() != CacheOptions::Policy::kFull || cache_->ValueType() != DataType::kFloat) {
-    UNIMPLEMENTED();
-  }
-  synced_ = false;
-  cache_->FusedHalfUpdatePut(stream, num_keys, keys, values, update, lr, scale, num_buffer_,
-                             keys_buffer_, values_buffer_);
-}
-
-template<typename Key, typename Elem>
-bool CacheKeyValueStoreImpl<Key, Elem>::SnapshotExists(const std::string& name) {
-  return store_->SnapshotExists(name);
-}
-
-template<typename Key, typename Elem>
-void CacheKeyValueStoreImpl<Key, Elem>::LoadSnapshot(const std::string& name) {
-  LoadSnapshot(name, nullptr);
-}
-
-template<typename Key, typename Elem>
-void CacheKeyValueStoreImpl<Key, Elem>::LoadSnapshot(
-    const std::string& name, const std::function<void(KVIterator* iter)>& Hook) {
-  CudaCurrentDeviceGuard guard(device_index_);
-  std::lock_guard<std::recursive_mutex> lock(mutex_);
-  CHECK_GT(max_query_length_, 0);
-  cache_->Clear();
-  auto device =
-      Singleton<ep::DeviceManagerRegistry>::Get()->GetDevice(DeviceType::kCUDA, device_index_);
-  CHECK(device);
-  auto* stream = device->CreateStream();
-  store_->LoadSnapshot(name, [&](KVIterator* iter) {
-    if (cache_->Policy() == CacheOptions::Policy::kFull) {
-      auto* cuda_stream = stream->As<ep::CudaStream>();
-      while (true) {
-        iter->NextN(stream, max_query_length_, num_buffer_, keys_buffer_, values_buffer_);
-        OF_CUDA_CHECK(hipDeviceSynchronize());
-        OF_CUDA_CHECK(hipMemcpyAsync(host_num_buffer_, num_buffer_, sizeof(uint32_t),
-                                      hipMemcpyDefault, cuda_stream->cuda_stream()));
-        CHECK_JUST(stream->Sync());
-        if (*host_num_buffer_ == 0) { return; }
-        cache_->Put(stream, *host_num_buffer_, keys_buffer_, values_buffer_, num_buffer_, nullptr,
-                    nullptr);
-        OF_CUDA_CHECK(hipMemcpyAsync(host_num_buffer_, num_buffer_, sizeof(uint32_t),
-                                      hipMemcpyDefault, cuda_stream->cuda_stream()));
-        CHECK_JUST(stream->Sync());
-        CHECK_EQ(*host_num_buffer_, 0);
-      }
-    }
-    if (Hook) {
-      iter->Reset();
-      Hook(iter);
-    }
-  });
-  device->DestroyStream(stream);
-  store_->LoadSnapshot(name);
-}
-
-template<typename Key, typename Elem>
-void CacheKeyValueStoreImpl<Key, Elem>::SaveSnapshot(const std::string& name) {
-  CudaCurrentDeviceGuard guard(device_index_);
-  std::lock_guard<std::recursive_mutex> lock(mutex_);
-  SyncCacheToStore();
-  store_->SaveSnapshot(name);
-}
-
-template<typename Key, typename Elem>
-void CacheKeyValueStoreImpl<Key, Elem>::SyncCacheToStore() {
-  if (synced_) { return; }
-  CudaCurrentDeviceGuard guard(device_index_);
-  auto device =
-      Singleton<ep::DeviceManagerRegistry>::Get()->GetDevice(DeviceType::kCUDA, device_index_);
-  CHECK(device);
-  auto* stream = device->CreateStream();
-  auto* cuda_stream = stream->As<ep::CudaStream>();
-  const uint64_t dump_capacity = cache_->DumpCapacity();
-  CHECK_GT(max_query_length_, 0);
-  for (uint64_t start_key_index = 0; start_key_index < dump_capacity;
-       start_key_index += max_query_length_) {
-    cache_->Dump(stream, start_key_index,
-                 std::min(start_key_index + max_query_length_, dump_capacity), num_buffer_,
-                 keys_buffer_, values_buffer_);
-    OF_CUDA_CHECK(hipMemcpyAsync(host_num_buffer_, num_buffer_, sizeof(uint32_t),
-                                  hipMemcpyDefault, cuda_stream->cuda_stream()));
-    CHECK_JUST(stream->Sync());
-    if (*host_num_buffer_ == 0) { continue; }
-    store_->Put(stream, *host_num_buffer_, keys_buffer_, values_buffer_);
-    CHECK_JUST(stream->Sync());
-  }
-  device->DestroyStream(stream);
-  synced_ = true;
-}
-
-template<typename Key>
-std::unique_ptr<KeyValueStore> DispatchElemType(std::unique_ptr<KeyValueStore>&& store,
-                                                std::unique_ptr<Cache>&& cache) {
-  const uint32_t value_size = store->ValueSize();
-  if (value_size % sizeof(uint4) == 0) {
-    return std::unique_ptr<KeyValueStore>(
-        new CacheKeyValueStoreImpl<Key, uint4>(std::move(store), std::move(cache)));
-  } else if (value_size % sizeof(uint64_t) == 0) {
-    return std::unique_ptr<KeyValueStore>(
-        new CacheKeyValueStoreImpl<Key, uint64_t>(std::move(store), std::move(cache)));
-  } else if (value_size % sizeof(uint32_t) == 0) {
-    return std::unique_ptr<KeyValueStore>(
-        new CacheKeyValueStoreImpl<Key, uint32_t>(std::move(store), std::move(cache)));
-  } else if (value_size % sizeof(uint16_t) == 0) {
-    return std::unique_ptr<KeyValueStore>(
-        new CacheKeyValueStoreImpl<Key, uint16_t>(std::move(store), std::move(cache)));
-  } else {
-    return std::unique_ptr<KeyValueStore>(
-        new CacheKeyValueStoreImpl<Key, uint8_t>(std::move(store), std::move(cache)));
-  }
-}
-
-std::unique_ptr<KeyValueStore> DispatchKeyType(std::unique_ptr<KeyValueStore>&& store,
-                                               std::unique_ptr<Cache>&& cache) {
-  const uint32_t key_size = store->KeySize();
-  if (key_size == 4) {
-    return DispatchElemType<uint32_t>(std::move(store), std::move(cache));
-  } else if (key_size == 8) {
-    return DispatchElemType<uint64_t>(std::move(store), std::move(cache));
-  } else {
-    UNIMPLEMENTED();
-    return nullptr;
-  }
-}
-
-}  // namespace
-
-std::unique_ptr<KeyValueStore> NewCachedKeyValueStore(std::unique_ptr<KeyValueStore>&& store,
-                                                      std::unique_ptr<Cache>&& cache) {
-  return DispatchKeyType(std::move(store), std::move(cache));
-}
-
-}  // namespace embedding
-
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "hip/hip_runtime.h"
+#include "oneflow/core/embedding/cached_key_value_store.h"
+#include "oneflow/core/ep/rocm/cuda_stream.h"
+#include "oneflow/core/ep/include/device_manager_registry.h"
+
+namespace oneflow {
+
+namespace embedding {
+
+namespace {
+template<typename Key, typename Elem>
+__global__ void PostStoreGetKernel(uint32_t num_cache_missing, uint32_t num_store_missing,
+                                   uint32_t num_elems_per_value,
+                                   const uint32_t* cache_missing_indices,
+                                   const uint32_t* store_missing_indices, const Elem* store_values,
+                                   Elem* values, uint32_t* missing_indices) {
+  const uint32_t num_cache_missing_elem = num_cache_missing * num_elems_per_value;
+  CUDA_1D_KERNEL_LOOP_T(uint32_t, i, num_cache_missing_elem) {
+    const uint32_t value_index = i / num_elems_per_value;
+    const uint32_t elem_index = i - value_index * num_elems_per_value;
+    values[cache_missing_indices[value_index] * num_elems_per_value + elem_index] = store_values[i];
+  }
+  CUDA_1D_KERNEL_LOOP_T(uint32_t, i, num_store_missing) {
+    missing_indices[i] = cache_missing_indices[store_missing_indices[i]];
+  }
+}
+
+template<typename Key, typename Elem>
+class CacheKeyValueStoreImpl : public KeyValueStore {
+ public:
+  OF_DISALLOW_COPY_AND_MOVE(CacheKeyValueStoreImpl);
+  CacheKeyValueStoreImpl(std::unique_ptr<KeyValueStore>&& store, std::unique_ptr<Cache>&& cache)
+      : store_(std::move(store)), cache_(std::move(cache)), synced_(true), max_query_length_(0) {
+    OF_CUDA_CHECK(hipGetDevice(&device_index_));
+    CHECK_EQ(store_->KeySize(), cache_->KeySize());
+    CHECK_EQ(store_->ValueSize(), cache_->ValueSize());
+    OF_CUDA_CHECK(hipMalloc(&num_buffer_, sizeof(uint32_t)));
+    OF_CUDA_CHECK(hipMallocHost(reinterpret_cast<void **>(&host_num_buffer_), sizeof(uint32_t)));
+    num_elems_per_value_ = store_->ValueSize() / sizeof(Elem);
+  }
+  ~CacheKeyValueStoreImpl() {
+    CudaCurrentDeviceGuard guard(device_index_);
+    OF_CUDA_CHECK(hipFree(num_buffer_));
+    OF_CUDA_CHECK(hipHostFree(host_num_buffer_));
+    if (max_query_length_ != 0) {
+      OF_CUDA_CHECK(hipFree(keys_buffer_));
+      OF_CUDA_CHECK(hipFree(values_buffer_));
+      OF_CUDA_CHECK(hipFree(indices_buffer0_));
+      OF_CUDA_CHECK(hipFree(indices_buffer1_));
+    }
+    cache_.reset();
+    store_.reset();
+  }
+
+  uint32_t KeySize() const override { return store_->KeySize(); }
+  uint32_t ValueSize() const override { return store_->ValueSize(); }
+  uint32_t MaxQueryLength() const override { return max_query_length_; }
+
+  void ReserveQueryLength(uint32_t query_length) override {
+    CudaCurrentDeviceGuard guard(device_index_);
+    if (query_length <= max_query_length_) { return; }
+    if (query_length > cache_->MaxQueryLength()) { cache_->ReserveQueryLength(query_length); }
+    if (query_length > store_->MaxQueryLength()) { store_->ReserveQueryLength(query_length); }
+    if (max_query_length_ != 0) {
+      OF_CUDA_CHECK(hipFree(keys_buffer_));
+      OF_CUDA_CHECK(hipFree(values_buffer_));
+      OF_CUDA_CHECK(hipFree(indices_buffer0_));
+      OF_CUDA_CHECK(hipFree(indices_buffer1_));
+    }
+    OF_CUDA_CHECK(hipMalloc(&keys_buffer_, query_length * store_->KeySize()));
+    OF_CUDA_CHECK(hipMalloc(&values_buffer_, query_length * store_->ValueSize()));
+    OF_CUDA_CHECK(hipMalloc(&indices_buffer0_, query_length * sizeof(uint32_t)));
+    OF_CUDA_CHECK(hipMalloc(&indices_buffer1_, query_length * sizeof(uint32_t)));
+    max_query_length_ = query_length;
+  }
+
+  void Get(ep::Stream* stream, uint32_t num_keys, const void* keys, void* values,
+           uint32_t* n_missing, uint32_t* missing_indices) override;
+  void Get(ep::Stream* stream, uint32_t num_keys, const void* keys, void* values,
+           uint8_t* mask) override;
+  void Put(ep::Stream* stream, uint32_t num_keys, const void* keys, const void* values) override;
+  void FusedHalfUpdatePut(ep::Stream* stream, uint32_t n_keys, const void* keys, const void* values,
+                          const void* update, const float* lr, float scale) override;
+  bool IsFusionSupported() override {
+    return cache_->Policy() == CacheOptions::Policy::kFull
+           && cache_->ValueType() == DataType::kFloat;
+  }
+  bool SnapshotExists(const std::string& name) override;
+  void LoadSnapshot(const std::string& name) override;
+  void SaveSnapshot(const std::string& name) override;
+  void LoadSnapshot(const std::string& name,
+                    const std::function<void(KVIterator* iter)>& Hook) override;
+
+ private:
+  void SyncCacheToStore();
+
+  std::unique_ptr<KeyValueStore> store_;
+  std::unique_ptr<Cache> cache_;
+
+  uint32_t* num_buffer_{};
+  uint32_t* host_num_buffer_{};
+  Key* keys_buffer_{};
+  Elem* values_buffer_{};
+  uint32_t* indices_buffer0_{};
+  uint32_t* indices_buffer1_{};
+  int device_index_{};
+  uint32_t max_query_length_;
+  uint32_t num_elems_per_value_{};
+  std::recursive_mutex mutex_;
+  bool synced_;
+};
+
+template<typename Key, typename Elem>
+void CacheKeyValueStoreImpl<Key, Elem>::Get(ep::Stream* stream, uint32_t num_keys, const void* keys,
+                                            void* values, uint32_t* n_missing,
+                                            uint32_t* missing_indices) {
+  std::lock_guard<std::recursive_mutex> lock(mutex_);
+  auto cuda_stream = stream->As<ep::CudaStream>();
+  if (cache_->Policy() == CacheOptions::Policy::kFull) {
+    cache_->Get(stream, num_keys, keys, values, n_missing, keys_buffer_, missing_indices);
+    return;
+  } else {
+    cache_->Get(stream, num_keys, keys, values, num_buffer_, keys_buffer_, indices_buffer0_);
+  }
+  OF_CUDA_CHECK(hipMemcpyAsync(host_num_buffer_, num_buffer_, sizeof(uint32_t), hipMemcpyDefault,
+                                cuda_stream->cuda_stream()));
+  CHECK_JUST(cuda_stream->Sync());
+  const uint32_t num_cache_missing = *host_num_buffer_;
+  if (num_cache_missing == 0) {
+    OF_CUDA_CHECK(hipMemsetAsync(n_missing, 0, sizeof(uint32_t),
+                                  stream->As<ep::CudaStream>()->cuda_stream()));
+    return;
+  }
+  store_->Get(stream, num_cache_missing, keys_buffer_, values_buffer_, n_missing, indices_buffer1_);
+  OF_CUDA_CHECK(hipMemcpyAsync(host_num_buffer_, n_missing, sizeof(uint32_t), hipMemcpyDefault,
+                                cuda_stream->cuda_stream()));
+  CHECK_JUST(cuda_stream->Sync());
+  const uint32_t num_store_missing = *host_num_buffer_;
+  RUN_CUDA_KERNEL((PostStoreGetKernel<Key, Elem>), stream, num_cache_missing * num_elems_per_value_,
+                  num_cache_missing, num_store_missing, num_elems_per_value_, indices_buffer0_,
+                  indices_buffer1_, values_buffer_, static_cast<Elem*>(values), missing_indices);
+}
+
+template<typename Key, typename Elem>
+void CacheKeyValueStoreImpl<Key, Elem>::Get(ep::Stream* stream, uint32_t num_keys, const void* keys,
+                                            void* values, uint8_t* mask) {
+  std::lock_guard<std::recursive_mutex> lock(mutex_);
+  if (cache_->Policy() == CacheOptions::Policy::kFull) {
+    cache_->Get(stream, num_keys, keys, values, mask);
+    return;
+  } else {
+    UNIMPLEMENTED();
+  }
+}
+
+template<typename Key, typename Elem>
+void CacheKeyValueStoreImpl<Key, Elem>::Put(ep::Stream* stream, uint32_t num_keys, const void* keys,
+                                            const void* values) {
+  std::lock_guard<std::recursive_mutex> lock(mutex_);
+  synced_ = false;
+  auto cuda_stream = stream->As<ep::CudaStream>();
+  cache_->Put(stream, num_keys, keys, values, num_buffer_, keys_buffer_, values_buffer_);
+  if (cache_->Policy() == CacheOptions::Policy::kFull) { return; }
+  OF_CUDA_CHECK(hipMemcpyAsync(host_num_buffer_, num_buffer_, sizeof(uint32_t), hipMemcpyDefault,
+                                cuda_stream->cuda_stream()));
+  CHECK_JUST(cuda_stream->Sync());
+  store_->Put(stream, *host_num_buffer_, keys_buffer_, values_buffer_);
+}
+
+template<typename Key, typename Elem>
+void CacheKeyValueStoreImpl<Key, Elem>::FusedHalfUpdatePut(ep::Stream* stream, uint32_t num_keys,
+                                                           const void* keys, const void* values,
+                                                           const void* update, const float* lr,
+                                                           float scale) {
+  std::lock_guard<std::recursive_mutex> lock(mutex_);
+  if (cache_->Policy() != CacheOptions::Policy::kFull || cache_->ValueType() != DataType::kFloat) {
+    UNIMPLEMENTED();
+  }
+  synced_ = false;
+  cache_->FusedHalfUpdatePut(stream, num_keys, keys, values, update, lr, scale, num_buffer_,
+                             keys_buffer_, values_buffer_);
+}
+
+template<typename Key, typename Elem>
+bool CacheKeyValueStoreImpl<Key, Elem>::SnapshotExists(const std::string& name) {
+  return store_->SnapshotExists(name);
+}
+
+template<typename Key, typename Elem>
+void CacheKeyValueStoreImpl<Key, Elem>::LoadSnapshot(const std::string& name) {
+  LoadSnapshot(name, nullptr);
+}
+
+template<typename Key, typename Elem>
+void CacheKeyValueStoreImpl<Key, Elem>::LoadSnapshot(
+    const std::string& name, const std::function<void(KVIterator* iter)>& Hook) {
+  CudaCurrentDeviceGuard guard(device_index_);
+  std::lock_guard<std::recursive_mutex> lock(mutex_);
+  CHECK_GT(max_query_length_, 0);
+  cache_->Clear();
+  auto device =
+      Singleton<ep::DeviceManagerRegistry>::Get()->GetDevice(DeviceType::kCUDA, device_index_);
+  CHECK(device);
+  auto* stream = device->CreateStream();
+  store_->LoadSnapshot(name, [&](KVIterator* iter) {
+    if (cache_->Policy() == CacheOptions::Policy::kFull) {
+      auto* cuda_stream = stream->As<ep::CudaStream>();
+      while (true) {
+        iter->NextN(stream, max_query_length_, num_buffer_, keys_buffer_, values_buffer_);
+        OF_CUDA_CHECK(hipDeviceSynchronize());
+        OF_CUDA_CHECK(hipMemcpyAsync(host_num_buffer_, num_buffer_, sizeof(uint32_t),
+                                      hipMemcpyDefault, cuda_stream->cuda_stream()));
+        CHECK_JUST(stream->Sync());
+        if (*host_num_buffer_ == 0) { return; }
+        cache_->Put(stream, *host_num_buffer_, keys_buffer_, values_buffer_, num_buffer_, nullptr,
+                    nullptr);
+        OF_CUDA_CHECK(hipMemcpyAsync(host_num_buffer_, num_buffer_, sizeof(uint32_t),
+                                      hipMemcpyDefault, cuda_stream->cuda_stream()));
+        CHECK_JUST(stream->Sync());
+        CHECK_EQ(*host_num_buffer_, 0);
+      }
+    }
+    if (Hook) {
+      iter->Reset();
+      Hook(iter);
+    }
+  });
+  device->DestroyStream(stream);
+  store_->LoadSnapshot(name);
+}
+
+template<typename Key, typename Elem>
+void CacheKeyValueStoreImpl<Key, Elem>::SaveSnapshot(const std::string& name) {
+  CudaCurrentDeviceGuard guard(device_index_);
+  std::lock_guard<std::recursive_mutex> lock(mutex_);
+  SyncCacheToStore();
+  store_->SaveSnapshot(name);
+}
+
+template<typename Key, typename Elem>
+void CacheKeyValueStoreImpl<Key, Elem>::SyncCacheToStore() {
+  if (synced_) { return; }
+  CudaCurrentDeviceGuard guard(device_index_);
+  auto device =
+      Singleton<ep::DeviceManagerRegistry>::Get()->GetDevice(DeviceType::kCUDA, device_index_);
+  CHECK(device);
+  auto* stream = device->CreateStream();
+  auto* cuda_stream = stream->As<ep::CudaStream>();
+  const uint64_t dump_capacity = cache_->DumpCapacity();
+  CHECK_GT(max_query_length_, 0);
+  for (uint64_t start_key_index = 0; start_key_index < dump_capacity;
+       start_key_index += max_query_length_) {
+    cache_->Dump(stream, start_key_index,
+                 std::min(start_key_index + max_query_length_, dump_capacity), num_buffer_,
+                 keys_buffer_, values_buffer_);
+    OF_CUDA_CHECK(hipMemcpyAsync(host_num_buffer_, num_buffer_, sizeof(uint32_t),
+                                  hipMemcpyDefault, cuda_stream->cuda_stream()));
+    CHECK_JUST(stream->Sync());
+    if (*host_num_buffer_ == 0) { continue; }
+    store_->Put(stream, *host_num_buffer_, keys_buffer_, values_buffer_);
+    CHECK_JUST(stream->Sync());
+  }
+  device->DestroyStream(stream);
+  synced_ = true;
+}
+
+template<typename Key>
+std::unique_ptr<KeyValueStore> DispatchElemType(std::unique_ptr<KeyValueStore>&& store,
+                                                std::unique_ptr<Cache>&& cache) {
+  const uint32_t value_size = store->ValueSize();
+  if (value_size % sizeof(uint4) == 0) {
+    return std::unique_ptr<KeyValueStore>(
+        new CacheKeyValueStoreImpl<Key, uint4>(std::move(store), std::move(cache)));
+  } else if (value_size % sizeof(uint64_t) == 0) {
+    return std::unique_ptr<KeyValueStore>(
+        new CacheKeyValueStoreImpl<Key, uint64_t>(std::move(store), std::move(cache)));
+  } else if (value_size % sizeof(uint32_t) == 0) {
+    return std::unique_ptr<KeyValueStore>(
+        new CacheKeyValueStoreImpl<Key, uint32_t>(std::move(store), std::move(cache)));
+  } else if (value_size % sizeof(uint16_t) == 0) {
+    return std::unique_ptr<KeyValueStore>(
+        new CacheKeyValueStoreImpl<Key, uint16_t>(std::move(store), std::move(cache)));
+  } else {
+    return std::unique_ptr<KeyValueStore>(
+        new CacheKeyValueStoreImpl<Key, uint8_t>(std::move(store), std::move(cache)));
+  }
+}
+
+std::unique_ptr<KeyValueStore> DispatchKeyType(std::unique_ptr<KeyValueStore>&& store,
+                                               std::unique_ptr<Cache>&& cache) {
+  const uint32_t key_size = store->KeySize();
+  if (key_size == 4) {
+    return DispatchElemType<uint32_t>(std::move(store), std::move(cache));
+  } else if (key_size == 8) {
+    return DispatchElemType<uint64_t>(std::move(store), std::move(cache));
+  } else {
+    UNIMPLEMENTED();
+    return nullptr;
+  }
+}
+
+}  // namespace
+
+std::unique_ptr<KeyValueStore> NewCachedKeyValueStore(std::unique_ptr<KeyValueStore>&& store,
+                                                      std::unique_ptr<Cache>&& cache) {
+  return DispatchKeyType(std::move(store), std::move(cache));
+}
+
+}  // namespace embedding
+
 }  // namespace oneflow
\ No newline at end of file
diff --git a/oneflow/core/embedding/full_cache.hip.cpp b/oneflow/core/embedding/full_cache.hip.cpp
index 164ecec..fed4182 100644
--- a/oneflow/core/embedding/full_cache.hip.cpp
+++ b/oneflow/core/embedding/full_cache.hip.cpp
@@ -1,640 +1,640 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "hip/hip_runtime.h"
-#include "oneflow/core/embedding/full_cache.h"
-#include "oneflow/core/device/cuda_util.h"
-#include "oneflow/core/embedding/hash_functions.hip.h"
-#include "oneflow/core/hip/atomic.hip.h"
-
-namespace oneflow {
-
-namespace embedding {
-
-using Key32 = unsigned int;
-using Key64 = unsigned long long int;
-using Key128 = ulonglong2;
-
-namespace {
-
-template<typename Key, typename Index>
-__device__ bool TryGetOrInsert(Key* entry_key, volatile Index* entry_index, Index* table_size,
-                               Key key, Index* out) {
-  Key key_hi = (key | 0x1);
-  Key key_lo = (key & 0x1);
-  Index index_plus_one = 0;
-  Key old_entry_key = cuda::atomic::CAS(entry_key, static_cast<Key>(0), key_hi);
-  while (index_plus_one == 0) {
-    if (old_entry_key == static_cast<Key>(0)) {
-      Index index = cuda::atomic::Add(table_size, static_cast<Index>(1));
-      index_plus_one = index + 1;
-      *entry_index = ((index_plus_one << 1U) | key_lo);
-      *out = index_plus_one;
-      return true;
-    } else if (old_entry_key == key_hi) {
-      const Index entry_index_val = *entry_index;
-      if (entry_index_val == 0) {
-        // do nothing
-      } else if ((entry_index_val & 0x1) == key_lo) {
-        *out = (entry_index_val >> 1U);
-        return true;
-      } else {
-        return false;
-      }
-    } else {
-      return false;
-    }
-  }
-  return false;
-}
-
-template<typename Key, typename Index>
-__device__ bool GetOrInsertOne(const size_t capacity, Key* table_keys, Index* table_indices,
-                               Index* table_size, Key key, size_t hash, Index* out) {
-  const size_t start_idx = hash % capacity;
-  for (size_t count = 0; count < capacity; ++count) {
-    const size_t idx = (start_idx + count) % capacity;
-    Key* entry_key = table_keys + idx;
-    Index* entry_index = table_indices + idx;
-    if (TryGetOrInsert<Key, Index>(entry_key, entry_index, table_size, key, out)) { return true; }
-  }
-  return false;
-}
-
-template<typename Key, typename Index>
-__device__ bool GetOne(const size_t capacity, Key* table_keys, Index* table_indices, Key key,
-                       size_t hash, Index* out) {
-  const size_t start_idx = hash % capacity;
-  for (size_t count = 0; count < capacity; ++count) {
-    const size_t idx = (start_idx + count) % capacity;
-    Key entry_key = table_keys[idx];
-    Key entry_index = table_indices[idx];
-    Key key_hi = (key | 0x1);
-    Key key_lo = (key & 0x1);
-    if (entry_key == 0) { break; }
-    if (entry_key == key_hi) {
-      if ((entry_index & 0x1) == key_lo) {
-        *out = (entry_index >> 1U);
-        return true;
-      }
-    }
-  }
-  *out = 0;
-  return false;
-}
-
-template<typename Key, typename Index>
-__global__ void OrdinalEncodeKernel(uint64_t capacity, Key* table_keys, Index* table_indices,
-                                    Index* table_size, uint32_t num_keys, const Key* keys,
-                                    Index* context) {
-  CUDA_1D_KERNEL_LOOP(i, num_keys) {
-    Key key = keys[i];
-    uint64_t hash = FullCacheHash()(key);
-    bool success = GetOrInsertOne<Key, Index>(capacity, table_keys, table_indices, table_size, key,
-                                              hash, context + i);
-    assert(success);
-  }
-}
-
-template<typename Key, typename Index>
-__global__ void OrdinalEncodeLookupKernel(uint64_t capacity, Key* table_keys, Index* table_indices,
-                                          uint32_t num_keys, const Key* keys, Index* context) {
-  CUDA_1D_KERNEL_LOOP(i, num_keys) {
-    Key key = keys[i];
-    uint64_t hash = FullCacheHash()(key);
-    GetOne<Key, Index>(capacity, table_keys, table_indices, key, hash, context + i);
-  }
-}
-
-template<typename Key, typename Index>
-__global__ void OrdinalEncodeDumpKernel(const Key* table_keys, const Index* table_indices,
-                                        uint64_t start_key_index, uint64_t end_key_index,
-                                        uint32_t* n_dumped, Key* keys, Index* context) {
-  CUDA_1D_KERNEL_LOOP(i, (end_key_index - start_key_index)) {
-    Key entry_key = table_keys[i + start_key_index];
-    Index entry_index = table_indices[i + start_key_index];
-    if (entry_index != 0) {
-      uint32_t index = cuda::atomic::Add(n_dumped, static_cast<uint32_t>(1));
-      keys[index] = ((entry_key ^ 0x1) | (entry_index & 0x1));
-      context[index] = (entry_index >> 1U);
-    }
-  }
-}
-
-template<typename Key, typename Elem, typename Index, bool return_value>
-__global__ void LookupKernel(uint32_t value_length, const Elem* cache_values,
-                             uint32_t values_elem_cnt, const Key* keys, const Index* context,
-                             Elem* values, uint32_t* n_missing, Key* missing_keys,
-                             uint32_t* missing_indices) {
-  CUDA_1D_KERNEL_LOOP(i, values_elem_cnt) {
-    const uint64_t key_id = i / value_length;
-    const uint64_t ctx = context[key_id];
-    const uint64_t row_id = ctx - 1;
-    const uint64_t col_id = i - key_id * value_length;
-    if (ctx == 0) {
-      const Key missing_key = keys[key_id];
-      if (col_id == 0) {
-        const uint32_t old_n_missing = cuda::atomic::Add(n_missing, static_cast<uint32_t>(1));
-        missing_keys[old_n_missing] = missing_key;
-        missing_indices[old_n_missing] = key_id;
-      }
-      continue;
-    }
-    if (return_value) { values[i] = cache_values[row_id * value_length + col_id]; }
-  }
-}
-
-template<typename Key, typename Elem, typename Index, uint32_t block_size>
-__global__ void EncodeLookupKernel(uint32_t value_length, const Elem* cache_values,
-                                   uint32_t values_elem_cnt, const Key* keys, const Index* context,
-                                   Elem* values, uint32_t* n_missing, Key* missing_keys,
-                                   uint32_t* missing_indices, const size_t capacity,
-                                   Key* table_keys, Index* table_indices) {
-  constexpr uint32_t warp_size = 32;
-  constexpr uint32_t n_warp_per_block = block_size / warp_size;
-  const uint32_t warp_id = threadIdx.x / warp_size;
-  const uint32_t lane_id = threadIdx.x % warp_size;
-  const uint32_t global_warp_id = blockIdx.x * n_warp_per_block + warp_id;
-  const uint32_t global_n_warp = gridDim.x * n_warp_per_block;
-  const uint32_t n_keys = values_elem_cnt / value_length;
-  __shared__ Key batch_keys[n_warp_per_block][warp_size];
-  __shared__ Index batch_row_ids[n_warp_per_block][warp_size];
-  __shared__ Key batch_missing_keys[n_warp_per_block][warp_size];
-  __shared__ uint32_t batch_missing_indices[n_warp_per_block][warp_size];
-  __shared__ uint32_t batch_n_missing[n_warp_per_block];
-  for (uint32_t batch_start = global_warp_id * warp_size; batch_start < n_keys;
-       batch_start += global_n_warp * warp_size) {
-    const uint32_t batch_n_key = min(n_keys - batch_start, warp_size);
-    if (lane_id == 0) { batch_n_missing[warp_id] = 0; }
-   __syncthreads();
-    const uint32_t key_offset = batch_start + lane_id;
-    if (key_offset < n_keys) {
-      const Key key = keys[batch_start + lane_id];
-      const uint64_t hash = FullCacheHash()(key);
-      Index row;
-      GetOne<Key, Index>(capacity, table_keys, table_indices, key, hash, &row);
-      batch_row_ids[warp_id][lane_id] = row;
-      if (row == 0) {
-        const uint32_t batch_missing_idx = atomicAdd(batch_n_missing + warp_id, 1);
-        batch_missing_keys[warp_id][batch_missing_idx] = key;
-        batch_missing_indices[warp_id][batch_missing_idx] = key_offset;
-      }
-    }
-   __syncthreads();
-    const uint32_t batch_n_missing_t = batch_n_missing[warp_id];
-    if (lane_id == 0) {
-      const uint32_t old_n_missing =
-          cuda::atomic::Add(n_missing, static_cast<uint32_t>(batch_n_missing_t));
-      batch_n_missing[warp_id] = old_n_missing;
-    }
-   __syncthreads();
-    if (lane_id < batch_n_missing_t) {
-      missing_keys[batch_n_missing[warp_id] + lane_id] = batch_missing_keys[warp_id][lane_id];
-      missing_indices[batch_n_missing[warp_id] + lane_id] = batch_missing_indices[warp_id][lane_id];
-    }
-    for (int i = 0; i < batch_n_key; ++i) {
-      const Key key = batch_keys[warp_id][i];
-      const Index row = batch_row_ids[warp_id][i];
-      if (row == 0) { continue; }
-      for (int col = lane_id; col < value_length; col += warp_size) {
-        values[(batch_start + i) * value_length + col] =
-            cache_values[(row - 1) * value_length + col];
-      }
-    }
-   __syncthreads();
-  }
-}
-
-template<typename T, size_t pack_size>
-struct alignas(sizeof(T) * pack_size) Pack {
-  T elem[pack_size];
-};
-
-template<typename Key, typename Elem, typename Index, uint32_t block_size, uint32_t pack_size>
-__global__ void EncodeLookupMaskKernel(uint32_t value_length, const Elem* __restrict__ cache_values,
-                                       uint32_t values_elem_cnt, const Key* __restrict__ keys,
-                                       const Index* __restrict__ context, Elem* __restrict__ values,
-                                       uint8_t* __restrict__ mask, const size_t capacity,
-                                       Key* __restrict__ table_keys,
-                                       Index* __restrict__ table_indices) {
-  const uint32_t packed_cols = value_length / pack_size;
-  auto* packed_values = reinterpret_cast<Pack<Elem, pack_size>*>(values);
-  const auto* packed_cache_values = reinterpret_cast<const Pack<Elem, pack_size>*>(cache_values);
-  constexpr uint32_t warp_size = 32;
-  constexpr uint32_t n_warp_per_block = block_size / warp_size;
-  const uint32_t warp_id = threadIdx.x / warp_size;
-  const uint32_t lane_id = threadIdx.x % warp_size;
-  const uint32_t global_warp_id = blockIdx.x * n_warp_per_block + warp_id;
-  const uint32_t global_n_warp = gridDim.x * n_warp_per_block;
-  const uint32_t n_keys = values_elem_cnt / value_length;
-  __shared__ Key batch_keys[n_warp_per_block][warp_size];
-  __shared__ Index batch_row_ids[n_warp_per_block][warp_size];
-  for (uint32_t batch_start = global_warp_id * warp_size; batch_start < n_keys;
-       batch_start += global_n_warp * warp_size) {
-    const uint32_t batch_n_key = min(n_keys - batch_start, warp_size);
-    const uint32_t key_offset = batch_start + lane_id;
-    if (key_offset < n_keys) {
-      const Key key = keys[batch_start + lane_id];
-      const uint64_t hash = FullCacheHash()(key);
-      Index row;
-      GetOne<Key, Index>(capacity, table_keys, table_indices, key, hash, &row);
-      batch_row_ids[warp_id][lane_id] = row;
-      mask[key_offset] = row > 0;
-    }
-   __syncthreads();
-    for (int i = 0; i < batch_n_key; ++i) {
-      const Key key = batch_keys[warp_id][i];
-      const Index row = batch_row_ids[warp_id][i];
-      if (row == 0) { continue; }
-#pragma unroll 4
-      for (int col = lane_id; col < packed_cols; col += warp_size) {
-        packed_values[(batch_start + i) * packed_cols + col] =
-            packed_cache_values[(row - 1) * packed_cols + col];
-      }
-    }
-   __syncthreads();
-  }
-}
-
-template<typename Elem, typename Index, size_t pack_size>
-__global__ void UpdateKernel(uint32_t value_length, Elem* cache_values, uint32_t values_elem_cnt,
-                             const Index* context, const Elem* values) {
-  const int packed_values_elem_cnt = values_elem_cnt / pack_size;
-  const uint32_t packed_elem_cnt = value_length / pack_size;
-  auto* packed_cache_values = reinterpret_cast<Pack<Elem, pack_size>*>(cache_values);
-  auto* packed_values = reinterpret_cast<const Pack<Elem, pack_size>*>(values);
-  CUDA_1D_KERNEL_LOOP(i, packed_values_elem_cnt) {
-    const uint64_t key_id = i / packed_elem_cnt;
-    const uint64_t ctx = context[key_id];
-    if (ctx == 0) { continue; }
-    const uint64_t row_id = ctx - 1;
-    const uint64_t col_id = i - key_id * packed_elem_cnt;
-    packed_cache_values[row_id * packed_elem_cnt + col_id] = packed_values[i];
-  }
-}
-
-template<typename Elem, typename Index, size_t pack_size>
-__global__ typename std::enable_if<std::is_same<Elem, float>::value, void>::type
-FusedHalfUpdateKernel(uint32_t value_length, Elem* __restrict__ cache_values,
-                      uint32_t values_elem_cnt, const Index* __restrict__ context,
-                      const Elem* __restrict__ values, const half* __restrict__ update,
-                      const float* __restrict__ lr, float scale) {
-  const int packed_values_elem_cnt = values_elem_cnt / pack_size;
-  const uint32_t packed_elem_cnt = value_length / pack_size;
-  auto* packed_cache_values = reinterpret_cast<Pack<Elem, pack_size>*>(cache_values);
-  auto* packed_values = reinterpret_cast<const Pack<Elem, pack_size>*>(values);
-  auto* packed_update = reinterpret_cast<const Pack<half, pack_size>*>(update);
-  const float alpha = -*lr * scale;
-  CUDA_1D_KERNEL_LOOP(i, packed_values_elem_cnt) {
-    const uint64_t key_id = i / packed_elem_cnt;
-    const uint64_t ctx = context[key_id];
-    if (ctx == 0) { continue; }
-    const uint64_t row_id = ctx - 1;
-    const uint64_t col_id = i - key_id * packed_elem_cnt;
-    Pack<Elem, pack_size> m = packed_values[i];
-    Pack<half, pack_size> u = packed_update[i];
-    for (size_t j = 0; j < pack_size; ++j) { m.elem[j] += static_cast<Elem>(u.elem[j]) * alpha; }
-    packed_cache_values[row_id * packed_elem_cnt + col_id] = m;
-  }
-}
-
-template<typename Elem, typename Index, size_t pack_size>
-__global__ typename std::enable_if<!std::is_same<Elem, float>::value, void>::type
-FusedHalfUpdateKernel(uint32_t value_length, Elem* cache_values, uint32_t values_elem_cnt,
-                      const Index* context, const Elem* values, const half* update, const float* lr,
-                      float scale) {
-  asm volatile("s_trap 0;");
-}
-
-template<typename Key, typename Elem, typename Index>
-__global__ void DumpValueKernel(uint32_t value_length, const uint32_t* n_dumped,
-                                const Index* context, const Elem* cache_values, Elem* values) {
-  CUDA_1D_KERNEL_LOOP(i, *n_dumped * value_length) {
-    const uint64_t key_id = i / value_length;
-    const uint64_t ctx = context[key_id];
-    const uint64_t row_id = ctx - 1;
-    const uint64_t col_id = i - key_id * value_length;
-    values[i] = cache_values[row_id * value_length + col_id];
-  }
-}
-
-template<typename Key, typename Index>
-class OrdinalEncoder {
- public:
-  OF_DISALLOW_COPY_AND_MOVE(OrdinalEncoder);
-  explicit OrdinalEncoder(uint64_t capacity, float load_factor)
-      : capacity_(capacity), table_capacity_(capacity / load_factor) {
-    OF_CUDA_CHECK(hipGetDevice(&device_index_));
-    OF_CUDA_CHECK(hipMalloc(&table_size_, sizeof(Index)));
-    OF_CUDA_CHECK(hipMallocHost(reinterpret_cast<void **>(&table_size_host_), sizeof(Index)));
-    OF_CUDA_CHECK(hipMalloc(&table_keys_, table_capacity_ * sizeof(Key)));
-    OF_CUDA_CHECK(hipMalloc(&table_indices_, table_capacity_ * sizeof(Index)));
-    Clear();
-  }
-  ~OrdinalEncoder() {
-    CudaCurrentDeviceGuard guard(device_index_);
-    OF_CUDA_CHECK(hipFree(table_size_));
-    OF_CUDA_CHECK(hipHostFree(table_size_host_));
-    OF_CUDA_CHECK(hipFree(table_keys_));
-    OF_CUDA_CHECK(hipFree(table_indices_));
-  }
-
-  template<bool insert>
-  void Encode(ep::Stream* stream, uint32_t num_keys, const Key* keys, Index* context) {
-    if (insert) {
-      RUN_CUDA_KERNEL((OrdinalEncodeKernel<Key, Index>), stream, num_keys, table_capacity_,
-                      table_keys_, table_indices_, table_size_, num_keys, keys, context);
-      OF_CUDA_CHECK(hipMemcpyAsync(table_size_host_, table_size_, sizeof(Index), hipMemcpyDefault,
-                                    stream->As<ep::CudaStream>()->cuda_stream()));
-      CHECK_JUST(stream->Sync());
-      CHECK_LT(*table_size_host_, capacity_)
-          << "The number of key is larger than cache size, please enlarge cache_memory_budget. ";
-    } else {
-      RUN_CUDA_KERNEL((OrdinalEncodeLookupKernel<Key, Index>), stream, num_keys, table_capacity_,
-                      table_keys_, table_indices_, num_keys, keys, context);
-    }
-  }
-
-  void Dump(ep::Stream* stream, uint64_t start_key_index, uint64_t end_key_index,
-            uint32_t* n_dumped, Key* keys, Index* context) {
-    OF_CUDA_CHECK(hipMemsetAsync(n_dumped, 0, sizeof(uint32_t),
-                                  stream->As<ep::CudaStream>()->cuda_stream()));
-    RUN_CUDA_KERNEL((OrdinalEncodeDumpKernel<Key, Index>), stream, end_key_index - start_key_index,
-                    table_keys_, table_indices_, start_key_index, end_key_index, n_dumped, keys,
-                    context);
-  }
-
-  void Clear() {
-    OF_CUDA_CHECK(hipMemset(table_size_, 0, sizeof(Index)));
-    OF_CUDA_CHECK(hipMemset(table_keys_, 0, table_capacity_ * sizeof(Key)));
-    OF_CUDA_CHECK(hipMemset(table_indices_, 0, table_capacity_ * sizeof(Index)));
-  }
-
-  uint64_t TableCapacity() const { return table_capacity_; }
-
-  Key* table_keys() const { return table_keys_; }
-
-  Index* table_indices() const { return table_indices_; }
-
- private:
-  int device_index_{};
-  Key* table_keys_;
-  Index* table_indices_;
-  uint64_t capacity_;
-  uint64_t table_capacity_;
-  Index* table_size_{};
-  Index* table_size_host_{};
-};
-
-template<typename Key, typename Elem, typename Index, size_t pack_size>
-class CacheImpl : public Cache {
- public:
-  OF_DISALLOW_COPY_AND_MOVE(CacheImpl);
-  explicit CacheImpl(const CacheOptions& options)
-      : encoder_(options.capacity, options.load_factor),
-        device_index_(-1),
-        options_(options),
-        max_query_length_(0) {
-    OF_CUDA_CHECK(hipGetDevice(&device_index_));
-    const uint64_t values_size = options.capacity * options.value_size;
-    if (options.value_memory_kind == CacheOptions::MemoryKind::kDevice) {
-      OF_CUDA_CHECK(hipMalloc(&values_, values_size));
-    } else if (options.value_memory_kind == CacheOptions::MemoryKind::kHost) {
-      if (ParseBooleanFromEnv("ONEFLOW_ONE_EMBEDDING_DISABLE_NUMA_AWARE_ALLOCATION", false)) {
-        OF_CUDA_CHECK(hipMallocHost(reinterpret_cast<void **>(&values_), values_size));
-      } else {
-        OF_CUDA_CHECK(NumaAwareCudaMallocHost(device_index_, reinterpret_cast<void**>(&values_),
-                                              values_size));
-      }
-    } else {
-      UNIMPLEMENTED();
-    }
-    num_elem_per_value_ = options_.value_size / sizeof(Elem);
-  }
-  ~CacheImpl() {
-    CudaCurrentDeviceGuard guard(device_index_);
-    if (options_.value_memory_kind == CacheOptions::MemoryKind::kDevice) {
-      OF_CUDA_CHECK(hipFree(values_));
-    } else if (options_.value_memory_kind == CacheOptions::MemoryKind::kHost) {
-      OF_CUDA_CHECK(hipHostFree(values_));
-    } else {
-      UNIMPLEMENTED();
-    }
-    if (max_query_length_ > 0) { OF_CUDA_CHECK(hipFree(encoding_buffer_)); }
-  }
-
-  uint64_t Capacity() const override { return options_.capacity; }
-  uint64_t DumpCapacity() const override { return encoder_.TableCapacity(); }
-  uint32_t KeySize() const override { return options_.key_size; }
-
-  uint32_t ValueSize() const override { return options_.value_size; }
-
-  DataType ValueType() const override { return options_.value_type; }
-
-  uint32_t MaxQueryLength() const override { return max_query_length_; }
-
-  void ReserveQueryLength(uint32_t query_length) override {
-    CudaCurrentDeviceGuard guard(device_index_);
-    if (query_length <= max_query_length_) { return; }
-    if (max_query_length_ > 0) { OF_CUDA_CHECK(hipFree(encoding_buffer_)); }
-    OF_CUDA_CHECK(hipMalloc(&encoding_buffer_, query_length * sizeof(uint64_t)));
-    max_query_length_ = query_length;
-  }
-
-  CacheOptions::Policy Policy() const override { return CacheOptions::Policy::kFull; }
-
-  void Test(ep::Stream* stream, uint32_t n_keys, const void* keys, uint32_t* n_missing,
-            void* missing_keys, uint32_t* missing_indices) override;
-
-  void Get(ep::Stream* stream, uint32_t n_keys, const void* keys, void* values, uint32_t* n_missing,
-           void* missing_keys, uint32_t* missing_indices) override;
-
-  void Get(ep::Stream* stream, uint32_t n_keys, const void* keys, void* values,
-           uint8_t* mask) override;
-
-  void Put(ep::Stream* stream, uint32_t n_keys, const void* keys, const void* values,
-           uint32_t* n_evicted, void* evicted_keys, void* evicted_values) override;
-  void FusedHalfUpdatePut(ep::Stream* stream, uint32_t n_keys, const void* keys, const void* values,
-                          const void* update, const float* lr, float scale, uint32_t* n_evicted,
-                          void* evicted_keys, void* evicted_values) override;
-  void Dump(ep::Stream* stream, uint64_t start_key_index, uint64_t end_key_index,
-            uint32_t* n_dumped, void* keys, void* values) override;
-
-  void Clear() override;
-
- private:
-  OrdinalEncoder<Key, Index> encoder_;
-  int device_index_;
-  uint32_t num_elem_per_value_{};
-  Elem* values_;
-  Index* encoding_buffer_{};
-  CacheOptions options_;
-  uint32_t max_query_length_;
-};
-
-template<typename Key, typename Elem, typename Index, size_t pack_size>
-void CacheImpl<Key, Elem, Index, pack_size>::Test(ep::Stream* stream, uint32_t n_keys,
-                                                  const void* keys, uint32_t* n_missing,
-                                                  void* missing_keys, uint32_t* missing_indices) {
-  OF_CUDA_CHECK(
-      hipMemsetAsync(n_missing, 0, sizeof(uint32_t), stream->As<ep::CudaStream>()->cuda_stream()));
-  if (n_keys == 0) { return; }
-  CHECK_LE(n_keys, max_query_length_);
-  encoder_.template Encode<false>(stream, n_keys, static_cast<const Key*>(keys), encoding_buffer_);
-  const uint32_t values_elem_cnt = n_keys * num_elem_per_value_;
-  RUN_CUDA_KERNEL((LookupKernel<Key, Elem, Index, false>), stream, values_elem_cnt,
-                  num_elem_per_value_, values_, values_elem_cnt, static_cast<const Key*>(keys),
-                  encoding_buffer_, nullptr, n_missing, static_cast<Key*>(missing_keys),
-                  missing_indices);
-}
-
-template<typename Key, typename Elem, typename Index, size_t pack_size>
-void CacheImpl<Key, Elem, Index, pack_size>::Get(ep::Stream* stream, uint32_t n_keys,
-                                                 const void* keys, void* values,
-                                                 uint32_t* n_missing, void* missing_keys,
-                                                 uint32_t* missing_indices) {
-  OF_CUDA_CHECK(
-      hipMemsetAsync(n_missing, 0, sizeof(uint32_t), stream->As<ep::CudaStream>()->cuda_stream()));
-  if (n_keys == 0) { return; }
-  CHECK_LE(n_keys, max_query_length_);
-  constexpr uint32_t block_size = 128;
-  uint32_t grid_size = (n_keys + block_size - 1) / block_size;
-  const uint32_t values_elem_cnt = n_keys * num_elem_per_value_;
-  EncodeLookupKernel<Key, Elem, Index, block_size>
-      <<<grid_size, block_size, 0, stream->As<ep::CudaStream>()->cuda_stream()>>>(
-          num_elem_per_value_, values_, values_elem_cnt, static_cast<const Key*>(keys),
-          encoding_buffer_, static_cast<Elem*>(values), n_missing, static_cast<Key*>(missing_keys),
-          missing_indices, encoder_.TableCapacity(), encoder_.table_keys(),
-          encoder_.table_indices());
-}
-
-template<typename Key, typename Elem, typename Index, size_t pack_size>
-void CacheImpl<Key, Elem, Index, pack_size>::Get(ep::Stream* stream, uint32_t n_keys,
-                                                 const void* keys, void* values, uint8_t* mask) {
-  if (n_keys == 0) { return; }
-  CHECK_LE(n_keys, max_query_length_);
-  constexpr uint32_t block_size = 128;
-  uint32_t grid_size = (n_keys + block_size - 1) / block_size;
-  const uint32_t values_elem_cnt = n_keys * num_elem_per_value_;
-  EncodeLookupMaskKernel<Key, Elem, Index, block_size, pack_size>
-      <<<grid_size, block_size, 0, stream->As<ep::CudaStream>()->cuda_stream()>>>(
-          num_elem_per_value_, values_, values_elem_cnt, static_cast<const Key*>(keys),
-          encoding_buffer_, static_cast<Elem*>(values), mask, encoder_.TableCapacity(),
-          encoder_.table_keys(), encoder_.table_indices());
-}
-
-template<typename Key, typename Elem, typename Index, size_t pack_size>
-void CacheImpl<Key, Elem, Index, pack_size>::Put(ep::Stream* stream, uint32_t n_keys,
-                                                 const void* keys, const void* values,
-                                                 uint32_t* n_evicted, void* evicted_keys,
-                                                 void* evicted_values) {
-  OF_CUDA_CHECK(
-      hipMemsetAsync(n_evicted, 0, sizeof(uint32_t), stream->As<ep::CudaStream>()->cuda_stream()));
-  if (n_keys == 0) { return; }
-  CHECK_LE(n_keys, max_query_length_);
-  encoder_.template Encode<true>(stream, n_keys, static_cast<const Key*>(keys), encoding_buffer_);
-  const uint32_t values_elem_cnt = n_keys * num_elem_per_value_;
-  RUN_CUDA_KERNEL((UpdateKernel<Elem, Index, pack_size>), stream, values_elem_cnt / pack_size,
-                  num_elem_per_value_, values_, values_elem_cnt, encoding_buffer_,
-                  static_cast<const Elem*>(values));
-}
-
-template<typename Key, typename Elem, typename Index, size_t pack_size>
-void CacheImpl<Key, Elem, Index, pack_size>::FusedHalfUpdatePut(
-    ep::Stream* stream, uint32_t n_keys, const void* keys, const void* values, const void* update,
-    const float* lr, float scale, uint32_t* n_evicted, void* evicted_keys, void* evicted_values) {
-  if (!std::is_same<Elem, float>::value) { UNIMPLEMENTED(); }
-  OF_CUDA_CHECK(
-      hipMemsetAsync(n_evicted, 0, sizeof(uint32_t), stream->As<ep::CudaStream>()->cuda_stream()));
-  if (n_keys == 0) { return; }
-  CHECK_LE(n_keys, max_query_length_);
-  encoder_.template Encode<true>(stream, n_keys, static_cast<const Key*>(keys), encoding_buffer_);
-  const uint32_t values_elem_cnt = n_keys * num_elem_per_value_;
-  RUN_CUDA_KERNEL((FusedHalfUpdateKernel<Elem, Index, pack_size>), stream,
-                  values_elem_cnt / pack_size, num_elem_per_value_, values_, values_elem_cnt,
-                  encoding_buffer_, static_cast<const Elem*>(values),
-                  static_cast<const half*>(update), lr, scale);
-}
-template<typename Key, typename Elem, typename Index, size_t pack_size>
-void CacheImpl<Key, Elem, Index, pack_size>::Dump(ep::Stream* stream, uint64_t start_key_index,
-                                                  uint64_t end_key_index, uint32_t* n_dumped,
-                                                  void* keys, void* values) {
-  encoder_.Dump(stream, start_key_index, end_key_index, n_dumped, static_cast<Key*>(keys),
-                encoding_buffer_);
-  RUN_CUDA_KERNEL((DumpValueKernel<Key, Elem, Index>), stream,
-                  num_elem_per_value_ * (end_key_index - start_key_index), num_elem_per_value_,
-                  n_dumped, encoding_buffer_, values_, static_cast<Elem*>(values));
-}
-
-template<typename Key, typename Elem, typename Index, size_t pack_size>
-void CacheImpl<Key, Elem, Index, pack_size>::Clear() {
-  encoder_.Clear();
-}
-
-template<typename Key, typename Index>
-std::unique_ptr<Cache> DispatchValueType(const CacheOptions& options) {
-  if (options.value_type == DataType::kFloat) {
-    const size_t value_elem_cnt = options.value_size / sizeof(float);
-    const size_t half_warp = 16;
-    if (value_elem_cnt % 4 == 0 && value_elem_cnt / 4 > half_warp) {
-      return std::unique_ptr<Cache>(new CacheImpl<Key, float, Index, 4>(options));
-    } else if (value_elem_cnt % 2 == 0 && value_elem_cnt / 2 > half_warp) {
-      return std::unique_ptr<Cache>(new CacheImpl<Key, float, Index, 2>(options));
-    } else {
-      return std::unique_ptr<Cache>(new CacheImpl<Key, float, Index, 1>(options));
-    }
-  } else if (options.value_size % sizeof(ulonglong2) == 0) {
-    return std::unique_ptr<Cache>(new CacheImpl<Key, ulonglong2, Index, 1>(options));
-  } else if (options.value_size % sizeof(uint64_t) == 0) {
-    return std::unique_ptr<Cache>(new CacheImpl<Key, uint64_t, Index, 1>(options));
-  } else if (options.value_size % sizeof(uint32_t) == 0) {
-    return std::unique_ptr<Cache>(new CacheImpl<Key, uint32_t, Index, 1>(options));
-  } else if (options.value_size % sizeof(uint16_t) == 0) {
-    return std::unique_ptr<Cache>(new CacheImpl<Key, uint16_t, Index, 1>(options));
-  } else {
-    return std::unique_ptr<Cache>(new CacheImpl<Key, uint8_t, Index, 1>(options));
-  }
-}
-
-template<typename Index>
-std::unique_ptr<Cache> DispatchKeyType(const CacheOptions& options) {
-  if (options.key_size == sizeof(Key32)) {
-    return DispatchValueType<Key32, Index>(options);
-  } else if (options.key_size == sizeof(Key64)) {
-    return DispatchValueType<Key64, Index>(options);
-  } else {
-    UNIMPLEMENTED();
-    return nullptr;
-  }
-}
-
-std::unique_ptr<Cache> DispatchIndexType(const CacheOptions& options) {
-  const int64_t table_capacity = static_cast<double>(options.capacity) / options.load_factor;
-  if (table_capacity >= (1ULL << 31ULL)) {
-    return DispatchKeyType<uint64_t>(options);
-  } else {
-    return DispatchKeyType<uint32_t>(options);
-  }
-}
-
-}  // namespace
-
-std::unique_ptr<Cache> NewFullCache(const CacheOptions& options) {
-  return DispatchIndexType(options);
-}
-
-}  // namespace embedding
-
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "hip/hip_runtime.h"
+#include "oneflow/core/embedding/full_cache.h"
+#include "oneflow/core/device/cuda_util.h"
+#include "oneflow/core/embedding/hash_functions.hip.h"
+#include "oneflow/core/hip/atomic.hip.h"
+
+namespace oneflow {
+
+namespace embedding {
+
+using Key32 = unsigned int;
+using Key64 = unsigned long long int;
+using Key128 = ulonglong2;
+
+namespace {
+
+template<typename Key, typename Index>
+__device__ bool TryGetOrInsert(Key* entry_key, volatile Index* entry_index, Index* table_size,
+                               Key key, Index* out) {
+  Key key_hi = (key | 0x1);
+  Key key_lo = (key & 0x1);
+  Index index_plus_one = 0;
+  Key old_entry_key = cuda::atomic::CAS(entry_key, static_cast<Key>(0), key_hi);
+  while (index_plus_one == 0) {
+    if (old_entry_key == static_cast<Key>(0)) {
+      Index index = cuda::atomic::Add(table_size, static_cast<Index>(1));
+      index_plus_one = index + 1;
+      *entry_index = ((index_plus_one << 1U) | key_lo);
+      *out = index_plus_one;
+      return true;
+    } else if (old_entry_key == key_hi) {
+      const Index entry_index_val = *entry_index;
+      if (entry_index_val == 0) {
+        // do nothing
+      } else if ((entry_index_val & 0x1) == key_lo) {
+        *out = (entry_index_val >> 1U);
+        return true;
+      } else {
+        return false;
+      }
+    } else {
+      return false;
+    }
+  }
+  return false;
+}
+
+template<typename Key, typename Index>
+__device__ bool GetOrInsertOne(const size_t capacity, Key* table_keys, Index* table_indices,
+                               Index* table_size, Key key, size_t hash, Index* out) {
+  const size_t start_idx = hash % capacity;
+  for (size_t count = 0; count < capacity; ++count) {
+    const size_t idx = (start_idx + count) % capacity;
+    Key* entry_key = table_keys + idx;
+    Index* entry_index = table_indices + idx;
+    if (TryGetOrInsert<Key, Index>(entry_key, entry_index, table_size, key, out)) { return true; }
+  }
+  return false;
+}
+
+template<typename Key, typename Index>
+__device__ bool GetOne(const size_t capacity, Key* table_keys, Index* table_indices, Key key,
+                       size_t hash, Index* out) {
+  const size_t start_idx = hash % capacity;
+  for (size_t count = 0; count < capacity; ++count) {
+    const size_t idx = (start_idx + count) % capacity;
+    Key entry_key = table_keys[idx];
+    Key entry_index = table_indices[idx];
+    Key key_hi = (key | 0x1);
+    Key key_lo = (key & 0x1);
+    if (entry_key == 0) { break; }
+    if (entry_key == key_hi) {
+      if ((entry_index & 0x1) == key_lo) {
+        *out = (entry_index >> 1U);
+        return true;
+      }
+    }
+  }
+  *out = 0;
+  return false;
+}
+
+template<typename Key, typename Index>
+__global__ void OrdinalEncodeKernel(uint64_t capacity, Key* table_keys, Index* table_indices,
+                                    Index* table_size, uint32_t num_keys, const Key* keys,
+                                    Index* context) {
+  CUDA_1D_KERNEL_LOOP(i, num_keys) {
+    Key key = keys[i];
+    uint64_t hash = FullCacheHash()(key);
+    bool success = GetOrInsertOne<Key, Index>(capacity, table_keys, table_indices, table_size, key,
+                                              hash, context + i);
+    assert(success);
+  }
+}
+
+template<typename Key, typename Index>
+__global__ void OrdinalEncodeLookupKernel(uint64_t capacity, Key* table_keys, Index* table_indices,
+                                          uint32_t num_keys, const Key* keys, Index* context) {
+  CUDA_1D_KERNEL_LOOP(i, num_keys) {
+    Key key = keys[i];
+    uint64_t hash = FullCacheHash()(key);
+    GetOne<Key, Index>(capacity, table_keys, table_indices, key, hash, context + i);
+  }
+}
+
+template<typename Key, typename Index>
+__global__ void OrdinalEncodeDumpKernel(const Key* table_keys, const Index* table_indices,
+                                        uint64_t start_key_index, uint64_t end_key_index,
+                                        uint32_t* n_dumped, Key* keys, Index* context) {
+  CUDA_1D_KERNEL_LOOP(i, (end_key_index - start_key_index)) {
+    Key entry_key = table_keys[i + start_key_index];
+    Index entry_index = table_indices[i + start_key_index];
+    if (entry_index != 0) {
+      uint32_t index = cuda::atomic::Add(n_dumped, static_cast<uint32_t>(1));
+      keys[index] = ((entry_key ^ 0x1) | (entry_index & 0x1));
+      context[index] = (entry_index >> 1U);
+    }
+  }
+}
+
+template<typename Key, typename Elem, typename Index, bool return_value>
+__global__ void LookupKernel(uint32_t value_length, const Elem* cache_values,
+                             uint32_t values_elem_cnt, const Key* keys, const Index* context,
+                             Elem* values, uint32_t* n_missing, Key* missing_keys,
+                             uint32_t* missing_indices) {
+  CUDA_1D_KERNEL_LOOP(i, values_elem_cnt) {
+    const uint64_t key_id = i / value_length;
+    const uint64_t ctx = context[key_id];
+    const uint64_t row_id = ctx - 1;
+    const uint64_t col_id = i - key_id * value_length;
+    if (ctx == 0) {
+      const Key missing_key = keys[key_id];
+      if (col_id == 0) {
+        const uint32_t old_n_missing = cuda::atomic::Add(n_missing, static_cast<uint32_t>(1));
+        missing_keys[old_n_missing] = missing_key;
+        missing_indices[old_n_missing] = key_id;
+      }
+      continue;
+    }
+    if (return_value) { values[i] = cache_values[row_id * value_length + col_id]; }
+  }
+}
+
+template<typename Key, typename Elem, typename Index, uint32_t block_size>
+__global__ void EncodeLookupKernel(uint32_t value_length, const Elem* cache_values,
+                                   uint32_t values_elem_cnt, const Key* keys, const Index* context,
+                                   Elem* values, uint32_t* n_missing, Key* missing_keys,
+                                   uint32_t* missing_indices, const size_t capacity,
+                                   Key* table_keys, Index* table_indices) {
+  constexpr uint32_t warp_size = 32;
+  constexpr uint32_t n_warp_per_block = block_size / warp_size;
+  const uint32_t warp_id = threadIdx.x / warp_size;
+  const uint32_t lane_id = threadIdx.x % warp_size;
+  const uint32_t global_warp_id = blockIdx.x * n_warp_per_block + warp_id;
+  const uint32_t global_n_warp = gridDim.x * n_warp_per_block;
+  const uint32_t n_keys = values_elem_cnt / value_length;
+  __shared__ Key batch_keys[n_warp_per_block][warp_size];
+  __shared__ Index batch_row_ids[n_warp_per_block][warp_size];
+  __shared__ Key batch_missing_keys[n_warp_per_block][warp_size];
+  __shared__ uint32_t batch_missing_indices[n_warp_per_block][warp_size];
+  __shared__ uint32_t batch_n_missing[n_warp_per_block];
+  for (uint32_t batch_start = global_warp_id * warp_size; batch_start < n_keys;
+       batch_start += global_n_warp * warp_size) {
+    const uint32_t batch_n_key = min(n_keys - batch_start, warp_size);
+    if (lane_id == 0) { batch_n_missing[warp_id] = 0; }
+   __syncthreads();
+    const uint32_t key_offset = batch_start + lane_id;
+    if (key_offset < n_keys) {
+      const Key key = keys[batch_start + lane_id];
+      const uint64_t hash = FullCacheHash()(key);
+      Index row;
+      GetOne<Key, Index>(capacity, table_keys, table_indices, key, hash, &row);
+      batch_row_ids[warp_id][lane_id] = row;
+      if (row == 0) {
+        const uint32_t batch_missing_idx = atomicAdd(batch_n_missing + warp_id, 1);
+        batch_missing_keys[warp_id][batch_missing_idx] = key;
+        batch_missing_indices[warp_id][batch_missing_idx] = key_offset;
+      }
+    }
+   __syncthreads();
+    const uint32_t batch_n_missing_t = batch_n_missing[warp_id];
+    if (lane_id == 0) {
+      const uint32_t old_n_missing =
+          cuda::atomic::Add(n_missing, static_cast<uint32_t>(batch_n_missing_t));
+      batch_n_missing[warp_id] = old_n_missing;
+    }
+   __syncthreads();
+    if (lane_id < batch_n_missing_t) {
+      missing_keys[batch_n_missing[warp_id] + lane_id] = batch_missing_keys[warp_id][lane_id];
+      missing_indices[batch_n_missing[warp_id] + lane_id] = batch_missing_indices[warp_id][lane_id];
+    }
+    for (int i = 0; i < batch_n_key; ++i) {
+      const Key key = batch_keys[warp_id][i];
+      const Index row = batch_row_ids[warp_id][i];
+      if (row == 0) { continue; }
+      for (int col = lane_id; col < value_length; col += warp_size) {
+        values[(batch_start + i) * value_length + col] =
+            cache_values[(row - 1) * value_length + col];
+      }
+    }
+   __syncthreads();
+  }
+}
+
+template<typename T, size_t pack_size>
+struct alignas(sizeof(T) * pack_size) Pack {
+  T elem[pack_size];
+};
+
+template<typename Key, typename Elem, typename Index, uint32_t block_size, uint32_t pack_size>
+__global__ void EncodeLookupMaskKernel(uint32_t value_length, const Elem* __restrict__ cache_values,
+                                       uint32_t values_elem_cnt, const Key* __restrict__ keys,
+                                       const Index* __restrict__ context, Elem* __restrict__ values,
+                                       uint8_t* __restrict__ mask, const size_t capacity,
+                                       Key* __restrict__ table_keys,
+                                       Index* __restrict__ table_indices) {
+  const uint32_t packed_cols = value_length / pack_size;
+  auto* packed_values = reinterpret_cast<Pack<Elem, pack_size>*>(values);
+  const auto* packed_cache_values = reinterpret_cast<const Pack<Elem, pack_size>*>(cache_values);
+  constexpr uint32_t warp_size = 32;
+  constexpr uint32_t n_warp_per_block = block_size / warp_size;
+  const uint32_t warp_id = threadIdx.x / warp_size;
+  const uint32_t lane_id = threadIdx.x % warp_size;
+  const uint32_t global_warp_id = blockIdx.x * n_warp_per_block + warp_id;
+  const uint32_t global_n_warp = gridDim.x * n_warp_per_block;
+  const uint32_t n_keys = values_elem_cnt / value_length;
+  __shared__ Key batch_keys[n_warp_per_block][warp_size];
+  __shared__ Index batch_row_ids[n_warp_per_block][warp_size];
+  for (uint32_t batch_start = global_warp_id * warp_size; batch_start < n_keys;
+       batch_start += global_n_warp * warp_size) {
+    const uint32_t batch_n_key = min(n_keys - batch_start, warp_size);
+    const uint32_t key_offset = batch_start + lane_id;
+    if (key_offset < n_keys) {
+      const Key key = keys[batch_start + lane_id];
+      const uint64_t hash = FullCacheHash()(key);
+      Index row;
+      GetOne<Key, Index>(capacity, table_keys, table_indices, key, hash, &row);
+      batch_row_ids[warp_id][lane_id] = row;
+      mask[key_offset] = row > 0;
+    }
+   __syncthreads();
+    for (int i = 0; i < batch_n_key; ++i) {
+      const Key key = batch_keys[warp_id][i];
+      const Index row = batch_row_ids[warp_id][i];
+      if (row == 0) { continue; }
+#pragma unroll 4
+      for (int col = lane_id; col < packed_cols; col += warp_size) {
+        packed_values[(batch_start + i) * packed_cols + col] =
+            packed_cache_values[(row - 1) * packed_cols + col];
+      }
+    }
+   __syncthreads();
+  }
+}
+
+template<typename Elem, typename Index, size_t pack_size>
+__global__ void UpdateKernel(uint32_t value_length, Elem* cache_values, uint32_t values_elem_cnt,
+                             const Index* context, const Elem* values) {
+  const int packed_values_elem_cnt = values_elem_cnt / pack_size;
+  const uint32_t packed_elem_cnt = value_length / pack_size;
+  auto* packed_cache_values = reinterpret_cast<Pack<Elem, pack_size>*>(cache_values);
+  auto* packed_values = reinterpret_cast<const Pack<Elem, pack_size>*>(values);
+  CUDA_1D_KERNEL_LOOP(i, packed_values_elem_cnt) {
+    const uint64_t key_id = i / packed_elem_cnt;
+    const uint64_t ctx = context[key_id];
+    if (ctx == 0) { continue; }
+    const uint64_t row_id = ctx - 1;
+    const uint64_t col_id = i - key_id * packed_elem_cnt;
+    packed_cache_values[row_id * packed_elem_cnt + col_id] = packed_values[i];
+  }
+}
+
+template<typename Elem, typename Index, size_t pack_size>
+__global__ typename std::enable_if<std::is_same<Elem, float>::value, void>::type
+FusedHalfUpdateKernel(uint32_t value_length, Elem* __restrict__ cache_values,
+                      uint32_t values_elem_cnt, const Index* __restrict__ context,
+                      const Elem* __restrict__ values, const half* __restrict__ update,
+                      const float* __restrict__ lr, float scale) {
+  const int packed_values_elem_cnt = values_elem_cnt / pack_size;
+  const uint32_t packed_elem_cnt = value_length / pack_size;
+  auto* packed_cache_values = reinterpret_cast<Pack<Elem, pack_size>*>(cache_values);
+  auto* packed_values = reinterpret_cast<const Pack<Elem, pack_size>*>(values);
+  auto* packed_update = reinterpret_cast<const Pack<half, pack_size>*>(update);
+  const float alpha = -*lr * scale;
+  CUDA_1D_KERNEL_LOOP(i, packed_values_elem_cnt) {
+    const uint64_t key_id = i / packed_elem_cnt;
+    const uint64_t ctx = context[key_id];
+    if (ctx == 0) { continue; }
+    const uint64_t row_id = ctx - 1;
+    const uint64_t col_id = i - key_id * packed_elem_cnt;
+    Pack<Elem, pack_size> m = packed_values[i];
+    Pack<half, pack_size> u = packed_update[i];
+    for (size_t j = 0; j < pack_size; ++j) { m.elem[j] += static_cast<Elem>(u.elem[j]) * alpha; }
+    packed_cache_values[row_id * packed_elem_cnt + col_id] = m;
+  }
+}
+
+template<typename Elem, typename Index, size_t pack_size>
+__global__ typename std::enable_if<!std::is_same<Elem, float>::value, void>::type
+FusedHalfUpdateKernel(uint32_t value_length, Elem* cache_values, uint32_t values_elem_cnt,
+                      const Index* context, const Elem* values, const half* update, const float* lr,
+                      float scale) {
+  asm volatile("s_trap 0;");
+}
+
+template<typename Key, typename Elem, typename Index>
+__global__ void DumpValueKernel(uint32_t value_length, const uint32_t* n_dumped,
+                                const Index* context, const Elem* cache_values, Elem* values) {
+  CUDA_1D_KERNEL_LOOP(i, *n_dumped * value_length) {
+    const uint64_t key_id = i / value_length;
+    const uint64_t ctx = context[key_id];
+    const uint64_t row_id = ctx - 1;
+    const uint64_t col_id = i - key_id * value_length;
+    values[i] = cache_values[row_id * value_length + col_id];
+  }
+}
+
+template<typename Key, typename Index>
+class OrdinalEncoder {
+ public:
+  OF_DISALLOW_COPY_AND_MOVE(OrdinalEncoder);
+  explicit OrdinalEncoder(uint64_t capacity, float load_factor)
+      : capacity_(capacity), table_capacity_(capacity / load_factor) {
+    OF_CUDA_CHECK(hipGetDevice(&device_index_));
+    OF_CUDA_CHECK(hipMalloc(&table_size_, sizeof(Index)));
+    OF_CUDA_CHECK(hipMallocHost(reinterpret_cast<void **>(&table_size_host_), sizeof(Index)));
+    OF_CUDA_CHECK(hipMalloc(&table_keys_, table_capacity_ * sizeof(Key)));
+    OF_CUDA_CHECK(hipMalloc(&table_indices_, table_capacity_ * sizeof(Index)));
+    Clear();
+  }
+  ~OrdinalEncoder() {
+    CudaCurrentDeviceGuard guard(device_index_);
+    OF_CUDA_CHECK(hipFree(table_size_));
+    OF_CUDA_CHECK(hipHostFree(table_size_host_));
+    OF_CUDA_CHECK(hipFree(table_keys_));
+    OF_CUDA_CHECK(hipFree(table_indices_));
+  }
+
+  template<bool insert>
+  void Encode(ep::Stream* stream, uint32_t num_keys, const Key* keys, Index* context) {
+    if (insert) {
+      RUN_CUDA_KERNEL((OrdinalEncodeKernel<Key, Index>), stream, num_keys, table_capacity_,
+                      table_keys_, table_indices_, table_size_, num_keys, keys, context);
+      OF_CUDA_CHECK(hipMemcpyAsync(table_size_host_, table_size_, sizeof(Index), hipMemcpyDefault,
+                                    stream->As<ep::CudaStream>()->cuda_stream()));
+      CHECK_JUST(stream->Sync());
+      CHECK_LT(*table_size_host_, capacity_)
+          << "The number of key is larger than cache size, please enlarge cache_memory_budget. ";
+    } else {
+      RUN_CUDA_KERNEL((OrdinalEncodeLookupKernel<Key, Index>), stream, num_keys, table_capacity_,
+                      table_keys_, table_indices_, num_keys, keys, context);
+    }
+  }
+
+  void Dump(ep::Stream* stream, uint64_t start_key_index, uint64_t end_key_index,
+            uint32_t* n_dumped, Key* keys, Index* context) {
+    OF_CUDA_CHECK(hipMemsetAsync(n_dumped, 0, sizeof(uint32_t),
+                                  stream->As<ep::CudaStream>()->cuda_stream()));
+    RUN_CUDA_KERNEL((OrdinalEncodeDumpKernel<Key, Index>), stream, end_key_index - start_key_index,
+                    table_keys_, table_indices_, start_key_index, end_key_index, n_dumped, keys,
+                    context);
+  }
+
+  void Clear() {
+    OF_CUDA_CHECK(hipMemset(table_size_, 0, sizeof(Index)));
+    OF_CUDA_CHECK(hipMemset(table_keys_, 0, table_capacity_ * sizeof(Key)));
+    OF_CUDA_CHECK(hipMemset(table_indices_, 0, table_capacity_ * sizeof(Index)));
+  }
+
+  uint64_t TableCapacity() const { return table_capacity_; }
+
+  Key* table_keys() const { return table_keys_; }
+
+  Index* table_indices() const { return table_indices_; }
+
+ private:
+  int device_index_{};
+  Key* table_keys_;
+  Index* table_indices_;
+  uint64_t capacity_;
+  uint64_t table_capacity_;
+  Index* table_size_{};
+  Index* table_size_host_{};
+};
+
+template<typename Key, typename Elem, typename Index, size_t pack_size>
+class CacheImpl : public Cache {
+ public:
+  OF_DISALLOW_COPY_AND_MOVE(CacheImpl);
+  explicit CacheImpl(const CacheOptions& options)
+      : encoder_(options.capacity, options.load_factor),
+        device_index_(-1),
+        options_(options),
+        max_query_length_(0) {
+    OF_CUDA_CHECK(hipGetDevice(&device_index_));
+    const uint64_t values_size = options.capacity * options.value_size;
+    if (options.value_memory_kind == CacheOptions::MemoryKind::kDevice) {
+      OF_CUDA_CHECK(hipMalloc(&values_, values_size));
+    } else if (options.value_memory_kind == CacheOptions::MemoryKind::kHost) {
+      if (ParseBooleanFromEnv("ONEFLOW_ONE_EMBEDDING_DISABLE_NUMA_AWARE_ALLOCATION", false)) {
+        OF_CUDA_CHECK(hipMallocHost(reinterpret_cast<void **>(&values_), values_size));
+      } else {
+        OF_CUDA_CHECK(NumaAwareCudaMallocHost(device_index_, reinterpret_cast<void**>(&values_),
+                                              values_size));
+      }
+    } else {
+      UNIMPLEMENTED();
+    }
+    num_elem_per_value_ = options_.value_size / sizeof(Elem);
+  }
+  ~CacheImpl() {
+    CudaCurrentDeviceGuard guard(device_index_);
+    if (options_.value_memory_kind == CacheOptions::MemoryKind::kDevice) {
+      OF_CUDA_CHECK(hipFree(values_));
+    } else if (options_.value_memory_kind == CacheOptions::MemoryKind::kHost) {
+      OF_CUDA_CHECK(hipHostFree(values_));
+    } else {
+      UNIMPLEMENTED();
+    }
+    if (max_query_length_ > 0) { OF_CUDA_CHECK(hipFree(encoding_buffer_)); }
+  }
+
+  uint64_t Capacity() const override { return options_.capacity; }
+  uint64_t DumpCapacity() const override { return encoder_.TableCapacity(); }
+  uint32_t KeySize() const override { return options_.key_size; }
+
+  uint32_t ValueSize() const override { return options_.value_size; }
+
+  DataType ValueType() const override { return options_.value_type; }
+
+  uint32_t MaxQueryLength() const override { return max_query_length_; }
+
+  void ReserveQueryLength(uint32_t query_length) override {
+    CudaCurrentDeviceGuard guard(device_index_);
+    if (query_length <= max_query_length_) { return; }
+    if (max_query_length_ > 0) { OF_CUDA_CHECK(hipFree(encoding_buffer_)); }
+    OF_CUDA_CHECK(hipMalloc(&encoding_buffer_, query_length * sizeof(uint64_t)));
+    max_query_length_ = query_length;
+  }
+
+  CacheOptions::Policy Policy() const override { return CacheOptions::Policy::kFull; }
+
+  void Test(ep::Stream* stream, uint32_t n_keys, const void* keys, uint32_t* n_missing,
+            void* missing_keys, uint32_t* missing_indices) override;
+
+  void Get(ep::Stream* stream, uint32_t n_keys, const void* keys, void* values, uint32_t* n_missing,
+           void* missing_keys, uint32_t* missing_indices) override;
+
+  void Get(ep::Stream* stream, uint32_t n_keys, const void* keys, void* values,
+           uint8_t* mask) override;
+
+  void Put(ep::Stream* stream, uint32_t n_keys, const void* keys, const void* values,
+           uint32_t* n_evicted, void* evicted_keys, void* evicted_values) override;
+  void FusedHalfUpdatePut(ep::Stream* stream, uint32_t n_keys, const void* keys, const void* values,
+                          const void* update, const float* lr, float scale, uint32_t* n_evicted,
+                          void* evicted_keys, void* evicted_values) override;
+  void Dump(ep::Stream* stream, uint64_t start_key_index, uint64_t end_key_index,
+            uint32_t* n_dumped, void* keys, void* values) override;
+
+  void Clear() override;
+
+ private:
+  OrdinalEncoder<Key, Index> encoder_;
+  int device_index_;
+  uint32_t num_elem_per_value_{};
+  Elem* values_;
+  Index* encoding_buffer_{};
+  CacheOptions options_;
+  uint32_t max_query_length_;
+};
+
+template<typename Key, typename Elem, typename Index, size_t pack_size>
+void CacheImpl<Key, Elem, Index, pack_size>::Test(ep::Stream* stream, uint32_t n_keys,
+                                                  const void* keys, uint32_t* n_missing,
+                                                  void* missing_keys, uint32_t* missing_indices) {
+  OF_CUDA_CHECK(
+      hipMemsetAsync(n_missing, 0, sizeof(uint32_t), stream->As<ep::CudaStream>()->cuda_stream()));
+  if (n_keys == 0) { return; }
+  CHECK_LE(n_keys, max_query_length_);
+  encoder_.template Encode<false>(stream, n_keys, static_cast<const Key*>(keys), encoding_buffer_);
+  const uint32_t values_elem_cnt = n_keys * num_elem_per_value_;
+  RUN_CUDA_KERNEL((LookupKernel<Key, Elem, Index, false>), stream, values_elem_cnt,
+                  num_elem_per_value_, values_, values_elem_cnt, static_cast<const Key*>(keys),
+                  encoding_buffer_, nullptr, n_missing, static_cast<Key*>(missing_keys),
+                  missing_indices);
+}
+
+template<typename Key, typename Elem, typename Index, size_t pack_size>
+void CacheImpl<Key, Elem, Index, pack_size>::Get(ep::Stream* stream, uint32_t n_keys,
+                                                 const void* keys, void* values,
+                                                 uint32_t* n_missing, void* missing_keys,
+                                                 uint32_t* missing_indices) {
+  OF_CUDA_CHECK(
+      hipMemsetAsync(n_missing, 0, sizeof(uint32_t), stream->As<ep::CudaStream>()->cuda_stream()));
+  if (n_keys == 0) { return; }
+  CHECK_LE(n_keys, max_query_length_);
+  constexpr uint32_t block_size = 128;
+  uint32_t grid_size = (n_keys + block_size - 1) / block_size;
+  const uint32_t values_elem_cnt = n_keys * num_elem_per_value_;
+  EncodeLookupKernel<Key, Elem, Index, block_size>
+      <<<grid_size, block_size, 0, stream->As<ep::CudaStream>()->cuda_stream()>>>(
+          num_elem_per_value_, values_, values_elem_cnt, static_cast<const Key*>(keys),
+          encoding_buffer_, static_cast<Elem*>(values), n_missing, static_cast<Key*>(missing_keys),
+          missing_indices, encoder_.TableCapacity(), encoder_.table_keys(),
+          encoder_.table_indices());
+}
+
+template<typename Key, typename Elem, typename Index, size_t pack_size>
+void CacheImpl<Key, Elem, Index, pack_size>::Get(ep::Stream* stream, uint32_t n_keys,
+                                                 const void* keys, void* values, uint8_t* mask) {
+  if (n_keys == 0) { return; }
+  CHECK_LE(n_keys, max_query_length_);
+  constexpr uint32_t block_size = 128;
+  uint32_t grid_size = (n_keys + block_size - 1) / block_size;
+  const uint32_t values_elem_cnt = n_keys * num_elem_per_value_;
+  EncodeLookupMaskKernel<Key, Elem, Index, block_size, pack_size>
+      <<<grid_size, block_size, 0, stream->As<ep::CudaStream>()->cuda_stream()>>>(
+          num_elem_per_value_, values_, values_elem_cnt, static_cast<const Key*>(keys),
+          encoding_buffer_, static_cast<Elem*>(values), mask, encoder_.TableCapacity(),
+          encoder_.table_keys(), encoder_.table_indices());
+}
+
+template<typename Key, typename Elem, typename Index, size_t pack_size>
+void CacheImpl<Key, Elem, Index, pack_size>::Put(ep::Stream* stream, uint32_t n_keys,
+                                                 const void* keys, const void* values,
+                                                 uint32_t* n_evicted, void* evicted_keys,
+                                                 void* evicted_values) {
+  OF_CUDA_CHECK(
+      hipMemsetAsync(n_evicted, 0, sizeof(uint32_t), stream->As<ep::CudaStream>()->cuda_stream()));
+  if (n_keys == 0) { return; }
+  CHECK_LE(n_keys, max_query_length_);
+  encoder_.template Encode<true>(stream, n_keys, static_cast<const Key*>(keys), encoding_buffer_);
+  const uint32_t values_elem_cnt = n_keys * num_elem_per_value_;
+  RUN_CUDA_KERNEL((UpdateKernel<Elem, Index, pack_size>), stream, values_elem_cnt / pack_size,
+                  num_elem_per_value_, values_, values_elem_cnt, encoding_buffer_,
+                  static_cast<const Elem*>(values));
+}
+
+template<typename Key, typename Elem, typename Index, size_t pack_size>
+void CacheImpl<Key, Elem, Index, pack_size>::FusedHalfUpdatePut(
+    ep::Stream* stream, uint32_t n_keys, const void* keys, const void* values, const void* update,
+    const float* lr, float scale, uint32_t* n_evicted, void* evicted_keys, void* evicted_values) {
+  if (!std::is_same<Elem, float>::value) { UNIMPLEMENTED(); }
+  OF_CUDA_CHECK(
+      hipMemsetAsync(n_evicted, 0, sizeof(uint32_t), stream->As<ep::CudaStream>()->cuda_stream()));
+  if (n_keys == 0) { return; }
+  CHECK_LE(n_keys, max_query_length_);
+  encoder_.template Encode<true>(stream, n_keys, static_cast<const Key*>(keys), encoding_buffer_);
+  const uint32_t values_elem_cnt = n_keys * num_elem_per_value_;
+  RUN_CUDA_KERNEL((FusedHalfUpdateKernel<Elem, Index, pack_size>), stream,
+                  values_elem_cnt / pack_size, num_elem_per_value_, values_, values_elem_cnt,
+                  encoding_buffer_, static_cast<const Elem*>(values),
+                  static_cast<const half*>(update), lr, scale);
+}
+template<typename Key, typename Elem, typename Index, size_t pack_size>
+void CacheImpl<Key, Elem, Index, pack_size>::Dump(ep::Stream* stream, uint64_t start_key_index,
+                                                  uint64_t end_key_index, uint32_t* n_dumped,
+                                                  void* keys, void* values) {
+  encoder_.Dump(stream, start_key_index, end_key_index, n_dumped, static_cast<Key*>(keys),
+                encoding_buffer_);
+  RUN_CUDA_KERNEL((DumpValueKernel<Key, Elem, Index>), stream,
+                  num_elem_per_value_ * (end_key_index - start_key_index), num_elem_per_value_,
+                  n_dumped, encoding_buffer_, values_, static_cast<Elem*>(values));
+}
+
+template<typename Key, typename Elem, typename Index, size_t pack_size>
+void CacheImpl<Key, Elem, Index, pack_size>::Clear() {
+  encoder_.Clear();
+}
+
+template<typename Key, typename Index>
+std::unique_ptr<Cache> DispatchValueType(const CacheOptions& options) {
+  if (options.value_type == DataType::kFloat) {
+    const size_t value_elem_cnt = options.value_size / sizeof(float);
+    const size_t half_warp = 16;
+    if (value_elem_cnt % 4 == 0 && value_elem_cnt / 4 > half_warp) {
+      return std::unique_ptr<Cache>(new CacheImpl<Key, float, Index, 4>(options));
+    } else if (value_elem_cnt % 2 == 0 && value_elem_cnt / 2 > half_warp) {
+      return std::unique_ptr<Cache>(new CacheImpl<Key, float, Index, 2>(options));
+    } else {
+      return std::unique_ptr<Cache>(new CacheImpl<Key, float, Index, 1>(options));
+    }
+  } else if (options.value_size % sizeof(ulonglong2) == 0) {
+    return std::unique_ptr<Cache>(new CacheImpl<Key, ulonglong2, Index, 1>(options));
+  } else if (options.value_size % sizeof(uint64_t) == 0) {
+    return std::unique_ptr<Cache>(new CacheImpl<Key, uint64_t, Index, 1>(options));
+  } else if (options.value_size % sizeof(uint32_t) == 0) {
+    return std::unique_ptr<Cache>(new CacheImpl<Key, uint32_t, Index, 1>(options));
+  } else if (options.value_size % sizeof(uint16_t) == 0) {
+    return std::unique_ptr<Cache>(new CacheImpl<Key, uint16_t, Index, 1>(options));
+  } else {
+    return std::unique_ptr<Cache>(new CacheImpl<Key, uint8_t, Index, 1>(options));
+  }
+}
+
+template<typename Index>
+std::unique_ptr<Cache> DispatchKeyType(const CacheOptions& options) {
+  if (options.key_size == sizeof(Key32)) {
+    return DispatchValueType<Key32, Index>(options);
+  } else if (options.key_size == sizeof(Key64)) {
+    return DispatchValueType<Key64, Index>(options);
+  } else {
+    UNIMPLEMENTED();
+    return nullptr;
+  }
+}
+
+std::unique_ptr<Cache> DispatchIndexType(const CacheOptions& options) {
+  const int64_t table_capacity = static_cast<double>(options.capacity) / options.load_factor;
+  if (table_capacity >= (1ULL << 31ULL)) {
+    return DispatchKeyType<uint64_t>(options);
+  } else {
+    return DispatchKeyType<uint32_t>(options);
+  }
+}
+
+}  // namespace
+
+std::unique_ptr<Cache> NewFullCache(const CacheOptions& options) {
+  return DispatchIndexType(options);
+}
+
+}  // namespace embedding
+
 }  // namespace oneflow
\ No newline at end of file
diff --git a/oneflow/core/embedding/hash_functions.hip.h b/oneflow/core/embedding/hash_functions.hip.h
index 25c6eb0..99a2373 100644
--- a/oneflow/core/embedding/hash_functions.hip.h
+++ b/oneflow/core/embedding/hash_functions.hip.h
@@ -1,100 +1,100 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#ifndef ONEFLOW_CORE_EMBEDDING_HASH_FUNCTION_HIP_H_
-#define ONEFLOW_CORE_EMBEDDING_HASH_FUNCTION_HIP_H_
-
-#include <stdint.h>
-#include "oneflow/core/common/data_type.h"
-
-namespace oneflow {
-
-namespace embedding {
-
-namespace {
-
-// From https://github.com/Cyan4973/xxHash/blob/dev/xxhash.h
-static const uint64_t PRIME64_1 =
-    0x9E3779B185EBCA87ULL;  // 0b1001111000110111011110011011000110000101111010111100101010000111
-static const uint64_t PRIME64_2 =
-    0xC2B2AE3D27D4EB4FULL;  // 0b1100001010110010101011100011110100100111110101001110101101001111
-static const uint64_t PRIME64_3 =
-    0x165667B19E3779F9ULL;  // 0b0001011001010110011001111011000110011110001101110111100111111001
-static const uint64_t PRIME64_4 =
-    0x85EBCA77C2B2AE63ULL;  // 0b1000010111101011110010100111011111000010101100101010111001100011
-static const uint64_t PRIME64_5 =
-    0x27D4EB2F165667C5ULL;  // 0b0010011111010100111010110010111100010110010101100110011111000101
-
-#define XXH_rotl64(x, r) (((x) << (r)) | ((x) >> (64 - (r))))
-
-OF_DEVICE_FUNC uint64_t XXH64_round(uint64_t acc, uint64_t input) {
-  acc += input * PRIME64_2;
-  acc = XXH_rotl64(acc, 31);
-  acc *= PRIME64_1;
-  return acc;
-}
-
-OF_DEVICE_FUNC uint64_t xxh64_uint64(uint64_t v, uint64_t seed) {
-  uint64_t acc = seed + PRIME64_5;
-  acc += sizeof(uint64_t);
-  acc = acc ^ XXH64_round(0, v);
-  acc = XXH_rotl64(acc, 27) * PRIME64_1;
-  acc = acc + PRIME64_4;
-  acc ^= (acc >> 33);
-  acc = acc * PRIME64_2;
-  acc = acc ^ (acc >> 29);
-  acc = acc * PRIME64_3;
-  acc = acc ^ (acc >> 32);
-  return acc;
-}
-
-static const size_t kShardingHashSeed = 1;
-static const size_t kLocalUniqueHashSeed = 2;
-static const size_t kGlobalUniqueHashSeed = 3;
-static const size_t kFullCacheHashSeed = 4;
-static const size_t kLruCacheHashSeed = 5;
-
-}  // namespace
-
-struct ShardingHash {
-  OF_DEVICE_FUNC size_t operator()(uint64_t v) { return xxh64_uint64(v, kShardingHashSeed); }
-  OF_DEVICE_FUNC size_t operator()(uint32_t v) { return xxh64_uint64(v, kShardingHashSeed); }
-  OF_DEVICE_FUNC size_t operator()(int32_t v) {
-    return xxh64_uint64(static_cast<uint32_t>(v), kShardingHashSeed);
-  }
-  OF_DEVICE_FUNC size_t operator()(int64_t v) {
-    return xxh64_uint64(static_cast<uint64_t>(v), kShardingHashSeed);
-  }
-};
-
-struct LocalUniqueHash {
-  OF_DEVICE_FUNC size_t operator()(uint64_t v) { return xxh64_uint64(v, kLocalUniqueHashSeed); }
-};
-
-struct GlobalUniqueHash {
-  OF_DEVICE_FUNC size_t operator()(uint64_t v) { return xxh64_uint64(v, kGlobalUniqueHashSeed); }
-};
-
-struct FullCacheHash {
-  OF_DEVICE_FUNC size_t operator()(uint64_t v) { return xxh64_uint64(v, kFullCacheHashSeed); }
-};
-
-struct LruCacheHash {
-  OF_DEVICE_FUNC size_t operator()(uint64_t v) { return xxh64_uint64(v, kLruCacheHashSeed); }
-};
-
-}  // namespace embedding
-}  // namespace oneflow
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#ifndef ONEFLOW_CORE_EMBEDDING_HASH_FUNCTION_HIP_H_
+#define ONEFLOW_CORE_EMBEDDING_HASH_FUNCTION_HIP_H_
+
+#include <stdint.h>
+#include "oneflow/core/common/data_type.h"
+
+namespace oneflow {
+
+namespace embedding {
+
+namespace {
+
+// From https://github.com/Cyan4973/xxHash/blob/dev/xxhash.h
+static const uint64_t PRIME64_1 =
+    0x9E3779B185EBCA87ULL;  // 0b1001111000110111011110011011000110000101111010111100101010000111
+static const uint64_t PRIME64_2 =
+    0xC2B2AE3D27D4EB4FULL;  // 0b1100001010110010101011100011110100100111110101001110101101001111
+static const uint64_t PRIME64_3 =
+    0x165667B19E3779F9ULL;  // 0b0001011001010110011001111011000110011110001101110111100111111001
+static const uint64_t PRIME64_4 =
+    0x85EBCA77C2B2AE63ULL;  // 0b1000010111101011110010100111011111000010101100101010111001100011
+static const uint64_t PRIME64_5 =
+    0x27D4EB2F165667C5ULL;  // 0b0010011111010100111010110010111100010110010101100110011111000101
+
+#define XXH_rotl64(x, r) (((x) << (r)) | ((x) >> (64 - (r))))
+
+OF_DEVICE_FUNC uint64_t XXH64_round(uint64_t acc, uint64_t input) {
+  acc += input * PRIME64_2;
+  acc = XXH_rotl64(acc, 31);
+  acc *= PRIME64_1;
+  return acc;
+}
+
+OF_DEVICE_FUNC uint64_t xxh64_uint64(uint64_t v, uint64_t seed) {
+  uint64_t acc = seed + PRIME64_5;
+  acc += sizeof(uint64_t);
+  acc = acc ^ XXH64_round(0, v);
+  acc = XXH_rotl64(acc, 27) * PRIME64_1;
+  acc = acc + PRIME64_4;
+  acc ^= (acc >> 33);
+  acc = acc * PRIME64_2;
+  acc = acc ^ (acc >> 29);
+  acc = acc * PRIME64_3;
+  acc = acc ^ (acc >> 32);
+  return acc;
+}
+
+static const size_t kShardingHashSeed = 1;
+static const size_t kLocalUniqueHashSeed = 2;
+static const size_t kGlobalUniqueHashSeed = 3;
+static const size_t kFullCacheHashSeed = 4;
+static const size_t kLruCacheHashSeed = 5;
+
+}  // namespace
+
+struct ShardingHash {
+  OF_DEVICE_FUNC size_t operator()(uint64_t v) { return xxh64_uint64(v, kShardingHashSeed); }
+  OF_DEVICE_FUNC size_t operator()(uint32_t v) { return xxh64_uint64(v, kShardingHashSeed); }
+  OF_DEVICE_FUNC size_t operator()(int32_t v) {
+    return xxh64_uint64(static_cast<uint32_t>(v), kShardingHashSeed);
+  }
+  OF_DEVICE_FUNC size_t operator()(int64_t v) {
+    return xxh64_uint64(static_cast<uint64_t>(v), kShardingHashSeed);
+  }
+};
+
+struct LocalUniqueHash {
+  OF_DEVICE_FUNC size_t operator()(uint64_t v) { return xxh64_uint64(v, kLocalUniqueHashSeed); }
+};
+
+struct GlobalUniqueHash {
+  OF_DEVICE_FUNC size_t operator()(uint64_t v) { return xxh64_uint64(v, kGlobalUniqueHashSeed); }
+};
+
+struct FullCacheHash {
+  OF_DEVICE_FUNC size_t operator()(uint64_t v) { return xxh64_uint64(v, kFullCacheHashSeed); }
+};
+
+struct LruCacheHash {
+  OF_DEVICE_FUNC size_t operator()(uint64_t v) { return xxh64_uint64(v, kLruCacheHashSeed); }
+};
+
+}  // namespace embedding
+}  // namespace oneflow
 #endif  // ONEFLOW_CORE_EMBEDDING_HASH_FUNCTION_HIP_H_
\ No newline at end of file
diff --git a/oneflow/core/embedding/lru_cache.hip.cpp b/oneflow/core/embedding/lru_cache.hip.cpp
index 8db00c5..cfb1044 100644
--- a/oneflow/core/embedding/lru_cache.hip.cpp
+++ b/oneflow/core/embedding/lru_cache.hip.cpp
@@ -1,585 +1,585 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-// Inspired by https://github.com/NVIDIA-Merlin/HugeCTR/blob/master/gpu_cache/src/nv_gpu_cache.cu
-
-#include "oneflow/core/embedding/lru_cache.h"
-#include "oneflow/core/device/cuda_util.h"
-#include "oneflow/core/embedding/hash_functions.hip.h"
-#include <new>
-#include <hip/hip_runtime.h>
-
-namespace oneflow {
-
-namespace embedding {
-
-namespace {
-
-constexpr int kWarpSize = 64;
-constexpr int kNumWarpPerBlock = 2;
-constexpr int kBlockSize = kNumWarpPerBlock * kWarpSize;
-constexpr unsigned long long int kFullMask = 0xFFFFFFFFFFFFFFFFU;
-
-ep::CudaLaunchConfig GetLaunchConfig(uint32_t n_keys) {
-  return ep::CudaLaunchConfig((n_keys + kNumWarpPerBlock - 1) / kNumWarpPerBlock,
-                              kWarpSize * kNumWarpPerBlock, 0);
-}
-
-struct ThreadContext {
-  __device__ ThreadContext() {
-    const uint32_t global_thread_id = blockIdx.x * blockDim.x + threadIdx.x;
-    global_warp_id = global_thread_id / kWarpSize;
-    warp_id_in_block = global_warp_id % kNumWarpPerBlock;  // NOLINT
-    num_warps = gridDim.x * kNumWarpPerBlock;              // NOLINT
-    lane_id = global_thread_id % kWarpSize;
-  }
-
-  uint32_t global_warp_id;
-  uint32_t warp_id_in_block;
-  uint32_t num_warps;
-  uint32_t lane_id;
-};
-
-class WarpMutexAtomicImpl {
- public:
-  OF_DISALLOW_COPY_AND_MOVE(WarpMutexAtomicImpl);
-  __device__ WarpMutexAtomicImpl() : flag_(0) {}
-  __device__ ~WarpMutexAtomicImpl() = default;
-
-  __device__ void Lock(const ThreadContext& thread_ctx) {
-    if (thread_ctx.lane_id == 0) {
-      while (atomicCAS(&flag_, 0, 1) != 0)
-        ;
-    }
-    __threadfence();
-   __syncthreads();
-  }
-
-  __device__ void Unlock(const ThreadContext& thread_ctx) {
-   __syncthreads();
-    __threadfence();
-    if (thread_ctx.lane_id == 0) { atomicExch(&flag_, 0); }
-  }
-
- private:
-  int32_t flag_;
-};
-
-template<typename Key, typename Elem>
-struct LruCacheContext {
-  Key* keys;
-  Elem* lines;
-  uint8_t* ages;
-  void* mutex;
-  uint64_t n_set;
-  uint32_t line_size;
-  CacheOptions::MemoryKind value_memory_kind;
-};
-
-__global__ void InitCacheSetMutex(uint32_t n_set, void* mutex) {
-
-  using WarpMutex = WarpMutexAtomicImpl;
-
-  const uint32_t idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (idx < n_set) { new (reinterpret_cast<WarpMutex*>(mutex) + idx) WarpMutex; }
-}
-
-template<typename Key, typename Elem>
-void ClearLruCacheContext(LruCacheContext<Key, Elem>* ctx) {
-  OF_CUDA_CHECK(hipMemset(ctx->keys, 0, ctx->n_set * kWarpSize * sizeof(Key)));
-  OF_CUDA_CHECK(hipMemset(ctx->ages, 0, ctx->n_set * kWarpSize * sizeof(uint8_t)));
-  InitCacheSetMutex<<<(ctx->n_set - 1 + 256) / 256, 256>>>(ctx->n_set, ctx->mutex);
-}
-
-template<typename Key, typename Elem>
-void InitLruCacheContext(const CacheOptions& options, LruCacheContext<Key, Elem>* ctx) {
-  const size_t keys_size_per_set = kWarpSize * sizeof(Key);
-  const uint32_t line_size = options.value_size / sizeof(Elem);
-  const size_t lines_size_per_set = kWarpSize * line_size * sizeof(Elem);
-  const size_t ages_size_per_set = kWarpSize * sizeof(uint8_t);
-  int device = 0;
-  OF_CUDA_CHECK(hipGetDevice(&device));
-  int major = 0;
-  OF_CUDA_CHECK(hipDeviceGetAttribute(&major, hipDeviceAttributeComputeCapabilityMajor, device));
-  size_t mutex_size_per_set = 0;
-
-  mutex_size_per_set = sizeof(WarpMutexAtomicImpl);
-
-  const size_t n_set = (options.capacity - 1 + kWarpSize) / kWarpSize;
-  CHECK_GT(n_set, 0);
-  ctx->n_set = n_set;
-  ctx->line_size = line_size;
-  const size_t keys_size = n_set * keys_size_per_set;
-  OF_CUDA_CHECK(hipMalloc(&(ctx->keys), keys_size));
-  const size_t lines_size = n_set * lines_size_per_set;
-  if (options.value_memory_kind == CacheOptions::MemoryKind::kDevice) {
-    OF_CUDA_CHECK(hipMalloc(&(ctx->lines), lines_size));
-  } else if (options.value_memory_kind == CacheOptions::MemoryKind::kHost) {
-    if (ParseBooleanFromEnv("ONEFLOW_ONE_EMBEDDING_DISABLE_NUMA_AWARE_ALLOCATION", false)) {
-      OF_CUDA_CHECK(hipMallocHost(reinterpret_cast<void **>(&(ctx->lines)), lines_size));
-    } else {
-      OF_CUDA_CHECK(
-          NumaAwareCudaMallocHost(device, reinterpret_cast<void**>(&ctx->lines), lines_size));
-    }
-  } else {
-    UNIMPLEMENTED();
-  }
-  ctx->value_memory_kind = options.value_memory_kind;
-  const size_t ages_size = n_set * ages_size_per_set;
-  OF_CUDA_CHECK(hipMalloc(&(ctx->ages), ages_size));
-  const size_t mutex_size = n_set * mutex_size_per_set;
-  OF_CUDA_CHECK(hipMalloc(&(ctx->mutex), mutex_size));
-
-  ClearLruCacheContext(ctx);
-}
-
-template<typename Key, typename Elem>
-void DestroyLruCacheContext(LruCacheContext<Key, Elem>* ctx) {
-  OF_CUDA_CHECK(hipFree(ctx->keys));
-  if (ctx->value_memory_kind == CacheOptions::MemoryKind::kDevice) {
-    OF_CUDA_CHECK(hipFree(ctx->lines));
-  } else if (ctx->value_memory_kind == CacheOptions::MemoryKind::kHost) {
-    OF_CUDA_CHECK(hipHostFree(ctx->lines));
-  } else {
-    UNIMPLEMENTED();
-  }
-  OF_CUDA_CHECK(hipFree(ctx->ages));
-  OF_CUDA_CHECK(hipFree(ctx->mutex));
-}
-
-template<typename Key, typename Elem>
-struct SetContext {
-
-  using WarpMutex = WarpMutexAtomicImpl;
-
-  __device__ SetContext(const LruCacheContext<Key, Elem>& ctx, uint32_t set_id)
-      : keys(ctx.keys + set_id * kWarpSize),
-        mutex(reinterpret_cast<WarpMutex*>(ctx.mutex) + set_id),
-        ages(ctx.ages + set_id * kWarpSize),
-        lines(ctx.lines + set_id * kWarpSize * ctx.line_size) {}
-
-  __device__ int Lookup(const ThreadContext& thread_ctx, Key key) {
-    const Key lane_key = keys[thread_ctx.lane_id];
-    const int lane_age = ages[thread_ctx.lane_id];
-    const bool lane_hit = (lane_key == key && lane_age != 0);
-    const unsigned long long int hit_mask = __ballot(lane_hit);
-    if (hit_mask != 0) {
-      return __ffs(static_cast<int>(hit_mask)) - 1;
-    } else {
-      return -1;
-    }
-  }
-
-  __device__ void Read(const LruCacheContext<Key, Elem>& cache_ctx, const ThreadContext& thread_ctx,
-                       int way, Elem* line) {
-    const Elem* from_line = lines + way * cache_ctx.line_size;
-    for (int i = thread_ctx.lane_id; i < cache_ctx.line_size; i += kWarpSize) {
-      line[i] = from_line[i];
-    }
-  }
-
-  __device__ int InsertWithoutEvicting(const LruCacheContext<Key, Elem>& cache_ctx,
-                                       const ThreadContext& thread_ctx, Key key) {
-    int insert_way = -1;
-    const Key lane_key = keys[thread_ctx.lane_id];
-    int lane_age = ages[thread_ctx.lane_id];
-    const unsigned long long int hit_mask = __ballot(lane_key == key && lane_age != 0);
-    if (hit_mask != 0) {
-      insert_way = __ffs(static_cast<int>(hit_mask)) - 1;
-      const int insert_way_age = __shfl(lane_age, insert_way);
-      if (lane_age > insert_way_age) {
-        lane_age -= 1;
-      } else if (thread_ctx.lane_id == insert_way) {
-        lane_age = kWarpSize;
-      }
-     __syncthreads();
-    }
-    if (insert_way == -1) {
-      const unsigned long long int valid_mask = __ballot(lane_age != 0);
-      if (valid_mask != kFullMask) {
-        insert_way = __popc(static_cast<int>(valid_mask));
-        if (lane_age > 0) {
-          lane_age -= 1;
-        } else if (thread_ctx.lane_id == insert_way) {
-          lane_age = kWarpSize;
-          keys[insert_way] = key;
-        }
-       __syncthreads();
-      }
-    }
-    if (insert_way != -1) { ages[thread_ctx.lane_id] = lane_age; }
-    return insert_way;
-  }
-
-  __device__ void Evict(const LruCacheContext<Key, Elem>& cache_ctx,
-                        const ThreadContext& thread_ctx, Key key, int* way, Key* evicted_key) {
-    const Key lane_key = keys[thread_ctx.lane_id];
-    int lane_age = ages[thread_ctx.lane_id];
-    const int insert_way = __ffs(static_cast<int>(__ballot(lane_age == 1))) - 1;
-    *evicted_key = __shfl(lane_key, insert_way);
-    if (thread_ctx.lane_id == insert_way) {
-      keys[insert_way] = key;
-      lane_age = kWarpSize;
-    } else if (lane_age > 1) {
-      lane_age -= 1;
-    }
-   __syncthreads();
-    ages[thread_ctx.lane_id] = lane_age;
-    *way = insert_way;
-  }
-
-  __device__ void Write(const LruCacheContext<Key, Elem>& cache_ctx,
-                        const ThreadContext& thread_ctx, int way, const Elem* line) {
-    Elem* to_line = lines + way * cache_ctx.line_size;
-    for (int i = thread_ctx.lane_id; i < cache_ctx.line_size; i += kWarpSize) {
-      to_line[i] = line[i];
-    }
-  }
-
-  __device__ void Lock(const ThreadContext& thread_ctx) { mutex->Lock(thread_ctx); }
-
-  __device__ void Unlock(const ThreadContext& thread_ctx) { mutex->Unlock(thread_ctx); }
-
-  Key* keys;
-  Elem* lines;
-  uint8_t* ages;
-  WarpMutex* mutex;
-};
-
-template<typename Key, typename Elem, bool test_only>
-__global__ void GetKernel(LruCacheContext<Key, Elem> cache_ctx, uint32_t num_keys, const Key* keys,
-                          Elem* values, uint32_t* n_missing_keys, Key* missing_keys,
-                          uint32_t* missing_indices) {
-  ThreadContext thread_ctx{};
-  __shared__ Key block_keys[kNumWarpPerBlock][kWarpSize];
-  __shared__ size_t block_set_ids[kNumWarpPerBlock][kWarpSize];
-  for (uint32_t batch_offset = thread_ctx.global_warp_id * kWarpSize; batch_offset < num_keys;
-       batch_offset += thread_ctx.num_warps * kWarpSize) {
-    const uint32_t n_batch_keys = min(kWarpSize, num_keys - batch_offset);
-    if (thread_ctx.lane_id < n_batch_keys) {
-      const Key key = keys[batch_offset + thread_ctx.lane_id];
-      const size_t hash = LruCacheHash()(key);
-      const uint32_t set_id = hash % cache_ctx.n_set;
-      block_keys[thread_ctx.warp_id_in_block][thread_ctx.lane_id] = key;
-      block_set_ids[thread_ctx.warp_id_in_block][thread_ctx.lane_id] = set_id;
-    }
-   __syncthreads();
-    uint32_t n_warp_missing = 0;
-    Key warp_missing_key = 0;
-    uint32_t warp_missing_index = 0;
-    for (uint32_t i = 0; i < n_batch_keys; ++i) {
-      const uint32_t key_idx = batch_offset + i;
-      const Key key = block_keys[thread_ctx.warp_id_in_block][i];
-      const size_t set_id = block_set_ids[thread_ctx.warp_id_in_block][i];
-      SetContext<Key, Elem> set_ctx(cache_ctx, set_id);
-      const int way = set_ctx.Lookup(thread_ctx, key);
-      if (way < 0) {
-        if (thread_ctx.lane_id == n_warp_missing) {
-          warp_missing_key = key;
-          warp_missing_index = key_idx;
-        }
-       __syncthreads();
-        n_warp_missing += 1;
-      } else if (!test_only) {
-        set_ctx.Read(cache_ctx, thread_ctx, way, values + key_idx * cache_ctx.line_size);
-      }
-    }
-    if (n_warp_missing > 0) {
-      uint32_t base_missing_idx = 0;
-      if (thread_ctx.lane_id == 0) { base_missing_idx = atomicAdd(n_missing_keys, n_warp_missing); }
-     __syncthreads();
-      base_missing_idx = __shfl(base_missing_idx, 0);
-      if (thread_ctx.lane_id < n_warp_missing) {
-        missing_keys[base_missing_idx + thread_ctx.lane_id] = warp_missing_key;
-        missing_indices[base_missing_idx + thread_ctx.lane_id] = warp_missing_index;
-      }
-     __syncthreads();
-    }
-   __syncthreads();
-  }
-}
-
-template<typename Key, typename Elem>
-__global__ void PutWithoutEvictingKernel(LruCacheContext<Key, Elem> cache_ctx, uint32_t num_keys,
-                                         const Key* keys, const Elem* values, uint32_t* n_missing,
-                                         Key* missing_keys, uint32_t* missing_indices) {
-  ThreadContext thread_ctx{};
-  __shared__ Key block_keys[kNumWarpPerBlock][kWarpSize];
-  __shared__ size_t block_set_ids[kNumWarpPerBlock][kWarpSize];
-  for (uint32_t batch_offset = thread_ctx.global_warp_id * kWarpSize; batch_offset < num_keys;
-       batch_offset += thread_ctx.num_warps * kWarpSize) {
-    const uint32_t n_batch_keys = min(kWarpSize, num_keys - batch_offset);
-    if (thread_ctx.lane_id < n_batch_keys) {
-      const Key key = keys[batch_offset + thread_ctx.lane_id];
-      const size_t hash = LruCacheHash()(key);
-      const uint32_t set_id = hash % cache_ctx.n_set;
-      block_keys[thread_ctx.warp_id_in_block][thread_ctx.lane_id] = key;
-      block_set_ids[thread_ctx.warp_id_in_block][thread_ctx.lane_id] = set_id;
-    }
-   __syncthreads();
-    uint32_t n_warp_missing = 0;
-    Key warp_missing_key = 0;
-    uint32_t warp_missing_index = 0;
-    for (uint32_t i = 0; i < n_batch_keys; ++i) {
-      const uint32_t key_idx = batch_offset + i;
-      const Key key = block_keys[thread_ctx.warp_id_in_block][i];
-      const size_t set_id = block_set_ids[thread_ctx.warp_id_in_block][i];
-      SetContext<Key, Elem> set_ctx(cache_ctx, set_id);
-      set_ctx.Lock(thread_ctx);
-      Key evicted_key = 0;
-      const int insert_way = set_ctx.InsertWithoutEvicting(cache_ctx, thread_ctx, key);
-      if (insert_way >= 0) {
-        set_ctx.Write(cache_ctx, thread_ctx, insert_way, values + cache_ctx.line_size * key_idx);
-      } else {
-        if (thread_ctx.lane_id == n_warp_missing) {
-          warp_missing_key = key;
-          warp_missing_index = key_idx;
-        }
-       __syncthreads();
-        n_warp_missing += 1;
-      }
-      set_ctx.Unlock(thread_ctx);
-    }
-    if (n_warp_missing > 0) {
-      uint32_t base_missing_idx = 0;
-      if (thread_ctx.lane_id == 0) { base_missing_idx = atomicAdd(n_missing, n_warp_missing); }
-     __syncthreads();
-      base_missing_idx = __shfl(base_missing_idx, 0);
-      if (thread_ctx.lane_id < n_warp_missing) {
-        missing_keys[base_missing_idx + thread_ctx.lane_id] = warp_missing_key;
-        missing_indices[base_missing_idx + thread_ctx.lane_id] = warp_missing_index;
-      }
-     __syncthreads();
-    }
-  }
-}
-
-template<typename Key, typename Elem>
-__global__ void EvictKernel(LruCacheContext<Key, Elem> cache_ctx, const Key* keys,
-                            const uint32_t* indices, const Elem* values, const uint32_t* n_evict,
-                            Key* evicted_keys, Elem* evicted_values) {
-  ThreadContext thread_ctx{};
-  uint32_t num_evict = *n_evict;
-  __shared__ Key block_keys[kNumWarpPerBlock][kWarpSize];
-  __shared__ size_t block_set_ids[kNumWarpPerBlock][kWarpSize];
-  for (uint32_t batch_offset = thread_ctx.global_warp_id * kWarpSize; batch_offset < num_evict;
-       batch_offset += thread_ctx.num_warps * kWarpSize) {
-    const uint32_t n_batch_keys = min(kWarpSize, num_evict - batch_offset);
-    if (thread_ctx.lane_id < n_batch_keys) {
-      const Key key = keys[batch_offset + thread_ctx.lane_id];
-      const size_t hash = LruCacheHash()(key);
-      const uint32_t set_id = hash % cache_ctx.n_set;
-      block_keys[thread_ctx.warp_id_in_block][thread_ctx.lane_id] = key;
-      block_set_ids[thread_ctx.warp_id_in_block][thread_ctx.lane_id] = set_id;
-    }
-   __syncthreads();
-    for (uint32_t i = 0; i < n_batch_keys; ++i) {
-      const uint32_t key_idx = batch_offset + i;
-      const Key key = block_keys[thread_ctx.warp_id_in_block][i];
-      const uint32_t set_id = block_set_ids[thread_ctx.warp_id_in_block][i];
-      SetContext<Key, Elem> set_ctx(cache_ctx, set_id);
-      set_ctx.Lock(thread_ctx);
-      int evicted_way = -1;
-      Key evicted_key = 0;
-      set_ctx.Evict(cache_ctx, thread_ctx, key, &evicted_way, &evicted_key);
-      if (thread_ctx.lane_id == 0) { evicted_keys[key_idx] = evicted_key; }
-     __syncthreads();
-      set_ctx.Read(cache_ctx, thread_ctx, evicted_way,
-                   evicted_values + cache_ctx.line_size * key_idx);
-      set_ctx.Write(cache_ctx, thread_ctx, evicted_way,
-                    values + cache_ctx.line_size * indices[key_idx]);
-      set_ctx.Unlock(thread_ctx);
-    }
-  }
-}
-
-template<typename Key, typename Elem>
-__global__ void DumpKernel(LruCacheContext<Key, Elem> cache_ctx, size_t start_key_index,
-                           size_t end_key_index, uint32_t* n_dumped, Key* keys, Elem* values) {
-  ThreadContext thread_ctx{};
-  __shared__ Key warp_keys[kNumWarpPerBlock][kWarpSize];
-  __shared__ uint8_t warp_ages[kNumWarpPerBlock][kWarpSize];
-  for (uint32_t warp_start_key_index = start_key_index + thread_ctx.global_warp_id * kWarpSize;
-       warp_start_key_index < end_key_index;
-       warp_start_key_index += thread_ctx.num_warps * kWarpSize) {
-    Key lane_key = 0;
-    uint8_t lane_age = 0;
-    if (warp_start_key_index + thread_ctx.lane_id < end_key_index) {
-      lane_key = cache_ctx.keys[warp_start_key_index + thread_ctx.lane_id];
-      lane_age = cache_ctx.ages[warp_start_key_index + thread_ctx.lane_id];
-    }
-   __syncthreads();
-    warp_keys[thread_ctx.warp_id_in_block][thread_ctx.lane_id] = lane_key;
-    warp_ages[thread_ctx.warp_id_in_block][thread_ctx.lane_id] = lane_age;
-    const int key_count = __popc(static_cast<int>(__ballot(lane_age != 0)));
-    if (key_count == 0) { continue; }
-    uint32_t offset = 0;
-    if (thread_ctx.lane_id == 0) { offset = atomicAdd(n_dumped, key_count); }
-    offset = __shfl(offset, 0);
-   __syncthreads();
-    for (uint32_t i = 0; i < kWarpSize; ++i) {
-      const Key key = warp_keys[thread_ctx.warp_id_in_block][i];
-      const Key age = warp_ages[thread_ctx.warp_id_in_block][i];
-      if (age == 0) { continue; }
-      if (thread_ctx.lane_id == 0) { keys[offset] = key; }
-     __syncthreads();
-      for (uint32_t j = thread_ctx.lane_id; j < cache_ctx.line_size; j += kWarpSize) {
-        values[offset * cache_ctx.line_size + j] =
-            cache_ctx.lines[(warp_start_key_index + i) * cache_ctx.line_size + j];
-      }
-     __syncthreads();
-      offset += 1;
-    }
-  }
-}
-
-template<typename Key, typename Elem>
-class LruCache : public Cache {
- public:
-  OF_DISALLOW_COPY_AND_MOVE(LruCache);
-  explicit LruCache(const CacheOptions& options)
-      : device_index_{},
-        max_query_length_(0),
-        query_indices_buffer_(nullptr),
-        query_keys_buffer_(nullptr),
-        value_type_(options.value_type) {
-    OF_CUDA_CHECK(hipGetDevice(&device_index_));
-    InitLruCacheContext(options, &ctx_);
-  }
-  ~LruCache() override {
-    CudaCurrentDeviceGuard guard(device_index_);
-    if (max_query_length_ != 0) {
-      OF_CUDA_CHECK(hipFree(query_indices_buffer_));
-      OF_CUDA_CHECK(hipFree(query_keys_buffer_));
-    }
-    DestroyLruCacheContext(&ctx_);
-  }
-
-  uint32_t KeySize() const override { return sizeof(Key); }
-  uint32_t ValueSize() const override { return sizeof(Elem) * ctx_.line_size; }
-  DataType ValueType() const override { return value_type_; }
-  uint64_t Capacity() const override { return ctx_.n_set * kWarpSize; }
-  uint32_t MaxQueryLength() const override { return max_query_length_; }
-
-  void ReserveQueryLength(uint32_t query_length) override {
-    CudaCurrentDeviceGuard guard(device_index_);
-    if (query_length < max_query_length_) { return; }
-    if (max_query_length_ != 0) {
-      OF_CUDA_CHECK(hipFree(query_indices_buffer_));
-      OF_CUDA_CHECK(hipFree(query_keys_buffer_));
-    }
-    OF_CUDA_CHECK(hipMalloc(&query_indices_buffer_, query_length * sizeof(uint32_t)));
-    OF_CUDA_CHECK(hipMalloc(&query_keys_buffer_, query_length * sizeof(Key)));
-    max_query_length_ = query_length;
-  }
-
-  CacheOptions::Policy Policy() const override { return CacheOptions::Policy::kLRU; }
-
-  void Test(ep::Stream* stream, uint32_t n_keys, const void* keys, uint32_t* n_missing,
-            void* missing_keys, uint32_t* missing_indices) override {
-    CHECK_LE(n_keys, max_query_length_);
-    auto cuda_stream = stream->As<ep::CudaStream>();
-    OF_CUDA_CHECK(hipMemsetAsync(n_missing, 0, sizeof(uint32_t), cuda_stream->cuda_stream()));
-    if (n_keys == 0) { return; }
-    cuda_stream->LaunchKernel(GetKernel<Key, Elem, true>, GetLaunchConfig(n_keys), ctx_, n_keys,
-                              static_cast<const Key*>(keys), nullptr, n_missing,
-                              static_cast<Key*>(missing_keys), missing_indices);
-  }
-
-  void Get(ep::Stream* stream, uint32_t n_keys, const void* keys, void* values, uint32_t* n_missing,
-           void* missing_keys, uint32_t* missing_indices) override {
-    CHECK_LE(n_keys, max_query_length_);
-    auto cuda_stream = stream->As<ep::CudaStream>();
-    OF_CUDA_CHECK(hipMemsetAsync(n_missing, 0, sizeof(uint32_t), cuda_stream->cuda_stream()));
-    if (n_keys == 0) { return; }
-    cuda_stream->LaunchKernel(GetKernel<Key, Elem, false>, GetLaunchConfig(n_keys), ctx_, n_keys,
-                              static_cast<const Key*>(keys), static_cast<Elem*>(values), n_missing,
-                              static_cast<Key*>(missing_keys), missing_indices);
-  }
-
-  void Put(ep::Stream* stream, uint32_t n_keys, const void* keys, const void* values,
-           uint32_t* n_evicted, void* evicted_keys, void* evicted_values) override {
-    CHECK_LE(n_keys, max_query_length_);
-    auto cuda_stream = stream->As<ep::CudaStream>();
-    OF_CUDA_CHECK(hipMemsetAsync(n_evicted, 0, sizeof(uint32_t), cuda_stream->cuda_stream()));
-    if (n_keys == 0) { return; }
-    cuda_stream->LaunchKernel(PutWithoutEvictingKernel<Key, Elem>, GetLaunchConfig(n_keys), ctx_,
-                              n_keys, static_cast<const Key*>(keys),
-                              static_cast<const Elem*>(values), n_evicted, query_keys_buffer_,
-                              query_indices_buffer_);
-    cuda_stream->LaunchKernel(EvictKernel<Key, Elem>, GetLaunchConfig(n_keys), ctx_,
-                              query_keys_buffer_, query_indices_buffer_,
-                              static_cast<const Elem*>(values), n_evicted,
-                              static_cast<Key*>(evicted_keys), static_cast<Elem*>(evicted_values));
-  }
-
-  void Dump(ep::Stream* stream, uint64_t start_key_index, uint64_t end_key_index,
-            uint32_t* n_dumped, void* keys, void* values) override {
-    auto cuda_stream = stream->As<ep::CudaStream>();
-    OF_CUDA_CHECK(hipMemsetAsync(n_dumped, 0, sizeof(uint32_t), cuda_stream->cuda_stream()));
-    const uint64_t max_dump_keys = end_key_index - start_key_index;
-    cuda_stream->LaunchKernel(
-        DumpKernel<Key, Elem>,
-        ep::CudaLaunchConfig((max_dump_keys + kNumWarpPerBlock - 1) / kNumWarpPerBlock, kBlockSize,
-                             0),
-        ctx_, start_key_index, end_key_index, n_dumped, static_cast<Key*>(keys),
-        static_cast<Elem*>(values));
-  }
-
-  void Clear() override { ClearLruCacheContext<Key, Elem>(&ctx_); }
-
- private:
-  int device_index_;
-  uint32_t max_query_length_;
-  LruCacheContext<Key, Elem> ctx_;
-  uint32_t* query_indices_buffer_;
-  Key* query_keys_buffer_;
-  DataType value_type_;
-};
-
-template<typename Key>
-std::unique_ptr<Cache> DispatchValueType(const CacheOptions& options) {
-  if (options.value_size % sizeof(ulonglong2) == 0) {
-    return std::unique_ptr<Cache>(new LruCache<Key, ulonglong2>(options));
-  } else if (options.value_size % sizeof(uint64_t) == 0) {
-    return std::unique_ptr<Cache>(new LruCache<Key, uint64_t>(options));
-  } else if (options.value_size % sizeof(uint32_t) == 0) {
-    return std::unique_ptr<Cache>(new LruCache<Key, uint32_t>(options));
-  } else if (options.value_size % sizeof(uint16_t) == 0) {
-    return std::unique_ptr<Cache>(new LruCache<Key, uint16_t>(options));
-  } else {
-    return std::unique_ptr<Cache>(new LruCache<Key, uint8_t>(options));
-  }
-}
-
-std::unique_ptr<Cache> DispatchKeyType(const CacheOptions& options) {
-  if (options.key_size == sizeof(uint32_t)) {
-    return DispatchValueType<uint32_t>(options);
-  } else if (options.key_size == sizeof(uint64_t)) {
-    return DispatchValueType<uint64_t>(options);
-  } else {
-    UNIMPLEMENTED();
-    return nullptr;
-  }
-}
-
-}  // namespace
-
-std::unique_ptr<Cache> NewLruCache(const CacheOptions& options) { return DispatchKeyType(options); }
-
-}  // namespace embedding
-
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+// Inspired by https://github.com/NVIDIA-Merlin/HugeCTR/blob/master/gpu_cache/src/nv_gpu_cache.cu
+
+#include "oneflow/core/embedding/lru_cache.h"
+#include "oneflow/core/device/cuda_util.h"
+#include "oneflow/core/embedding/hash_functions.hip.h"
+#include <new>
+#include <hip/hip_runtime.h>
+
+namespace oneflow {
+
+namespace embedding {
+
+namespace {
+
+constexpr int kWarpSize = 64;
+constexpr int kNumWarpPerBlock = 2;
+constexpr int kBlockSize = kNumWarpPerBlock * kWarpSize;
+constexpr unsigned long long int kFullMask = 0xFFFFFFFFFFFFFFFFU;
+
+ep::CudaLaunchConfig GetLaunchConfig(uint32_t n_keys) {
+  return ep::CudaLaunchConfig((n_keys + kNumWarpPerBlock - 1) / kNumWarpPerBlock,
+                              kWarpSize * kNumWarpPerBlock, 0);
+}
+
+struct ThreadContext {
+  __device__ ThreadContext() {
+    const uint32_t global_thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+    global_warp_id = global_thread_id / kWarpSize;
+    warp_id_in_block = global_warp_id % kNumWarpPerBlock;  // NOLINT
+    num_warps = gridDim.x * kNumWarpPerBlock;              // NOLINT
+    lane_id = global_thread_id % kWarpSize;
+  }
+
+  uint32_t global_warp_id;
+  uint32_t warp_id_in_block;
+  uint32_t num_warps;
+  uint32_t lane_id;
+};
+
+class WarpMutexAtomicImpl {
+ public:
+  OF_DISALLOW_COPY_AND_MOVE(WarpMutexAtomicImpl);
+  __device__ WarpMutexAtomicImpl() : flag_(0) {}
+  __device__ ~WarpMutexAtomicImpl() = default;
+
+  __device__ void Lock(const ThreadContext& thread_ctx) {
+    if (thread_ctx.lane_id == 0) {
+      while (atomicCAS(&flag_, 0, 1) != 0)
+        ;
+    }
+    __threadfence();
+   __syncthreads();
+  }
+
+  __device__ void Unlock(const ThreadContext& thread_ctx) {
+   __syncthreads();
+    __threadfence();
+    if (thread_ctx.lane_id == 0) { atomicExch(&flag_, 0); }
+  }
+
+ private:
+  int32_t flag_;
+};
+
+template<typename Key, typename Elem>
+struct LruCacheContext {
+  Key* keys;
+  Elem* lines;
+  uint8_t* ages;
+  void* mutex;
+  uint64_t n_set;
+  uint32_t line_size;
+  CacheOptions::MemoryKind value_memory_kind;
+};
+
+__global__ void InitCacheSetMutex(uint32_t n_set, void* mutex) {
+
+  using WarpMutex = WarpMutexAtomicImpl;
+
+  const uint32_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx < n_set) { new (reinterpret_cast<WarpMutex*>(mutex) + idx) WarpMutex; }
+}
+
+template<typename Key, typename Elem>
+void ClearLruCacheContext(LruCacheContext<Key, Elem>* ctx) {
+  OF_CUDA_CHECK(hipMemset(ctx->keys, 0, ctx->n_set * kWarpSize * sizeof(Key)));
+  OF_CUDA_CHECK(hipMemset(ctx->ages, 0, ctx->n_set * kWarpSize * sizeof(uint8_t)));
+  InitCacheSetMutex<<<(ctx->n_set - 1 + 256) / 256, 256>>>(ctx->n_set, ctx->mutex);
+}
+
+template<typename Key, typename Elem>
+void InitLruCacheContext(const CacheOptions& options, LruCacheContext<Key, Elem>* ctx) {
+  const size_t keys_size_per_set = kWarpSize * sizeof(Key);
+  const uint32_t line_size = options.value_size / sizeof(Elem);
+  const size_t lines_size_per_set = kWarpSize * line_size * sizeof(Elem);
+  const size_t ages_size_per_set = kWarpSize * sizeof(uint8_t);
+  int device = 0;
+  OF_CUDA_CHECK(hipGetDevice(&device));
+  int major = 0;
+  OF_CUDA_CHECK(hipDeviceGetAttribute(&major, hipDeviceAttributeComputeCapabilityMajor, device));
+  size_t mutex_size_per_set = 0;
+
+  mutex_size_per_set = sizeof(WarpMutexAtomicImpl);
+
+  const size_t n_set = (options.capacity - 1 + kWarpSize) / kWarpSize;
+  CHECK_GT(n_set, 0);
+  ctx->n_set = n_set;
+  ctx->line_size = line_size;
+  const size_t keys_size = n_set * keys_size_per_set;
+  OF_CUDA_CHECK(hipMalloc(&(ctx->keys), keys_size));
+  const size_t lines_size = n_set * lines_size_per_set;
+  if (options.value_memory_kind == CacheOptions::MemoryKind::kDevice) {
+    OF_CUDA_CHECK(hipMalloc(&(ctx->lines), lines_size));
+  } else if (options.value_memory_kind == CacheOptions::MemoryKind::kHost) {
+    if (ParseBooleanFromEnv("ONEFLOW_ONE_EMBEDDING_DISABLE_NUMA_AWARE_ALLOCATION", false)) {
+      OF_CUDA_CHECK(hipMallocHost(reinterpret_cast<void **>(&(ctx->lines)), lines_size));
+    } else {
+      OF_CUDA_CHECK(
+          NumaAwareCudaMallocHost(device, reinterpret_cast<void**>(&ctx->lines), lines_size));
+    }
+  } else {
+    UNIMPLEMENTED();
+  }
+  ctx->value_memory_kind = options.value_memory_kind;
+  const size_t ages_size = n_set * ages_size_per_set;
+  OF_CUDA_CHECK(hipMalloc(&(ctx->ages), ages_size));
+  const size_t mutex_size = n_set * mutex_size_per_set;
+  OF_CUDA_CHECK(hipMalloc(&(ctx->mutex), mutex_size));
+
+  ClearLruCacheContext(ctx);
+}
+
+template<typename Key, typename Elem>
+void DestroyLruCacheContext(LruCacheContext<Key, Elem>* ctx) {
+  OF_CUDA_CHECK(hipFree(ctx->keys));
+  if (ctx->value_memory_kind == CacheOptions::MemoryKind::kDevice) {
+    OF_CUDA_CHECK(hipFree(ctx->lines));
+  } else if (ctx->value_memory_kind == CacheOptions::MemoryKind::kHost) {
+    OF_CUDA_CHECK(hipHostFree(ctx->lines));
+  } else {
+    UNIMPLEMENTED();
+  }
+  OF_CUDA_CHECK(hipFree(ctx->ages));
+  OF_CUDA_CHECK(hipFree(ctx->mutex));
+}
+
+template<typename Key, typename Elem>
+struct SetContext {
+
+  using WarpMutex = WarpMutexAtomicImpl;
+
+  __device__ SetContext(const LruCacheContext<Key, Elem>& ctx, uint32_t set_id)
+      : keys(ctx.keys + set_id * kWarpSize),
+        mutex(reinterpret_cast<WarpMutex*>(ctx.mutex) + set_id),
+        ages(ctx.ages + set_id * kWarpSize),
+        lines(ctx.lines + set_id * kWarpSize * ctx.line_size) {}
+
+  __device__ int Lookup(const ThreadContext& thread_ctx, Key key) {
+    const Key lane_key = keys[thread_ctx.lane_id];
+    const int lane_age = ages[thread_ctx.lane_id];
+    const bool lane_hit = (lane_key == key && lane_age != 0);
+    const unsigned long long int hit_mask = __ballot(lane_hit);
+    if (hit_mask != 0) {
+      return __ffs(static_cast<int>(hit_mask)) - 1;
+    } else {
+      return -1;
+    }
+  }
+
+  __device__ void Read(const LruCacheContext<Key, Elem>& cache_ctx, const ThreadContext& thread_ctx,
+                       int way, Elem* line) {
+    const Elem* from_line = lines + way * cache_ctx.line_size;
+    for (int i = thread_ctx.lane_id; i < cache_ctx.line_size; i += kWarpSize) {
+      line[i] = from_line[i];
+    }
+  }
+
+  __device__ int InsertWithoutEvicting(const LruCacheContext<Key, Elem>& cache_ctx,
+                                       const ThreadContext& thread_ctx, Key key) {
+    int insert_way = -1;
+    const Key lane_key = keys[thread_ctx.lane_id];
+    int lane_age = ages[thread_ctx.lane_id];
+    const unsigned long long int hit_mask = __ballot(lane_key == key && lane_age != 0);
+    if (hit_mask != 0) {
+      insert_way = __ffs(static_cast<int>(hit_mask)) - 1;
+      const int insert_way_age = __shfl(lane_age, insert_way);
+      if (lane_age > insert_way_age) {
+        lane_age -= 1;
+      } else if (thread_ctx.lane_id == insert_way) {
+        lane_age = kWarpSize;
+      }
+     __syncthreads();
+    }
+    if (insert_way == -1) {
+      const unsigned long long int valid_mask = __ballot(lane_age != 0);
+      if (valid_mask != kFullMask) {
+        insert_way = __popc(static_cast<int>(valid_mask));
+        if (lane_age > 0) {
+          lane_age -= 1;
+        } else if (thread_ctx.lane_id == insert_way) {
+          lane_age = kWarpSize;
+          keys[insert_way] = key;
+        }
+       __syncthreads();
+      }
+    }
+    if (insert_way != -1) { ages[thread_ctx.lane_id] = lane_age; }
+    return insert_way;
+  }
+
+  __device__ void Evict(const LruCacheContext<Key, Elem>& cache_ctx,
+                        const ThreadContext& thread_ctx, Key key, int* way, Key* evicted_key) {
+    const Key lane_key = keys[thread_ctx.lane_id];
+    int lane_age = ages[thread_ctx.lane_id];
+    const int insert_way = __ffs(static_cast<int>(__ballot(lane_age == 1))) - 1;
+    *evicted_key = __shfl(lane_key, insert_way);
+    if (thread_ctx.lane_id == insert_way) {
+      keys[insert_way] = key;
+      lane_age = kWarpSize;
+    } else if (lane_age > 1) {
+      lane_age -= 1;
+    }
+   __syncthreads();
+    ages[thread_ctx.lane_id] = lane_age;
+    *way = insert_way;
+  }
+
+  __device__ void Write(const LruCacheContext<Key, Elem>& cache_ctx,
+                        const ThreadContext& thread_ctx, int way, const Elem* line) {
+    Elem* to_line = lines + way * cache_ctx.line_size;
+    for (int i = thread_ctx.lane_id; i < cache_ctx.line_size; i += kWarpSize) {
+      to_line[i] = line[i];
+    }
+  }
+
+  __device__ void Lock(const ThreadContext& thread_ctx) { mutex->Lock(thread_ctx); }
+
+  __device__ void Unlock(const ThreadContext& thread_ctx) { mutex->Unlock(thread_ctx); }
+
+  Key* keys;
+  Elem* lines;
+  uint8_t* ages;
+  WarpMutex* mutex;
+};
+
+template<typename Key, typename Elem, bool test_only>
+__global__ void GetKernel(LruCacheContext<Key, Elem> cache_ctx, uint32_t num_keys, const Key* keys,
+                          Elem* values, uint32_t* n_missing_keys, Key* missing_keys,
+                          uint32_t* missing_indices) {
+  ThreadContext thread_ctx{};
+  __shared__ Key block_keys[kNumWarpPerBlock][kWarpSize];
+  __shared__ size_t block_set_ids[kNumWarpPerBlock][kWarpSize];
+  for (uint32_t batch_offset = thread_ctx.global_warp_id * kWarpSize; batch_offset < num_keys;
+       batch_offset += thread_ctx.num_warps * kWarpSize) {
+    const uint32_t n_batch_keys = min(kWarpSize, num_keys - batch_offset);
+    if (thread_ctx.lane_id < n_batch_keys) {
+      const Key key = keys[batch_offset + thread_ctx.lane_id];
+      const size_t hash = LruCacheHash()(key);
+      const uint32_t set_id = hash % cache_ctx.n_set;
+      block_keys[thread_ctx.warp_id_in_block][thread_ctx.lane_id] = key;
+      block_set_ids[thread_ctx.warp_id_in_block][thread_ctx.lane_id] = set_id;
+    }
+   __syncthreads();
+    uint32_t n_warp_missing = 0;
+    Key warp_missing_key = 0;
+    uint32_t warp_missing_index = 0;
+    for (uint32_t i = 0; i < n_batch_keys; ++i) {
+      const uint32_t key_idx = batch_offset + i;
+      const Key key = block_keys[thread_ctx.warp_id_in_block][i];
+      const size_t set_id = block_set_ids[thread_ctx.warp_id_in_block][i];
+      SetContext<Key, Elem> set_ctx(cache_ctx, set_id);
+      const int way = set_ctx.Lookup(thread_ctx, key);
+      if (way < 0) {
+        if (thread_ctx.lane_id == n_warp_missing) {
+          warp_missing_key = key;
+          warp_missing_index = key_idx;
+        }
+       __syncthreads();
+        n_warp_missing += 1;
+      } else if (!test_only) {
+        set_ctx.Read(cache_ctx, thread_ctx, way, values + key_idx * cache_ctx.line_size);
+      }
+    }
+    if (n_warp_missing > 0) {
+      uint32_t base_missing_idx = 0;
+      if (thread_ctx.lane_id == 0) { base_missing_idx = atomicAdd(n_missing_keys, n_warp_missing); }
+     __syncthreads();
+      base_missing_idx = __shfl(base_missing_idx, 0);
+      if (thread_ctx.lane_id < n_warp_missing) {
+        missing_keys[base_missing_idx + thread_ctx.lane_id] = warp_missing_key;
+        missing_indices[base_missing_idx + thread_ctx.lane_id] = warp_missing_index;
+      }
+     __syncthreads();
+    }
+   __syncthreads();
+  }
+}
+
+template<typename Key, typename Elem>
+__global__ void PutWithoutEvictingKernel(LruCacheContext<Key, Elem> cache_ctx, uint32_t num_keys,
+                                         const Key* keys, const Elem* values, uint32_t* n_missing,
+                                         Key* missing_keys, uint32_t* missing_indices) {
+  ThreadContext thread_ctx{};
+  __shared__ Key block_keys[kNumWarpPerBlock][kWarpSize];
+  __shared__ size_t block_set_ids[kNumWarpPerBlock][kWarpSize];
+  for (uint32_t batch_offset = thread_ctx.global_warp_id * kWarpSize; batch_offset < num_keys;
+       batch_offset += thread_ctx.num_warps * kWarpSize) {
+    const uint32_t n_batch_keys = min(kWarpSize, num_keys - batch_offset);
+    if (thread_ctx.lane_id < n_batch_keys) {
+      const Key key = keys[batch_offset + thread_ctx.lane_id];
+      const size_t hash = LruCacheHash()(key);
+      const uint32_t set_id = hash % cache_ctx.n_set;
+      block_keys[thread_ctx.warp_id_in_block][thread_ctx.lane_id] = key;
+      block_set_ids[thread_ctx.warp_id_in_block][thread_ctx.lane_id] = set_id;
+    }
+   __syncthreads();
+    uint32_t n_warp_missing = 0;
+    Key warp_missing_key = 0;
+    uint32_t warp_missing_index = 0;
+    for (uint32_t i = 0; i < n_batch_keys; ++i) {
+      const uint32_t key_idx = batch_offset + i;
+      const Key key = block_keys[thread_ctx.warp_id_in_block][i];
+      const size_t set_id = block_set_ids[thread_ctx.warp_id_in_block][i];
+      SetContext<Key, Elem> set_ctx(cache_ctx, set_id);
+      set_ctx.Lock(thread_ctx);
+      Key evicted_key = 0;
+      const int insert_way = set_ctx.InsertWithoutEvicting(cache_ctx, thread_ctx, key);
+      if (insert_way >= 0) {
+        set_ctx.Write(cache_ctx, thread_ctx, insert_way, values + cache_ctx.line_size * key_idx);
+      } else {
+        if (thread_ctx.lane_id == n_warp_missing) {
+          warp_missing_key = key;
+          warp_missing_index = key_idx;
+        }
+       __syncthreads();
+        n_warp_missing += 1;
+      }
+      set_ctx.Unlock(thread_ctx);
+    }
+    if (n_warp_missing > 0) {
+      uint32_t base_missing_idx = 0;
+      if (thread_ctx.lane_id == 0) { base_missing_idx = atomicAdd(n_missing, n_warp_missing); }
+     __syncthreads();
+      base_missing_idx = __shfl(base_missing_idx, 0);
+      if (thread_ctx.lane_id < n_warp_missing) {
+        missing_keys[base_missing_idx + thread_ctx.lane_id] = warp_missing_key;
+        missing_indices[base_missing_idx + thread_ctx.lane_id] = warp_missing_index;
+      }
+     __syncthreads();
+    }
+  }
+}
+
+template<typename Key, typename Elem>
+__global__ void EvictKernel(LruCacheContext<Key, Elem> cache_ctx, const Key* keys,
+                            const uint32_t* indices, const Elem* values, const uint32_t* n_evict,
+                            Key* evicted_keys, Elem* evicted_values) {
+  ThreadContext thread_ctx{};
+  uint32_t num_evict = *n_evict;
+  __shared__ Key block_keys[kNumWarpPerBlock][kWarpSize];
+  __shared__ size_t block_set_ids[kNumWarpPerBlock][kWarpSize];
+  for (uint32_t batch_offset = thread_ctx.global_warp_id * kWarpSize; batch_offset < num_evict;
+       batch_offset += thread_ctx.num_warps * kWarpSize) {
+    const uint32_t n_batch_keys = min(kWarpSize, num_evict - batch_offset);
+    if (thread_ctx.lane_id < n_batch_keys) {
+      const Key key = keys[batch_offset + thread_ctx.lane_id];
+      const size_t hash = LruCacheHash()(key);
+      const uint32_t set_id = hash % cache_ctx.n_set;
+      block_keys[thread_ctx.warp_id_in_block][thread_ctx.lane_id] = key;
+      block_set_ids[thread_ctx.warp_id_in_block][thread_ctx.lane_id] = set_id;
+    }
+   __syncthreads();
+    for (uint32_t i = 0; i < n_batch_keys; ++i) {
+      const uint32_t key_idx = batch_offset + i;
+      const Key key = block_keys[thread_ctx.warp_id_in_block][i];
+      const uint32_t set_id = block_set_ids[thread_ctx.warp_id_in_block][i];
+      SetContext<Key, Elem> set_ctx(cache_ctx, set_id);
+      set_ctx.Lock(thread_ctx);
+      int evicted_way = -1;
+      Key evicted_key = 0;
+      set_ctx.Evict(cache_ctx, thread_ctx, key, &evicted_way, &evicted_key);
+      if (thread_ctx.lane_id == 0) { evicted_keys[key_idx] = evicted_key; }
+     __syncthreads();
+      set_ctx.Read(cache_ctx, thread_ctx, evicted_way,
+                   evicted_values + cache_ctx.line_size * key_idx);
+      set_ctx.Write(cache_ctx, thread_ctx, evicted_way,
+                    values + cache_ctx.line_size * indices[key_idx]);
+      set_ctx.Unlock(thread_ctx);
+    }
+  }
+}
+
+template<typename Key, typename Elem>
+__global__ void DumpKernel(LruCacheContext<Key, Elem> cache_ctx, size_t start_key_index,
+                           size_t end_key_index, uint32_t* n_dumped, Key* keys, Elem* values) {
+  ThreadContext thread_ctx{};
+  __shared__ Key warp_keys[kNumWarpPerBlock][kWarpSize];
+  __shared__ uint8_t warp_ages[kNumWarpPerBlock][kWarpSize];
+  for (uint32_t warp_start_key_index = start_key_index + thread_ctx.global_warp_id * kWarpSize;
+       warp_start_key_index < end_key_index;
+       warp_start_key_index += thread_ctx.num_warps * kWarpSize) {
+    Key lane_key = 0;
+    uint8_t lane_age = 0;
+    if (warp_start_key_index + thread_ctx.lane_id < end_key_index) {
+      lane_key = cache_ctx.keys[warp_start_key_index + thread_ctx.lane_id];
+      lane_age = cache_ctx.ages[warp_start_key_index + thread_ctx.lane_id];
+    }
+   __syncthreads();
+    warp_keys[thread_ctx.warp_id_in_block][thread_ctx.lane_id] = lane_key;
+    warp_ages[thread_ctx.warp_id_in_block][thread_ctx.lane_id] = lane_age;
+    const int key_count = __popc(static_cast<int>(__ballot(lane_age != 0)));
+    if (key_count == 0) { continue; }
+    uint32_t offset = 0;
+    if (thread_ctx.lane_id == 0) { offset = atomicAdd(n_dumped, key_count); }
+    offset = __shfl(offset, 0);
+   __syncthreads();
+    for (uint32_t i = 0; i < kWarpSize; ++i) {
+      const Key key = warp_keys[thread_ctx.warp_id_in_block][i];
+      const Key age = warp_ages[thread_ctx.warp_id_in_block][i];
+      if (age == 0) { continue; }
+      if (thread_ctx.lane_id == 0) { keys[offset] = key; }
+     __syncthreads();
+      for (uint32_t j = thread_ctx.lane_id; j < cache_ctx.line_size; j += kWarpSize) {
+        values[offset * cache_ctx.line_size + j] =
+            cache_ctx.lines[(warp_start_key_index + i) * cache_ctx.line_size + j];
+      }
+     __syncthreads();
+      offset += 1;
+    }
+  }
+}
+
+template<typename Key, typename Elem>
+class LruCache : public Cache {
+ public:
+  OF_DISALLOW_COPY_AND_MOVE(LruCache);
+  explicit LruCache(const CacheOptions& options)
+      : device_index_{},
+        max_query_length_(0),
+        query_indices_buffer_(nullptr),
+        query_keys_buffer_(nullptr),
+        value_type_(options.value_type) {
+    OF_CUDA_CHECK(hipGetDevice(&device_index_));
+    InitLruCacheContext(options, &ctx_);
+  }
+  ~LruCache() override {
+    CudaCurrentDeviceGuard guard(device_index_);
+    if (max_query_length_ != 0) {
+      OF_CUDA_CHECK(hipFree(query_indices_buffer_));
+      OF_CUDA_CHECK(hipFree(query_keys_buffer_));
+    }
+    DestroyLruCacheContext(&ctx_);
+  }
+
+  uint32_t KeySize() const override { return sizeof(Key); }
+  uint32_t ValueSize() const override { return sizeof(Elem) * ctx_.line_size; }
+  DataType ValueType() const override { return value_type_; }
+  uint64_t Capacity() const override { return ctx_.n_set * kWarpSize; }
+  uint32_t MaxQueryLength() const override { return max_query_length_; }
+
+  void ReserveQueryLength(uint32_t query_length) override {
+    CudaCurrentDeviceGuard guard(device_index_);
+    if (query_length < max_query_length_) { return; }
+    if (max_query_length_ != 0) {
+      OF_CUDA_CHECK(hipFree(query_indices_buffer_));
+      OF_CUDA_CHECK(hipFree(query_keys_buffer_));
+    }
+    OF_CUDA_CHECK(hipMalloc(&query_indices_buffer_, query_length * sizeof(uint32_t)));
+    OF_CUDA_CHECK(hipMalloc(&query_keys_buffer_, query_length * sizeof(Key)));
+    max_query_length_ = query_length;
+  }
+
+  CacheOptions::Policy Policy() const override { return CacheOptions::Policy::kLRU; }
+
+  void Test(ep::Stream* stream, uint32_t n_keys, const void* keys, uint32_t* n_missing,
+            void* missing_keys, uint32_t* missing_indices) override {
+    CHECK_LE(n_keys, max_query_length_);
+    auto cuda_stream = stream->As<ep::CudaStream>();
+    OF_CUDA_CHECK(hipMemsetAsync(n_missing, 0, sizeof(uint32_t), cuda_stream->cuda_stream()));
+    if (n_keys == 0) { return; }
+    cuda_stream->LaunchKernel(GetKernel<Key, Elem, true>, GetLaunchConfig(n_keys), ctx_, n_keys,
+                              static_cast<const Key*>(keys), nullptr, n_missing,
+                              static_cast<Key*>(missing_keys), missing_indices);
+  }
+
+  void Get(ep::Stream* stream, uint32_t n_keys, const void* keys, void* values, uint32_t* n_missing,
+           void* missing_keys, uint32_t* missing_indices) override {
+    CHECK_LE(n_keys, max_query_length_);
+    auto cuda_stream = stream->As<ep::CudaStream>();
+    OF_CUDA_CHECK(hipMemsetAsync(n_missing, 0, sizeof(uint32_t), cuda_stream->cuda_stream()));
+    if (n_keys == 0) { return; }
+    cuda_stream->LaunchKernel(GetKernel<Key, Elem, false>, GetLaunchConfig(n_keys), ctx_, n_keys,
+                              static_cast<const Key*>(keys), static_cast<Elem*>(values), n_missing,
+                              static_cast<Key*>(missing_keys), missing_indices);
+  }
+
+  void Put(ep::Stream* stream, uint32_t n_keys, const void* keys, const void* values,
+           uint32_t* n_evicted, void* evicted_keys, void* evicted_values) override {
+    CHECK_LE(n_keys, max_query_length_);
+    auto cuda_stream = stream->As<ep::CudaStream>();
+    OF_CUDA_CHECK(hipMemsetAsync(n_evicted, 0, sizeof(uint32_t), cuda_stream->cuda_stream()));
+    if (n_keys == 0) { return; }
+    cuda_stream->LaunchKernel(PutWithoutEvictingKernel<Key, Elem>, GetLaunchConfig(n_keys), ctx_,
+                              n_keys, static_cast<const Key*>(keys),
+                              static_cast<const Elem*>(values), n_evicted, query_keys_buffer_,
+                              query_indices_buffer_);
+    cuda_stream->LaunchKernel(EvictKernel<Key, Elem>, GetLaunchConfig(n_keys), ctx_,
+                              query_keys_buffer_, query_indices_buffer_,
+                              static_cast<const Elem*>(values), n_evicted,
+                              static_cast<Key*>(evicted_keys), static_cast<Elem*>(evicted_values));
+  }
+
+  void Dump(ep::Stream* stream, uint64_t start_key_index, uint64_t end_key_index,
+            uint32_t* n_dumped, void* keys, void* values) override {
+    auto cuda_stream = stream->As<ep::CudaStream>();
+    OF_CUDA_CHECK(hipMemsetAsync(n_dumped, 0, sizeof(uint32_t), cuda_stream->cuda_stream()));
+    const uint64_t max_dump_keys = end_key_index - start_key_index;
+    cuda_stream->LaunchKernel(
+        DumpKernel<Key, Elem>,
+        ep::CudaLaunchConfig((max_dump_keys + kNumWarpPerBlock - 1) / kNumWarpPerBlock, kBlockSize,
+                             0),
+        ctx_, start_key_index, end_key_index, n_dumped, static_cast<Key*>(keys),
+        static_cast<Elem*>(values));
+  }
+
+  void Clear() override { ClearLruCacheContext<Key, Elem>(&ctx_); }
+
+ private:
+  int device_index_;
+  uint32_t max_query_length_;
+  LruCacheContext<Key, Elem> ctx_;
+  uint32_t* query_indices_buffer_;
+  Key* query_keys_buffer_;
+  DataType value_type_;
+};
+
+template<typename Key>
+std::unique_ptr<Cache> DispatchValueType(const CacheOptions& options) {
+  if (options.value_size % sizeof(ulonglong2) == 0) {
+    return std::unique_ptr<Cache>(new LruCache<Key, ulonglong2>(options));
+  } else if (options.value_size % sizeof(uint64_t) == 0) {
+    return std::unique_ptr<Cache>(new LruCache<Key, uint64_t>(options));
+  } else if (options.value_size % sizeof(uint32_t) == 0) {
+    return std::unique_ptr<Cache>(new LruCache<Key, uint32_t>(options));
+  } else if (options.value_size % sizeof(uint16_t) == 0) {
+    return std::unique_ptr<Cache>(new LruCache<Key, uint16_t>(options));
+  } else {
+    return std::unique_ptr<Cache>(new LruCache<Key, uint8_t>(options));
+  }
+}
+
+std::unique_ptr<Cache> DispatchKeyType(const CacheOptions& options) {
+  if (options.key_size == sizeof(uint32_t)) {
+    return DispatchValueType<uint32_t>(options);
+  } else if (options.key_size == sizeof(uint64_t)) {
+    return DispatchValueType<uint64_t>(options);
+  } else {
+    UNIMPLEMENTED();
+    return nullptr;
+  }
+}
+
+}  // namespace
+
+std::unique_ptr<Cache> NewLruCache(const CacheOptions& options) { return DispatchKeyType(options); }
+
+}  // namespace embedding
+
 }  // namespace oneflow
\ No newline at end of file
diff --git a/oneflow/core/embedding/mock_key_value_store.hip.cpp b/oneflow/core/embedding/mock_key_value_store.hip.cpp
index de55f2a..9897779 100644
--- a/oneflow/core/embedding/mock_key_value_store.hip.cpp
+++ b/oneflow/core/embedding/mock_key_value_store.hip.cpp
@@ -1,249 +1,249 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "oneflow/core/embedding/mock_key_value_store.h"
-#include "oneflow/core/device/cuda_util.h"
-
-namespace oneflow {
-
-namespace embedding {
-
-namespace {
-
-template<typename Key>
-class IteratorImpl : public KVIterator {
- public:
-  OF_DISALLOW_COPY_AND_MOVE(IteratorImpl);
-  IteratorImpl(HashMap<Key, std::string>* store, uint32_t key_size, uint32_t value_size,
-               uint32_t max_query_length, void* host_keys_buffer, void* host_values_buffer,
-               uint32_t* host_num_buffer)
-      : store_(store),
-        pos_(store->begin()),
-        key_size_(key_size),
-        value_size_(value_size),
-        max_query_length_(max_query_length),
-        host_keys_buffer_(host_keys_buffer),
-        host_values_buffer_(host_values_buffer),
-        host_num_buffer_(host_num_buffer) {}
-  ~IteratorImpl() override = default;
-
-  void NextN(ep::Stream* stream, uint32_t n_request, uint32_t* n_result, void* keys,
-             void* values) override {
-    CHECK_LE(n_request, max_query_length_);
-    auto cuda_stream = stream->As<ep::CudaStream>();
-    CHECK_JUST(cuda_stream->Sync());
-    *host_num_buffer_ = 0;
-    while (*host_num_buffer_ < n_request && pos_ != store_->end()) {
-      reinterpret_cast<Key*>(host_keys_buffer_)[*host_num_buffer_] = pos_->first;
-      std::memcpy(reinterpret_cast<char*>(host_values_buffer_) + *host_num_buffer_ * value_size_,
-                  pos_->second.data(), value_size_);
-    }
-    OF_CUDA_CHECK(hipMemcpyAsync(n_result, host_num_buffer_, sizeof(uint32_t), hipMemcpyDefault,
-                                  cuda_stream->cuda_stream()));
-    const uint32_t num_keys = *host_num_buffer_;
-    if (num_keys != 0) {
-      OF_CUDA_CHECK(hipMemcpyAsync(keys, host_keys_buffer_, num_keys * key_size_,
-                                    hipMemcpyDefault, cuda_stream->cuda_stream()));
-      OF_CUDA_CHECK(hipMemcpyAsync(values, host_values_buffer_, num_keys * value_size_,
-                                    hipMemcpyDefault, cuda_stream->cuda_stream()));
-    }
-  }
-
-  void Reset() override { pos_ = store_->begin(); }
-
- private:
-  HashMap<Key, std::string>* store_;
-  typename HashMap<Key, std::string>::iterator pos_;
-  uint32_t key_size_;
-  uint32_t value_size_;
-  uint32_t max_query_length_;
-  void* host_keys_buffer_;
-  void* host_values_buffer_;
-  uint32_t* host_num_buffer_;
-};
-
-template<typename Key>
-class KeyValueStoreImpl : public KeyValueStore {
- public:
-  OF_DISALLOW_COPY_AND_MOVE(KeyValueStoreImpl);
-  explicit KeyValueStoreImpl(const MockKeyValueStoreOptions& options)
-      : device_index_(-1), max_query_length_(0) {
-    OF_CUDA_CHECK(hipGetDevice(&device_index_));
-    key_size_ = options.key_size;
-    value_size_ = options.value_size;
-    OF_CUDA_CHECK(NumaAwareCudaMallocHost(
-        device_index_, reinterpret_cast<void**>(&host_query_keys_), key_size_ * max_query_length_));
-    OF_CUDA_CHECK(NumaAwareCudaMallocHost(device_index_,
-                                          reinterpret_cast<void**>(&host_query_values_),
-                                          value_size_ * max_query_length_));
-    OF_CUDA_CHECK(NumaAwareCudaMallocHost(device_index_, reinterpret_cast<void**>(&host_n_missing_),
-                                          sizeof(uint32_t)));
-    OF_CUDA_CHECK(NumaAwareCudaMallocHost(device_index_,
-                                          reinterpret_cast<void**>(&host_missing_indices_),
-                                          sizeof(uint32_t) * max_query_length_));
-  }
-  ~KeyValueStoreImpl() {
-    CudaCurrentDeviceGuard guard(device_index_);
-    if (max_query_length_ != 0) {
-      OF_CUDA_CHECK(hipHostFree(host_query_keys_));
-      OF_CUDA_CHECK(hipHostFree(host_query_values_));
-      OF_CUDA_CHECK(hipHostFree(host_missing_indices_));
-    }
-    OF_CUDA_CHECK(hipHostFree(host_n_missing_));
-  }
-
-  uint32_t KeySize() const override { return key_size_; }
-
-  uint32_t ValueSize() const override { return value_size_; }
-
-  uint32_t MaxQueryLength() const override { return max_query_length_; }
-
-  void ReserveQueryLength(uint32_t query_length) override {
-    CudaCurrentDeviceGuard guard(device_index_);
-    if (query_length <= max_query_length_) { return; }
-    if (max_query_length_ != 0) {
-      OF_CUDA_CHECK(hipHostFree(host_query_keys_));
-      OF_CUDA_CHECK(hipHostFree(host_query_values_));
-      OF_CUDA_CHECK(hipHostFree(host_missing_indices_));
-    }
-    OF_CUDA_CHECK(NumaAwareCudaMallocHost(
-        device_index_, reinterpret_cast<void**>(&host_query_keys_), key_size_ * query_length));
-    OF_CUDA_CHECK(NumaAwareCudaMallocHost(
-        device_index_, reinterpret_cast<void**>(&host_query_values_), value_size_ * query_length));
-    OF_CUDA_CHECK(NumaAwareCudaMallocHost(device_index_,
-                                          reinterpret_cast<void**>(&host_missing_indices_),
-                                          sizeof(uint32_t) * query_length));
-    max_query_length_ = query_length;
-  }
-
-  void Get(ep::Stream* stream, uint32_t num_keys, const void* keys, void* values,
-           uint32_t* n_missing, uint32_t* missing_indices) override;
-  void Put(ep::Stream* stream, uint32_t num_keys, const void* keys, const void* values) override;
-  bool SnapshotExists(const std::string& name) override;
-  void LoadSnapshot(const std::string& name) override;
-  void LoadSnapshot(const std::string& name,
-                    const std::function<void(KVIterator* iter)>& Hook) override;
-  void SaveSnapshot(const std::string& name) override;
-
- private:
-  int device_index_;
-  uint32_t max_query_length_;
-  uint32_t key_size_;
-  uint32_t value_size_;
-  Key* host_query_keys_{};
-  uint8_t* host_query_values_{};
-  uint32_t* host_n_missing_{};
-  uint32_t* host_missing_indices_{};
-  HashMap<Key, std::string> store_;
-  HashMap<std::string, HashMap<Key, std::string>> snapshots_;
-  std::mutex mutex_;
-};
-
-template<typename Key>
-void KeyValueStoreImpl<Key>::Get(ep::Stream* stream, uint32_t num_keys, const void* keys,
-                                 void* values, uint32_t* n_missing, uint32_t* missing_indices) {
-  std::lock_guard<std::mutex> lock(mutex_);
-  auto cuda_stream = stream->As<ep::CudaStream>();
-  CHECK_LE(num_keys, max_query_length_);
-  if (num_keys == 0) {
-    OF_CUDA_CHECK(hipMemsetAsync(n_missing, 0, sizeof(uint32_t),
-                                  stream->As<ep::CudaStream>()->cuda_stream()));
-    return;
-  }
-  OF_CUDA_CHECK(hipMemcpyAsync(host_query_keys_, keys, key_size_ * num_keys, hipMemcpyDefault,
-                                cuda_stream->cuda_stream()));
-  CHECK_JUST(cuda_stream->Sync());
-  *host_n_missing_ = 0;
-  for (uint32_t i = 0; i < num_keys; ++i) {
-    auto it = store_.find(host_query_keys_[i]);
-    if (it != store_.end()) {
-      std::memcpy(host_query_values_ + i * value_size_, it->second.data(), value_size_);
-    } else {
-      host_missing_indices_[*host_n_missing_] = i;
-      *host_n_missing_ += 1;
-    }
-  }
-  OF_CUDA_CHECK(hipMemcpyAsync(values, host_query_values_, num_keys * value_size_,
-                                hipMemcpyDefault, cuda_stream->cuda_stream()));
-  OF_CUDA_CHECK(hipMemcpyAsync(n_missing, host_n_missing_, sizeof(uint32_t), hipMemcpyDefault,
-                                cuda_stream->cuda_stream()));
-  OF_CUDA_CHECK(hipMemcpyAsync(missing_indices, host_missing_indices_,
-                                (*host_n_missing_) * sizeof(uint32_t), hipMemcpyDefault,
-                                cuda_stream->cuda_stream()));
-}
-
-template<typename Key>
-void KeyValueStoreImpl<Key>::Put(ep::Stream* stream, uint32_t num_keys, const void* keys,
-                                 const void* values) {
-  std::lock_guard<std::mutex> lock(mutex_);
-  auto cuda_stream = stream->As<ep::CudaStream>();
-  CHECK_LE(num_keys, max_query_length_);
-  if (num_keys == 0) { return; }
-  OF_CUDA_CHECK(hipMemcpyAsync(host_query_keys_, keys, key_size_ * num_keys, hipMemcpyDefault,
-                                cuda_stream->cuda_stream()));
-  OF_CUDA_CHECK(hipMemcpyAsync(host_query_values_, values, value_size_ * num_keys,
-                                hipMemcpyDefault, cuda_stream->cuda_stream()));
-  CHECK_JUST(cuda_stream->Sync());
-  for (uint32_t i = 0; i < num_keys; ++i) {
-    store_[host_query_keys_[i]] = std::string(
-        reinterpret_cast<const char*>(host_query_values_) + i * value_size_, value_size_);
-  }
-}
-
-template<typename Key>
-bool KeyValueStoreImpl<Key>::SnapshotExists(const std::string& name) {
-  return snapshots_.find(name) != snapshots_.end();
-}
-
-template<typename Key>
-void KeyValueStoreImpl<Key>::LoadSnapshot(const std::string& name) {
-  CudaCurrentDeviceGuard guard(device_index_);
-  LoadSnapshot(name, nullptr);
-}
-
-template<typename Key>
-void KeyValueStoreImpl<Key>::LoadSnapshot(const std::string& name,
-                                          const std::function<void(KVIterator* iter)>& Hook) {
-  CudaCurrentDeviceGuard guard(device_index_);
-  store_ = snapshots_[name];
-  if (Hook) {
-    IteratorImpl<Key> iterator(&store_, KeySize(), ValueSize(), max_query_length_, host_query_keys_,
-                               host_query_values_, host_n_missing_);
-    Hook(&iterator);
-  }
-}
-
-template<typename Key>
-void KeyValueStoreImpl<Key>::SaveSnapshot(const std::string& name) {
-  CudaCurrentDeviceGuard guard(device_index_);
-  snapshots_[name] = store_;
-}
-
-}  // namespace
-
-std::unique_ptr<KeyValueStore> NewMockKeyValueStore(const MockKeyValueStoreOptions& options) {
-  if (options.key_size == sizeof(uint64_t)) {
-    return std::unique_ptr<KeyValueStore>(new KeyValueStoreImpl<uint64_t>(options));
-  } else if (options.key_size == sizeof(uint32_t)) {
-    return std::unique_ptr<KeyValueStore>(new KeyValueStoreImpl<uint32_t>(options));
-  } else {
-    UNIMPLEMENTED();
-    return nullptr;
-  }
-}
-
-}  // namespace embedding
-
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "oneflow/core/embedding/mock_key_value_store.h"
+#include "oneflow/core/device/cuda_util.h"
+
+namespace oneflow {
+
+namespace embedding {
+
+namespace {
+
+template<typename Key>
+class IteratorImpl : public KVIterator {
+ public:
+  OF_DISALLOW_COPY_AND_MOVE(IteratorImpl);
+  IteratorImpl(HashMap<Key, std::string>* store, uint32_t key_size, uint32_t value_size,
+               uint32_t max_query_length, void* host_keys_buffer, void* host_values_buffer,
+               uint32_t* host_num_buffer)
+      : store_(store),
+        pos_(store->begin()),
+        key_size_(key_size),
+        value_size_(value_size),
+        max_query_length_(max_query_length),
+        host_keys_buffer_(host_keys_buffer),
+        host_values_buffer_(host_values_buffer),
+        host_num_buffer_(host_num_buffer) {}
+  ~IteratorImpl() override = default;
+
+  void NextN(ep::Stream* stream, uint32_t n_request, uint32_t* n_result, void* keys,
+             void* values) override {
+    CHECK_LE(n_request, max_query_length_);
+    auto cuda_stream = stream->As<ep::CudaStream>();
+    CHECK_JUST(cuda_stream->Sync());
+    *host_num_buffer_ = 0;
+    while (*host_num_buffer_ < n_request && pos_ != store_->end()) {
+      reinterpret_cast<Key*>(host_keys_buffer_)[*host_num_buffer_] = pos_->first;
+      std::memcpy(reinterpret_cast<char*>(host_values_buffer_) + *host_num_buffer_ * value_size_,
+                  pos_->second.data(), value_size_);
+    }
+    OF_CUDA_CHECK(hipMemcpyAsync(n_result, host_num_buffer_, sizeof(uint32_t), hipMemcpyDefault,
+                                  cuda_stream->cuda_stream()));
+    const uint32_t num_keys = *host_num_buffer_;
+    if (num_keys != 0) {
+      OF_CUDA_CHECK(hipMemcpyAsync(keys, host_keys_buffer_, num_keys * key_size_,
+                                    hipMemcpyDefault, cuda_stream->cuda_stream()));
+      OF_CUDA_CHECK(hipMemcpyAsync(values, host_values_buffer_, num_keys * value_size_,
+                                    hipMemcpyDefault, cuda_stream->cuda_stream()));
+    }
+  }
+
+  void Reset() override { pos_ = store_->begin(); }
+
+ private:
+  HashMap<Key, std::string>* store_;
+  typename HashMap<Key, std::string>::iterator pos_;
+  uint32_t key_size_;
+  uint32_t value_size_;
+  uint32_t max_query_length_;
+  void* host_keys_buffer_;
+  void* host_values_buffer_;
+  uint32_t* host_num_buffer_;
+};
+
+template<typename Key>
+class KeyValueStoreImpl : public KeyValueStore {
+ public:
+  OF_DISALLOW_COPY_AND_MOVE(KeyValueStoreImpl);
+  explicit KeyValueStoreImpl(const MockKeyValueStoreOptions& options)
+      : device_index_(-1), max_query_length_(0) {
+    OF_CUDA_CHECK(hipGetDevice(&device_index_));
+    key_size_ = options.key_size;
+    value_size_ = options.value_size;
+    OF_CUDA_CHECK(NumaAwareCudaMallocHost(
+        device_index_, reinterpret_cast<void**>(&host_query_keys_), key_size_ * max_query_length_));
+    OF_CUDA_CHECK(NumaAwareCudaMallocHost(device_index_,
+                                          reinterpret_cast<void**>(&host_query_values_),
+                                          value_size_ * max_query_length_));
+    OF_CUDA_CHECK(NumaAwareCudaMallocHost(device_index_, reinterpret_cast<void**>(&host_n_missing_),
+                                          sizeof(uint32_t)));
+    OF_CUDA_CHECK(NumaAwareCudaMallocHost(device_index_,
+                                          reinterpret_cast<void**>(&host_missing_indices_),
+                                          sizeof(uint32_t) * max_query_length_));
+  }
+  ~KeyValueStoreImpl() {
+    CudaCurrentDeviceGuard guard(device_index_);
+    if (max_query_length_ != 0) {
+      OF_CUDA_CHECK(hipHostFree(host_query_keys_));
+      OF_CUDA_CHECK(hipHostFree(host_query_values_));
+      OF_CUDA_CHECK(hipHostFree(host_missing_indices_));
+    }
+    OF_CUDA_CHECK(hipHostFree(host_n_missing_));
+  }
+
+  uint32_t KeySize() const override { return key_size_; }
+
+  uint32_t ValueSize() const override { return value_size_; }
+
+  uint32_t MaxQueryLength() const override { return max_query_length_; }
+
+  void ReserveQueryLength(uint32_t query_length) override {
+    CudaCurrentDeviceGuard guard(device_index_);
+    if (query_length <= max_query_length_) { return; }
+    if (max_query_length_ != 0) {
+      OF_CUDA_CHECK(hipHostFree(host_query_keys_));
+      OF_CUDA_CHECK(hipHostFree(host_query_values_));
+      OF_CUDA_CHECK(hipHostFree(host_missing_indices_));
+    }
+    OF_CUDA_CHECK(NumaAwareCudaMallocHost(
+        device_index_, reinterpret_cast<void**>(&host_query_keys_), key_size_ * query_length));
+    OF_CUDA_CHECK(NumaAwareCudaMallocHost(
+        device_index_, reinterpret_cast<void**>(&host_query_values_), value_size_ * query_length));
+    OF_CUDA_CHECK(NumaAwareCudaMallocHost(device_index_,
+                                          reinterpret_cast<void**>(&host_missing_indices_),
+                                          sizeof(uint32_t) * query_length));
+    max_query_length_ = query_length;
+  }
+
+  void Get(ep::Stream* stream, uint32_t num_keys, const void* keys, void* values,
+           uint32_t* n_missing, uint32_t* missing_indices) override;
+  void Put(ep::Stream* stream, uint32_t num_keys, const void* keys, const void* values) override;
+  bool SnapshotExists(const std::string& name) override;
+  void LoadSnapshot(const std::string& name) override;
+  void LoadSnapshot(const std::string& name,
+                    const std::function<void(KVIterator* iter)>& Hook) override;
+  void SaveSnapshot(const std::string& name) override;
+
+ private:
+  int device_index_;
+  uint32_t max_query_length_;
+  uint32_t key_size_;
+  uint32_t value_size_;
+  Key* host_query_keys_{};
+  uint8_t* host_query_values_{};
+  uint32_t* host_n_missing_{};
+  uint32_t* host_missing_indices_{};
+  HashMap<Key, std::string> store_;
+  HashMap<std::string, HashMap<Key, std::string>> snapshots_;
+  std::mutex mutex_;
+};
+
+template<typename Key>
+void KeyValueStoreImpl<Key>::Get(ep::Stream* stream, uint32_t num_keys, const void* keys,
+                                 void* values, uint32_t* n_missing, uint32_t* missing_indices) {
+  std::lock_guard<std::mutex> lock(mutex_);
+  auto cuda_stream = stream->As<ep::CudaStream>();
+  CHECK_LE(num_keys, max_query_length_);
+  if (num_keys == 0) {
+    OF_CUDA_CHECK(hipMemsetAsync(n_missing, 0, sizeof(uint32_t),
+                                  stream->As<ep::CudaStream>()->cuda_stream()));
+    return;
+  }
+  OF_CUDA_CHECK(hipMemcpyAsync(host_query_keys_, keys, key_size_ * num_keys, hipMemcpyDefault,
+                                cuda_stream->cuda_stream()));
+  CHECK_JUST(cuda_stream->Sync());
+  *host_n_missing_ = 0;
+  for (uint32_t i = 0; i < num_keys; ++i) {
+    auto it = store_.find(host_query_keys_[i]);
+    if (it != store_.end()) {
+      std::memcpy(host_query_values_ + i * value_size_, it->second.data(), value_size_);
+    } else {
+      host_missing_indices_[*host_n_missing_] = i;
+      *host_n_missing_ += 1;
+    }
+  }
+  OF_CUDA_CHECK(hipMemcpyAsync(values, host_query_values_, num_keys * value_size_,
+                                hipMemcpyDefault, cuda_stream->cuda_stream()));
+  OF_CUDA_CHECK(hipMemcpyAsync(n_missing, host_n_missing_, sizeof(uint32_t), hipMemcpyDefault,
+                                cuda_stream->cuda_stream()));
+  OF_CUDA_CHECK(hipMemcpyAsync(missing_indices, host_missing_indices_,
+                                (*host_n_missing_) * sizeof(uint32_t), hipMemcpyDefault,
+                                cuda_stream->cuda_stream()));
+}
+
+template<typename Key>
+void KeyValueStoreImpl<Key>::Put(ep::Stream* stream, uint32_t num_keys, const void* keys,
+                                 const void* values) {
+  std::lock_guard<std::mutex> lock(mutex_);
+  auto cuda_stream = stream->As<ep::CudaStream>();
+  CHECK_LE(num_keys, max_query_length_);
+  if (num_keys == 0) { return; }
+  OF_CUDA_CHECK(hipMemcpyAsync(host_query_keys_, keys, key_size_ * num_keys, hipMemcpyDefault,
+                                cuda_stream->cuda_stream()));
+  OF_CUDA_CHECK(hipMemcpyAsync(host_query_values_, values, value_size_ * num_keys,
+                                hipMemcpyDefault, cuda_stream->cuda_stream()));
+  CHECK_JUST(cuda_stream->Sync());
+  for (uint32_t i = 0; i < num_keys; ++i) {
+    store_[host_query_keys_[i]] = std::string(
+        reinterpret_cast<const char*>(host_query_values_) + i * value_size_, value_size_);
+  }
+}
+
+template<typename Key>
+bool KeyValueStoreImpl<Key>::SnapshotExists(const std::string& name) {
+  return snapshots_.find(name) != snapshots_.end();
+}
+
+template<typename Key>
+void KeyValueStoreImpl<Key>::LoadSnapshot(const std::string& name) {
+  CudaCurrentDeviceGuard guard(device_index_);
+  LoadSnapshot(name, nullptr);
+}
+
+template<typename Key>
+void KeyValueStoreImpl<Key>::LoadSnapshot(const std::string& name,
+                                          const std::function<void(KVIterator* iter)>& Hook) {
+  CudaCurrentDeviceGuard guard(device_index_);
+  store_ = snapshots_[name];
+  if (Hook) {
+    IteratorImpl<Key> iterator(&store_, KeySize(), ValueSize(), max_query_length_, host_query_keys_,
+                               host_query_values_, host_n_missing_);
+    Hook(&iterator);
+  }
+}
+
+template<typename Key>
+void KeyValueStoreImpl<Key>::SaveSnapshot(const std::string& name) {
+  CudaCurrentDeviceGuard guard(device_index_);
+  snapshots_[name] = store_;
+}
+
+}  // namespace
+
+std::unique_ptr<KeyValueStore> NewMockKeyValueStore(const MockKeyValueStoreOptions& options) {
+  if (options.key_size == sizeof(uint64_t)) {
+    return std::unique_ptr<KeyValueStore>(new KeyValueStoreImpl<uint64_t>(options));
+  } else if (options.key_size == sizeof(uint32_t)) {
+    return std::unique_ptr<KeyValueStore>(new KeyValueStoreImpl<uint32_t>(options));
+  } else {
+    UNIMPLEMENTED();
+    return nullptr;
+  }
+}
+
+}  // namespace embedding
+
 }  // namespace oneflow
\ No newline at end of file
diff --git a/oneflow/core/embedding/persistent_table_key_value_store.hip.cpp b/oneflow/core/embedding/persistent_table_key_value_store.hip.cpp
index 46fca92..8ec7a04 100644
--- a/oneflow/core/embedding/persistent_table_key_value_store.hip.cpp
+++ b/oneflow/core/embedding/persistent_table_key_value_store.hip.cpp
@@ -1,243 +1,243 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "oneflow/core/embedding/persistent_table_key_value_store.h"
-#include "oneflow/core/device/cuda_util.h"
-#include "oneflow/core/embedding/persistent_table.h"
-#include <robin_hood.h>
-#include <fcntl.h>
-#include <sys/mman.h>
-#include <sys/stat.h>
-#include <dirent.h>
-
-namespace oneflow {
-
-namespace embedding {
-
-namespace {
-
-class IteratorImpl : public KVIterator {
- public:
-  OF_DISALLOW_COPY_AND_MOVE(IteratorImpl);
-  IteratorImpl(PersistentTable::Iterator* base_iter, uint32_t key_size, uint32_t value_size,
-               uint32_t max_query_length, void* host_keys_buffer, void* host_values_buffer,
-               uint32_t* host_num_buffer)
-      : base_iter_(base_iter),
-        key_size_(key_size),
-        value_size_(value_size),
-        max_query_length_(max_query_length),
-        host_keys_buffer_(host_keys_buffer),
-        host_values_buffer_(host_values_buffer),
-        host_num_buffer_(host_num_buffer) {}
-  ~IteratorImpl() override = default;
-
-  void NextN(ep::Stream* stream, uint32_t n_request, uint32_t* n_result, void* keys,
-             void* values) override {
-    CHECK_LE(n_request, max_query_length_);
-    auto cuda_stream = stream->As<ep::CudaStream>();
-    CHECK_JUST(cuda_stream->Sync());
-    base_iter_->Next(n_request, host_num_buffer_, host_keys_buffer_, host_values_buffer_);
-    OF_CUDA_CHECK(hipMemcpyAsync(n_result, host_num_buffer_, sizeof(uint32_t), hipMemcpyDefault,
-                                  cuda_stream->cuda_stream()));
-    const uint32_t num_keys = *host_num_buffer_;
-    if (num_keys != 0) {
-      OF_CUDA_CHECK(hipMemcpyAsync(keys, host_keys_buffer_, num_keys * key_size_,
-                                    hipMemcpyDefault, cuda_stream->cuda_stream()));
-      OF_CUDA_CHECK(hipMemcpyAsync(values, host_values_buffer_, num_keys * value_size_,
-                                    hipMemcpyDefault, cuda_stream->cuda_stream()));
-    }
-  }
-
-  void Reset() override { base_iter_->Reset(); }
-
- private:
-  PersistentTable::Iterator* base_iter_;
-  uint32_t key_size_;
-  uint32_t value_size_;
-  uint32_t max_query_length_;
-  void* host_keys_buffer_;
-  void* host_values_buffer_;
-  uint32_t* host_num_buffer_;
-};
-
-template<typename Key>
-class KeyValueStoreImpl : public KeyValueStore {
- public:
-  OF_DISALLOW_COPY_AND_MOVE(KeyValueStoreImpl);
-  explicit KeyValueStoreImpl(const PersistentTableKeyValueStoreOptions& options)
-      : device_index_(-1), max_query_length_(0) {
-    OF_CUDA_CHECK(hipGetDevice(&device_index_));
-    key_size_ = options.table_options.key_size;
-    value_size_ = options.table_options.value_size;
-    table_ = NewPersistentTable(options.table_options);
-    OF_CUDA_CHECK(NumaAwareCudaMallocHost(
-        device_index_, reinterpret_cast<void**>(&host_query_keys_), key_size_ * max_query_length_));
-    OF_CUDA_CHECK(NumaAwareCudaMallocHost(device_index_,
-                                          reinterpret_cast<void**>(&host_query_values_),
-                                          value_size_ * max_query_length_));
-    OF_CUDA_CHECK(NumaAwareCudaMallocHost(device_index_, reinterpret_cast<void**>(&host_n_missing_),
-                                          sizeof(uint32_t)));
-    OF_CUDA_CHECK(NumaAwareCudaMallocHost(device_index_,
-                                          reinterpret_cast<void**>(&host_missing_indices_),
-                                          sizeof(uint32_t) * max_query_length_));
-  }
-  ~KeyValueStoreImpl() {
-    CudaCurrentDeviceGuard guard(device_index_);
-    if (max_query_length_ != 0) {
-      OF_CUDA_CHECK(hipHostFree(host_query_keys_));
-      OF_CUDA_CHECK(hipHostFree(host_query_values_));
-      OF_CUDA_CHECK(hipHostFree(host_missing_indices_));
-    }
-    OF_CUDA_CHECK(hipHostFree(host_n_missing_));
-  }
-
-  uint32_t KeySize() const override { return key_size_; }
-
-  uint32_t ValueSize() const override { return value_size_; }
-
-  uint32_t MaxQueryLength() const override { return max_query_length_; }
-
-  void ReserveQueryLength(uint32_t query_length) override {
-    CudaCurrentDeviceGuard guard(device_index_);
-    if (query_length <= max_query_length_) { return; }
-    if (max_query_length_ != 0) {
-      OF_CUDA_CHECK(hipHostFree(host_query_keys_));
-      OF_CUDA_CHECK(hipHostFree(host_query_values_));
-      OF_CUDA_CHECK(hipHostFree(host_missing_indices_));
-    }
-    OF_CUDA_CHECK(NumaAwareCudaMallocHost(
-        device_index_, reinterpret_cast<void**>(&host_query_keys_), key_size_ * query_length));
-    OF_CUDA_CHECK(NumaAwareCudaMallocHost(
-        device_index_, reinterpret_cast<void**>(&host_query_values_), value_size_ * query_length));
-    OF_CUDA_CHECK(NumaAwareCudaMallocHost(device_index_,
-                                          reinterpret_cast<void**>(&host_missing_indices_),
-                                          sizeof(uint32_t) * query_length));
-    max_query_length_ = query_length;
-  }
-
-  void Get(ep::Stream* stream, uint32_t num_keys, const void* keys, void* values,
-           uint32_t* n_missing, uint32_t* missing_indices) override;
-  void Put(ep::Stream* stream, uint32_t num_keys, const void* keys, const void* values) override;
-  bool SnapshotExists(const std::string& name) override;
-  void LoadSnapshot(const std::string& name) override;
-  void LoadSnapshot(const std::string& name,
-                    const std::function<void(KVIterator* iter)>& Hook) override;
-  void SaveSnapshot(const std::string& name) override;
-
- private:
-  int device_index_;
-  uint32_t max_query_length_;
-  uint32_t key_size_;
-  uint32_t value_size_;
-  Key* host_query_keys_{};
-  uint8_t* host_query_values_{};
-  uint32_t* host_n_missing_{};
-  uint32_t* host_missing_indices_{};
-
-  std::mutex mutex_;
-  std::unique_ptr<PersistentTable> table_;
-};
-
-template<typename Key>
-void KeyValueStoreImpl<Key>::Get(ep::Stream* stream, uint32_t num_keys, const void* keys,
-                                 void* values, uint32_t* n_missing, uint32_t* missing_indices) {
-  std::lock_guard<std::mutex> lock(mutex_);
-  auto cuda_stream = stream->As<ep::CudaStream>();
-  CHECK_LE(num_keys, max_query_length_);
-  if (num_keys == 0) {
-    OF_CUDA_CHECK(hipMemsetAsync(n_missing, 0, sizeof(uint32_t),
-                                  stream->As<ep::CudaStream>()->cuda_stream()));
-    return;
-  }
-  OF_CUDA_CHECK(hipMemcpyAsync(host_query_keys_, keys, key_size_ * num_keys, hipMemcpyDefault,
-                                cuda_stream->cuda_stream()));
-  CHECK_JUST(cuda_stream->Sync());
-
-  table_->Get(num_keys, host_query_keys_, host_query_values_, host_n_missing_,
-              host_missing_indices_);
-
-  OF_CUDA_CHECK(hipMemcpyAsync(values, host_query_values_, num_keys * value_size_,
-                                hipMemcpyDefault, cuda_stream->cuda_stream()));
-  OF_CUDA_CHECK(hipMemcpyAsync(n_missing, host_n_missing_, sizeof(uint32_t), hipMemcpyDefault,
-                                cuda_stream->cuda_stream()));
-  OF_CUDA_CHECK(hipMemcpyAsync(missing_indices, host_missing_indices_,
-                                (*host_n_missing_) * sizeof(uint32_t), hipMemcpyDefault,
-                                cuda_stream->cuda_stream()));
-}
-
-template<typename Key>
-void KeyValueStoreImpl<Key>::Put(ep::Stream* stream, uint32_t num_keys, const void* keys,
-                                 const void* values) {
-  std::lock_guard<std::mutex> lock(mutex_);
-  auto cuda_stream = stream->As<ep::CudaStream>();
-  CHECK_LE(num_keys, max_query_length_);
-  if (num_keys == 0) { return; }
-  OF_CUDA_CHECK(hipMemcpyAsync(host_query_keys_, keys, key_size_ * num_keys, hipMemcpyDefault,
-                                cuda_stream->cuda_stream()));
-  OF_CUDA_CHECK(hipMemcpyAsync(host_query_values_, values, value_size_ * num_keys,
-                                hipMemcpyDefault, cuda_stream->cuda_stream()));
-  CHECK_JUST(cuda_stream->Sync());
-  table_->Put(num_keys, host_query_keys_, host_query_values_);
-}
-
-template<typename Key>
-bool KeyValueStoreImpl<Key>::SnapshotExists(const std::string& name) {
-  return table_->SnapshotExists(name);
-}
-
-template<typename Key>
-void KeyValueStoreImpl<Key>::LoadSnapshot(const std::string& name) {
-  CudaCurrentDeviceGuard guard(device_index_);
-  LoadSnapshot(name, nullptr);
-}
-
-template<typename Key>
-void KeyValueStoreImpl<Key>::LoadSnapshot(const std::string& name,
-                                          const std::function<void(KVIterator* iter)>& Hook) {
-  CudaCurrentDeviceGuard guard(device_index_);
-  if (Hook) {
-    table_->LoadSnapshot(name, [&](PersistentTable::Iterator* chunk_iterator) {
-      IteratorImpl iterator(chunk_iterator, KeySize(), ValueSize(), max_query_length_,
-                            host_query_keys_, host_query_values_, host_n_missing_);
-      Hook(&iterator);
-    });
-  } else {
-    table_->LoadSnapshot(name);
-  }
-}
-
-template<typename Key>
-void KeyValueStoreImpl<Key>::SaveSnapshot(const std::string& name) {
-  CudaCurrentDeviceGuard guard(device_index_);
-  table_->SaveSnapshot(name);
-}
-
-}  // namespace
-
-std::unique_ptr<KeyValueStore> NewPersistentTableKeyValueStore(
-    const PersistentTableKeyValueStoreOptions& options) {
-  if (options.table_options.key_size == sizeof(uint64_t)) {
-    return std::unique_ptr<KeyValueStore>(new KeyValueStoreImpl<uint64_t>(options));
-  } else if (options.table_options.key_size == sizeof(uint32_t)) {
-    return std::unique_ptr<KeyValueStore>(new KeyValueStoreImpl<uint32_t>(options));
-  } else {
-    UNIMPLEMENTED();
-    return nullptr;
-  }
-}
-
-}  // namespace embedding
-
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "oneflow/core/embedding/persistent_table_key_value_store.h"
+#include "oneflow/core/device/cuda_util.h"
+#include "oneflow/core/embedding/persistent_table.h"
+#include <robin_hood.h>
+#include <fcntl.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <dirent.h>
+
+namespace oneflow {
+
+namespace embedding {
+
+namespace {
+
+class IteratorImpl : public KVIterator {
+ public:
+  OF_DISALLOW_COPY_AND_MOVE(IteratorImpl);
+  IteratorImpl(PersistentTable::Iterator* base_iter, uint32_t key_size, uint32_t value_size,
+               uint32_t max_query_length, void* host_keys_buffer, void* host_values_buffer,
+               uint32_t* host_num_buffer)
+      : base_iter_(base_iter),
+        key_size_(key_size),
+        value_size_(value_size),
+        max_query_length_(max_query_length),
+        host_keys_buffer_(host_keys_buffer),
+        host_values_buffer_(host_values_buffer),
+        host_num_buffer_(host_num_buffer) {}
+  ~IteratorImpl() override = default;
+
+  void NextN(ep::Stream* stream, uint32_t n_request, uint32_t* n_result, void* keys,
+             void* values) override {
+    CHECK_LE(n_request, max_query_length_);
+    auto cuda_stream = stream->As<ep::CudaStream>();
+    CHECK_JUST(cuda_stream->Sync());
+    base_iter_->Next(n_request, host_num_buffer_, host_keys_buffer_, host_values_buffer_);
+    OF_CUDA_CHECK(hipMemcpyAsync(n_result, host_num_buffer_, sizeof(uint32_t), hipMemcpyDefault,
+                                  cuda_stream->cuda_stream()));
+    const uint32_t num_keys = *host_num_buffer_;
+    if (num_keys != 0) {
+      OF_CUDA_CHECK(hipMemcpyAsync(keys, host_keys_buffer_, num_keys * key_size_,
+                                    hipMemcpyDefault, cuda_stream->cuda_stream()));
+      OF_CUDA_CHECK(hipMemcpyAsync(values, host_values_buffer_, num_keys * value_size_,
+                                    hipMemcpyDefault, cuda_stream->cuda_stream()));
+    }
+  }
+
+  void Reset() override { base_iter_->Reset(); }
+
+ private:
+  PersistentTable::Iterator* base_iter_;
+  uint32_t key_size_;
+  uint32_t value_size_;
+  uint32_t max_query_length_;
+  void* host_keys_buffer_;
+  void* host_values_buffer_;
+  uint32_t* host_num_buffer_;
+};
+
+template<typename Key>
+class KeyValueStoreImpl : public KeyValueStore {
+ public:
+  OF_DISALLOW_COPY_AND_MOVE(KeyValueStoreImpl);
+  explicit KeyValueStoreImpl(const PersistentTableKeyValueStoreOptions& options)
+      : device_index_(-1), max_query_length_(0) {
+    OF_CUDA_CHECK(hipGetDevice(&device_index_));
+    key_size_ = options.table_options.key_size;
+    value_size_ = options.table_options.value_size;
+    table_ = NewPersistentTable(options.table_options);
+    OF_CUDA_CHECK(NumaAwareCudaMallocHost(
+        device_index_, reinterpret_cast<void**>(&host_query_keys_), key_size_ * max_query_length_));
+    OF_CUDA_CHECK(NumaAwareCudaMallocHost(device_index_,
+                                          reinterpret_cast<void**>(&host_query_values_),
+                                          value_size_ * max_query_length_));
+    OF_CUDA_CHECK(NumaAwareCudaMallocHost(device_index_, reinterpret_cast<void**>(&host_n_missing_),
+                                          sizeof(uint32_t)));
+    OF_CUDA_CHECK(NumaAwareCudaMallocHost(device_index_,
+                                          reinterpret_cast<void**>(&host_missing_indices_),
+                                          sizeof(uint32_t) * max_query_length_));
+  }
+  ~KeyValueStoreImpl() {
+    CudaCurrentDeviceGuard guard(device_index_);
+    if (max_query_length_ != 0) {
+      OF_CUDA_CHECK(hipHostFree(host_query_keys_));
+      OF_CUDA_CHECK(hipHostFree(host_query_values_));
+      OF_CUDA_CHECK(hipHostFree(host_missing_indices_));
+    }
+    OF_CUDA_CHECK(hipHostFree(host_n_missing_));
+  }
+
+  uint32_t KeySize() const override { return key_size_; }
+
+  uint32_t ValueSize() const override { return value_size_; }
+
+  uint32_t MaxQueryLength() const override { return max_query_length_; }
+
+  void ReserveQueryLength(uint32_t query_length) override {
+    CudaCurrentDeviceGuard guard(device_index_);
+    if (query_length <= max_query_length_) { return; }
+    if (max_query_length_ != 0) {
+      OF_CUDA_CHECK(hipHostFree(host_query_keys_));
+      OF_CUDA_CHECK(hipHostFree(host_query_values_));
+      OF_CUDA_CHECK(hipHostFree(host_missing_indices_));
+    }
+    OF_CUDA_CHECK(NumaAwareCudaMallocHost(
+        device_index_, reinterpret_cast<void**>(&host_query_keys_), key_size_ * query_length));
+    OF_CUDA_CHECK(NumaAwareCudaMallocHost(
+        device_index_, reinterpret_cast<void**>(&host_query_values_), value_size_ * query_length));
+    OF_CUDA_CHECK(NumaAwareCudaMallocHost(device_index_,
+                                          reinterpret_cast<void**>(&host_missing_indices_),
+                                          sizeof(uint32_t) * query_length));
+    max_query_length_ = query_length;
+  }
+
+  void Get(ep::Stream* stream, uint32_t num_keys, const void* keys, void* values,
+           uint32_t* n_missing, uint32_t* missing_indices) override;
+  void Put(ep::Stream* stream, uint32_t num_keys, const void* keys, const void* values) override;
+  bool SnapshotExists(const std::string& name) override;
+  void LoadSnapshot(const std::string& name) override;
+  void LoadSnapshot(const std::string& name,
+                    const std::function<void(KVIterator* iter)>& Hook) override;
+  void SaveSnapshot(const std::string& name) override;
+
+ private:
+  int device_index_;
+  uint32_t max_query_length_;
+  uint32_t key_size_;
+  uint32_t value_size_;
+  Key* host_query_keys_{};
+  uint8_t* host_query_values_{};
+  uint32_t* host_n_missing_{};
+  uint32_t* host_missing_indices_{};
+
+  std::mutex mutex_;
+  std::unique_ptr<PersistentTable> table_;
+};
+
+template<typename Key>
+void KeyValueStoreImpl<Key>::Get(ep::Stream* stream, uint32_t num_keys, const void* keys,
+                                 void* values, uint32_t* n_missing, uint32_t* missing_indices) {
+  std::lock_guard<std::mutex> lock(mutex_);
+  auto cuda_stream = stream->As<ep::CudaStream>();
+  CHECK_LE(num_keys, max_query_length_);
+  if (num_keys == 0) {
+    OF_CUDA_CHECK(hipMemsetAsync(n_missing, 0, sizeof(uint32_t),
+                                  stream->As<ep::CudaStream>()->cuda_stream()));
+    return;
+  }
+  OF_CUDA_CHECK(hipMemcpyAsync(host_query_keys_, keys, key_size_ * num_keys, hipMemcpyDefault,
+                                cuda_stream->cuda_stream()));
+  CHECK_JUST(cuda_stream->Sync());
+
+  table_->Get(num_keys, host_query_keys_, host_query_values_, host_n_missing_,
+              host_missing_indices_);
+
+  OF_CUDA_CHECK(hipMemcpyAsync(values, host_query_values_, num_keys * value_size_,
+                                hipMemcpyDefault, cuda_stream->cuda_stream()));
+  OF_CUDA_CHECK(hipMemcpyAsync(n_missing, host_n_missing_, sizeof(uint32_t), hipMemcpyDefault,
+                                cuda_stream->cuda_stream()));
+  OF_CUDA_CHECK(hipMemcpyAsync(missing_indices, host_missing_indices_,
+                                (*host_n_missing_) * sizeof(uint32_t), hipMemcpyDefault,
+                                cuda_stream->cuda_stream()));
+}
+
+template<typename Key>
+void KeyValueStoreImpl<Key>::Put(ep::Stream* stream, uint32_t num_keys, const void* keys,
+                                 const void* values) {
+  std::lock_guard<std::mutex> lock(mutex_);
+  auto cuda_stream = stream->As<ep::CudaStream>();
+  CHECK_LE(num_keys, max_query_length_);
+  if (num_keys == 0) { return; }
+  OF_CUDA_CHECK(hipMemcpyAsync(host_query_keys_, keys, key_size_ * num_keys, hipMemcpyDefault,
+                                cuda_stream->cuda_stream()));
+  OF_CUDA_CHECK(hipMemcpyAsync(host_query_values_, values, value_size_ * num_keys,
+                                hipMemcpyDefault, cuda_stream->cuda_stream()));
+  CHECK_JUST(cuda_stream->Sync());
+  table_->Put(num_keys, host_query_keys_, host_query_values_);
+}
+
+template<typename Key>
+bool KeyValueStoreImpl<Key>::SnapshotExists(const std::string& name) {
+  return table_->SnapshotExists(name);
+}
+
+template<typename Key>
+void KeyValueStoreImpl<Key>::LoadSnapshot(const std::string& name) {
+  CudaCurrentDeviceGuard guard(device_index_);
+  LoadSnapshot(name, nullptr);
+}
+
+template<typename Key>
+void KeyValueStoreImpl<Key>::LoadSnapshot(const std::string& name,
+                                          const std::function<void(KVIterator* iter)>& Hook) {
+  CudaCurrentDeviceGuard guard(device_index_);
+  if (Hook) {
+    table_->LoadSnapshot(name, [&](PersistentTable::Iterator* chunk_iterator) {
+      IteratorImpl iterator(chunk_iterator, KeySize(), ValueSize(), max_query_length_,
+                            host_query_keys_, host_query_values_, host_n_missing_);
+      Hook(&iterator);
+    });
+  } else {
+    table_->LoadSnapshot(name);
+  }
+}
+
+template<typename Key>
+void KeyValueStoreImpl<Key>::SaveSnapshot(const std::string& name) {
+  CudaCurrentDeviceGuard guard(device_index_);
+  table_->SaveSnapshot(name);
+}
+
+}  // namespace
+
+std::unique_ptr<KeyValueStore> NewPersistentTableKeyValueStore(
+    const PersistentTableKeyValueStoreOptions& options) {
+  if (options.table_options.key_size == sizeof(uint64_t)) {
+    return std::unique_ptr<KeyValueStore>(new KeyValueStoreImpl<uint64_t>(options));
+  } else if (options.table_options.key_size == sizeof(uint32_t)) {
+    return std::unique_ptr<KeyValueStore>(new KeyValueStoreImpl<uint32_t>(options));
+  } else {
+    UNIMPLEMENTED();
+    return nullptr;
+  }
+}
+
+}  // namespace embedding
+
 }  // namespace oneflow
\ No newline at end of file
diff --git a/oneflow/core/ep/rocm/cuda_device.cpp b/oneflow/core/ep/rocm/cuda_device.cpp
index 850a490..d0dae8a 100644
--- a/oneflow/core/ep/rocm/cuda_device.cpp
+++ b/oneflow/core/ep/rocm/cuda_device.cpp
@@ -1,179 +1,179 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "oneflow/core/ep/rocm/cuda_device.h"
-#include "oneflow/core/ep/rocm/cuda_event.h"
-#include "oneflow/core/ep/rocm/cuda_stream.h"
-
-#ifdef WITH_ROCM
-
-#include <hip/hip_runtime.h>
-#include <hip/hip_fp16.h>
-
-// #if CUDA_VERSION >= 11000
-// #include <cuda_bf16.h>
-// #endif
-
-namespace oneflow {
-
-namespace ep {
-
-namespace {
-
-constexpr size_t kDefaultConstBufElementCount = 1024 * 1024;
-
-template<typename T>
-void CreateConstBuffer(void** buf, T value, size_t n) {
-  OF_CUDA_CHECK(hipMalloc(buf, n * sizeof(T)));
-  std::vector<T> host(n, value);
-  OF_CUDA_CHECK(hipMemcpy(*buf, host.data(), n * sizeof(T), hipMemcpyDefault));
-}
-
-}  // namespace
-
-CudaDevice::CudaDevice(int device_index, DeviceManager* device_manager)
-    : device_index_(device_index),
-      event_flags_{},
-      properties_{},
-      device_manager_(device_manager),
-      const_buf_elem_cnt_(0),
-      const_zeros_buffer_(nullptr),
-      const_ones_buffer_fp32_(nullptr),
-      const_ones_buffer_fp16_(nullptr),
-      const_ones_buffer_bf16_(nullptr) {
-  CudaCurrentDeviceGuard guard(device_index_);
-  OF_CUDA_CHECK(hipGetDeviceProperties(&properties_, device_index_));
-  event_flags_ = hipEventDisableTiming;
-  if (ParseBooleanFromEnv("ONEFLOW_STREAM_CUDA_EVENT_FLAG_BLOCKING_SYNC", false)) {
-    event_flags_ |= hipEventBlockingSync;
-  }
-  const_buf_elem_cnt_ = ParseIntegerFromEnv("ONEFLOW_EP_CUDA_CONST_BUFFER_ELEMENT_COUNT",
-                                            kDefaultConstBufElementCount);
-  if (const_buf_elem_cnt_ > 0) {
-    CreateConstBuffer<float>(&const_zeros_buffer_, static_cast<float>(0), const_buf_elem_cnt_);
-    CreateConstBuffer<float>(&const_ones_buffer_fp32_, static_cast<float>(1.0),
-                             const_buf_elem_cnt_);
-    CreateConstBuffer<half>(&const_ones_buffer_fp16_, static_cast<half>(1.0), const_buf_elem_cnt_);
-// #if CUDA_VERSION >= 11000
-//     CreateConstBuffer<nv_bfloat16>(&const_ones_buffer_bf16_, static_cast<nv_bfloat16>(1.0),
-//                                    const_buf_elem_cnt_);
-// #endif
-  }
-}
-
-CudaDevice::~CudaDevice() {
-  CudaCurrentDeviceGuard guard(device_index_);
-  for (auto* event : events_) { delete event; }
-  OF_CUDA_CHECK(hipFree(const_zeros_buffer_));
-  OF_CUDA_CHECK(hipFree(const_ones_buffer_fp32_));
-  OF_CUDA_CHECK(hipFree(const_ones_buffer_fp16_));
-  OF_CUDA_CHECK(hipFree(const_ones_buffer_bf16_));
-}
-
-void CudaDevice::SetAsActiveDevice() { OF_CUDA_CHECK(hipSetDevice(device_index_)); }
-
-Stream* CudaDevice::CreateStream() {
-  CudaCurrentDeviceGuard guard(device_index_);
-  return new CudaStream(this);
-}
-
-void CudaDevice::DestroyStream(Stream* stream) {
-  CudaCurrentDeviceGuard guard(device_index_);
-  delete stream;
-}
-
-void CudaDevice::CreateEvents(Event** events, size_t count) {
-  size_t copied = 0;
-  {
-    std::lock_guard<std::mutex> lock(events_mutex_);
-    copied = std::min(count, events_.size());
-    size_t offset = events_.size() - copied;
-    std::copy(events_.begin() + offset, events_.end(), events);
-    events_.resize(offset);
-  }
-  if (copied != count) {
-    CudaCurrentDeviceGuard guard(device_index_);
-    for (size_t i = copied; i < count; ++i) { events[i] = new CudaEvent(event_flags_); }
-  }
-}
-
-void CudaDevice::DestroyEvents(Event** events, size_t count) {
-  std::lock_guard<std::mutex> lock(events_mutex_);
-  events_.insert(events_.end(), events, events + count);
-}
-
-Maybe<void> CudaDevice::Alloc(const AllocationOptions& options, void** ptr, size_t size) {
-  CudaCurrentDeviceGuard guard(device_index_);
-  CHECK(!options.HasPinnedDevice());
-  hipError_t err = hipMalloc(ptr, size);
-  if (err != hipSuccess) {
-    return Error::RuntimeError() << hipGetErrorString(err);
-  } else {
-    return Maybe<void>::Ok();
-  }
-}
-
-void CudaDevice::Free(const AllocationOptions& attr, void* ptr) {
-  CudaCurrentDeviceGuard guard(device_index_);
-  OF_CUDA_CHECK(hipFree(ptr));
-}
-
-Maybe<void> CudaDevice::AllocPinned(const AllocationOptions& options, void** ptr, size_t size) {
-  CudaCurrentDeviceGuard guard(device_index_);
-  hipError_t err = NumaAwareCudaMallocHost(device_index_, ptr, size);
-  if (err != hipSuccess) {
-    return Error::RuntimeError() << hipGetErrorString(err);
-  } else {
-    return Maybe<void>::Ok();
-  }
-}
-
-void CudaDevice::FreePinned(const AllocationOptions& options, void* ptr) {
-  CudaCurrentDeviceGuard guard(device_index_);
-  OF_CUDA_CHECK(hipHostFree(ptr));
-}
-
-const hipDeviceProp_t& CudaDevice::properties() const { return properties_; }
-
-const void* CudaDevice::GetConstZeros(DataType data_type, size_t n) const {
-  if (GetSizeOfDataType(data_type) * n
-      <= GetSizeOfDataType(DataType::kFloat) * const_buf_elem_cnt_) {
-    return const_zeros_buffer_;
-  } else {
-    return nullptr;
-  }
-}
-
-const void* CudaDevice::GetConstOnes(DataType data_type, size_t n) const {
-  if (n <= const_buf_elem_cnt_) {
-    if (data_type == DataType::kFloat) {
-      return const_ones_buffer_fp32_;
-    } else if (data_type == DataType::kFloat16) {
-      return const_ones_buffer_fp16_;
-    } else if (data_type == DataType::kBFloat16) {
-      return const_ones_buffer_bf16_;
-    } else {
-      return nullptr;
-    }
-  } else {
-    return nullptr;
-  }
-}
-
-}  // namespace ep
-
-}  // namespace oneflow
-
-#endif  // WITH_ROCM
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "oneflow/core/ep/rocm/cuda_device.h"
+#include "oneflow/core/ep/rocm/cuda_event.h"
+#include "oneflow/core/ep/rocm/cuda_stream.h"
+
+#ifdef WITH_ROCM
+
+#include <hip/hip_runtime.h>
+#include <hip/hip_fp16.h>
+
+// #if CUDA_VERSION >= 11000
+// #include <cuda_bf16.h>
+// #endif
+
+namespace oneflow {
+
+namespace ep {
+
+namespace {
+
+constexpr size_t kDefaultConstBufElementCount = 1024 * 1024;
+
+template<typename T>
+void CreateConstBuffer(void** buf, T value, size_t n) {
+  OF_CUDA_CHECK(hipMalloc(buf, n * sizeof(T)));
+  std::vector<T> host(n, value);
+  OF_CUDA_CHECK(hipMemcpy(*buf, host.data(), n * sizeof(T), hipMemcpyDefault));
+}
+
+}  // namespace
+
+CudaDevice::CudaDevice(int device_index, DeviceManager* device_manager)
+    : device_index_(device_index),
+      event_flags_{},
+      properties_{},
+      device_manager_(device_manager),
+      const_buf_elem_cnt_(0),
+      const_zeros_buffer_(nullptr),
+      const_ones_buffer_fp32_(nullptr),
+      const_ones_buffer_fp16_(nullptr),
+      const_ones_buffer_bf16_(nullptr) {
+  CudaCurrentDeviceGuard guard(device_index_);
+  OF_CUDA_CHECK(hipGetDeviceProperties(&properties_, device_index_));
+  event_flags_ = hipEventDisableTiming;
+  if (ParseBooleanFromEnv("ONEFLOW_STREAM_CUDA_EVENT_FLAG_BLOCKING_SYNC", false)) {
+    event_flags_ |= hipEventBlockingSync;
+  }
+  const_buf_elem_cnt_ = ParseIntegerFromEnv("ONEFLOW_EP_CUDA_CONST_BUFFER_ELEMENT_COUNT",
+                                            kDefaultConstBufElementCount);
+  if (const_buf_elem_cnt_ > 0) {
+    CreateConstBuffer<float>(&const_zeros_buffer_, static_cast<float>(0), const_buf_elem_cnt_);
+    CreateConstBuffer<float>(&const_ones_buffer_fp32_, static_cast<float>(1.0),
+                             const_buf_elem_cnt_);
+    CreateConstBuffer<half>(&const_ones_buffer_fp16_, static_cast<half>(1.0), const_buf_elem_cnt_);
+// #if CUDA_VERSION >= 11000
+//     CreateConstBuffer<nv_bfloat16>(&const_ones_buffer_bf16_, static_cast<nv_bfloat16>(1.0),
+//                                    const_buf_elem_cnt_);
+// #endif
+  }
+}
+
+CudaDevice::~CudaDevice() {
+  CudaCurrentDeviceGuard guard(device_index_);
+  for (auto* event : events_) { delete event; }
+  OF_CUDA_CHECK(hipFree(const_zeros_buffer_));
+  OF_CUDA_CHECK(hipFree(const_ones_buffer_fp32_));
+  OF_CUDA_CHECK(hipFree(const_ones_buffer_fp16_));
+  OF_CUDA_CHECK(hipFree(const_ones_buffer_bf16_));
+}
+
+void CudaDevice::SetAsActiveDevice() { OF_CUDA_CHECK(hipSetDevice(device_index_)); }
+
+Stream* CudaDevice::CreateStream() {
+  CudaCurrentDeviceGuard guard(device_index_);
+  return new CudaStream(this);
+}
+
+void CudaDevice::DestroyStream(Stream* stream) {
+  CudaCurrentDeviceGuard guard(device_index_);
+  delete stream;
+}
+
+void CudaDevice::CreateEvents(Event** events, size_t count) {
+  size_t copied = 0;
+  {
+    std::lock_guard<std::mutex> lock(events_mutex_);
+    copied = std::min(count, events_.size());
+    size_t offset = events_.size() - copied;
+    std::copy(events_.begin() + offset, events_.end(), events);
+    events_.resize(offset);
+  }
+  if (copied != count) {
+    CudaCurrentDeviceGuard guard(device_index_);
+    for (size_t i = copied; i < count; ++i) { events[i] = new CudaEvent(event_flags_); }
+  }
+}
+
+void CudaDevice::DestroyEvents(Event** events, size_t count) {
+  std::lock_guard<std::mutex> lock(events_mutex_);
+  events_.insert(events_.end(), events, events + count);
+}
+
+Maybe<void> CudaDevice::Alloc(const AllocationOptions& options, void** ptr, size_t size) {
+  CudaCurrentDeviceGuard guard(device_index_);
+  CHECK(!options.HasPinnedDevice());
+  hipError_t err = hipMalloc(ptr, size);
+  if (err != hipSuccess) {
+    return Error::RuntimeError() << hipGetErrorString(err);
+  } else {
+    return Maybe<void>::Ok();
+  }
+}
+
+void CudaDevice::Free(const AllocationOptions& attr, void* ptr) {
+  CudaCurrentDeviceGuard guard(device_index_);
+  OF_CUDA_CHECK(hipFree(ptr));
+}
+
+Maybe<void> CudaDevice::AllocPinned(const AllocationOptions& options, void** ptr, size_t size) {
+  CudaCurrentDeviceGuard guard(device_index_);
+  hipError_t err = NumaAwareCudaMallocHost(device_index_, ptr, size);
+  if (err != hipSuccess) {
+    return Error::RuntimeError() << hipGetErrorString(err);
+  } else {
+    return Maybe<void>::Ok();
+  }
+}
+
+void CudaDevice::FreePinned(const AllocationOptions& options, void* ptr) {
+  CudaCurrentDeviceGuard guard(device_index_);
+  OF_CUDA_CHECK(hipHostFree(ptr));
+}
+
+const hipDeviceProp_t& CudaDevice::properties() const { return properties_; }
+
+const void* CudaDevice::GetConstZeros(DataType data_type, size_t n) const {
+  if (GetSizeOfDataType(data_type) * n
+      <= GetSizeOfDataType(DataType::kFloat) * const_buf_elem_cnt_) {
+    return const_zeros_buffer_;
+  } else {
+    return nullptr;
+  }
+}
+
+const void* CudaDevice::GetConstOnes(DataType data_type, size_t n) const {
+  if (n <= const_buf_elem_cnt_) {
+    if (data_type == DataType::kFloat) {
+      return const_ones_buffer_fp32_;
+    } else if (data_type == DataType::kFloat16) {
+      return const_ones_buffer_fp16_;
+    } else if (data_type == DataType::kBFloat16) {
+      return const_ones_buffer_bf16_;
+    } else {
+      return nullptr;
+    }
+  } else {
+    return nullptr;
+  }
+}
+
+}  // namespace ep
+
+}  // namespace oneflow
+
+#endif  // WITH_ROCM
diff --git a/oneflow/core/ep/rocm/cuda_device.h b/oneflow/core/ep/rocm/cuda_device.h
index 76e1015..1623b8e 100644
--- a/oneflow/core/ep/rocm/cuda_device.h
+++ b/oneflow/core/ep/rocm/cuda_device.h
@@ -1,78 +1,78 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#ifndef ONEFLOW_CORE_EP_ROCM_CUDA_DEVICE_H_
-#define ONEFLOW_CORE_EP_ROCM_CUDA_DEVICE_H_
-
-#include "oneflow/core/ep/include/device.h"
-#include "oneflow/core/common/data_type.h"
-
-#ifdef WITH_ROCM
-
-#include <hip/hip_runtime.h>
-
-namespace oneflow {
-
-namespace ep {
-
-class CudaDevice : public Device {
- public:
-  OF_DISALLOW_COPY_AND_MOVE(CudaDevice);
-  explicit CudaDevice(int device_index, DeviceManager* device_manager);
-  ~CudaDevice() override;
-
-  void SetAsActiveDevice() override;
-
-  DeviceType device_type() const override { return DeviceType::kCUDA; }
-  size_t device_index() const override { return device_index_; }
-  DeviceManager* device_manager() const override { return device_manager_; }
-
-  Stream* CreateStream() override;
-  void DestroyStream(Stream* stream) override;
-
-  void CreateEvents(Event** events, size_t count) override;
-  void DestroyEvents(Event** events, size_t count) override;
-
-  Maybe<void> Alloc(const AllocationOptions& options, void** ptr, size_t size) override;
-  void Free(const AllocationOptions& options, void* ptr) override;
-  Maybe<void> AllocPinned(const AllocationOptions& options, void** ptr, size_t size) override;
-  void FreePinned(const AllocationOptions& options, void* ptr) override;
-
-  const hipDeviceProp_t& properties() const;
-
-  const void* GetConstZeros(DataType data_type, size_t n) const;
-  const void* GetConstOnes(DataType data_type, size_t n) const;
-
- private:
-  int device_index_;
-  std::mutex events_mutex_;
-  std::vector<Event*> events_;
-  unsigned int event_flags_;
-  hipDeviceProp_t properties_;
-  DeviceManager* device_manager_;
-  int64_t const_buf_elem_cnt_;
-  void* const_zeros_buffer_;
-  void* const_ones_buffer_fp32_;
-  void* const_ones_buffer_fp16_;
-  void* const_ones_buffer_bf16_;
-};
-
-}  // namespace ep
-
-}  // namespace oneflow
-
-#endif  // WITH_ROCM
-
-#endif  // ONEFLOW_CORE_EP_ROCM_CUDA_DEVICE_H_
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#ifndef ONEFLOW_CORE_EP_ROCM_CUDA_DEVICE_H_
+#define ONEFLOW_CORE_EP_ROCM_CUDA_DEVICE_H_
+
+#include "oneflow/core/ep/include/device.h"
+#include "oneflow/core/common/data_type.h"
+
+#ifdef WITH_ROCM
+
+#include <hip/hip_runtime.h>
+
+namespace oneflow {
+
+namespace ep {
+
+class CudaDevice : public Device {
+ public:
+  OF_DISALLOW_COPY_AND_MOVE(CudaDevice);
+  explicit CudaDevice(int device_index, DeviceManager* device_manager);
+  ~CudaDevice() override;
+
+  void SetAsActiveDevice() override;
+
+  DeviceType device_type() const override { return DeviceType::kCUDA; }
+  size_t device_index() const override { return device_index_; }
+  DeviceManager* device_manager() const override { return device_manager_; }
+
+  Stream* CreateStream() override;
+  void DestroyStream(Stream* stream) override;
+
+  void CreateEvents(Event** events, size_t count) override;
+  void DestroyEvents(Event** events, size_t count) override;
+
+  Maybe<void> Alloc(const AllocationOptions& options, void** ptr, size_t size) override;
+  void Free(const AllocationOptions& options, void* ptr) override;
+  Maybe<void> AllocPinned(const AllocationOptions& options, void** ptr, size_t size) override;
+  void FreePinned(const AllocationOptions& options, void* ptr) override;
+
+  const hipDeviceProp_t& properties() const;
+
+  const void* GetConstZeros(DataType data_type, size_t n) const;
+  const void* GetConstOnes(DataType data_type, size_t n) const;
+
+ private:
+  int device_index_;
+  std::mutex events_mutex_;
+  std::vector<Event*> events_;
+  unsigned int event_flags_;
+  hipDeviceProp_t properties_;
+  DeviceManager* device_manager_;
+  int64_t const_buf_elem_cnt_;
+  void* const_zeros_buffer_;
+  void* const_ones_buffer_fp32_;
+  void* const_ones_buffer_fp16_;
+  void* const_ones_buffer_bf16_;
+};
+
+}  // namespace ep
+
+}  // namespace oneflow
+
+#endif  // WITH_ROCM
+
+#endif  // ONEFLOW_CORE_EP_ROCM_CUDA_DEVICE_H_
diff --git a/oneflow/core/ep/rocm/cuda_device_manager.cpp b/oneflow/core/ep/rocm/cuda_device_manager.cpp
index 48664df..6ea769f 100644
--- a/oneflow/core/ep/rocm/cuda_device_manager.cpp
+++ b/oneflow/core/ep/rocm/cuda_device_manager.cpp
@@ -1,68 +1,68 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "oneflow/core/ep/rocm/cuda_device_manager.h"
-#include "oneflow/core/device/cuda_util.h"
-
-#ifdef WITH_ROCM
-
-namespace oneflow {
-
-namespace ep {
-
-CudaDeviceManager::CudaDeviceManager(DeviceManagerRegistry* registry) : registry_(registry) {}
-CudaDeviceManager::~CudaDeviceManager() = default;
-
-DeviceManagerRegistry* CudaDeviceManager::registry() const { return registry_; }
-
-std::shared_ptr<Device> CudaDeviceManager::GetDevice(size_t device_index) {
-  std::lock_guard<std::mutex> lock(devices_mutex_);
-  if (device_index < devices_.size() && devices_.at(device_index)) {
-    return devices_.at(device_index);
-  }
-  auto device = std::make_shared<CudaDevice>(device_index, this);
-  if (device_index >= devices_.size()) { devices_.resize(device_index + 1); }
-  devices_.at(device_index) = device;
-  return device;
-}
-
-size_t CudaDeviceManager::GetDeviceCount(size_t primary_device_index) {
-  CudaCurrentDeviceGuard guard(primary_device_index);
-  return this->GetDeviceCount();
-}
-
-size_t CudaDeviceManager::GetDeviceCount() {
-  int count = 0;
-  hipError_t err = hipGetDeviceCount(&count);
-  if (err == hipErrorNoDevice || err == hipErrorInsufficientDriver) { return 0; }
-  OF_CUDA_CHECK(err);
-  return count;
-}
-
-size_t CudaDeviceManager::GetActiveDeviceIndex() {
-  int device = 0;
-  OF_CUDA_CHECK(hipGetDevice(&device));
-  return static_cast<size_t>(device);
-}
-
-void CudaDeviceManager::SetActiveDeviceByIndex(size_t device_index) {
-  OF_CUDA_CHECK(hipSetDevice(static_cast<int>(device_index)));
-}
-
-}  // namespace ep
-
-}  // namespace oneflow
-
-#endif  // WITH_ROCM
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "oneflow/core/ep/rocm/cuda_device_manager.h"
+#include "oneflow/core/device/cuda_util.h"
+
+#ifdef WITH_ROCM
+
+namespace oneflow {
+
+namespace ep {
+
+CudaDeviceManager::CudaDeviceManager(DeviceManagerRegistry* registry) : registry_(registry) {}
+CudaDeviceManager::~CudaDeviceManager() = default;
+
+DeviceManagerRegistry* CudaDeviceManager::registry() const { return registry_; }
+
+std::shared_ptr<Device> CudaDeviceManager::GetDevice(size_t device_index) {
+  std::lock_guard<std::mutex> lock(devices_mutex_);
+  if (device_index < devices_.size() && devices_.at(device_index)) {
+    return devices_.at(device_index);
+  }
+  auto device = std::make_shared<CudaDevice>(device_index, this);
+  if (device_index >= devices_.size()) { devices_.resize(device_index + 1); }
+  devices_.at(device_index) = device;
+  return device;
+}
+
+size_t CudaDeviceManager::GetDeviceCount(size_t primary_device_index) {
+  CudaCurrentDeviceGuard guard(primary_device_index);
+  return this->GetDeviceCount();
+}
+
+size_t CudaDeviceManager::GetDeviceCount() {
+  int count = 0;
+  hipError_t err = hipGetDeviceCount(&count);
+  if (err == hipErrorNoDevice || err == hipErrorInsufficientDriver) { return 0; }
+  OF_CUDA_CHECK(err);
+  return count;
+}
+
+size_t CudaDeviceManager::GetActiveDeviceIndex() {
+  int device = 0;
+  OF_CUDA_CHECK(hipGetDevice(&device));
+  return static_cast<size_t>(device);
+}
+
+void CudaDeviceManager::SetActiveDeviceByIndex(size_t device_index) {
+  OF_CUDA_CHECK(hipSetDevice(static_cast<int>(device_index)));
+}
+
+}  // namespace ep
+
+}  // namespace oneflow
+
+#endif  // WITH_ROCM
diff --git a/oneflow/core/ep/rocm/cuda_device_manager.h b/oneflow/core/ep/rocm/cuda_device_manager.h
index e1b9488..22a9fc8 100644
--- a/oneflow/core/ep/rocm/cuda_device_manager.h
+++ b/oneflow/core/ep/rocm/cuda_device_manager.h
@@ -1,54 +1,54 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#ifndef ONEFLOW_CORE_EP_ROCM_CUDA_DEVICE_MANAGER_H_
-#define ONEFLOW_CORE_EP_ROCM_CUDA_DEVICE_MANAGER_H_
-
-#include "oneflow/core/ep/include/device_manager.h"
-#include "oneflow/core/ep/rocm/cuda_device.h"
-
-#ifdef WITH_ROCM
-
-namespace oneflow {
-namespace ep {
-
-class CudaDevice;
-
-class CudaDeviceManager : public DeviceManager {
- public:
-  OF_DISALLOW_COPY_AND_MOVE(CudaDeviceManager);
-  CudaDeviceManager(DeviceManagerRegistry* registry);
-  ~CudaDeviceManager() override;
-
-  DeviceManagerRegistry* registry() const override;
-  std::shared_ptr<Device> GetDevice(size_t device_index) override;
-  size_t GetDeviceCount(size_t primary_device_index) override;
-  size_t GetDeviceCount() override;
-  size_t GetActiveDeviceIndex() override;
-  void SetActiveDeviceByIndex(size_t device_index) override;
-
- private:
-  std::mutex devices_mutex_;
-  std::vector<std::shared_ptr<CudaDevice>> devices_;
-  DeviceManagerRegistry* registry_;
-};
-
-}  // namespace ep
-
-}  // namespace oneflow
-
-#endif  // WITH_ROCM
-
-#endif  // ONEFLOW_CORE_EP_ROCM_CUDA_DEVICE_MANAGER_H_
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#ifndef ONEFLOW_CORE_EP_ROCM_CUDA_DEVICE_MANAGER_H_
+#define ONEFLOW_CORE_EP_ROCM_CUDA_DEVICE_MANAGER_H_
+
+#include "oneflow/core/ep/include/device_manager.h"
+#include "oneflow/core/ep/rocm/cuda_device.h"
+
+#ifdef WITH_ROCM
+
+namespace oneflow {
+namespace ep {
+
+class CudaDevice;
+
+class CudaDeviceManager : public DeviceManager {
+ public:
+  OF_DISALLOW_COPY_AND_MOVE(CudaDeviceManager);
+  CudaDeviceManager(DeviceManagerRegistry* registry);
+  ~CudaDeviceManager() override;
+
+  DeviceManagerRegistry* registry() const override;
+  std::shared_ptr<Device> GetDevice(size_t device_index) override;
+  size_t GetDeviceCount(size_t primary_device_index) override;
+  size_t GetDeviceCount() override;
+  size_t GetActiveDeviceIndex() override;
+  void SetActiveDeviceByIndex(size_t device_index) override;
+
+ private:
+  std::mutex devices_mutex_;
+  std::vector<std::shared_ptr<CudaDevice>> devices_;
+  DeviceManagerRegistry* registry_;
+};
+
+}  // namespace ep
+
+}  // namespace oneflow
+
+#endif  // WITH_ROCM
+
+#endif  // ONEFLOW_CORE_EP_ROCM_CUDA_DEVICE_MANAGER_H_
diff --git a/oneflow/core/ep/rocm/cuda_device_manager_factory.cpp b/oneflow/core/ep/rocm/cuda_device_manager_factory.cpp
index fb8d15c..6b559fe 100644
--- a/oneflow/core/ep/rocm/cuda_device_manager_factory.cpp
+++ b/oneflow/core/ep/rocm/cuda_device_manager_factory.cpp
@@ -1,117 +1,117 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "oneflow/core/ep/include/device_manager_factory.h"
-#include "oneflow/core/ep/include/device_manager_registry.h"
-#include "oneflow/core/ep/rocm/cuda_device_manager.h"
-
-#ifdef WITH_ROCM
-
-#include <hip/hip_runtime.h>
-#include <miopen/miopen.h>
-#include <rccl.h>
-
-namespace oneflow {
-
-namespace ep {
-
-namespace {
-
-std::string GetCudaVersionString(int version) {
-  return std::to_string(version / 1000) + "." + std::to_string((version % 1000) / 10);
-}
-
-bool GetCudnnVersion(size_t* major, size_t* minor, size_t* patch) {
-  miopenStatus_t status = miopenGetVersion(major, minor, patch);
-  if (status == miopenStatusSuccess) {
-    return true;
-  } else {
-    LOG(ERROR) << "Failed to get cuDNN version: " << miopenGetErrorString(status);
-    return false;
-  }
-}
-
-bool GetCudnnVersionString(std::string* version) {
-  size_t version_major = 0;
-  size_t version_minor = 0;
-  size_t version_patch = 0;
-  if (!GetCudnnVersion(&version_major, &version_minor, &version_patch)) { return false; }
-  *version = std::to_string(version_major) + "." + std::to_string(version_minor) + "."
-             + std::to_string(version_patch);
-  return true;
-}
-
-void CudaDumpVersionInfo() {
-  {
-    int cuda_runtime_version = 0;
-    hipError_t err = hipRuntimeGetVersion(&cuda_runtime_version);
-    if (err == hipSuccess) {
-      LOG(INFO) << "CUDA runtime version: " << GetCudaVersionString(cuda_runtime_version);
-    } else {
-      LOG(ERROR) << "Failed to get cuda runtime version: " << hipGetErrorString(err);
-    }
-  }
-
-  {
-    std::string cudnn_version_string;
-    if (GetCudnnVersionString(&cudnn_version_string)) {
-      LOG(INFO) << "cuDNN version: " << cudnn_version_string;
-    }
-  }
-
-  {
-    int nccl_version = 0;
-    ncclResult_t result = ncclGetVersion(&nccl_version);
-    if (result == ncclSuccess) {
-      int nccl_version_major =
-          (nccl_version >= 20900) ? (nccl_version / 10000) : (nccl_version / 1000);
-      int nccl_version_minor =
-          (nccl_version >= 20900) ? (nccl_version % 10000) / 100 : (nccl_version % 1000) / 100;
-      int nccl_version_patch = (nccl_version % 100);
-      LOG(INFO) << "NCCL version: " << nccl_version_major << "." << nccl_version_minor << "."
-                << nccl_version_patch;
-    } else {
-      LOG(ERROR) << "Failed to get NCCL version: " << ncclGetErrorString(result);
-    }
-  }
-}
-
-class CudaDeviceManagerFactory : public DeviceManagerFactory {
- public:
-  OF_DISALLOW_COPY_AND_MOVE(CudaDeviceManagerFactory);
-  CudaDeviceManagerFactory() = default;
-  ~CudaDeviceManagerFactory() override = default;
-
-  std::unique_ptr<DeviceManager> NewDeviceManager(DeviceManagerRegistry* registry) override {
-    return std::make_unique<CudaDeviceManager>(registry);
-  }
-
-  DeviceType device_type() const override { return DeviceType::kCUDA; }
-
-  std::string device_type_name() const override { return "cuda"; }
-
-  void DumpVersionInfo() const override { CudaDumpVersionInfo(); }
-};
-
-COMMAND(DeviceManagerRegistry::RegisterDeviceManagerFactory(
-    std::make_unique<CudaDeviceManagerFactory>()))
-
-}  // namespace
-
-}  // namespace ep
-
-}  // namespace oneflow
-
-#endif  // WITH_ROCM
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "oneflow/core/ep/include/device_manager_factory.h"
+#include "oneflow/core/ep/include/device_manager_registry.h"
+#include "oneflow/core/ep/rocm/cuda_device_manager.h"
+
+#ifdef WITH_ROCM
+
+#include <hip/hip_runtime.h>
+#include <miopen/miopen.h>
+#include <rccl.h>
+
+namespace oneflow {
+
+namespace ep {
+
+namespace {
+
+std::string GetCudaVersionString(int version) {
+  return std::to_string(version / 1000) + "." + std::to_string((version % 1000) / 10);
+}
+
+bool GetCudnnVersion(size_t* major, size_t* minor, size_t* patch) {
+  miopenStatus_t status = miopenGetVersion(major, minor, patch);
+  if (status == miopenStatusSuccess) {
+    return true;
+  } else {
+    LOG(ERROR) << "Failed to get cuDNN version: " << miopenGetErrorString(status);
+    return false;
+  }
+}
+
+bool GetCudnnVersionString(std::string* version) {
+  size_t version_major = 0;
+  size_t version_minor = 0;
+  size_t version_patch = 0;
+  if (!GetCudnnVersion(&version_major, &version_minor, &version_patch)) { return false; }
+  *version = std::to_string(version_major) + "." + std::to_string(version_minor) + "."
+             + std::to_string(version_patch);
+  return true;
+}
+
+void CudaDumpVersionInfo() {
+  {
+    int cuda_runtime_version = 0;
+    hipError_t err = hipRuntimeGetVersion(&cuda_runtime_version);
+    if (err == hipSuccess) {
+      LOG(INFO) << "CUDA runtime version: " << GetCudaVersionString(cuda_runtime_version);
+    } else {
+      LOG(ERROR) << "Failed to get cuda runtime version: " << hipGetErrorString(err);
+    }
+  }
+
+  {
+    std::string cudnn_version_string;
+    if (GetCudnnVersionString(&cudnn_version_string)) {
+      LOG(INFO) << "cuDNN version: " << cudnn_version_string;
+    }
+  }
+
+  {
+    int nccl_version = 0;
+    ncclResult_t result = ncclGetVersion(&nccl_version);
+    if (result == ncclSuccess) {
+      int nccl_version_major =
+          (nccl_version >= 20900) ? (nccl_version / 10000) : (nccl_version / 1000);
+      int nccl_version_minor =
+          (nccl_version >= 20900) ? (nccl_version % 10000) / 100 : (nccl_version % 1000) / 100;
+      int nccl_version_patch = (nccl_version % 100);
+      LOG(INFO) << "NCCL version: " << nccl_version_major << "." << nccl_version_minor << "."
+                << nccl_version_patch;
+    } else {
+      LOG(ERROR) << "Failed to get NCCL version: " << ncclGetErrorString(result);
+    }
+  }
+}
+
+class CudaDeviceManagerFactory : public DeviceManagerFactory {
+ public:
+  OF_DISALLOW_COPY_AND_MOVE(CudaDeviceManagerFactory);
+  CudaDeviceManagerFactory() = default;
+  ~CudaDeviceManagerFactory() override = default;
+
+  std::unique_ptr<DeviceManager> NewDeviceManager(DeviceManagerRegistry* registry) override {
+    return std::make_unique<CudaDeviceManager>(registry);
+  }
+
+  DeviceType device_type() const override { return DeviceType::kCUDA; }
+
+  std::string device_type_name() const override { return "cuda"; }
+
+  void DumpVersionInfo() const override { CudaDumpVersionInfo(); }
+};
+
+COMMAND(DeviceManagerRegistry::RegisterDeviceManagerFactory(
+    std::make_unique<CudaDeviceManagerFactory>()))
+
+}  // namespace
+
+}  // namespace ep
+
+}  // namespace oneflow
+
+#endif  // WITH_ROCM
diff --git a/oneflow/core/ep/rocm/cuda_event.cpp b/oneflow/core/ep/rocm/cuda_event.cpp
index 20ce0f0..011adef 100644
--- a/oneflow/core/ep/rocm/cuda_event.cpp
+++ b/oneflow/core/ep/rocm/cuda_event.cpp
@@ -1,56 +1,56 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "oneflow/core/ep/rocm/cuda_event.h"
-
-#ifdef WITH_ROCM
-
-namespace oneflow {
-
-namespace ep {
-
-CudaEvent::CudaEvent(unsigned int flags) : cuda_event_{} {
-  OF_CUDA_CHECK(hipEventCreateWithFlags(&cuda_event_, flags));
-}
-
-CudaEvent::~CudaEvent() { OF_CUDA_CHECK(hipEventDestroy(cuda_event_)); }
-
-Maybe<bool> CudaEvent::QueryDone() {
-  hipError_t err = hipEventQuery(cuda_event_);
-  if (err == hipSuccess) {
-    return Maybe<bool>(true);
-  } else if (err == hipErrorNotReady) {
-    return Maybe<bool>(false);
-  } else {
-    return Error::RuntimeError() << hipGetErrorString(err);
-  }
-}
-
-Maybe<void> CudaEvent::Sync() {
-  hipError_t err = hipEventSynchronize(cuda_event_);
-  if (err == hipSuccess) {
-    return Maybe<void>::Ok();
-  } else {
-    return Error::RuntimeError() << hipGetErrorString(err);
-  }
-}
-
-hipEvent_t CudaEvent::cuda_event() { return cuda_event_; }
-
-}  // namespace ep
-
-}  // namespace oneflow
-
-#endif  // WITH_ROCM
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "oneflow/core/ep/rocm/cuda_event.h"
+
+#ifdef WITH_ROCM
+
+namespace oneflow {
+
+namespace ep {
+
+CudaEvent::CudaEvent(unsigned int flags) : cuda_event_{} {
+  OF_CUDA_CHECK(hipEventCreateWithFlags(&cuda_event_, flags));
+}
+
+CudaEvent::~CudaEvent() { OF_CUDA_CHECK(hipEventDestroy(cuda_event_)); }
+
+Maybe<bool> CudaEvent::QueryDone() {
+  hipError_t err = hipEventQuery(cuda_event_);
+  if (err == hipSuccess) {
+    return Maybe<bool>(true);
+  } else if (err == hipErrorNotReady) {
+    return Maybe<bool>(false);
+  } else {
+    return Error::RuntimeError() << hipGetErrorString(err);
+  }
+}
+
+Maybe<void> CudaEvent::Sync() {
+  hipError_t err = hipEventSynchronize(cuda_event_);
+  if (err == hipSuccess) {
+    return Maybe<void>::Ok();
+  } else {
+    return Error::RuntimeError() << hipGetErrorString(err);
+  }
+}
+
+hipEvent_t CudaEvent::cuda_event() { return cuda_event_; }
+
+}  // namespace ep
+
+}  // namespace oneflow
+
+#endif  // WITH_ROCM
diff --git a/oneflow/core/ep/rocm/cuda_event.h b/oneflow/core/ep/rocm/cuda_event.h
index 37a3379..62caf75 100644
--- a/oneflow/core/ep/rocm/cuda_event.h
+++ b/oneflow/core/ep/rocm/cuda_event.h
@@ -1,50 +1,50 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#ifndef ONEFLOW_CORE_EP_ROCM_CUDA_EVENT_H_
-#define ONEFLOW_CORE_EP_ROCM_CUDA_EVENT_H_
-
-#include "oneflow/core/ep/include/event.h"
-
-#ifdef WITH_ROCM
-
-#include "oneflow/core/device/cuda_util.h"
-
-namespace oneflow {
-
-namespace ep {
-
-class CudaEvent : public Event {
- public:
-  OF_DISALLOW_COPY_AND_MOVE(CudaEvent);
-  explicit CudaEvent(unsigned int flags);
-  ~CudaEvent() override;
-
-  Maybe<bool> QueryDone() override;
-  Maybe<void> Sync() override;
-
-  hipEvent_t cuda_event();
-
- private:
-  hipEvent_t cuda_event_;
-};
-
-}  // namespace ep
-
-}  // namespace oneflow
-
-#endif  // WITH_ROCM
-
-#endif  // ONEFLOW_CORE_EP_ROCM_CUDA_EVENT_H_
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#ifndef ONEFLOW_CORE_EP_ROCM_CUDA_EVENT_H_
+#define ONEFLOW_CORE_EP_ROCM_CUDA_EVENT_H_
+
+#include "oneflow/core/ep/include/event.h"
+
+#ifdef WITH_ROCM
+
+#include "oneflow/core/device/cuda_util.h"
+
+namespace oneflow {
+
+namespace ep {
+
+class CudaEvent : public Event {
+ public:
+  OF_DISALLOW_COPY_AND_MOVE(CudaEvent);
+  explicit CudaEvent(unsigned int flags);
+  ~CudaEvent() override;
+
+  Maybe<bool> QueryDone() override;
+  Maybe<void> Sync() override;
+
+  hipEvent_t cuda_event();
+
+ private:
+  hipEvent_t cuda_event_;
+};
+
+}  // namespace ep
+
+}  // namespace oneflow
+
+#endif  // WITH_ROCM
+
+#endif  // ONEFLOW_CORE_EP_ROCM_CUDA_EVENT_H_
diff --git a/oneflow/core/ep/rocm/cuda_stream.cpp b/oneflow/core/ep/rocm/cuda_stream.cpp
index 18f1870..1508ba0 100644
--- a/oneflow/core/ep/rocm/cuda_stream.cpp
+++ b/oneflow/core/ep/rocm/cuda_stream.cpp
@@ -1,180 +1,180 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "oneflow/core/ep/rocm/cuda_stream.h"
-#include "oneflow/core/job/global_for.h"
-#include "oneflow/core/job/resource_desc.h"
-#include "oneflow/core/hardware/node_device_descriptor_manager.h"
-#include "oneflow/core/hardware/cuda_device_descriptor.h"
-#include "oneflow/core/ep/rocm/cuda_event.h"
-#include "oneflow/core/ep/rocm/cuda_device.h"
-
-#ifdef WITH_ROCM
-
-namespace oneflow {
-
-namespace ep {
-
-namespace {
-
-constexpr size_t kDefaultWorkspaceSize = 4 * 1024 * 1024;  // 4M
-
-void SetAffinityByDevice(int dev_id) {
-  auto node_device_desc_mgr = Singleton<hardware::NodeDeviceDescriptorManager>::Get();
-  if (node_device_desc_mgr == nullptr) { return; }
-  auto node_device_desc = node_device_desc_mgr->GetLocalNodeDeviceDescriptor();
-  auto cuda_device = std::dynamic_pointer_cast<const hardware::CudaDeviceDescriptor>(
-      node_device_desc->GetDevice(hardware::kCudaDeviceDescriptorClassName, dev_id));
-  if (!cuda_device) { return; }
-  node_device_desc->Topology()->SetCPUAffinityByPCIBusID(cuda_device->PCIBusID());
-  node_device_desc->Topology()->SetMemoryAffinityByPCIBusID(cuda_device->PCIBusID());
-}
-
-}  // namespace
-
-#ifdef WITH_ROCM_GRAPHS
-
-CudaGraphExecutable::CudaGraphExecutable() : graph_exec_(nullptr), dev_(-1) {}
-
-CudaGraphExecutable::~CudaGraphExecutable() { Reset(); }
-
-void CudaGraphExecutable::Update(hipGraph_t graph) {
-  int dev = -1;
-  OF_CUDA_CHECK(hipGetDevice(&dev));
-  if (dev != dev_) { Reset(); }
-  dev_ = dev;
-  if (graph_exec_ != nullptr) {
-    hipGraphExecUpdateResult update_result{};
-    hipGraphNode_t error_node = nullptr;
-    OF_CUDA_CHECK(hipGraphExecUpdate(graph_exec_, graph, &error_node, &update_result));
-    if (update_result == hipGraphExecUpdateSuccess) { return; }
-  }
-  Reset();
-  OF_CUDA_CHECK(hipGraphInstantiate(&graph_exec_, graph, NULL, NULL, 0));
-}
-
-void CudaGraphExecutable::Launch(hipStream_t stream) const {
-  OF_CUDA_CHECK(hipGraphLaunch(graph_exec_, stream));
-}
-
-bool CudaGraphExecutable::IsInstantiated() const { return graph_exec_ != nullptr; }
-
-void CudaGraphExecutable::Reset() {
-  if (graph_exec_ != nullptr) {
-    CudaCurrentDeviceGuard guard(dev_);
-    OF_CUDA_CHECK(hipGraphExecDestroy(graph_exec_));
-  }
-}
-
-#endif  // WITH_ROCM_GRAPHS
-
-CudaStream::CudaStream(CudaDevice* device)
-    : device_index_(device->device_index()), device_(device) {
-  CudaCurrentDeviceGuard guard(device_index_);
-  // cuda_stream
-  OF_CUDA_CHECK(hipStreamCreate(&cuda_stream_));
-  // cublas_handle
-  OF_CUBLAS_CHECK(hipblasCreate(&cublas_handle_));
-  OF_CUBLAS_CHECK(hipblasSetStream(cublas_handle_, cuda_stream_));
-
-  workspace_size_ = kDefaultWorkspaceSize;
-  OF_CUDA_CHECK(hipMalloc(&workspace_, workspace_size_));
-
-  OF_CUDNN_CHECK(hipdnnCreate(&cudnn_handle_));
- 
-  OF_CUDNN_CHECK(hipdnnSetStream(cudnn_handle_, cuda_stream_));
-}
-
-CudaStream::~CudaStream() {
-  CudaCurrentDeviceGuard guard(device_index_);
-  OF_CUDA_CHECK(hipStreamSynchronize(cuda_stream_));
-  OF_CUDNN_CHECK(hipdnnDestroy(cudnn_handle_));
-  OF_CUBLAS_CHECK(hipblasDestroy(cublas_handle_));
-
-  OF_CUDA_CHECK(hipStreamDestroy(cuda_stream_));
-  OF_CUDA_CHECK(hipFree(workspace_));
-}
-
-Maybe<void> CudaStream::OnExecutionContextSetup() {
-  OF_CUDA_CHECK(hipSetDevice(device_index_));
-  SetAffinityByDevice(device_index_);
-  return Maybe<void>::Ok();
-}
-
-Maybe<void> CudaStream::OnExecutionContextTeardown() { return Maybe<void>::Ok(); }
-
-DeviceType CudaStream::device_type() const { return DeviceType::kCUDA; }
-
-CudaDevice* CudaStream::device() const { return device_; }
-
-Maybe<void> CudaStream::Sync() {
-  hipError_t err = hipStreamSynchronize(cuda_stream_);
-  if (err == hipSuccess) {
-    return Maybe<void>::Ok();
-  } else {
-    return Error::RuntimeError() << hipGetErrorString(err) << " (" << err << ") ";
-  }
-}
-
-void CudaStream::RecordEvent(Event* event) {
-  auto* cuda_event = static_cast<CudaEvent*>(event);  // NOLINT
-  OF_CUDA_CHECK(hipEventRecord(cuda_event->cuda_event(), cuda_stream_));
-}
-
-hipStream_t CudaStream::cuda_stream() const { return cuda_stream_; }
-
-hipblasHandle_t CudaStream::cublas_handle() const { return cublas_handle_; }
-
-void* CudaStream::cublas_workspace() const { return workspace_; }
-
-size_t CudaStream::cublas_workspace_size() const { return workspace_size_; }
-
-hipdnnHandle_t CudaStream::cudnn_handle() const { return cudnn_handle_; }
-
-const hipDeviceProp_t& CudaStream::device_properties() const { return device_->properties(); }
-
-int CudaStream::cuda_arch() const {
-  return device_->properties().major * 100 + device_->properties().minor * 10;
-}
-
-#ifdef WITH_ROCM_GRAPHS
-
-void CudaStream::BeginGraphCapture() {
-  CHECK(!is_graph_capturing_);
-  is_graph_capturing_ = true;
-  OF_CUDA_CHECK(hipStreamBeginCapture(cuda_stream_, hipStreamCaptureModeThreadLocal));
-}
-
-void CudaStream::EndGraphCapture(CudaGraphExecutable* executable) {
-  hipGraph_t graph = nullptr;
-  OF_CUDA_CHECK(hipStreamEndCapture(cuda_stream_, &graph));
-  executable->Update(graph);
-  OF_CUDA_CHECK(hipGraphDestroy(graph));
-  is_graph_capturing_ = false;
-}
-
-bool CudaStream::IsGraphCapturing() const { return is_graph_capturing_; }
-
-void CudaStream::LaunchGraph(const CudaGraphExecutable* executable) {
-  executable->Launch(cuda_stream_);
-}
-
-#endif  // WITH_ROCM_GRAPHS
-
-}  // namespace ep
-
-}  // namespace oneflow
-
-#endif  // WITH_ROCM
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "oneflow/core/ep/rocm/cuda_stream.h"
+#include "oneflow/core/job/global_for.h"
+#include "oneflow/core/job/resource_desc.h"
+#include "oneflow/core/hardware/node_device_descriptor_manager.h"
+#include "oneflow/core/hardware/cuda_device_descriptor.h"
+#include "oneflow/core/ep/rocm/cuda_event.h"
+#include "oneflow/core/ep/rocm/cuda_device.h"
+
+#ifdef WITH_ROCM
+
+namespace oneflow {
+
+namespace ep {
+
+namespace {
+
+constexpr size_t kDefaultWorkspaceSize = 4 * 1024 * 1024;  // 4M
+
+void SetAffinityByDevice(int dev_id) {
+  auto node_device_desc_mgr = Singleton<hardware::NodeDeviceDescriptorManager>::Get();
+  if (node_device_desc_mgr == nullptr) { return; }
+  auto node_device_desc = node_device_desc_mgr->GetLocalNodeDeviceDescriptor();
+  auto cuda_device = std::dynamic_pointer_cast<const hardware::CudaDeviceDescriptor>(
+      node_device_desc->GetDevice(hardware::kCudaDeviceDescriptorClassName, dev_id));
+  if (!cuda_device) { return; }
+  node_device_desc->Topology()->SetCPUAffinityByPCIBusID(cuda_device->PCIBusID());
+  node_device_desc->Topology()->SetMemoryAffinityByPCIBusID(cuda_device->PCIBusID());
+}
+
+}  // namespace
+
+#ifdef WITH_ROCM_GRAPHS
+
+CudaGraphExecutable::CudaGraphExecutable() : graph_exec_(nullptr), dev_(-1) {}
+
+CudaGraphExecutable::~CudaGraphExecutable() { Reset(); }
+
+void CudaGraphExecutable::Update(hipGraph_t graph) {
+  int dev = -1;
+  OF_CUDA_CHECK(hipGetDevice(&dev));
+  if (dev != dev_) { Reset(); }
+  dev_ = dev;
+  if (graph_exec_ != nullptr) {
+    hipGraphExecUpdateResult update_result{};
+    hipGraphNode_t error_node = nullptr;
+    OF_CUDA_CHECK(hipGraphExecUpdate(graph_exec_, graph, &error_node, &update_result));
+    if (update_result == hipGraphExecUpdateSuccess) { return; }
+  }
+  Reset();
+  OF_CUDA_CHECK(hipGraphInstantiate(&graph_exec_, graph, NULL, NULL, 0));
+}
+
+void CudaGraphExecutable::Launch(hipStream_t stream) const {
+  OF_CUDA_CHECK(hipGraphLaunch(graph_exec_, stream));
+}
+
+bool CudaGraphExecutable::IsInstantiated() const { return graph_exec_ != nullptr; }
+
+void CudaGraphExecutable::Reset() {
+  if (graph_exec_ != nullptr) {
+    CudaCurrentDeviceGuard guard(dev_);
+    OF_CUDA_CHECK(hipGraphExecDestroy(graph_exec_));
+  }
+}
+
+#endif  // WITH_ROCM_GRAPHS
+
+CudaStream::CudaStream(CudaDevice* device)
+    : device_index_(device->device_index()), device_(device) {
+  CudaCurrentDeviceGuard guard(device_index_);
+  // cuda_stream
+  OF_CUDA_CHECK(hipStreamCreate(&cuda_stream_));
+  // cublas_handle
+  OF_CUBLAS_CHECK(hipblasCreate(&cublas_handle_));
+  OF_CUBLAS_CHECK(hipblasSetStream(cublas_handle_, cuda_stream_));
+
+  workspace_size_ = kDefaultWorkspaceSize;
+  OF_CUDA_CHECK(hipMalloc(&workspace_, workspace_size_));
+
+  OF_CUDNN_CHECK(hipdnnCreate(&cudnn_handle_));
+ 
+  OF_CUDNN_CHECK(hipdnnSetStream(cudnn_handle_, cuda_stream_));
+}
+
+CudaStream::~CudaStream() {
+  CudaCurrentDeviceGuard guard(device_index_);
+  OF_CUDA_CHECK(hipStreamSynchronize(cuda_stream_));
+  OF_CUDNN_CHECK(hipdnnDestroy(cudnn_handle_));
+  OF_CUBLAS_CHECK(hipblasDestroy(cublas_handle_));
+
+  OF_CUDA_CHECK(hipStreamDestroy(cuda_stream_));
+  OF_CUDA_CHECK(hipFree(workspace_));
+}
+
+Maybe<void> CudaStream::OnExecutionContextSetup() {
+  OF_CUDA_CHECK(hipSetDevice(device_index_));
+  SetAffinityByDevice(device_index_);
+  return Maybe<void>::Ok();
+}
+
+Maybe<void> CudaStream::OnExecutionContextTeardown() { return Maybe<void>::Ok(); }
+
+DeviceType CudaStream::device_type() const { return DeviceType::kCUDA; }
+
+CudaDevice* CudaStream::device() const { return device_; }
+
+Maybe<void> CudaStream::Sync() {
+  hipError_t err = hipStreamSynchronize(cuda_stream_);
+  if (err == hipSuccess) {
+    return Maybe<void>::Ok();
+  } else {
+    return Error::RuntimeError() << hipGetErrorString(err) << " (" << err << ") ";
+  }
+}
+
+void CudaStream::RecordEvent(Event* event) {
+  auto* cuda_event = static_cast<CudaEvent*>(event);  // NOLINT
+  OF_CUDA_CHECK(hipEventRecord(cuda_event->cuda_event(), cuda_stream_));
+}
+
+hipStream_t CudaStream::cuda_stream() const { return cuda_stream_; }
+
+hipblasHandle_t CudaStream::cublas_handle() const { return cublas_handle_; }
+
+void* CudaStream::cublas_workspace() const { return workspace_; }
+
+size_t CudaStream::cublas_workspace_size() const { return workspace_size_; }
+
+hipdnnHandle_t CudaStream::cudnn_handle() const { return cudnn_handle_; }
+
+const hipDeviceProp_t& CudaStream::device_properties() const { return device_->properties(); }
+
+int CudaStream::cuda_arch() const {
+  return device_->properties().major * 100 + device_->properties().minor * 10;
+}
+
+#ifdef WITH_ROCM_GRAPHS
+
+void CudaStream::BeginGraphCapture() {
+  CHECK(!is_graph_capturing_);
+  is_graph_capturing_ = true;
+  OF_CUDA_CHECK(hipStreamBeginCapture(cuda_stream_, hipStreamCaptureModeThreadLocal));
+}
+
+void CudaStream::EndGraphCapture(CudaGraphExecutable* executable) {
+  hipGraph_t graph = nullptr;
+  OF_CUDA_CHECK(hipStreamEndCapture(cuda_stream_, &graph));
+  executable->Update(graph);
+  OF_CUDA_CHECK(hipGraphDestroy(graph));
+  is_graph_capturing_ = false;
+}
+
+bool CudaStream::IsGraphCapturing() const { return is_graph_capturing_; }
+
+void CudaStream::LaunchGraph(const CudaGraphExecutable* executable) {
+  executable->Launch(cuda_stream_);
+}
+
+#endif  // WITH_ROCM_GRAPHS
+
+}  // namespace ep
+
+}  // namespace oneflow
+
+#endif  // WITH_ROCM
diff --git a/oneflow/core/ep/rocm/cuda_stream.h b/oneflow/core/ep/rocm/cuda_stream.h
index b3149a7..b63af20 100644
--- a/oneflow/core/ep/rocm/cuda_stream.h
+++ b/oneflow/core/ep/rocm/cuda_stream.h
@@ -1,168 +1,168 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#ifndef ONEFLOW_CORE_EP_ROCM_CUDA_STREAM_H_
-#define ONEFLOW_CORE_EP_ROCM_CUDA_STREAM_H_
-
-#include "oneflow/core/ep/include/stream.h"
-#include "oneflow/core/ep/rocm/cuda_device.h"
-
-#ifdef WITH_ROCM
-
-#include <hip/hip_runtime.h>
-#include "oneflow/core/hipdnn/hipdnn.h"
-
-// #if CUDA_VERSION >= 11000
-// #define WITH_ROCM_GRAPHS
-// #endif  // CUDA_VERSION >= 11000
-
-#include "oneflow/core/device/cuda_util.h"
-
-namespace oneflow {
-
-namespace ep {
-
-class CudaDevice;
-
-#ifdef WITH_ROCM_GRAPHS
-
-class CudaGraphExecutable {
- public:
-  OF_DISALLOW_COPY_AND_MOVE(CudaGraphExecutable);
-  CudaGraphExecutable();
-  ~CudaGraphExecutable();
-
-  void Update(hipGraph_t graph);
-  void Launch(hipStream_t stream) const;
-  bool IsInstantiated() const;
-
- private:
-  void Reset();
-
-  hipGraphExec_t graph_exec_;
-  int dev_;
-};
-
-#endif  // WITH_ROCM_GRAPHS
-
-struct CudaLaunchConfig {
-  dim3 grid_dim;
-  dim3 block_dim;
-  size_t shared_mem_size;
-  CudaLaunchConfig() : grid_dim{}, block_dim{}, shared_mem_size(0) {}
-
-  CudaLaunchConfig(unsigned int grid_size, unsigned int block_size, size_t shared_mem_size)
-      : grid_dim(grid_size), block_dim(block_size), shared_mem_size(shared_mem_size) {}
-};
-
-class CudaStream : public Stream {
- public:
-  OF_DISALLOW_COPY_AND_MOVE(CudaStream);
-  explicit CudaStream(CudaDevice* device);
-  ~CudaStream() override;
-
-  static constexpr uint32_t kDefaultBlockSize = 256;
-
-  DeviceType device_type() const override;
-  CudaDevice* device() const override;
-  Maybe<void> Sync() override;
-  void RecordEvent(Event* event) override;
-
-  Maybe<void> OnExecutionContextSetup() override;
-  Maybe<void> OnExecutionContextTeardown() override;
-
-  hipStream_t cuda_stream() const;
-  hipblasHandle_t cublas_handle() const;
-
-// #if CUDA_VERSION >= 10010
-
-//   cublasLtHandle_t cublas_lt_handle() const;
-
-// #endif
-
-  hipdnnHandle_t cudnn_handle() const;
-  void* cublas_workspace() const;
-  size_t cublas_workspace_size() const;
-  const hipDeviceProp_t& device_properties() const;
-  int cuda_arch() const;
-
-  void InitLaunchConfigWithWaves(CudaLaunchConfig* config, size_t elem_cnt, size_t block_size,
-                                 size_t max_waves) const {
-    const uint32_t max_grid_size = max_waves * device_properties().multiProcessorCount
-                                   * (device_properties().maxThreadsPerMultiProcessor / block_size);
-    const uint32_t grid_size =
-        std::min<uint32_t>(max_grid_size, (elem_cnt + block_size - 1) / block_size);
-    config->grid_dim = dim3(grid_size);
-    config->block_dim = dim3(block_size);
-    config->shared_mem_size = 0;
-  }
-
-#ifdef __HIPCC__
-  template<typename... Params, typename... Args>
-  void LaunchKernel(void (*kernel)(Params...), const CudaLaunchConfig& launch_config,
-                    Args... args) {
-    kernel<<<launch_config.grid_dim, launch_config.block_dim, launch_config.shared_mem_size,
-             cuda_stream()>>>(args...);
-  }
-
-  template<typename... Params, typename... Args>
-  void LaunchKernel(void (*kernel)(Params...), size_t elem_cnt, size_t max_waves, Args... args) {
-    constexpr uint32_t block_size = kDefaultBlockSize;
-    CudaLaunchConfig config{};
-    InitLaunchConfigWithWaves(&config, elem_cnt, block_size, max_waves);
-    LaunchKernel(kernel, config, args...);
-  }
-
-  template<typename... Params, typename... Args>
-  void LaunchKernelDefaultWaves(void (*kernel)(Params...), size_t elem_cnt, Args... args) {
-    const size_t default_waves = 32;
-    LaunchKernel(kernel, elem_cnt, default_waves, args...);
-  }
-#endif  // __HIPCC__
-
-#ifdef WITH_ROCM_GRAPHS
-  void BeginGraphCapture();
-  void EndGraphCapture(CudaGraphExecutable* executable);
-  bool IsGraphCapturing() const;
-  void LaunchGraph(const CudaGraphExecutable* executable);
-#endif  // WITH_ROCM_GRAPHS
-
- private:
-  hipStream_t cuda_stream_{};
-  hipblasHandle_t cublas_handle_{};
-
-// #if CUDA_VERSION >= 10010
-
-//   cublasLtHandle_t cublas_lt_handle_{};
-
-// #endif
-
-  hipdnnHandle_t cudnn_handle_{};
-  int device_index_;
-  void* workspace_{};
-  size_t workspace_size_{};
-#ifdef WITH_ROCM_GRAPHS
-  bool is_graph_capturing_{};
-#endif  // WITH_ROCM_GRAPHS
-  CudaDevice* device_;
-};
-
-}  // namespace ep
-
-}  // namespace oneflow
-
-#endif  // WITH_ROCM
-
-#endif  // ONEFLOW_CORE_EP_ROCM_CUDA_STREAM_H_
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#ifndef ONEFLOW_CORE_EP_ROCM_CUDA_STREAM_H_
+#define ONEFLOW_CORE_EP_ROCM_CUDA_STREAM_H_
+
+#include "oneflow/core/ep/include/stream.h"
+#include "oneflow/core/ep/rocm/cuda_device.h"
+
+#ifdef WITH_ROCM
+
+#include <hip/hip_runtime.h>
+#include "oneflow/core/hipdnn/hipdnn.h"
+
+// #if CUDA_VERSION >= 11000
+// #define WITH_ROCM_GRAPHS
+// #endif  // CUDA_VERSION >= 11000
+
+#include "oneflow/core/device/cuda_util.h"
+
+namespace oneflow {
+
+namespace ep {
+
+class CudaDevice;
+
+#ifdef WITH_ROCM_GRAPHS
+
+class CudaGraphExecutable {
+ public:
+  OF_DISALLOW_COPY_AND_MOVE(CudaGraphExecutable);
+  CudaGraphExecutable();
+  ~CudaGraphExecutable();
+
+  void Update(hipGraph_t graph);
+  void Launch(hipStream_t stream) const;
+  bool IsInstantiated() const;
+
+ private:
+  void Reset();
+
+  hipGraphExec_t graph_exec_;
+  int dev_;
+};
+
+#endif  // WITH_ROCM_GRAPHS
+
+struct CudaLaunchConfig {
+  dim3 grid_dim;
+  dim3 block_dim;
+  size_t shared_mem_size;
+  CudaLaunchConfig() : grid_dim{}, block_dim{}, shared_mem_size(0) {}
+
+  CudaLaunchConfig(unsigned int grid_size, unsigned int block_size, size_t shared_mem_size)
+      : grid_dim(grid_size), block_dim(block_size), shared_mem_size(shared_mem_size) {}
+};
+
+class CudaStream : public Stream {
+ public:
+  OF_DISALLOW_COPY_AND_MOVE(CudaStream);
+  explicit CudaStream(CudaDevice* device);
+  ~CudaStream() override;
+
+  static constexpr uint32_t kDefaultBlockSize = 256;
+
+  DeviceType device_type() const override;
+  CudaDevice* device() const override;
+  Maybe<void> Sync() override;
+  void RecordEvent(Event* event) override;
+
+  Maybe<void> OnExecutionContextSetup() override;
+  Maybe<void> OnExecutionContextTeardown() override;
+
+  hipStream_t cuda_stream() const;
+  hipblasHandle_t cublas_handle() const;
+
+// #if CUDA_VERSION >= 10010
+
+//   cublasLtHandle_t cublas_lt_handle() const;
+
+// #endif
+
+  hipdnnHandle_t cudnn_handle() const;
+  void* cublas_workspace() const;
+  size_t cublas_workspace_size() const;
+  const hipDeviceProp_t& device_properties() const;
+  int cuda_arch() const;
+
+  void InitLaunchConfigWithWaves(CudaLaunchConfig* config, size_t elem_cnt, size_t block_size,
+                                 size_t max_waves) const {
+    const uint32_t max_grid_size = max_waves * device_properties().multiProcessorCount
+                                   * (device_properties().maxThreadsPerMultiProcessor / block_size);
+    const uint32_t grid_size =
+        std::min<uint32_t>(max_grid_size, (elem_cnt + block_size - 1) / block_size);
+    config->grid_dim = dim3(grid_size);
+    config->block_dim = dim3(block_size);
+    config->shared_mem_size = 0;
+  }
+
+#ifdef __HIPCC__
+  template<typename... Params, typename... Args>
+  void LaunchKernel(void (*kernel)(Params...), const CudaLaunchConfig& launch_config,
+                    Args... args) {
+    kernel<<<launch_config.grid_dim, launch_config.block_dim, launch_config.shared_mem_size,
+             cuda_stream()>>>(args...);
+  }
+
+  template<typename... Params, typename... Args>
+  void LaunchKernel(void (*kernel)(Params...), size_t elem_cnt, size_t max_waves, Args... args) {
+    constexpr uint32_t block_size = kDefaultBlockSize;
+    CudaLaunchConfig config{};
+    InitLaunchConfigWithWaves(&config, elem_cnt, block_size, max_waves);
+    LaunchKernel(kernel, config, args...);
+  }
+
+  template<typename... Params, typename... Args>
+  void LaunchKernelDefaultWaves(void (*kernel)(Params...), size_t elem_cnt, Args... args) {
+    const size_t default_waves = 32;
+    LaunchKernel(kernel, elem_cnt, default_waves, args...);
+  }
+#endif  // __HIPCC__
+
+#ifdef WITH_ROCM_GRAPHS
+  void BeginGraphCapture();
+  void EndGraphCapture(CudaGraphExecutable* executable);
+  bool IsGraphCapturing() const;
+  void LaunchGraph(const CudaGraphExecutable* executable);
+#endif  // WITH_ROCM_GRAPHS
+
+ private:
+  hipStream_t cuda_stream_{};
+  hipblasHandle_t cublas_handle_{};
+
+// #if CUDA_VERSION >= 10010
+
+//   cublasLtHandle_t cublas_lt_handle_{};
+
+// #endif
+
+  hipdnnHandle_t cudnn_handle_{};
+  int device_index_;
+  void* workspace_{};
+  size_t workspace_size_{};
+#ifdef WITH_ROCM_GRAPHS
+  bool is_graph_capturing_{};
+#endif  // WITH_ROCM_GRAPHS
+  CudaDevice* device_;
+};
+
+}  // namespace ep
+
+}  // namespace oneflow
+
+#endif  // WITH_ROCM
+
+#endif  // ONEFLOW_CORE_EP_ROCM_CUDA_STREAM_H_
diff --git a/oneflow/core/ep/rocm/primitive/add.hip.cpp b/oneflow/core/ep/rocm/primitive/add.hip.cpp
index 174cdbb..20cdd17 100644
--- a/oneflow/core/ep/rocm/primitive/add.hip.cpp
+++ b/oneflow/core/ep/rocm/primitive/add.hip.cpp
@@ -1,139 +1,139 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "oneflow/core/common/preprocessor.h"
-#include "oneflow/core/ep/include/primitive/add.h"
-#include "oneflow/core/ep/rocm/primitive/type_seq.h"
-#include "oneflow/core/hip/elementwise.hip.h"
-#include "oneflow/core/ep/rocm/cuda_stream.h"
-#include "oneflow/core/device/cuda_pseudo_bfloat16.h"
-
-namespace oneflow {
-
-namespace ep {
-namespace primitive {
-
-namespace {
-
-template<typename... Args>
-struct AddFunctor;
-
-template<typename T>
-struct AddFunctor<T> {
-  __device__ T operator()(T x) const { return x; }
-};
-
-template<typename T, typename U, typename... Args>
-struct AddFunctor<T, U, Args...> {
-  __device__ T operator()(T x0, U x1, Args... xs) const {
-    return x0 + AddFunctor<U, Args...>()(x1, xs...);
-  }
-};
-
-template<typename T, typename... Args>
-__global__ void AddGpu(const Args*... srcs, T* dst, size_t count) {
-  CUDA_1D_KERNEL_LOOP_T(size_t, i, count) { dst[i] = AddFunctor<Args...>()(srcs[i]...); }
-}
-
-template<typename T, typename... Args>
-void LaunchAddGpu(hipStream_t stream, const Args*... srcs, T* dst, size_t count) {
-  AddGpu<T, Args...>
-      <<<BlocksNum4ThreadsNum(count), kCudaThreadsNumPerBlock, 0, stream>>>(srcs..., dst, count);
-}
-
-template<typename T>
-void DispatchLaunch(hipStream_t stream, const T* const* srcs, size_t arity, T* dst, size_t count) {
-  if (arity == 0) {
-    OF_CUDA_CHECK(hipMemsetAsync(dst, 0, count * sizeof(T), stream));
-  } else if (arity == 1) {
-    OF_CUDA_CHECK(hipMemcpyAsync(dst, srcs[0], count * sizeof(T), hipMemcpyDefault, stream));
-  } else if (arity == 2) {
-    OF_CUDA_CHECK((cuda::elementwise::Binary<AddFunctor<T, T>, T, T, T>(
-        AddFunctor<T, T>(), count, dst, srcs[0], srcs[1], stream)));
-  } else if (arity == 3) {
-    OF_CUDA_CHECK((cuda::elementwise::Ternary<AddFunctor<T, T, T>, T, T, T, T>(
-        AddFunctor<T, T, T>(), count, dst, srcs[0], srcs[1], srcs[2], stream)));
-  } else if (arity == 4) {
-    LaunchAddGpu<T, T, T, T, T>(stream, srcs[0], srcs[1], srcs[2], srcs[3], dst, count);
-  } else if (arity == 5) {
-    LaunchAddGpu<T, T, T, T, T, T>(stream, srcs[0], srcs[1], srcs[2], srcs[3], srcs[4], dst, count);
-  } else if (arity == 6) {
-    LaunchAddGpu<T, T, T, T, T, T, T>(stream, srcs[0], srcs[1], srcs[2], srcs[3], srcs[4], srcs[5],
-                                      dst, count);
-  } else if (arity == 7) {
-    LaunchAddGpu<T, T, T, T, T, T, T, T>(stream, srcs[0], srcs[1], srcs[2], srcs[3], srcs[4],
-                                         srcs[5], srcs[6], dst, count);
-  } else if (arity == 8) {
-    LaunchAddGpu<T, T, T, T, T, T, T, T, T>(stream, srcs[0], srcs[1], srcs[2], srcs[3], srcs[4],
-                                            srcs[5], srcs[6], srcs[7], dst, count);
-  } else {
-    DispatchLaunch(stream, srcs + 7, arity - 7, dst, count);
-    LaunchAddGpu<T, T, T, T, T, T, T, T, T>(stream, srcs[0], srcs[1], srcs[2], srcs[3], srcs[4],
-                                            srcs[5], srcs[6], dst, dst, count);
-  }
-}
-
-template<typename T>
-class AddImpl : public Add {
- public:
-  OF_DISALLOW_COPY_AND_MOVE(AddImpl);
-  AddImpl() = default;
-  ~AddImpl() override = default;
-
-  using Add::Launch;
-  void Launch(Stream* stream, const void* const* srcs, size_t arity, void* dst,
-              size_t count) override {
-    hipStream_t cuda_stream = stream->As<CudaStream>()->cuda_stream();
-    DispatchLaunch(cuda_stream, reinterpret_cast<const T* const*>(srcs), arity,
-                   reinterpret_cast<T*>(dst), count);
-  }
-};
-
-template<typename T>
-std::unique_ptr<Add> NewAdd() {
-  return std::unique_ptr<Add>(new AddImpl<T>());
-}
-
-class AddFactoryImpl : public AddFactory {
- public:
-  OF_DISALLOW_COPY_AND_MOVE(AddFactoryImpl);
-  AddFactoryImpl() = default;
-  ~AddFactoryImpl() override = default;
-
-  std::unique_ptr<Add> New(DataType data_type) override {
-#define MAKE_NEW_ADD_ENTRY(type_cpp, type_proto) {type_proto, NewAdd<type_cpp>},
-
-    static const std::map<DataType, std::function<std::unique_ptr<Add>()>> new_add_handle{
-        OF_PP_FOR_EACH_TUPLE(MAKE_NEW_ADD_ENTRY, CUDA_PRIMITIVE_ALL_TYPE_SEQ)};
-
-#undef MAKE_NEW_ADD_ENTRY
-
-    const auto it = new_add_handle.find(data_type);
-    if (it != new_add_handle.end()) {
-      return it->second();
-    } else {
-      return nullptr;
-    }
-  }
-};
-
-REGISTER_PRIMITIVE_FACTORY(DeviceType::kCUDA, AddFactory, AddFactoryImpl);
-
-}  // namespace
-
-}  // namespace primitive
-}  // namespace ep
-
-}  // namespace oneflow
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "oneflow/core/common/preprocessor.h"
+#include "oneflow/core/ep/include/primitive/add.h"
+#include "oneflow/core/ep/rocm/primitive/type_seq.h"
+#include "oneflow/core/hip/elementwise.hip.h"
+#include "oneflow/core/ep/rocm/cuda_stream.h"
+#include "oneflow/core/device/cuda_pseudo_bfloat16.h"
+
+namespace oneflow {
+
+namespace ep {
+namespace primitive {
+
+namespace {
+
+template<typename... Args>
+struct AddFunctor;
+
+template<typename T>
+struct AddFunctor<T> {
+  __device__ T operator()(T x) const { return x; }
+};
+
+template<typename T, typename U, typename... Args>
+struct AddFunctor<T, U, Args...> {
+  __device__ T operator()(T x0, U x1, Args... xs) const {
+    return x0 + AddFunctor<U, Args...>()(x1, xs...);
+  }
+};
+
+template<typename T, typename... Args>
+__global__ void AddGpu(const Args*... srcs, T* dst, size_t count) {
+  CUDA_1D_KERNEL_LOOP_T(size_t, i, count) { dst[i] = AddFunctor<Args...>()(srcs[i]...); }
+}
+
+template<typename T, typename... Args>
+void LaunchAddGpu(hipStream_t stream, const Args*... srcs, T* dst, size_t count) {
+  AddGpu<T, Args...>
+      <<<BlocksNum4ThreadsNum(count), kCudaThreadsNumPerBlock, 0, stream>>>(srcs..., dst, count);
+}
+
+template<typename T>
+void DispatchLaunch(hipStream_t stream, const T* const* srcs, size_t arity, T* dst, size_t count) {
+  if (arity == 0) {
+    OF_CUDA_CHECK(hipMemsetAsync(dst, 0, count * sizeof(T), stream));
+  } else if (arity == 1) {
+    OF_CUDA_CHECK(hipMemcpyAsync(dst, srcs[0], count * sizeof(T), hipMemcpyDefault, stream));
+  } else if (arity == 2) {
+    OF_CUDA_CHECK((cuda::elementwise::Binary<AddFunctor<T, T>, T, T, T>(
+        AddFunctor<T, T>(), count, dst, srcs[0], srcs[1], stream)));
+  } else if (arity == 3) {
+    OF_CUDA_CHECK((cuda::elementwise::Ternary<AddFunctor<T, T, T>, T, T, T, T>(
+        AddFunctor<T, T, T>(), count, dst, srcs[0], srcs[1], srcs[2], stream)));
+  } else if (arity == 4) {
+    LaunchAddGpu<T, T, T, T, T>(stream, srcs[0], srcs[1], srcs[2], srcs[3], dst, count);
+  } else if (arity == 5) {
+    LaunchAddGpu<T, T, T, T, T, T>(stream, srcs[0], srcs[1], srcs[2], srcs[3], srcs[4], dst, count);
+  } else if (arity == 6) {
+    LaunchAddGpu<T, T, T, T, T, T, T>(stream, srcs[0], srcs[1], srcs[2], srcs[3], srcs[4], srcs[5],
+                                      dst, count);
+  } else if (arity == 7) {
+    LaunchAddGpu<T, T, T, T, T, T, T, T>(stream, srcs[0], srcs[1], srcs[2], srcs[3], srcs[4],
+                                         srcs[5], srcs[6], dst, count);
+  } else if (arity == 8) {
+    LaunchAddGpu<T, T, T, T, T, T, T, T, T>(stream, srcs[0], srcs[1], srcs[2], srcs[3], srcs[4],
+                                            srcs[5], srcs[6], srcs[7], dst, count);
+  } else {
+    DispatchLaunch(stream, srcs + 7, arity - 7, dst, count);
+    LaunchAddGpu<T, T, T, T, T, T, T, T, T>(stream, srcs[0], srcs[1], srcs[2], srcs[3], srcs[4],
+                                            srcs[5], srcs[6], dst, dst, count);
+  }
+}
+
+template<typename T>
+class AddImpl : public Add {
+ public:
+  OF_DISALLOW_COPY_AND_MOVE(AddImpl);
+  AddImpl() = default;
+  ~AddImpl() override = default;
+
+  using Add::Launch;
+  void Launch(Stream* stream, const void* const* srcs, size_t arity, void* dst,
+              size_t count) override {
+    hipStream_t cuda_stream = stream->As<CudaStream>()->cuda_stream();
+    DispatchLaunch(cuda_stream, reinterpret_cast<const T* const*>(srcs), arity,
+                   reinterpret_cast<T*>(dst), count);
+  }
+};
+
+template<typename T>
+std::unique_ptr<Add> NewAdd() {
+  return std::unique_ptr<Add>(new AddImpl<T>());
+}
+
+class AddFactoryImpl : public AddFactory {
+ public:
+  OF_DISALLOW_COPY_AND_MOVE(AddFactoryImpl);
+  AddFactoryImpl() = default;
+  ~AddFactoryImpl() override = default;
+
+  std::unique_ptr<Add> New(DataType data_type) override {
+#define MAKE_NEW_ADD_ENTRY(type_cpp, type_proto) {type_proto, NewAdd<type_cpp>},
+
+    static const std::map<DataType, std::function<std::unique_ptr<Add>()>> new_add_handle{
+        OF_PP_FOR_EACH_TUPLE(MAKE_NEW_ADD_ENTRY, CUDA_PRIMITIVE_ALL_TYPE_SEQ)};
+
+#undef MAKE_NEW_ADD_ENTRY
+
+    const auto it = new_add_handle.find(data_type);
+    if (it != new_add_handle.end()) {
+      return it->second();
+    } else {
+      return nullptr;
+    }
+  }
+};
+
+REGISTER_PRIMITIVE_FACTORY(DeviceType::kCUDA, AddFactory, AddFactoryImpl);
+
+}  // namespace
+
+}  // namespace primitive
+}  // namespace ep
+
+}  // namespace oneflow
diff --git a/oneflow/core/ep/rocm/primitive/binary_functor.hip.h b/oneflow/core/ep/rocm/primitive/binary_functor.hip.h
index 3dd42dc..b04935e 100644
--- a/oneflow/core/ep/rocm/primitive/binary_functor.hip.h
+++ b/oneflow/core/ep/rocm/primitive/binary_functor.hip.h
@@ -1,151 +1,151 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-#include "oneflow/core/ep/common/primitive/binary_functor.h"
-
-namespace oneflow {
-namespace ep {
-namespace primitive {
-namespace broadcast_elementwise_binary {
-
-template<typename Src, typename Dst>
-struct BinaryFunctor<DeviceType::kCUDA, BinaryOp::kPow, Src, Dst> {
-  OF_DEVICE_FUNC BinaryFunctor(Scalar attr0, Scalar attr1) {}
-
-  OF_DEVICE_FUNC Dst operator()(Src src0, Src src1) const { return pow(src0, src1); }
-};
-
-template<>
-struct BinaryFunctor<DeviceType::kCUDA, BinaryOp::kPow, bool, bool> {
-  OF_DEVICE_FUNC BinaryFunctor(Scalar attr0, Scalar attr1) {}
-
-  OF_DEVICE_FUNC bool operator()(bool src0, bool src1) const {
-    return static_cast<bool>(pow(static_cast<double>(src0), static_cast<double>(src1)));
-  }
-};
-
-template<>
-struct BinaryFunctor<DeviceType::kCUDA, BinaryOp::kPow, half, half> {
-  OF_DEVICE_FUNC BinaryFunctor(Scalar attr0, Scalar attr1) {}
-
-  OF_DEVICE_FUNC half operator()(half src0, half src1) const {
-    return static_cast<half>(pow(static_cast<float>(src0), static_cast<float>(src1)));
-  }
-};
-
-template<typename Src, typename Dst>
-struct BinaryFunctor<DeviceType::kCUDA, BinaryOp::kGeluBackwardWithDyX, Src, Dst> {
-  OF_DEVICE_FUNC BinaryFunctor(Scalar attr0, Scalar attr1) {
-#if defined(__CUDA_ARCH__)
-    coef = sqrt(static_cast<Src>(2.0) / acos(static_cast<Src>(-1.0)));
-#elif defined(__HIP_DEVICE_COMPILE__)
-    coef = sqrt(static_cast<Src>(2.0) / acos(static_cast<Src>(-1.0)));
-#else
-    coef = std::sqrt(static_cast<Src>(2.0) / std::acos(static_cast<Src>(-1.0)));
-#endif
-  }
-
-  OF_DEVICE_FUNC Dst operator()(Src dy, Src x) const {
-    return static_cast<Src>(0.5)
-           * (static_cast<Src>(1.0) + erf(static_cast<Src>(M_SQRT1_2) * x)
-              + x * coef * exp(static_cast<Src>(-0.5) * x * x))
-           * dy;
-  }
-  Src coef;
-};
-
-template<typename Src, typename Dst>
-struct BinaryFunctor<DeviceType::kCUDA, BinaryOp::kTanhBackwardWithDyX, Src, Dst> {
-  OF_DEVICE_FUNC BinaryFunctor(Scalar attr0, Scalar attr1) {}
-
-  OF_DEVICE_FUNC Dst operator()(Src dy, Src x) const {
-    Src tanh_val = tanh(x);
-    return static_cast<Dst>(dy * (static_cast<Src>(1.0) - tanh_val * tanh_val));
-  }
-};
-
-// /*********nv_bfloat16_kernel*******/
-
-// #if CUDA_VERSION >= 11000
-
-// template<>
-// struct BinaryFunctor<DeviceType::kCUDA, BinaryOp::kPow, nv_bfloat16, nv_bfloat16> {
-//   OF_DEVICE_FUNC BinaryFunctor(Scalar attr0, Scalar attr1) {}
-
-//   OF_DEVICE_FUNC nv_bfloat16 operator()(nv_bfloat16 src0, nv_bfloat16 src1) const {
-//     return static_cast<nv_bfloat16>(pow(static_cast<float>(src0), static_cast<float>(src1)));
-//   }
-// };
-
-// #define SPECIALIZATION_PSEUDO_BFLOAT16_BINARY_FUNCTOR(op)                                     \
-//   template<>                                                                                  \
-//   struct BinaryFunctor<DeviceType::kCUDA, op, nv_bfloat16, nv_bfloat16> {                     \
-//     OF_DEVICE_FUNC BinaryFunctor(Scalar attr0, Scalar attr1) : float_functor(attr0, attr1) {} \
-//                                                                                               \
-//     BinaryFunctor<DeviceType::kCUDA, op, float, float> float_functor;                         \
-//     OF_DEVICE_FUNC nv_bfloat16 operator()(nv_bfloat16 src0, nv_bfloat16 src1) const {         \
-//       return __float2bfloat16(float_functor(__bfloat162float(src0), __bfloat162float(src1))); \
-//     }                                                                                         \
-//   };
-
-// SPECIALIZATION_PSEUDO_BFLOAT16_BINARY_FUNCTOR(BinaryOp::kEluBackwardWithDyX);
-// SPECIALIZATION_PSEUDO_BFLOAT16_BINARY_FUNCTOR(BinaryOp::kCeluBackwardWithDyX);
-// SPECIALIZATION_PSEUDO_BFLOAT16_BINARY_FUNCTOR(BinaryOp::kGeluBackwardWithDyX);
-// SPECIALIZATION_PSEUDO_BFLOAT16_BINARY_FUNCTOR(BinaryOp::kHardswishBackwardWithDyX);
-// SPECIALIZATION_PSEUDO_BFLOAT16_BINARY_FUNCTOR(BinaryOp::kHardsigmoidBackwardWithDyX);
-// SPECIALIZATION_PSEUDO_BFLOAT16_BINARY_FUNCTOR(BinaryOp::kHardshrinkBackwardWithDyY);
-// SPECIALIZATION_PSEUDO_BFLOAT16_BINARY_FUNCTOR(BinaryOp::kHardtanhBackwardWithDyY);
-// SPECIALIZATION_PSEUDO_BFLOAT16_BINARY_FUNCTOR(BinaryOp::kLeakyReluBackwardWithDyX);
-// SPECIALIZATION_PSEUDO_BFLOAT16_BINARY_FUNCTOR(BinaryOp::kMishBackwardWithDyX);
-// SPECIALIZATION_PSEUDO_BFLOAT16_BINARY_FUNCTOR(BinaryOp::kSeluBackwardWithDyX);
-// SPECIALIZATION_PSEUDO_BFLOAT16_BINARY_FUNCTOR(BinaryOp::kSiluBackwardWithDyX);
-// SPECIALIZATION_PSEUDO_BFLOAT16_BINARY_FUNCTOR(BinaryOp::kSoftsignBackwardWithDyX);
-// SPECIALIZATION_PSEUDO_BFLOAT16_BINARY_FUNCTOR(BinaryOp::kSoftplusBackwardWithDyX);
-// SPECIALIZATION_PSEUDO_BFLOAT16_BINARY_FUNCTOR(BinaryOp::kSoftshrinkBackwardWithDyY);
-// SPECIALIZATION_PSEUDO_BFLOAT16_BINARY_FUNCTOR(BinaryOp::kTanhBackwardWithDyX);
-// SPECIALIZATION_PSEUDO_BFLOAT16_BINARY_FUNCTOR(BinaryOp::kThresholdBackwardWithDyX);
-
-// #endif  // CUDA_VERSION >= 11000
-
-#define SPECIALIZATION_PSEUDO_HALF_BINARY_FUNCTOR(op)                                         \
-  template<>                                                                                  \
-  struct BinaryFunctor<DeviceType::kCUDA, op, half, half> {                                   \
-    OF_DEVICE_FUNC BinaryFunctor(Scalar attr0, Scalar attr1) : float_functor(attr0, attr1) {} \
-                                                                                              \
-    BinaryFunctor<DeviceType::kCUDA, op, float, float> float_functor;                         \
-    OF_DEVICE_FUNC half operator()(half src0, half src1) const {                              \
-      return __float2half(float_functor(__half2float(src0), __half2float(src1)));             \
-    }                                                                                         \
-  };
-
-SPECIALIZATION_PSEUDO_HALF_BINARY_FUNCTOR(BinaryOp::kEluBackwardWithDyX);
-SPECIALIZATION_PSEUDO_HALF_BINARY_FUNCTOR(BinaryOp::kCeluBackwardWithDyX);
-SPECIALIZATION_PSEUDO_HALF_BINARY_FUNCTOR(BinaryOp::kGeluBackwardWithDyX);
-SPECIALIZATION_PSEUDO_HALF_BINARY_FUNCTOR(BinaryOp::kHardswishBackwardWithDyX);
-SPECIALIZATION_PSEUDO_HALF_BINARY_FUNCTOR(BinaryOp::kHardshrinkBackwardWithDyY);
-SPECIALIZATION_PSEUDO_HALF_BINARY_FUNCTOR(BinaryOp::kMishBackwardWithDyX);
-SPECIALIZATION_PSEUDO_HALF_BINARY_FUNCTOR(BinaryOp::kSiluBackwardWithDyX);
-SPECIALIZATION_PSEUDO_HALF_BINARY_FUNCTOR(BinaryOp::kSeluBackwardWithDyX);
-SPECIALIZATION_PSEUDO_HALF_BINARY_FUNCTOR(BinaryOp::kSoftplusBackwardWithDyX);
-SPECIALIZATION_PSEUDO_HALF_BINARY_FUNCTOR(BinaryOp::kSoftsignBackwardWithDyX);
-SPECIALIZATION_PSEUDO_HALF_BINARY_FUNCTOR(BinaryOp::kSoftshrinkBackwardWithDyY);
-SPECIALIZATION_PSEUDO_HALF_BINARY_FUNCTOR(BinaryOp::kThresholdBackwardWithDyX);
-SPECIALIZATION_PSEUDO_HALF_BINARY_FUNCTOR(BinaryOp::kTanhBackwardWithDyX);
-
-}  // namespace broadcast_elementwise_binary
-}  // namespace primitive
-}  // namespace ep
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+#include "oneflow/core/ep/common/primitive/binary_functor.h"
+
+namespace oneflow {
+namespace ep {
+namespace primitive {
+namespace broadcast_elementwise_binary {
+
+template<typename Src, typename Dst>
+struct BinaryFunctor<DeviceType::kCUDA, BinaryOp::kPow, Src, Dst> {
+  OF_DEVICE_FUNC BinaryFunctor(Scalar attr0, Scalar attr1) {}
+
+  OF_DEVICE_FUNC Dst operator()(Src src0, Src src1) const { return pow(src0, src1); }
+};
+
+template<>
+struct BinaryFunctor<DeviceType::kCUDA, BinaryOp::kPow, bool, bool> {
+  OF_DEVICE_FUNC BinaryFunctor(Scalar attr0, Scalar attr1) {}
+
+  OF_DEVICE_FUNC bool operator()(bool src0, bool src1) const {
+    return static_cast<bool>(pow(static_cast<double>(src0), static_cast<double>(src1)));
+  }
+};
+
+template<>
+struct BinaryFunctor<DeviceType::kCUDA, BinaryOp::kPow, half, half> {
+  OF_DEVICE_FUNC BinaryFunctor(Scalar attr0, Scalar attr1) {}
+
+  OF_DEVICE_FUNC half operator()(half src0, half src1) const {
+    return static_cast<half>(pow(static_cast<float>(src0), static_cast<float>(src1)));
+  }
+};
+
+template<typename Src, typename Dst>
+struct BinaryFunctor<DeviceType::kCUDA, BinaryOp::kGeluBackwardWithDyX, Src, Dst> {
+  OF_DEVICE_FUNC BinaryFunctor(Scalar attr0, Scalar attr1) {
+#if defined(__CUDA_ARCH__)
+    coef = sqrt(static_cast<Src>(2.0) / acos(static_cast<Src>(-1.0)));
+#elif defined(__HIP_DEVICE_COMPILE__)
+    coef = sqrt(static_cast<Src>(2.0) / acos(static_cast<Src>(-1.0)));
+#else
+    coef = std::sqrt(static_cast<Src>(2.0) / std::acos(static_cast<Src>(-1.0)));
+#endif
+  }
+
+  OF_DEVICE_FUNC Dst operator()(Src dy, Src x) const {
+    return static_cast<Src>(0.5)
+           * (static_cast<Src>(1.0) + erf(static_cast<Src>(M_SQRT1_2) * x)
+              + x * coef * exp(static_cast<Src>(-0.5) * x * x))
+           * dy;
+  }
+  Src coef;
+};
+
+template<typename Src, typename Dst>
+struct BinaryFunctor<DeviceType::kCUDA, BinaryOp::kTanhBackwardWithDyX, Src, Dst> {
+  OF_DEVICE_FUNC BinaryFunctor(Scalar attr0, Scalar attr1) {}
+
+  OF_DEVICE_FUNC Dst operator()(Src dy, Src x) const {
+    Src tanh_val = tanh(x);
+    return static_cast<Dst>(dy * (static_cast<Src>(1.0) - tanh_val * tanh_val));
+  }
+};
+
+// /*********nv_bfloat16_kernel*******/
+
+// #if CUDA_VERSION >= 11000
+
+// template<>
+// struct BinaryFunctor<DeviceType::kCUDA, BinaryOp::kPow, nv_bfloat16, nv_bfloat16> {
+//   OF_DEVICE_FUNC BinaryFunctor(Scalar attr0, Scalar attr1) {}
+
+//   OF_DEVICE_FUNC nv_bfloat16 operator()(nv_bfloat16 src0, nv_bfloat16 src1) const {
+//     return static_cast<nv_bfloat16>(pow(static_cast<float>(src0), static_cast<float>(src1)));
+//   }
+// };
+
+// #define SPECIALIZATION_PSEUDO_BFLOAT16_BINARY_FUNCTOR(op)                                     \
+//   template<>                                                                                  \
+//   struct BinaryFunctor<DeviceType::kCUDA, op, nv_bfloat16, nv_bfloat16> {                     \
+//     OF_DEVICE_FUNC BinaryFunctor(Scalar attr0, Scalar attr1) : float_functor(attr0, attr1) {} \
+//                                                                                               \
+//     BinaryFunctor<DeviceType::kCUDA, op, float, float> float_functor;                         \
+//     OF_DEVICE_FUNC nv_bfloat16 operator()(nv_bfloat16 src0, nv_bfloat16 src1) const {         \
+//       return __float2bfloat16(float_functor(__bfloat162float(src0), __bfloat162float(src1))); \
+//     }                                                                                         \
+//   };
+
+// SPECIALIZATION_PSEUDO_BFLOAT16_BINARY_FUNCTOR(BinaryOp::kEluBackwardWithDyX);
+// SPECIALIZATION_PSEUDO_BFLOAT16_BINARY_FUNCTOR(BinaryOp::kCeluBackwardWithDyX);
+// SPECIALIZATION_PSEUDO_BFLOAT16_BINARY_FUNCTOR(BinaryOp::kGeluBackwardWithDyX);
+// SPECIALIZATION_PSEUDO_BFLOAT16_BINARY_FUNCTOR(BinaryOp::kHardswishBackwardWithDyX);
+// SPECIALIZATION_PSEUDO_BFLOAT16_BINARY_FUNCTOR(BinaryOp::kHardsigmoidBackwardWithDyX);
+// SPECIALIZATION_PSEUDO_BFLOAT16_BINARY_FUNCTOR(BinaryOp::kHardshrinkBackwardWithDyY);
+// SPECIALIZATION_PSEUDO_BFLOAT16_BINARY_FUNCTOR(BinaryOp::kHardtanhBackwardWithDyY);
+// SPECIALIZATION_PSEUDO_BFLOAT16_BINARY_FUNCTOR(BinaryOp::kLeakyReluBackwardWithDyX);
+// SPECIALIZATION_PSEUDO_BFLOAT16_BINARY_FUNCTOR(BinaryOp::kMishBackwardWithDyX);
+// SPECIALIZATION_PSEUDO_BFLOAT16_BINARY_FUNCTOR(BinaryOp::kSeluBackwardWithDyX);
+// SPECIALIZATION_PSEUDO_BFLOAT16_BINARY_FUNCTOR(BinaryOp::kSiluBackwardWithDyX);
+// SPECIALIZATION_PSEUDO_BFLOAT16_BINARY_FUNCTOR(BinaryOp::kSoftsignBackwardWithDyX);
+// SPECIALIZATION_PSEUDO_BFLOAT16_BINARY_FUNCTOR(BinaryOp::kSoftplusBackwardWithDyX);
+// SPECIALIZATION_PSEUDO_BFLOAT16_BINARY_FUNCTOR(BinaryOp::kSoftshrinkBackwardWithDyY);
+// SPECIALIZATION_PSEUDO_BFLOAT16_BINARY_FUNCTOR(BinaryOp::kTanhBackwardWithDyX);
+// SPECIALIZATION_PSEUDO_BFLOAT16_BINARY_FUNCTOR(BinaryOp::kThresholdBackwardWithDyX);
+
+// #endif  // CUDA_VERSION >= 11000
+
+#define SPECIALIZATION_PSEUDO_HALF_BINARY_FUNCTOR(op)                                         \
+  template<>                                                                                  \
+  struct BinaryFunctor<DeviceType::kCUDA, op, half, half> {                                   \
+    OF_DEVICE_FUNC BinaryFunctor(Scalar attr0, Scalar attr1) : float_functor(attr0, attr1) {} \
+                                                                                              \
+    BinaryFunctor<DeviceType::kCUDA, op, float, float> float_functor;                         \
+    OF_DEVICE_FUNC half operator()(half src0, half src1) const {                              \
+      return __float2half(float_functor(__half2float(src0), __half2float(src1)));             \
+    }                                                                                         \
+  };
+
+SPECIALIZATION_PSEUDO_HALF_BINARY_FUNCTOR(BinaryOp::kEluBackwardWithDyX);
+SPECIALIZATION_PSEUDO_HALF_BINARY_FUNCTOR(BinaryOp::kCeluBackwardWithDyX);
+SPECIALIZATION_PSEUDO_HALF_BINARY_FUNCTOR(BinaryOp::kGeluBackwardWithDyX);
+SPECIALIZATION_PSEUDO_HALF_BINARY_FUNCTOR(BinaryOp::kHardswishBackwardWithDyX);
+SPECIALIZATION_PSEUDO_HALF_BINARY_FUNCTOR(BinaryOp::kHardshrinkBackwardWithDyY);
+SPECIALIZATION_PSEUDO_HALF_BINARY_FUNCTOR(BinaryOp::kMishBackwardWithDyX);
+SPECIALIZATION_PSEUDO_HALF_BINARY_FUNCTOR(BinaryOp::kSiluBackwardWithDyX);
+SPECIALIZATION_PSEUDO_HALF_BINARY_FUNCTOR(BinaryOp::kSeluBackwardWithDyX);
+SPECIALIZATION_PSEUDO_HALF_BINARY_FUNCTOR(BinaryOp::kSoftplusBackwardWithDyX);
+SPECIALIZATION_PSEUDO_HALF_BINARY_FUNCTOR(BinaryOp::kSoftsignBackwardWithDyX);
+SPECIALIZATION_PSEUDO_HALF_BINARY_FUNCTOR(BinaryOp::kSoftshrinkBackwardWithDyY);
+SPECIALIZATION_PSEUDO_HALF_BINARY_FUNCTOR(BinaryOp::kThresholdBackwardWithDyX);
+SPECIALIZATION_PSEUDO_HALF_BINARY_FUNCTOR(BinaryOp::kTanhBackwardWithDyX);
+
+}  // namespace broadcast_elementwise_binary
+}  // namespace primitive
+}  // namespace ep
 }  // namespace oneflow
\ No newline at end of file
diff --git a/oneflow/core/ep/rocm/primitive/broadcast_elementwise_binary.hip.cpp b/oneflow/core/ep/rocm/primitive/broadcast_elementwise_binary.hip.cpp
index 25759ec..38909e5 100644
--- a/oneflow/core/ep/rocm/primitive/broadcast_elementwise_binary.hip.cpp
+++ b/oneflow/core/ep/rocm/primitive/broadcast_elementwise_binary.hip.cpp
@@ -1,110 +1,110 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "oneflow/core/ep/include/primitive/broadcast_elementwise_binary.h"
-#include "oneflow/core/ep/common/primitive/broadcast_elementwise_binary.h"
-#include "oneflow/core/ep/rocm/primitive/type_seq.h"
-#include "oneflow/core/ep/rocm/cuda_stream.h"
-#include "oneflow/core/hip/elementwise.hip.h"
-#include "oneflow/core/ep/rocm/primitive/binary_functor.hip.h"
-
-namespace oneflow {
-
-namespace ep {
-namespace primitive {
-namespace broadcast_elementwise_binary {
-
-template<BinaryOp binary_op, typename Src, typename Dst>
-std::unique_ptr<BroadcastElementwiseBinary> NewBroadcastElementwiseBinary(Scalar attr0,
-                                                                          Scalar attr1);
-
-namespace {
-
-class BroadcastElementwiseBinaryFactoryImpl : public BroadcastElementwiseBinaryFactory {
- public:
-  OF_DISALLOW_COPY_AND_MOVE(BroadcastElementwiseBinaryFactoryImpl);
-  BroadcastElementwiseBinaryFactoryImpl() = default;
-  ~BroadcastElementwiseBinaryFactoryImpl() override = default;
-
-  std::unique_ptr<BroadcastElementwiseBinary> New(BinaryOp op, DataType src_type, DataType dst_type,
-                                                  size_t max_num_dims) override {
-    return New(op, src_type, dst_type, max_num_dims, Scalar(), Scalar());
-  }
-
-  std::unique_ptr<BroadcastElementwiseBinary> New(BinaryOp op, DataType src_type, DataType dst_type,
-                                                  size_t max_num_dims, Scalar attr0) override {
-    return New(op, src_type, dst_type, max_num_dims, attr0, Scalar());
-  }
-
-  std::unique_ptr<BroadcastElementwiseBinary> New(BinaryOp binary_op, DataType src_type,
-                                                  DataType dst_type, size_t max_num_dims,
-                                                  Scalar attr0, Scalar attr1) override {
-    if (max_num_dims > kMaxNumDims) { return nullptr; }
-#define MAKE_NEW_BROADCAST_ELEMENTWISE_BINARY_MATH_ENTRY(binary_op, data_type_pair) \
-  {std::make_tuple(binary_op, OF_PP_PAIR_SECOND(data_type_pair),                    \
-                   OF_PP_PAIR_SECOND(data_type_pair)),                              \
-   NewBroadcastElementwiseBinary<binary_op, OF_PP_PAIR_FIRST(data_type_pair),       \
-                                 OF_PP_PAIR_FIRST(data_type_pair)>},
-
-#define MAKE_NEW_BROADCAST_ELEMENTWISE_BINARY_COMPARASION_AND_LOGICAL_ENTRY(      \
-    binary_op, src_data_type_pair, dst_data_type_pair)                            \
-  {std::make_tuple(binary_op, OF_PP_PAIR_SECOND(src_data_type_pair),              \
-                   OF_PP_PAIR_SECOND(dst_data_type_pair)),                        \
-   NewBroadcastElementwiseBinary<binary_op, OF_PP_PAIR_FIRST(src_data_type_pair), \
-                                 OF_PP_PAIR_FIRST(dst_data_type_pair)>},
-
-#define MAKE_NEW_BROADCAST_ELEMENTWISE_BINARY_ACTIVATION_GRAD_ENTRY(binary_op, data_type_pair) \
-  {std::make_tuple(binary_op, OF_PP_PAIR_SECOND(data_type_pair),                               \
-                   OF_PP_PAIR_SECOND(data_type_pair)),                                         \
-   NewBroadcastElementwiseBinary<binary_op, OF_PP_PAIR_FIRST(data_type_pair),                  \
-                                 OF_PP_PAIR_FIRST(data_type_pair)>},
-
-    static const std::map<
-        std::tuple<BinaryOp, DataType, DataType>,
-        std::function<std::unique_ptr<BroadcastElementwiseBinary>(Scalar, Scalar)>>
-        new_broadcast_elementwise_binary_handle{
-            OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(MAKE_NEW_BROADCAST_ELEMENTWISE_BINARY_MATH_ENTRY,
-                                             BINARY_MATH_OP_SEQ, CUDA_PRIMITIVE_ALL_TYPE_SEQ)
-
-                OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(
-                    MAKE_NEW_BROADCAST_ELEMENTWISE_BINARY_COMPARASION_AND_LOGICAL_ENTRY,
-                    BINARY_COMPARISION_OP_SEQ BINARY_LOGICAL_OP_SEQ, CUDA_PRIMITIVE_ALL_TYPE_SEQ,
-                    CUDA_PRIMITIVE_BOOL_TYPE_SEQ)
-
-                    OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(
-                        MAKE_NEW_BROADCAST_ELEMENTWISE_BINARY_ACTIVATION_GRAD_ENTRY,
-                        BINARY_ACTIVATION_BACKWARD_OP_SEQ, CUDA_PRIMITIVE_FLOATING_TYPE_SEQ)};
-
-#undef MAKE_NEW_BROADCAST_ELEMENTWISE_BINARY_COMPARASION_AND_LOGICAL_ENTRY
-#undef MAKE_NEW_BROADCAST_ELEMENTWISE_BINARY_MATH_ENTRY
-
-    const auto it = new_broadcast_elementwise_binary_handle.find(
-        std::make_tuple(binary_op, src_type, dst_type));
-    if (it != new_broadcast_elementwise_binary_handle.end()) {
-      return it->second(attr0, attr1);
-    } else {
-      return nullptr;
-    }
-  }
-};
-
-REGISTER_PRIMITIVE_FACTORY(DeviceType::kCUDA, BroadcastElementwiseBinaryFactory,
-                           BroadcastElementwiseBinaryFactoryImpl);
-}  // namespace
-}  // namespace broadcast_elementwise_binary
-}  // namespace primitive
-}  // namespace ep
-
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "oneflow/core/ep/include/primitive/broadcast_elementwise_binary.h"
+#include "oneflow/core/ep/common/primitive/broadcast_elementwise_binary.h"
+#include "oneflow/core/ep/rocm/primitive/type_seq.h"
+#include "oneflow/core/ep/rocm/cuda_stream.h"
+#include "oneflow/core/hip/elementwise.hip.h"
+#include "oneflow/core/ep/rocm/primitive/binary_functor.hip.h"
+
+namespace oneflow {
+
+namespace ep {
+namespace primitive {
+namespace broadcast_elementwise_binary {
+
+template<BinaryOp binary_op, typename Src, typename Dst>
+std::unique_ptr<BroadcastElementwiseBinary> NewBroadcastElementwiseBinary(Scalar attr0,
+                                                                          Scalar attr1);
+
+namespace {
+
+class BroadcastElementwiseBinaryFactoryImpl : public BroadcastElementwiseBinaryFactory {
+ public:
+  OF_DISALLOW_COPY_AND_MOVE(BroadcastElementwiseBinaryFactoryImpl);
+  BroadcastElementwiseBinaryFactoryImpl() = default;
+  ~BroadcastElementwiseBinaryFactoryImpl() override = default;
+
+  std::unique_ptr<BroadcastElementwiseBinary> New(BinaryOp op, DataType src_type, DataType dst_type,
+                                                  size_t max_num_dims) override {
+    return New(op, src_type, dst_type, max_num_dims, Scalar(), Scalar());
+  }
+
+  std::unique_ptr<BroadcastElementwiseBinary> New(BinaryOp op, DataType src_type, DataType dst_type,
+                                                  size_t max_num_dims, Scalar attr0) override {
+    return New(op, src_type, dst_type, max_num_dims, attr0, Scalar());
+  }
+
+  std::unique_ptr<BroadcastElementwiseBinary> New(BinaryOp binary_op, DataType src_type,
+                                                  DataType dst_type, size_t max_num_dims,
+                                                  Scalar attr0, Scalar attr1) override {
+    if (max_num_dims > kMaxNumDims) { return nullptr; }
+#define MAKE_NEW_BROADCAST_ELEMENTWISE_BINARY_MATH_ENTRY(binary_op, data_type_pair) \
+  {std::make_tuple(binary_op, OF_PP_PAIR_SECOND(data_type_pair),                    \
+                   OF_PP_PAIR_SECOND(data_type_pair)),                              \
+   NewBroadcastElementwiseBinary<binary_op, OF_PP_PAIR_FIRST(data_type_pair),       \
+                                 OF_PP_PAIR_FIRST(data_type_pair)>},
+
+#define MAKE_NEW_BROADCAST_ELEMENTWISE_BINARY_COMPARASION_AND_LOGICAL_ENTRY(      \
+    binary_op, src_data_type_pair, dst_data_type_pair)                            \
+  {std::make_tuple(binary_op, OF_PP_PAIR_SECOND(src_data_type_pair),              \
+                   OF_PP_PAIR_SECOND(dst_data_type_pair)),                        \
+   NewBroadcastElementwiseBinary<binary_op, OF_PP_PAIR_FIRST(src_data_type_pair), \
+                                 OF_PP_PAIR_FIRST(dst_data_type_pair)>},
+
+#define MAKE_NEW_BROADCAST_ELEMENTWISE_BINARY_ACTIVATION_GRAD_ENTRY(binary_op, data_type_pair) \
+  {std::make_tuple(binary_op, OF_PP_PAIR_SECOND(data_type_pair),                               \
+                   OF_PP_PAIR_SECOND(data_type_pair)),                                         \
+   NewBroadcastElementwiseBinary<binary_op, OF_PP_PAIR_FIRST(data_type_pair),                  \
+                                 OF_PP_PAIR_FIRST(data_type_pair)>},
+
+    static const std::map<
+        std::tuple<BinaryOp, DataType, DataType>,
+        std::function<std::unique_ptr<BroadcastElementwiseBinary>(Scalar, Scalar)>>
+        new_broadcast_elementwise_binary_handle{
+            OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(MAKE_NEW_BROADCAST_ELEMENTWISE_BINARY_MATH_ENTRY,
+                                             BINARY_MATH_OP_SEQ, CUDA_PRIMITIVE_ALL_TYPE_SEQ)
+
+                OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(
+                    MAKE_NEW_BROADCAST_ELEMENTWISE_BINARY_COMPARASION_AND_LOGICAL_ENTRY,
+                    BINARY_COMPARISION_OP_SEQ BINARY_LOGICAL_OP_SEQ, CUDA_PRIMITIVE_ALL_TYPE_SEQ,
+                    CUDA_PRIMITIVE_BOOL_TYPE_SEQ)
+
+                    OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(
+                        MAKE_NEW_BROADCAST_ELEMENTWISE_BINARY_ACTIVATION_GRAD_ENTRY,
+                        BINARY_ACTIVATION_BACKWARD_OP_SEQ, CUDA_PRIMITIVE_FLOATING_TYPE_SEQ)};
+
+#undef MAKE_NEW_BROADCAST_ELEMENTWISE_BINARY_COMPARASION_AND_LOGICAL_ENTRY
+#undef MAKE_NEW_BROADCAST_ELEMENTWISE_BINARY_MATH_ENTRY
+
+    const auto it = new_broadcast_elementwise_binary_handle.find(
+        std::make_tuple(binary_op, src_type, dst_type));
+    if (it != new_broadcast_elementwise_binary_handle.end()) {
+      return it->second(attr0, attr1);
+    } else {
+      return nullptr;
+    }
+  }
+};
+
+REGISTER_PRIMITIVE_FACTORY(DeviceType::kCUDA, BroadcastElementwiseBinaryFactory,
+                           BroadcastElementwiseBinaryFactoryImpl);
+}  // namespace
+}  // namespace broadcast_elementwise_binary
+}  // namespace primitive
+}  // namespace ep
+
 }  // namespace oneflow
\ No newline at end of file
diff --git a/oneflow/core/ep/rocm/primitive/broadcast_elementwise_binary.hip.h b/oneflow/core/ep/rocm/primitive/broadcast_elementwise_binary.hip.h
index 20b2717..9a48365 100644
--- a/oneflow/core/ep/rocm/primitive/broadcast_elementwise_binary.hip.h
+++ b/oneflow/core/ep/rocm/primitive/broadcast_elementwise_binary.hip.h
@@ -1,397 +1,397 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "hip/hip_runtime.h"
-#include "oneflow/core/ep/include/primitive/broadcast_elementwise_binary.h"
-#include "oneflow/core/ep/common/primitive/broadcast_elementwise_binary.h"
-#include "oneflow/core/ep/rocm/primitive/type_seq.h"
-#include "oneflow/core/ep/rocm/cuda_stream.h"
-#include "oneflow/core/hip/elementwise.hip.h"
-#include "oneflow/core/ep/rocm/primitive/binary_functor.hip.h"
-
-namespace oneflow {
-
-namespace ep {
-namespace primitive {
-namespace broadcast_elementwise_binary {
-
-namespace {
-
-template<typename T, int N>
-struct GetPackType {
-  using type = typename std::aligned_storage<N * sizeof(T), N * sizeof(T)>::type;
-};
-
-template<typename T, int N>
-using PackType = typename GetPackType<T, N>::type;
-
-template<typename T, int N>
-union Pack {
-  static_assert(sizeof(PackType<T, N>) == sizeof(T) * N, "");
-  OF_DEVICE_FUNC Pack() {
-    // do nothing
-  }
-  PackType<T, N> storage;
-  T elem[N];
-};
-
-template<size_t max_dims, typename IndexType>
-struct BroadcastElementwiseBinaryParams {
-  NdIndexOffsetHelper<IndexType, max_dims> src0_index_helper;
-  NdIndexOffsetHelper<IndexType, max_dims> src1_index_helper;
-  NdIndexOffsetHelper<IndexType, max_dims> dst_index_helper;
-  size_t num_dims;
-  IndexType src0_index_mask[max_dims];
-  IndexType src1_index_mask[max_dims];
-  IndexType count{};
-  const void* src0{};
-  const void* src1{};
-  void* dst{};
-  Scalar attr0;
-  Scalar attr1;
-};
-
-template<BinaryOp binary_op, typename Src, typename Dst, size_t max_dims, size_t src0_pack_size,
-         size_t src1_pack_size, typename IndexType>
-__global__ void BroadcastElementwiseBinaryGpu(
-    BroadcastElementwiseBinaryParams<max_dims, IndexType> params) {
-  constexpr size_t dst_pack_size =
-      src0_pack_size > src1_pack_size ? src0_pack_size : src1_pack_size;
-  static_assert(src0_pack_size == dst_pack_size || src0_pack_size == 1, "");
-  static_assert(src1_pack_size == dst_pack_size || src1_pack_size == 1, "");
-
-  const PackType<Src, src0_pack_size>* src0 =
-      reinterpret_cast<const PackType<Src, src0_pack_size>*>(params.src0);
-  const PackType<Src, src1_pack_size>* src1 =
-      reinterpret_cast<const PackType<Src, src1_pack_size>*>(params.src1);
-  PackType<Dst, dst_pack_size>* dst = reinterpret_cast<PackType<Dst, dst_pack_size>*>(params.dst);
-
-  IndexType src0_index[max_dims];
-  IndexType src1_index[max_dims];
-  IndexType dst_index[max_dims];
-  size_t num_dims = params.num_dims;
-  CUDA_1D_KERNEL_LOOP_T(IndexType, offset, params.count) {
-    params.dst_index_helper.OffsetToNdIndex(offset, dst_index, num_dims);
-#pragma unroll
-    for (int i = 0; i < max_dims; ++i) {
-      if (i < num_dims) {
-        src0_index[i] = params.src0_index_mask[i] * dst_index[i];
-        src1_index[i] = params.src1_index_mask[i] * dst_index[i];
-      } else {
-        src0_index[i] = 0;
-        src1_index[i] = 0;
-      }
-    }
-    const IndexType src0_offset = params.src0_index_helper.NdIndexToOffset(src0_index, num_dims);
-    const IndexType src1_offset = params.src1_index_helper.NdIndexToOffset(src1_index, num_dims);
-    Pack<Src, src0_pack_size> src0_pack;
-    src0_pack.storage = src0[src0_offset];
-    Pack<Src, src1_pack_size> src1_pack;
-    src1_pack.storage = src1[src1_offset];
-    Pack<Dst, dst_pack_size> dst_pack;
-    BinaryFunctor<DeviceType::kCUDA, binary_op, Src, Dst> functor(params.attr0, params.attr1);
-#pragma unroll
-    for (int j = 0; j < dst_pack_size; ++j) {
-      const Src src0_val =
-          (src0_pack_size == dst_pack_size) ? src0_pack.elem[j] : src0_pack.elem[0];
-      const Src src1_val =
-          (src1_pack_size == dst_pack_size) ? src1_pack.elem[j] : src1_pack.elem[0];
-      dst_pack.elem[j] = functor(src0_val, src1_val);
-    }
-    dst[offset] = dst_pack.storage;
-  }
-}
-
-template<BinaryOp op, typename T, typename R, size_t max_dims, size_t src0_pack_size,
-         size_t src1_pack_size, typename IndexType>
-void LaunchKernel(Stream* stream, int num_dims, const int64_t* src0_dims, const void* src0,
-                  const int64_t* src1_dims, const void* src1, const int64_t* dst_dims, void* dst,
-                  size_t count, Scalar attr0, Scalar attr1) {
-  BroadcastElementwiseBinaryParams<max_dims, IndexType> params;
-  for (size_t i = 0; i < num_dims; ++i) {
-    params.src0_index_mask[i] = (src0_dims[i] == 1) ? 0 : 1;
-    params.src1_index_mask[i] = (src1_dims[i] == 1) ? 0 : 1;
-  }
-  params.src0_index_helper = NdIndexOffsetHelper<IndexType, max_dims>(src0_dims, num_dims);
-  params.src1_index_helper = NdIndexOffsetHelper<IndexType, max_dims>(src1_dims, num_dims);
-  params.dst_index_helper = NdIndexOffsetHelper<IndexType, max_dims>(dst_dims, num_dims);
-  params.num_dims = num_dims;
-  params.src0 = src0;
-  params.src1 = src1;
-  params.dst = dst;
-  params.count = static_cast<IndexType>(count);
-  params.attr0 = attr0;
-  params.attr1 = attr1;
-  auto* cuda_stream = stream->As<CudaStream>();
-  BroadcastElementwiseBinaryGpu<op, T, R, max_dims, src0_pack_size, src1_pack_size, IndexType>
-      <<<BlocksNum4ThreadsNum(params.count), kCudaThreadsNumPerBlock, 0,
-         cuda_stream->cuda_stream()>>>(params);
-}
-
-template<BinaryOp op, typename T, typename R, size_t max_dims, size_t src0_pack_size,
-         size_t src1_pack_size>
-void DispatchIndexType(Stream* stream, size_t num_dims, const int64_t* src0_dims, const void* src0,
-                       const int64_t* src1_dims, const void* src1, const int64_t* dst_dims,
-                       void* dst, Scalar attr0, Scalar attr1) {
-  size_t count = GetElementCount(num_dims, dst_dims);
-  if (count < GetMaxVal<int32_t>()) {
-    LaunchKernel<op, T, R, max_dims, src0_pack_size, src1_pack_size, int32_t>(
-        stream, num_dims, src0_dims, src0, src1_dims, src1, dst_dims, dst, count, attr0, attr1);
-  } else {
-    LaunchKernel<op, T, R, max_dims, src0_pack_size, src1_pack_size, int64_t>(
-        stream, num_dims, src0_dims, src0, src1_dims, src1, dst_dims, dst, count, attr0, attr1);
-  }
-}
-
-template<BinaryOp op, typename T, typename R, size_t max_dims>
-void DispatchPackSize(Stream* stream, size_t src0_pack_size, size_t src1_pack_size, size_t num_dims,
-                      const int64_t* src0_dims, const void* src0, const int64_t* src1_dims,
-                      const void* src1, const int64_t* dst_dims, void* dst, Scalar attr0,
-                      Scalar attr1) {
-  void (*func)(Stream* /*stream*/, size_t /*num_dims*/, const int64_t* /*src0_dims*/,
-               const void* /*src0*/, const int64_t* /*src1_dims*/, const void* /*src1*/,
-               const int64_t* /*dst_dims*/, void* /*dst*/, Scalar /*attr0*/, Scalar /*attr1*/) =
-      nullptr;
-  if (src0_pack_size == 1 && src1_pack_size == 1) {
-    func = DispatchIndexType<op, T, R, max_dims, 1, 1>;
-  } else if (src0_pack_size == 4 && src1_pack_size == 4) {
-    func = DispatchIndexType<op, T, R, max_dims, 4, 4>;
-  } else if (src0_pack_size == 1 && src1_pack_size == 4) {
-    func = DispatchIndexType<op, T, R, max_dims, 1, 4>;
-  } else if (src0_pack_size == 4 && src1_pack_size == 1) {
-    func = DispatchIndexType<op, T, R, max_dims, 4, 1>;
-  } else {
-    UNIMPLEMENTED();
-  }
-  func(stream, num_dims, src0_dims, src0, src1_dims, src1, dst_dims, dst, attr0, attr1);
-}
-
-template<BinaryOp op, typename T, typename R>
-void DispatchNumDims(Stream* stream, size_t src0_pack_size, size_t src1_pack_size, size_t num_dims,
-                     const int64_t* src0_dims, const void* src0, const int64_t* src1_dims,
-                     const void* src1, const int64_t* dst_dims, void* dst, Scalar attr0,
-                     Scalar attr1) {
-  void (*func)(Stream* /*stream*/, size_t /*src0_pack_size*/, size_t /*src1_pack_size*/,
-               size_t /*num_dims*/, const int64_t* /*src0_dims*/, const void* /*src0*/,
-               const int64_t* /*src1_dims*/, const void* /*src1*/, const int64_t* /*dst_dims*/,
-               void* /*dst*/, Scalar /*attr0*/, Scalar /*attr1*/) = nullptr;
-  CHECK_NE(num_dims, 1);
-  if (num_dims == 2) {
-    func = DispatchPackSize<op, T, R, 2>;
-  } else if (num_dims == 3) {
-    func = DispatchPackSize<op, T, R, 3>;
-  } else if (num_dims == 4) {
-    func = DispatchPackSize<op, T, R, 4>;
-  } else if (num_dims <= 8) {
-    func = DispatchPackSize<op, T, R, 8>;
-  } else {
-    UNIMPLEMENTED();
-  }
-  func(stream, src0_pack_size, src1_pack_size, num_dims, src0_dims, src0, src1_dims, src1, dst_dims,
-       dst, attr0, attr1);
-}
-
-template<size_t max_pack_size, typename T, typename R>
-size_t GetPackSize(size_t num_src_dims, const int64_t* src0_dims, const void* src0,
-                   const int64_t* src1_dims, const void* src1, void* dst) {
-  static_assert(max_pack_size > 0 && (max_pack_size & (max_pack_size - 1)) == 0, "");
-  CHECK(src0_dims[num_src_dims - 1] != 1 || src1_dims[num_src_dims - 1] != 1);
-  auto dst_ptr = reinterpret_cast<std::uintptr_t>(dst);
-  for (size_t pack_size = max_pack_size; pack_size > 2; pack_size /= 2) {
-    bool is_src0_supported = (src0_dims[num_src_dims - 1] == 1)
-                             || IsPackSizeSupported<T>(pack_size, num_src_dims, src0_dims, src0);
-    bool is_src1_supported = (src1_dims[num_src_dims - 1] == 1)
-                             || IsPackSizeSupported<T>(pack_size, num_src_dims, src1_dims, src1);
-    if (is_src0_supported && is_src1_supported && (dst_ptr % (pack_size * sizeof(R))) == 0) {
-      return pack_size;
-    }
-  }
-  return 1;
-}
-
-constexpr size_t kMaxPackSize = 4;
-
-template<BinaryOp op, typename T, typename R>
-void LaunchWithSimplified(Stream* stream, size_t simplified_num_dims, int64_t* simplified_src0_dims,
-                          const void* src0, int64_t* simplified_src1_dims, const void* src1,
-                          int64_t* simplified_dst_dims, void* dst, Scalar attr0, Scalar attr1) {
-  CHECK_LE(simplified_num_dims, kMaxNumDims);
-  size_t pack_size = GetPackSize<kMaxPackSize, T, R>(simplified_num_dims, simplified_src0_dims,
-                                                     src0, simplified_src1_dims, src1, dst);
-  size_t src0_pack_size = 1;
-  size_t src1_pack_size = 1;
-  if (simplified_src0_dims[simplified_num_dims - 1] != 1) {
-    simplified_src0_dims[simplified_num_dims - 1] /= pack_size;
-    src0_pack_size = pack_size;
-  }
-  if (simplified_src1_dims[simplified_num_dims - 1] != 1) {
-    simplified_src1_dims[simplified_num_dims - 1] /= pack_size;
-    src1_pack_size = pack_size;
-  }
-  simplified_dst_dims[simplified_num_dims - 1] /= pack_size;
-  DispatchNumDims<op, T, R>(stream, src0_pack_size, src1_pack_size, simplified_num_dims,
-                            simplified_src0_dims, src0, simplified_src1_dims, src1,
-                            simplified_dst_dims, dst, attr0, attr1);
-}
-
-template<BinaryOp binary_op, typename Src, typename Dst>
-struct BinaryLhsScalarFunctor {
-  __host__ __device__ BinaryLhsScalarFunctor(Src scalar, Scalar attr0, Scalar attr1)
-      : scalar(scalar), functor(attr0, attr1) {}
-  __device__ Dst operator()(Src src) const { return functor(scalar, src); }
-  const Src scalar;
-  BinaryFunctor<DeviceType::kCUDA, binary_op, Src, Dst> functor;
-};
-
-template<BinaryOp binary_op, typename Src, typename Dst>
-struct BinaryRhsScalarFunctor {
-  __host__ __device__ BinaryRhsScalarFunctor(Src scalar, Scalar attr0, Scalar attr1)
-      : scalar(scalar), functor(attr0, attr1) {}
-  __device__ Dst operator()(Src src) const { return functor(src, scalar); }
-  const Src scalar;
-  BinaryFunctor<DeviceType::kCUDA, binary_op, Src, Dst> functor;
-};
-
-template<BinaryOp binary_op, typename Src, typename Dst>
-struct BinaryLhsScalarPtrFunctorFactory {
-  __host__ __device__ BinaryLhsScalarPtrFunctorFactory(const Src* scalar_ptr, Scalar attr0,
-                                                       Scalar attr1)
-      : scalar_ptr(scalar_ptr), attr0(attr0), attr1(attr1) {}
-  __device__ BinaryLhsScalarFunctor<binary_op, Src, Dst> operator()() const {
-    return BinaryLhsScalarFunctor<binary_op, Src, Dst>(*scalar_ptr, attr0, attr1);
-  }
-  const Src* scalar_ptr;
-  Scalar attr0, attr1;
-};
-
-template<BinaryOp binary_op, typename Src, typename Dst>
-struct BinaryRhsScalarPtrFunctorFactory {
-  __host__ __device__ explicit BinaryRhsScalarPtrFunctorFactory(const Src* scalar_ptr, Scalar attr0,
-                                                                Scalar attr1)
-      : scalar_ptr(scalar_ptr), attr0(attr0), attr1(attr1) {}
-  __device__ BinaryRhsScalarFunctor<binary_op, Src, Dst> operator()() const {
-    return BinaryRhsScalarFunctor<binary_op, Src, Dst>(*scalar_ptr, attr0, attr1);
-  }
-  const Src* scalar_ptr;
-  Scalar attr0, attr1;
-};
-
-template<BinaryOp binary_op, typename Src, typename Dst>
-void DispatchLaunch(Stream* stream, size_t num_src0_dims, const int64_t* src0_dims, const Src* src0,
-                    size_t num_src1_dims, const int64_t* src1_dims, const Src* src1, Dst* dst,
-                    Scalar attr0, Scalar attr1) {
-  auto* cuda_stream = stream->As<CudaStream>();
-  size_t simplified_num_dims = 0;
-  int64_t simplified_src0_dims[kMaxNumDims];
-  int64_t simplified_src1_dims[kMaxNumDims];
-  int64_t simplified_dst_dims[kMaxNumDims];
-  SimplifyBroadcastDims<kMaxNumDims>(num_src0_dims, src0_dims, num_src1_dims, src1_dims,
-                                     &simplified_num_dims, simplified_src0_dims,
-                                     simplified_src1_dims, simplified_dst_dims);
-  CheckInplace(simplified_num_dims, simplified_src0_dims, src0, simplified_src1_dims, src1,
-               simplified_dst_dims, dst);
-  if (IsDimsEquals(simplified_num_dims, simplified_src0_dims, simplified_num_dims,
-                   simplified_src1_dims)) {
-    const int64_t elem_cnt = GetElementCount(simplified_num_dims, simplified_src0_dims);
-    OF_CUDA_CHECK((cuda::elementwise::Binary(
-        BinaryFunctor<DeviceType::kCUDA, binary_op, Src, Dst>(attr0, attr1), elem_cnt, dst, src0,
-        src1, cuda_stream->cuda_stream())));
-  } else {
-    if (simplified_num_dims == 1 && simplified_src0_dims[0] == 1) {
-      OF_CUDA_CHECK((cuda::elementwise::UnaryWithFactory(
-          BinaryLhsScalarPtrFunctorFactory<binary_op, Src, Dst>(src0, attr0, attr1),
-          simplified_src1_dims[0], dst, src1, cuda_stream->cuda_stream())));
-    } else if (simplified_num_dims == 1 && simplified_src1_dims[0] == 1) {
-      OF_CUDA_CHECK((cuda::elementwise::UnaryWithFactory(
-          BinaryRhsScalarPtrFunctorFactory<binary_op, Src, Dst>(src1, attr0, attr1),
-          simplified_src0_dims[0], dst, src0, cuda_stream->cuda_stream())));
-    } else {
-      LaunchWithSimplified<binary_op, Src, Dst>(stream, simplified_num_dims, simplified_src0_dims,
-                                                src0, simplified_src1_dims, src1,
-                                                simplified_dst_dims, dst, attr0, attr1);
-    }
-  }
-}
-
-template<typename T>
-T GetValue(Scalar value) {
-  return value.Value<T>();
-}
-
-template<>
-half GetValue<half>(Scalar value) {
-  return static_cast<half>(GetValue<float>(value));
-}
-
-// #if CUDA_VERSION >= 11000
-
-// template<>
-// nv_bfloat16 GetValue<nv_bfloat16>(Scalar value) {
-//   return static_cast<nv_bfloat16>(GetValue<float>(value));
-// }
-
-// #endif  // CUDA_VERSION >= 11000
-
-template<BinaryOp binary_op, typename Src, typename Dst>
-class BroadcastElementwiseBinaryImpl : public BroadcastElementwiseBinary {
- public:
-  OF_DISALLOW_COPY_AND_MOVE(BroadcastElementwiseBinaryImpl);
-  BroadcastElementwiseBinaryImpl(Scalar attr0, Scalar attr1) : attr0(attr0), attr1(attr1) {}
-  ~BroadcastElementwiseBinaryImpl() override = default;
-
-  void Launch(Stream* stream, Scalar src0, size_t num_src1_dims, const int64_t* src1_dims,
-              const void* src1, void* dst) override {
-    auto* cuda_stream = stream->As<CudaStream>();
-    const size_t elem_cnt = GetElementCount(num_src1_dims, src1_dims);
-    OF_CUDA_CHECK((cuda::elementwise::Unary(
-        BinaryLhsScalarFunctor<binary_op, Src, Dst>(GetValue<Src>(src0), attr0, attr1), elem_cnt,
-        reinterpret_cast<Dst*>(dst), reinterpret_cast<const Src*>(src1),
-        cuda_stream->cuda_stream())));
-  }
-  void Launch(Stream* stream, size_t num_src0_dims, const int64_t* src0_dims, const void* src0,
-              Scalar src1, void* dst) override {
-    auto* cuda_stream = stream->As<CudaStream>();
-    const size_t elem_cnt = GetElementCount(num_src0_dims, src0_dims);
-    OF_CUDA_CHECK((cuda::elementwise::Unary(
-        BinaryRhsScalarFunctor<binary_op, Src, Dst>(GetValue<Src>(src1), attr0, attr1), elem_cnt,
-        reinterpret_cast<Dst*>(dst), reinterpret_cast<const Src*>(src0),
-        cuda_stream->cuda_stream())));
-  }
-  void Launch(Stream* stream, size_t num_src0_dims, const int64_t* src0_dims, const void* src0,
-              size_t num_src1_dims, const int64_t* src1_dims, const void* src1,
-              void* dst) override {
-    DispatchLaunch<binary_op, Src, Dst>(
-        stream, num_src0_dims, src0_dims, reinterpret_cast<const Src*>(src0), num_src1_dims,
-        src1_dims, reinterpret_cast<const Src*>(src1), reinterpret_cast<Dst*>(dst), attr0, attr1);
-  }
-
- private:
-  Scalar attr0, attr1;
-};
-
-}  // namespace
-
-template<BinaryOp binary_op, typename Src, typename Dst>
-std::unique_ptr<BroadcastElementwiseBinary> NewBroadcastElementwiseBinary(Scalar attr0,
-                                                                          Scalar attr1) {
-  return std::unique_ptr<BroadcastElementwiseBinary>(
-      new BroadcastElementwiseBinaryImpl<binary_op, Src, Dst>(attr0, attr1));
-}
-
-}  // namespace broadcast_elementwise_binary
-}  // namespace primitive
-}  // namespace ep
-
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "hip/hip_runtime.h"
+#include "oneflow/core/ep/include/primitive/broadcast_elementwise_binary.h"
+#include "oneflow/core/ep/common/primitive/broadcast_elementwise_binary.h"
+#include "oneflow/core/ep/rocm/primitive/type_seq.h"
+#include "oneflow/core/ep/rocm/cuda_stream.h"
+#include "oneflow/core/hip/elementwise.hip.h"
+#include "oneflow/core/ep/rocm/primitive/binary_functor.hip.h"
+
+namespace oneflow {
+
+namespace ep {
+namespace primitive {
+namespace broadcast_elementwise_binary {
+
+namespace {
+
+template<typename T, int N>
+struct GetPackType {
+  using type = typename std::aligned_storage<N * sizeof(T), N * sizeof(T)>::type;
+};
+
+template<typename T, int N>
+using PackType = typename GetPackType<T, N>::type;
+
+template<typename T, int N>
+union Pack {
+  static_assert(sizeof(PackType<T, N>) == sizeof(T) * N, "");
+  OF_DEVICE_FUNC Pack() {
+    // do nothing
+  }
+  PackType<T, N> storage;
+  T elem[N];
+};
+
+template<size_t max_dims, typename IndexType>
+struct BroadcastElementwiseBinaryParams {
+  NdIndexOffsetHelper<IndexType, max_dims> src0_index_helper;
+  NdIndexOffsetHelper<IndexType, max_dims> src1_index_helper;
+  NdIndexOffsetHelper<IndexType, max_dims> dst_index_helper;
+  size_t num_dims;
+  IndexType src0_index_mask[max_dims];
+  IndexType src1_index_mask[max_dims];
+  IndexType count{};
+  const void* src0{};
+  const void* src1{};
+  void* dst{};
+  Scalar attr0;
+  Scalar attr1;
+};
+
+template<BinaryOp binary_op, typename Src, typename Dst, size_t max_dims, size_t src0_pack_size,
+         size_t src1_pack_size, typename IndexType>
+__global__ void BroadcastElementwiseBinaryGpu(
+    BroadcastElementwiseBinaryParams<max_dims, IndexType> params) {
+  constexpr size_t dst_pack_size =
+      src0_pack_size > src1_pack_size ? src0_pack_size : src1_pack_size;
+  static_assert(src0_pack_size == dst_pack_size || src0_pack_size == 1, "");
+  static_assert(src1_pack_size == dst_pack_size || src1_pack_size == 1, "");
+
+  const PackType<Src, src0_pack_size>* src0 =
+      reinterpret_cast<const PackType<Src, src0_pack_size>*>(params.src0);
+  const PackType<Src, src1_pack_size>* src1 =
+      reinterpret_cast<const PackType<Src, src1_pack_size>*>(params.src1);
+  PackType<Dst, dst_pack_size>* dst = reinterpret_cast<PackType<Dst, dst_pack_size>*>(params.dst);
+
+  IndexType src0_index[max_dims];
+  IndexType src1_index[max_dims];
+  IndexType dst_index[max_dims];
+  size_t num_dims = params.num_dims;
+  CUDA_1D_KERNEL_LOOP_T(IndexType, offset, params.count) {
+    params.dst_index_helper.OffsetToNdIndex(offset, dst_index, num_dims);
+#pragma unroll
+    for (int i = 0; i < max_dims; ++i) {
+      if (i < num_dims) {
+        src0_index[i] = params.src0_index_mask[i] * dst_index[i];
+        src1_index[i] = params.src1_index_mask[i] * dst_index[i];
+      } else {
+        src0_index[i] = 0;
+        src1_index[i] = 0;
+      }
+    }
+    const IndexType src0_offset = params.src0_index_helper.NdIndexToOffset(src0_index, num_dims);
+    const IndexType src1_offset = params.src1_index_helper.NdIndexToOffset(src1_index, num_dims);
+    Pack<Src, src0_pack_size> src0_pack;
+    src0_pack.storage = src0[src0_offset];
+    Pack<Src, src1_pack_size> src1_pack;
+    src1_pack.storage = src1[src1_offset];
+    Pack<Dst, dst_pack_size> dst_pack;
+    BinaryFunctor<DeviceType::kCUDA, binary_op, Src, Dst> functor(params.attr0, params.attr1);
+#pragma unroll
+    for (int j = 0; j < dst_pack_size; ++j) {
+      const Src src0_val =
+          (src0_pack_size == dst_pack_size) ? src0_pack.elem[j] : src0_pack.elem[0];
+      const Src src1_val =
+          (src1_pack_size == dst_pack_size) ? src1_pack.elem[j] : src1_pack.elem[0];
+      dst_pack.elem[j] = functor(src0_val, src1_val);
+    }
+    dst[offset] = dst_pack.storage;
+  }
+}
+
+template<BinaryOp op, typename T, typename R, size_t max_dims, size_t src0_pack_size,
+         size_t src1_pack_size, typename IndexType>
+void LaunchKernel(Stream* stream, int num_dims, const int64_t* src0_dims, const void* src0,
+                  const int64_t* src1_dims, const void* src1, const int64_t* dst_dims, void* dst,
+                  size_t count, Scalar attr0, Scalar attr1) {
+  BroadcastElementwiseBinaryParams<max_dims, IndexType> params;
+  for (size_t i = 0; i < num_dims; ++i) {
+    params.src0_index_mask[i] = (src0_dims[i] == 1) ? 0 : 1;
+    params.src1_index_mask[i] = (src1_dims[i] == 1) ? 0 : 1;
+  }
+  params.src0_index_helper = NdIndexOffsetHelper<IndexType, max_dims>(src0_dims, num_dims);
+  params.src1_index_helper = NdIndexOffsetHelper<IndexType, max_dims>(src1_dims, num_dims);
+  params.dst_index_helper = NdIndexOffsetHelper<IndexType, max_dims>(dst_dims, num_dims);
+  params.num_dims = num_dims;
+  params.src0 = src0;
+  params.src1 = src1;
+  params.dst = dst;
+  params.count = static_cast<IndexType>(count);
+  params.attr0 = attr0;
+  params.attr1 = attr1;
+  auto* cuda_stream = stream->As<CudaStream>();
+  BroadcastElementwiseBinaryGpu<op, T, R, max_dims, src0_pack_size, src1_pack_size, IndexType>
+      <<<BlocksNum4ThreadsNum(params.count), kCudaThreadsNumPerBlock, 0,
+         cuda_stream->cuda_stream()>>>(params);
+}
+
+template<BinaryOp op, typename T, typename R, size_t max_dims, size_t src0_pack_size,
+         size_t src1_pack_size>
+void DispatchIndexType(Stream* stream, size_t num_dims, const int64_t* src0_dims, const void* src0,
+                       const int64_t* src1_dims, const void* src1, const int64_t* dst_dims,
+                       void* dst, Scalar attr0, Scalar attr1) {
+  size_t count = GetElementCount(num_dims, dst_dims);
+  if (count < GetMaxVal<int32_t>()) {
+    LaunchKernel<op, T, R, max_dims, src0_pack_size, src1_pack_size, int32_t>(
+        stream, num_dims, src0_dims, src0, src1_dims, src1, dst_dims, dst, count, attr0, attr1);
+  } else {
+    LaunchKernel<op, T, R, max_dims, src0_pack_size, src1_pack_size, int64_t>(
+        stream, num_dims, src0_dims, src0, src1_dims, src1, dst_dims, dst, count, attr0, attr1);
+  }
+}
+
+template<BinaryOp op, typename T, typename R, size_t max_dims>
+void DispatchPackSize(Stream* stream, size_t src0_pack_size, size_t src1_pack_size, size_t num_dims,
+                      const int64_t* src0_dims, const void* src0, const int64_t* src1_dims,
+                      const void* src1, const int64_t* dst_dims, void* dst, Scalar attr0,
+                      Scalar attr1) {
+  void (*func)(Stream* /*stream*/, size_t /*num_dims*/, const int64_t* /*src0_dims*/,
+               const void* /*src0*/, const int64_t* /*src1_dims*/, const void* /*src1*/,
+               const int64_t* /*dst_dims*/, void* /*dst*/, Scalar /*attr0*/, Scalar /*attr1*/) =
+      nullptr;
+  if (src0_pack_size == 1 && src1_pack_size == 1) {
+    func = DispatchIndexType<op, T, R, max_dims, 1, 1>;
+  } else if (src0_pack_size == 4 && src1_pack_size == 4) {
+    func = DispatchIndexType<op, T, R, max_dims, 4, 4>;
+  } else if (src0_pack_size == 1 && src1_pack_size == 4) {
+    func = DispatchIndexType<op, T, R, max_dims, 1, 4>;
+  } else if (src0_pack_size == 4 && src1_pack_size == 1) {
+    func = DispatchIndexType<op, T, R, max_dims, 4, 1>;
+  } else {
+    UNIMPLEMENTED();
+  }
+  func(stream, num_dims, src0_dims, src0, src1_dims, src1, dst_dims, dst, attr0, attr1);
+}
+
+template<BinaryOp op, typename T, typename R>
+void DispatchNumDims(Stream* stream, size_t src0_pack_size, size_t src1_pack_size, size_t num_dims,
+                     const int64_t* src0_dims, const void* src0, const int64_t* src1_dims,
+                     const void* src1, const int64_t* dst_dims, void* dst, Scalar attr0,
+                     Scalar attr1) {
+  void (*func)(Stream* /*stream*/, size_t /*src0_pack_size*/, size_t /*src1_pack_size*/,
+               size_t /*num_dims*/, const int64_t* /*src0_dims*/, const void* /*src0*/,
+               const int64_t* /*src1_dims*/, const void* /*src1*/, const int64_t* /*dst_dims*/,
+               void* /*dst*/, Scalar /*attr0*/, Scalar /*attr1*/) = nullptr;
+  CHECK_NE(num_dims, 1);
+  if (num_dims == 2) {
+    func = DispatchPackSize<op, T, R, 2>;
+  } else if (num_dims == 3) {
+    func = DispatchPackSize<op, T, R, 3>;
+  } else if (num_dims == 4) {
+    func = DispatchPackSize<op, T, R, 4>;
+  } else if (num_dims <= 8) {
+    func = DispatchPackSize<op, T, R, 8>;
+  } else {
+    UNIMPLEMENTED();
+  }
+  func(stream, src0_pack_size, src1_pack_size, num_dims, src0_dims, src0, src1_dims, src1, dst_dims,
+       dst, attr0, attr1);
+}
+
+template<size_t max_pack_size, typename T, typename R>
+size_t GetPackSize(size_t num_src_dims, const int64_t* src0_dims, const void* src0,
+                   const int64_t* src1_dims, const void* src1, void* dst) {
+  static_assert(max_pack_size > 0 && (max_pack_size & (max_pack_size - 1)) == 0, "");
+  CHECK(src0_dims[num_src_dims - 1] != 1 || src1_dims[num_src_dims - 1] != 1);
+  auto dst_ptr = reinterpret_cast<std::uintptr_t>(dst);
+  for (size_t pack_size = max_pack_size; pack_size > 2; pack_size /= 2) {
+    bool is_src0_supported = (src0_dims[num_src_dims - 1] == 1)
+                             || IsPackSizeSupported<T>(pack_size, num_src_dims, src0_dims, src0);
+    bool is_src1_supported = (src1_dims[num_src_dims - 1] == 1)
+                             || IsPackSizeSupported<T>(pack_size, num_src_dims, src1_dims, src1);
+    if (is_src0_supported && is_src1_supported && (dst_ptr % (pack_size * sizeof(R))) == 0) {
+      return pack_size;
+    }
+  }
+  return 1;
+}
+
+constexpr size_t kMaxPackSize = 4;
+
+template<BinaryOp op, typename T, typename R>
+void LaunchWithSimplified(Stream* stream, size_t simplified_num_dims, int64_t* simplified_src0_dims,
+                          const void* src0, int64_t* simplified_src1_dims, const void* src1,
+                          int64_t* simplified_dst_dims, void* dst, Scalar attr0, Scalar attr1) {
+  CHECK_LE(simplified_num_dims, kMaxNumDims);
+  size_t pack_size = GetPackSize<kMaxPackSize, T, R>(simplified_num_dims, simplified_src0_dims,
+                                                     src0, simplified_src1_dims, src1, dst);
+  size_t src0_pack_size = 1;
+  size_t src1_pack_size = 1;
+  if (simplified_src0_dims[simplified_num_dims - 1] != 1) {
+    simplified_src0_dims[simplified_num_dims - 1] /= pack_size;
+    src0_pack_size = pack_size;
+  }
+  if (simplified_src1_dims[simplified_num_dims - 1] != 1) {
+    simplified_src1_dims[simplified_num_dims - 1] /= pack_size;
+    src1_pack_size = pack_size;
+  }
+  simplified_dst_dims[simplified_num_dims - 1] /= pack_size;
+  DispatchNumDims<op, T, R>(stream, src0_pack_size, src1_pack_size, simplified_num_dims,
+                            simplified_src0_dims, src0, simplified_src1_dims, src1,
+                            simplified_dst_dims, dst, attr0, attr1);
+}
+
+template<BinaryOp binary_op, typename Src, typename Dst>
+struct BinaryLhsScalarFunctor {
+  __host__ __device__ BinaryLhsScalarFunctor(Src scalar, Scalar attr0, Scalar attr1)
+      : scalar(scalar), functor(attr0, attr1) {}
+  __device__ Dst operator()(Src src) const { return functor(scalar, src); }
+  const Src scalar;
+  BinaryFunctor<DeviceType::kCUDA, binary_op, Src, Dst> functor;
+};
+
+template<BinaryOp binary_op, typename Src, typename Dst>
+struct BinaryRhsScalarFunctor {
+  __host__ __device__ BinaryRhsScalarFunctor(Src scalar, Scalar attr0, Scalar attr1)
+      : scalar(scalar), functor(attr0, attr1) {}
+  __device__ Dst operator()(Src src) const { return functor(src, scalar); }
+  const Src scalar;
+  BinaryFunctor<DeviceType::kCUDA, binary_op, Src, Dst> functor;
+};
+
+template<BinaryOp binary_op, typename Src, typename Dst>
+struct BinaryLhsScalarPtrFunctorFactory {
+  __host__ __device__ BinaryLhsScalarPtrFunctorFactory(const Src* scalar_ptr, Scalar attr0,
+                                                       Scalar attr1)
+      : scalar_ptr(scalar_ptr), attr0(attr0), attr1(attr1) {}
+  __device__ BinaryLhsScalarFunctor<binary_op, Src, Dst> operator()() const {
+    return BinaryLhsScalarFunctor<binary_op, Src, Dst>(*scalar_ptr, attr0, attr1);
+  }
+  const Src* scalar_ptr;
+  Scalar attr0, attr1;
+};
+
+template<BinaryOp binary_op, typename Src, typename Dst>
+struct BinaryRhsScalarPtrFunctorFactory {
+  __host__ __device__ explicit BinaryRhsScalarPtrFunctorFactory(const Src* scalar_ptr, Scalar attr0,
+                                                                Scalar attr1)
+      : scalar_ptr(scalar_ptr), attr0(attr0), attr1(attr1) {}
+  __device__ BinaryRhsScalarFunctor<binary_op, Src, Dst> operator()() const {
+    return BinaryRhsScalarFunctor<binary_op, Src, Dst>(*scalar_ptr, attr0, attr1);
+  }
+  const Src* scalar_ptr;
+  Scalar attr0, attr1;
+};
+
+template<BinaryOp binary_op, typename Src, typename Dst>
+void DispatchLaunch(Stream* stream, size_t num_src0_dims, const int64_t* src0_dims, const Src* src0,
+                    size_t num_src1_dims, const int64_t* src1_dims, const Src* src1, Dst* dst,
+                    Scalar attr0, Scalar attr1) {
+  auto* cuda_stream = stream->As<CudaStream>();
+  size_t simplified_num_dims = 0;
+  int64_t simplified_src0_dims[kMaxNumDims];
+  int64_t simplified_src1_dims[kMaxNumDims];
+  int64_t simplified_dst_dims[kMaxNumDims];
+  SimplifyBroadcastDims<kMaxNumDims>(num_src0_dims, src0_dims, num_src1_dims, src1_dims,
+                                     &simplified_num_dims, simplified_src0_dims,
+                                     simplified_src1_dims, simplified_dst_dims);
+  CheckInplace(simplified_num_dims, simplified_src0_dims, src0, simplified_src1_dims, src1,
+               simplified_dst_dims, dst);
+  if (IsDimsEquals(simplified_num_dims, simplified_src0_dims, simplified_num_dims,
+                   simplified_src1_dims)) {
+    const int64_t elem_cnt = GetElementCount(simplified_num_dims, simplified_src0_dims);
+    OF_CUDA_CHECK((cuda::elementwise::Binary(
+        BinaryFunctor<DeviceType::kCUDA, binary_op, Src, Dst>(attr0, attr1), elem_cnt, dst, src0,
+        src1, cuda_stream->cuda_stream())));
+  } else {
+    if (simplified_num_dims == 1 && simplified_src0_dims[0] == 1) {
+      OF_CUDA_CHECK((cuda::elementwise::UnaryWithFactory(
+          BinaryLhsScalarPtrFunctorFactory<binary_op, Src, Dst>(src0, attr0, attr1),
+          simplified_src1_dims[0], dst, src1, cuda_stream->cuda_stream())));
+    } else if (simplified_num_dims == 1 && simplified_src1_dims[0] == 1) {
+      OF_CUDA_CHECK((cuda::elementwise::UnaryWithFactory(
+          BinaryRhsScalarPtrFunctorFactory<binary_op, Src, Dst>(src1, attr0, attr1),
+          simplified_src0_dims[0], dst, src0, cuda_stream->cuda_stream())));
+    } else {
+      LaunchWithSimplified<binary_op, Src, Dst>(stream, simplified_num_dims, simplified_src0_dims,
+                                                src0, simplified_src1_dims, src1,
+                                                simplified_dst_dims, dst, attr0, attr1);
+    }
+  }
+}
+
+template<typename T>
+T GetValue(Scalar value) {
+  return value.Value<T>();
+}
+
+template<>
+half GetValue<half>(Scalar value) {
+  return static_cast<half>(GetValue<float>(value));
+}
+
+// #if CUDA_VERSION >= 11000
+
+// template<>
+// nv_bfloat16 GetValue<nv_bfloat16>(Scalar value) {
+//   return static_cast<nv_bfloat16>(GetValue<float>(value));
+// }
+
+// #endif  // CUDA_VERSION >= 11000
+
+template<BinaryOp binary_op, typename Src, typename Dst>
+class BroadcastElementwiseBinaryImpl : public BroadcastElementwiseBinary {
+ public:
+  OF_DISALLOW_COPY_AND_MOVE(BroadcastElementwiseBinaryImpl);
+  BroadcastElementwiseBinaryImpl(Scalar attr0, Scalar attr1) : attr0(attr0), attr1(attr1) {}
+  ~BroadcastElementwiseBinaryImpl() override = default;
+
+  void Launch(Stream* stream, Scalar src0, size_t num_src1_dims, const int64_t* src1_dims,
+              const void* src1, void* dst) override {
+    auto* cuda_stream = stream->As<CudaStream>();
+    const size_t elem_cnt = GetElementCount(num_src1_dims, src1_dims);
+    OF_CUDA_CHECK((cuda::elementwise::Unary(
+        BinaryLhsScalarFunctor<binary_op, Src, Dst>(GetValue<Src>(src0), attr0, attr1), elem_cnt,
+        reinterpret_cast<Dst*>(dst), reinterpret_cast<const Src*>(src1),
+        cuda_stream->cuda_stream())));
+  }
+  void Launch(Stream* stream, size_t num_src0_dims, const int64_t* src0_dims, const void* src0,
+              Scalar src1, void* dst) override {
+    auto* cuda_stream = stream->As<CudaStream>();
+    const size_t elem_cnt = GetElementCount(num_src0_dims, src0_dims);
+    OF_CUDA_CHECK((cuda::elementwise::Unary(
+        BinaryRhsScalarFunctor<binary_op, Src, Dst>(GetValue<Src>(src1), attr0, attr1), elem_cnt,
+        reinterpret_cast<Dst*>(dst), reinterpret_cast<const Src*>(src0),
+        cuda_stream->cuda_stream())));
+  }
+  void Launch(Stream* stream, size_t num_src0_dims, const int64_t* src0_dims, const void* src0,
+              size_t num_src1_dims, const int64_t* src1_dims, const void* src1,
+              void* dst) override {
+    DispatchLaunch<binary_op, Src, Dst>(
+        stream, num_src0_dims, src0_dims, reinterpret_cast<const Src*>(src0), num_src1_dims,
+        src1_dims, reinterpret_cast<const Src*>(src1), reinterpret_cast<Dst*>(dst), attr0, attr1);
+  }
+
+ private:
+  Scalar attr0, attr1;
+};
+
+}  // namespace
+
+template<BinaryOp binary_op, typename Src, typename Dst>
+std::unique_ptr<BroadcastElementwiseBinary> NewBroadcastElementwiseBinary(Scalar attr0,
+                                                                          Scalar attr1) {
+  return std::unique_ptr<BroadcastElementwiseBinary>(
+      new BroadcastElementwiseBinaryImpl<binary_op, Src, Dst>(attr0, attr1));
+}
+
+}  // namespace broadcast_elementwise_binary
+}  // namespace primitive
+}  // namespace ep
+
 }  // namespace oneflow
\ No newline at end of file
diff --git a/oneflow/core/ep/rocm/primitive/broadcast_elementwise_binary_activation_grad.hip.cpp b/oneflow/core/ep/rocm/primitive/broadcast_elementwise_binary_activation_grad.hip.cpp
index c6252be..c991252 100644
--- a/oneflow/core/ep/rocm/primitive/broadcast_elementwise_binary_activation_grad.hip.cpp
+++ b/oneflow/core/ep/rocm/primitive/broadcast_elementwise_binary_activation_grad.hip.cpp
@@ -1,39 +1,39 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "oneflow/core/ep/rocm/primitive/broadcast_elementwise_binary.hip.h"
-
-namespace oneflow {
-
-namespace ep {
-namespace primitive {
-namespace broadcast_elementwise_binary {
-
-#define INSTANTIATE_NEW_BROADCAST_ELEMENTWISE_BINARY_ACTIVATION_GRAD_ENTRY(binary_op,      \
-                                                                           data_type_pair) \
-  template std::unique_ptr<BroadcastElementwiseBinary> NewBroadcastElementwiseBinary<      \
-      binary_op, OF_PP_PAIR_FIRST(data_type_pair), OF_PP_PAIR_FIRST(data_type_pair)>(      \
-      Scalar attr0, Scalar attr1);
-
-OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_NEW_BROADCAST_ELEMENTWISE_BINARY_ACTIVATION_GRAD_ENTRY,
-                                 BINARY_ACTIVATION_BACKWARD_OP_SEQ,
-                                 CUDA_PRIMITIVE_FLOATING_TYPE_SEQ);
-
-}  // namespace broadcast_elementwise_binary
-}  // namespace primitive
-}  // namespace ep
-
-}  // namespace oneflow
-
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "oneflow/core/ep/rocm/primitive/broadcast_elementwise_binary.hip.h"
+
+namespace oneflow {
+
+namespace ep {
+namespace primitive {
+namespace broadcast_elementwise_binary {
+
+#define INSTANTIATE_NEW_BROADCAST_ELEMENTWISE_BINARY_ACTIVATION_GRAD_ENTRY(binary_op,      \
+                                                                           data_type_pair) \
+  template std::unique_ptr<BroadcastElementwiseBinary> NewBroadcastElementwiseBinary<      \
+      binary_op, OF_PP_PAIR_FIRST(data_type_pair), OF_PP_PAIR_FIRST(data_type_pair)>(      \
+      Scalar attr0, Scalar attr1);
+
+OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_NEW_BROADCAST_ELEMENTWISE_BINARY_ACTIVATION_GRAD_ENTRY,
+                                 BINARY_ACTIVATION_BACKWARD_OP_SEQ,
+                                 CUDA_PRIMITIVE_FLOATING_TYPE_SEQ);
+
+}  // namespace broadcast_elementwise_binary
+}  // namespace primitive
+}  // namespace ep
+
+}  // namespace oneflow
+
diff --git a/oneflow/core/ep/rocm/primitive/broadcast_elementwise_binary_comparision.hip.cpp b/oneflow/core/ep/rocm/primitive/broadcast_elementwise_binary_comparision.hip.cpp
index a7fc91e..fd9c0d4 100644
--- a/oneflow/core/ep/rocm/primitive/broadcast_elementwise_binary_comparision.hip.cpp
+++ b/oneflow/core/ep/rocm/primitive/broadcast_elementwise_binary_comparision.hip.cpp
@@ -1,38 +1,38 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "oneflow/core/ep/rocm/primitive/broadcast_elementwise_binary.hip.h"
-
-namespace oneflow {
-
-namespace ep {
-namespace primitive {
-namespace broadcast_elementwise_binary {
-
-#define INSTANTIATE_NEW_BROADCAST_ELEMENTWISE_BINARY_COMPARASION_ENTRY(                       \
-    binary_op, src_data_type_pair, dst_data_type_pair)                                        \
-  template std::unique_ptr<BroadcastElementwiseBinary> NewBroadcastElementwiseBinary<         \
-      binary_op, OF_PP_PAIR_FIRST(src_data_type_pair), OF_PP_PAIR_FIRST(dst_data_type_pair)>( \
-      Scalar attr0, Scalar attr1);
-
-OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_NEW_BROADCAST_ELEMENTWISE_BINARY_COMPARASION_ENTRY,
-                                 BINARY_COMPARISION_OP_SEQ, CUDA_PRIMITIVE_ALL_TYPE_SEQ,
-                                 CUDA_PRIMITIVE_BOOL_TYPE_SEQ);
-
-}  // namespace broadcast_elementwise_binary
-}  // namespace primitive
-}  // namespace ep
-
-}  // namespace oneflow
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "oneflow/core/ep/rocm/primitive/broadcast_elementwise_binary.hip.h"
+
+namespace oneflow {
+
+namespace ep {
+namespace primitive {
+namespace broadcast_elementwise_binary {
+
+#define INSTANTIATE_NEW_BROADCAST_ELEMENTWISE_BINARY_COMPARASION_ENTRY(                       \
+    binary_op, src_data_type_pair, dst_data_type_pair)                                        \
+  template std::unique_ptr<BroadcastElementwiseBinary> NewBroadcastElementwiseBinary<         \
+      binary_op, OF_PP_PAIR_FIRST(src_data_type_pair), OF_PP_PAIR_FIRST(dst_data_type_pair)>( \
+      Scalar attr0, Scalar attr1);
+
+OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_NEW_BROADCAST_ELEMENTWISE_BINARY_COMPARASION_ENTRY,
+                                 BINARY_COMPARISION_OP_SEQ, CUDA_PRIMITIVE_ALL_TYPE_SEQ,
+                                 CUDA_PRIMITIVE_BOOL_TYPE_SEQ);
+
+}  // namespace broadcast_elementwise_binary
+}  // namespace primitive
+}  // namespace ep
+
+}  // namespace oneflow
diff --git a/oneflow/core/ep/rocm/primitive/broadcast_elementwise_binary_logical.hip.cpp b/oneflow/core/ep/rocm/primitive/broadcast_elementwise_binary_logical.hip.cpp
index ffa05eb..4a03ee1 100644
--- a/oneflow/core/ep/rocm/primitive/broadcast_elementwise_binary_logical.hip.cpp
+++ b/oneflow/core/ep/rocm/primitive/broadcast_elementwise_binary_logical.hip.cpp
@@ -1,38 +1,38 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "oneflow/core/ep/rocm/primitive/broadcast_elementwise_binary.hip.h"
-
-namespace oneflow {
-
-namespace ep {
-namespace primitive {
-namespace broadcast_elementwise_binary {
-
-#define INSTANTIATE_NEW_BROADCAST_ELEMENTWISE_BINARY_LOGICAL_ENTRY(binary_op, src_data_type_pair, \
-                                                                   dst_data_type_pair)            \
-  template std::unique_ptr<BroadcastElementwiseBinary> NewBroadcastElementwiseBinary<             \
-      binary_op, OF_PP_PAIR_FIRST(src_data_type_pair), OF_PP_PAIR_FIRST(dst_data_type_pair)>(     \
-      Scalar attr0, Scalar attr1);
-
-OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_NEW_BROADCAST_ELEMENTWISE_BINARY_LOGICAL_ENTRY,
-                                 BINARY_COMPARISION_OP_SEQ BINARY_LOGICAL_OP_SEQ,
-                                 CUDA_PRIMITIVE_ALL_TYPE_SEQ, CUDA_PRIMITIVE_BOOL_TYPE_SEQ);
-
-}  // namespace broadcast_elementwise_binary
-}  // namespace primitive
-}  // namespace ep
-
-}  // namespace oneflow
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "oneflow/core/ep/rocm/primitive/broadcast_elementwise_binary.hip.h"
+
+namespace oneflow {
+
+namespace ep {
+namespace primitive {
+namespace broadcast_elementwise_binary {
+
+#define INSTANTIATE_NEW_BROADCAST_ELEMENTWISE_BINARY_LOGICAL_ENTRY(binary_op, src_data_type_pair, \
+                                                                   dst_data_type_pair)            \
+  template std::unique_ptr<BroadcastElementwiseBinary> NewBroadcastElementwiseBinary<             \
+      binary_op, OF_PP_PAIR_FIRST(src_data_type_pair), OF_PP_PAIR_FIRST(dst_data_type_pair)>(     \
+      Scalar attr0, Scalar attr1);
+
+OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_NEW_BROADCAST_ELEMENTWISE_BINARY_LOGICAL_ENTRY,
+                                 BINARY_COMPARISION_OP_SEQ BINARY_LOGICAL_OP_SEQ,
+                                 CUDA_PRIMITIVE_ALL_TYPE_SEQ, CUDA_PRIMITIVE_BOOL_TYPE_SEQ);
+
+}  // namespace broadcast_elementwise_binary
+}  // namespace primitive
+}  // namespace ep
+
+}  // namespace oneflow
diff --git a/oneflow/core/ep/rocm/primitive/broadcast_elementwise_binary_math.hip.cpp b/oneflow/core/ep/rocm/primitive/broadcast_elementwise_binary_math.hip.cpp
index a2ca2bb..144f1a7 100644
--- a/oneflow/core/ep/rocm/primitive/broadcast_elementwise_binary_math.hip.cpp
+++ b/oneflow/core/ep/rocm/primitive/broadcast_elementwise_binary_math.hip.cpp
@@ -1,36 +1,36 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "oneflow/core/ep/rocm/primitive/broadcast_elementwise_binary.hip.h"
-
-namespace oneflow {
-
-namespace ep {
-namespace primitive {
-namespace broadcast_elementwise_binary {
-
-#define INSTANTIATE_NEW_BROADCAST_ELEMENTWISE_BINARY_MATH_ENTRY(binary_op, data_type_pair) \
-  template std::unique_ptr<BroadcastElementwiseBinary> NewBroadcastElementwiseBinary<      \
-      binary_op, OF_PP_PAIR_FIRST(data_type_pair), OF_PP_PAIR_FIRST(data_type_pair)>(      \
-      Scalar attr0, Scalar attr1);
-
-OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_NEW_BROADCAST_ELEMENTWISE_BINARY_MATH_ENTRY,
-                                 BINARY_MATH_OP_SEQ, CUDA_PRIMITIVE_ALL_TYPE_SEQ);
-
-}  // namespace broadcast_elementwise_binary
-}  // namespace primitive
-}  // namespace ep
-
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "oneflow/core/ep/rocm/primitive/broadcast_elementwise_binary.hip.h"
+
+namespace oneflow {
+
+namespace ep {
+namespace primitive {
+namespace broadcast_elementwise_binary {
+
+#define INSTANTIATE_NEW_BROADCAST_ELEMENTWISE_BINARY_MATH_ENTRY(binary_op, data_type_pair) \
+  template std::unique_ptr<BroadcastElementwiseBinary> NewBroadcastElementwiseBinary<      \
+      binary_op, OF_PP_PAIR_FIRST(data_type_pair), OF_PP_PAIR_FIRST(data_type_pair)>(      \
+      Scalar attr0, Scalar attr1);
+
+OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_NEW_BROADCAST_ELEMENTWISE_BINARY_MATH_ENTRY,
+                                 BINARY_MATH_OP_SEQ, CUDA_PRIMITIVE_ALL_TYPE_SEQ);
+
+}  // namespace broadcast_elementwise_binary
+}  // namespace primitive
+}  // namespace ep
+
 }  // namespace oneflow
\ No newline at end of file
diff --git a/oneflow/core/ep/rocm/primitive/broadcast_matmul.cpp b/oneflow/core/ep/rocm/primitive/broadcast_matmul.cpp
index d42ace2..3a92b64 100644
--- a/oneflow/core/ep/rocm/primitive/broadcast_matmul.cpp
+++ b/oneflow/core/ep/rocm/primitive/broadcast_matmul.cpp
@@ -1,237 +1,237 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#ifdef WITH_ROCM
-
-#include "oneflow/core/ep/include/primitive/primitive.h"
-#include "oneflow/core/ep/include/primitive/broadcast_matmul.h"
-#include "oneflow/core/ep/common/primitive/broadcast_matmul.h"
-#include "oneflow/core/common/optional.h"
-#include "oneflow/core/device/cuda_util.h"
-#include "oneflow/core/ep/rocm/cuda_stream.h"
-#include <hip/hip_runtime.h>
-#include <hip/hip_fp16.h>
-namespace oneflow {
-
-namespace ep {
-namespace primitive {
-
-namespace broadcast_matmul {
-
-namespace internal {
-
-namespace {
-
-constexpr size_t kMaxNumDims = 8;
-
-Optional<hipblasDatatype_t> OptCudaDataType(DataType data_type) {
-  switch (data_type) {
-    case kFloat: return HIPBLAS_R_32F;
-    case kDouble: return HIPBLAS_R_64F;
-    case kFloat16: return HIPBLAS_R_16F;
-// #if CUDA_VERSION >= 11000
-//     case kBFloat16: return CUDA_R_16BF;
-// #endif  // CUDA_VERSION >= 11000
-    default: return NullOpt;
-  }
-}
-
-hipblasDatatype_t GetCudaDataType(DataType data_type) {
-  auto cuda_data_type = OptCudaDataType(data_type);
-  CHECK(cuda_data_type.has_value());
-  return cuda_data_type.value_or(HIPBLAS_R_32F);
-}
-
-union CublasScalarParameter {
-  double d;
-  float s;
-};
-
-CublasScalarParameter GetCublasScalarParameter(Scalar scalar, hipblasDatatype_t compute_type) {
-  CublasScalarParameter sp{};
-  if (compute_type == HIPBLAS_R_64F) {
-    sp.d = scalar.Value<double>();
-  } else if (compute_type == HIPBLAS_R_32F) {
-    sp.s = scalar.Value<float>();
-  } else if (compute_type == HIPBLAS_R_16F) {
-    sp.s = scalar.Value<float>();
-  } else {
-    UNIMPLEMENTED();
-  }
-  return sp;
-}
-
-hipblasDatatype_t GetComputeType(DataType data_type) {
-  switch (data_type) {
-    case kFloat: return HIPBLAS_R_32F;
-    case kDouble: return HIPBLAS_R_64F;
-    case kFloat16: return HIPBLAS_R_16F;
-// #if CUDA_VERSION >= 11000
-//     case kBFloat16: return HIPBLAS_R_32F;
-// #endif  // CUDA_VERSION >= 11000
-    default: UNIMPLEMENTED(); return HIPBLAS_R_32F;
-  }
-}
-
-void LaunchBroadcastMatmul(Stream* stream, DataType data_type, BlasTransposeType transpose_a,
-                           BlasTransposeType transpose_b, int64_t num_batch_dims,
-                           const int64_t* broadcast_batch_dims, const int64_t* a_batch_dims,
-                           const int64_t* b_batch_dims, const int64_t* c_batch_dims, int64_t m,
-                           int64_t n, int64_t k, Scalar alpha, const void* a, const void* b,
-                           Scalar beta, void* c) {
-  auto* cuda_stream = stream->As<CudaStream>();
-  const auto cuda_data_type = GetCudaDataType(data_type);
-  const auto compute_type = GetComputeType(data_type);
-  const auto sp_alpha = GetCublasScalarParameter(alpha, compute_type);
-  __half h_alpha = 0;
-  if (compute_type == HIPBLAS_R_16F) {
-      h_alpha = __float2half(sp_alpha.s);
-  }
-  const auto GetCublasOperation = [](BlasTransposeType transpose_type) {
-    if (transpose_type == BlasTransposeType::N) {
-      return HIPBLAS_OP_N;
-    } else if (transpose_type == BlasTransposeType::T) {
-      return HIPBLAS_OP_T;
-    } else {
-      UNIMPLEMENTED();
-      return HIPBLAS_OP_N;
-    }
-  };
-  const hipblasOperation_t cublas_trans_a = GetCublasOperation(transpose_b);
-  const hipblasOperation_t cublas_trans_b = GetCublasOperation(transpose_a);
-  const int cublas_m = n;
-  const int cublas_n = m;
-  const int cublas_k = k;
-  int cublas_lda = 0;
-  if (transpose_b == BlasTransposeType::N) {
-    cublas_lda = n;
-  } else if (transpose_b == BlasTransposeType::T) {
-    cublas_lda = k;
-  } else {
-    UNIMPLEMENTED();
-  }
-  int cublas_ldb = 0;
-  if (transpose_a == BlasTransposeType::N) {
-    cublas_ldb = k;
-  } else if (transpose_a == BlasTransposeType::T) {
-    cublas_ldb = m;
-  } else {
-    UNIMPLEMENTED();
-  }
-  const int cublas_ldc = n;
-  // CublasMathModeGuard guard(cuda_stream->cublas_handle());
-//   if (data_type == DataType::kFloat16) {
-// #if CUDA_VERSION < 11000
-//     guard.SetMathMode(CUBLAS_TENSOR_OP_MATH);
-// #else
-//     guard.SetMathMode(CUBLAS_DEFAULT_MATH);
-// #endif  // CUDA_VERSION < 11000
-//   }
-// #if CUDA_VERSION >= 11000
-//   hipblasGemmAlgo_t algo = HIPBLAS_GEMM_DEFAULT;
-  hipblasGemmAlgo_t algo = HIPBLAS_GEMM_DEFAULT;  
-// #else
-//   hipblasGemmAlgo_t algo =
-//       (data_type == DataType::kFloat16) ? CUBLAS_GEMM_DFALT_TENSOR_OP : HIPBLAS_GEMM_DEFAULT;
-// #endif
-  if (num_batch_dims == 1 && c_batch_dims[0] != 1) {
-    const void* cublas_a = b;
-    const void* cublas_b = a;
-    void* cublas_c = c;
-    const int64_t a_batch_count = a_batch_dims[0];
-    const int64_t b_batch_count = b_batch_dims[0];
-    CHECK(a_batch_count == 1 || b_batch_count == 1 || a_batch_count == b_batch_count);
-    CHECK_GT(a_batch_count, 0);
-    CHECK_GT(b_batch_count, 0);
-    const int batch_count = std::max(a_batch_count, b_batch_count);
-    const long long int cublas_stride_a = b_batch_count == 1 ? 0 : cublas_m * cublas_k;
-    const long long int cublas_stride_b = a_batch_count == 1 ? 0 : cublas_k * cublas_n;
-    const long long int cublas_stride_c = cublas_m * cublas_n;
-    const auto sp_beta = GetCublasScalarParameter(beta, compute_type);
-    __half h_beta = 0;
-    if (compute_type == HIPBLAS_R_16F) {
-      h_beta = __float2half(sp_beta.s);
-      OF_CUBLAS_CHECK(hipblasGemmStridedBatchedEx(
-        cuda_stream->cublas_handle(), cublas_trans_a, cublas_trans_b, cublas_m, cublas_n, cublas_k,
-        &h_alpha, cublas_a, cuda_data_type, cublas_lda, cublas_stride_a, cublas_b, cuda_data_type,
-        cublas_ldb, cublas_stride_b, &h_beta, cublas_c, cuda_data_type, cublas_ldc,
-        cublas_stride_c, batch_count, compute_type, algo));
-    } else {
-      OF_CUBLAS_CHECK(hipblasGemmStridedBatchedEx(
-        cuda_stream->cublas_handle(), cublas_trans_a, cublas_trans_b, cublas_m, cublas_n, cublas_k,
-        &sp_alpha, cublas_a, cuda_data_type, cublas_lda, cublas_stride_a, cublas_b, cuda_data_type,
-        cublas_ldb, cublas_stride_b, &sp_beta, cublas_c, cuda_data_type, cublas_ldc,
-        cublas_stride_c, batch_count, compute_type, algo));
-    }
-    
-  } else {
-    auto func = [&](const void* batch_a, const void* batch_b, void* batch_c, Scalar batch_beta) {
-      const auto sp_beta = GetCublasScalarParameter(batch_beta, compute_type);
-      __half h_beta = 0;
-      const void* cublas_a = batch_b;
-      const void* cublas_b = batch_a;
-      void* cublas_c = batch_c;
-      if (compute_type == HIPBLAS_R_16F) {
-        h_beta = __float2half(sp_beta.s);
-        OF_CUBLAS_CHECK(hipblasGemmEx(
-          cuda_stream->cublas_handle(), cublas_trans_a, cublas_trans_b, cublas_m, cublas_n,
-          cublas_k, &h_alpha, cublas_a, cuda_data_type, cublas_lda, cublas_b, cuda_data_type,
-          cublas_ldb, &h_beta, cublas_c, cuda_data_type, cublas_ldc, compute_type, algo));
-      } else {
-        OF_CUBLAS_CHECK(hipblasGemmEx(
-          cuda_stream->cublas_handle(), cublas_trans_a, cublas_trans_b, cublas_m, cublas_n,
-          cublas_k, &sp_alpha, cublas_a, cuda_data_type, cublas_lda, cublas_b, cuda_data_type,
-          cublas_ldb, &sp_beta, cublas_c, cuda_data_type, cublas_ldc, compute_type, algo));
-      }
-      
-    };
-    ForEachMatmul<kMaxNumDims>(data_type, m, n, k, beta, num_batch_dims, broadcast_batch_dims,
-                               a_batch_dims, b_batch_dims, c_batch_dims, a, b, c, func);
-  }
-}
-
-class BroadcastMatmulFactoryImpl : public BroadcastMatmulFactory {
- public:
-  OF_DISALLOW_COPY_AND_MOVE(BroadcastMatmulFactoryImpl);
-  BroadcastMatmulFactoryImpl() = default;
-  ~BroadcastMatmulFactoryImpl() override = default;
-
-  std::unique_ptr<BroadcastMatmul> New(DataType data_type, BlasTransposeType transpose_a,
-                                       BlasTransposeType transpose_b,
-                                       size_t max_num_dims) override {
-    auto cuda_data_type = OptCudaDataType(data_type);
-    if (max_num_dims <= kMaxNumDims && cuda_data_type.has_value()) {
-      return std::make_unique<BroadcastMatmulImpl<kMaxNumDims>>(data_type, transpose_a,
-                                                                transpose_b);
-    } else {
-      return nullptr;
-    }
-  }
-};
-
-REGISTER_PRIMITIVE_FACTORY(DeviceType::kCUDA, BroadcastMatmulFactory, BroadcastMatmulFactoryImpl);
-
-}  // namespace
-
-}  // namespace internal
-
-}  // namespace broadcast_matmul
-
-}  // namespace primitive
-}  // namespace ep
-
-}  // namespace oneflow
-
-#endif  // WITH_ROCM
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#ifdef WITH_ROCM
+
+#include "oneflow/core/ep/include/primitive/primitive.h"
+#include "oneflow/core/ep/include/primitive/broadcast_matmul.h"
+#include "oneflow/core/ep/common/primitive/broadcast_matmul.h"
+#include "oneflow/core/common/optional.h"
+#include "oneflow/core/device/cuda_util.h"
+#include "oneflow/core/ep/rocm/cuda_stream.h"
+#include <hip/hip_runtime.h>
+#include <hip/hip_fp16.h>
+namespace oneflow {
+
+namespace ep {
+namespace primitive {
+
+namespace broadcast_matmul {
+
+namespace internal {
+
+namespace {
+
+constexpr size_t kMaxNumDims = 8;
+
+Optional<hipblasDatatype_t> OptCudaDataType(DataType data_type) {
+  switch (data_type) {
+    case kFloat: return HIPBLAS_R_32F;
+    case kDouble: return HIPBLAS_R_64F;
+    case kFloat16: return HIPBLAS_R_16F;
+// #if CUDA_VERSION >= 11000
+//     case kBFloat16: return CUDA_R_16BF;
+// #endif  // CUDA_VERSION >= 11000
+    default: return NullOpt;
+  }
+}
+
+hipblasDatatype_t GetCudaDataType(DataType data_type) {
+  auto cuda_data_type = OptCudaDataType(data_type);
+  CHECK(cuda_data_type.has_value());
+  return cuda_data_type.value_or(HIPBLAS_R_32F);
+}
+
+union CublasScalarParameter {
+  double d;
+  float s;
+};
+
+CublasScalarParameter GetCublasScalarParameter(Scalar scalar, hipblasDatatype_t compute_type) {
+  CublasScalarParameter sp{};
+  if (compute_type == HIPBLAS_R_64F) {
+    sp.d = scalar.Value<double>();
+  } else if (compute_type == HIPBLAS_R_32F) {
+    sp.s = scalar.Value<float>();
+  } else if (compute_type == HIPBLAS_R_16F) {
+    sp.s = scalar.Value<float>();
+  } else {
+    UNIMPLEMENTED();
+  }
+  return sp;
+}
+
+hipblasDatatype_t GetComputeType(DataType data_type) {
+  switch (data_type) {
+    case kFloat: return HIPBLAS_R_32F;
+    case kDouble: return HIPBLAS_R_64F;
+    case kFloat16: return HIPBLAS_R_16F;
+// #if CUDA_VERSION >= 11000
+//     case kBFloat16: return HIPBLAS_R_32F;
+// #endif  // CUDA_VERSION >= 11000
+    default: UNIMPLEMENTED(); return HIPBLAS_R_32F;
+  }
+}
+
+void LaunchBroadcastMatmul(Stream* stream, DataType data_type, BlasTransposeType transpose_a,
+                           BlasTransposeType transpose_b, int64_t num_batch_dims,
+                           const int64_t* broadcast_batch_dims, const int64_t* a_batch_dims,
+                           const int64_t* b_batch_dims, const int64_t* c_batch_dims, int64_t m,
+                           int64_t n, int64_t k, Scalar alpha, const void* a, const void* b,
+                           Scalar beta, void* c) {
+  auto* cuda_stream = stream->As<CudaStream>();
+  const auto cuda_data_type = GetCudaDataType(data_type);
+  const auto compute_type = GetComputeType(data_type);
+  const auto sp_alpha = GetCublasScalarParameter(alpha, compute_type);
+  __half h_alpha = 0;
+  if (compute_type == HIPBLAS_R_16F) {
+      h_alpha = __float2half(sp_alpha.s);
+  }
+  const auto GetCublasOperation = [](BlasTransposeType transpose_type) {
+    if (transpose_type == BlasTransposeType::N) {
+      return HIPBLAS_OP_N;
+    } else if (transpose_type == BlasTransposeType::T) {
+      return HIPBLAS_OP_T;
+    } else {
+      UNIMPLEMENTED();
+      return HIPBLAS_OP_N;
+    }
+  };
+  const hipblasOperation_t cublas_trans_a = GetCublasOperation(transpose_b);
+  const hipblasOperation_t cublas_trans_b = GetCublasOperation(transpose_a);
+  const int cublas_m = n;
+  const int cublas_n = m;
+  const int cublas_k = k;
+  int cublas_lda = 0;
+  if (transpose_b == BlasTransposeType::N) {
+    cublas_lda = n;
+  } else if (transpose_b == BlasTransposeType::T) {
+    cublas_lda = k;
+  } else {
+    UNIMPLEMENTED();
+  }
+  int cublas_ldb = 0;
+  if (transpose_a == BlasTransposeType::N) {
+    cublas_ldb = k;
+  } else if (transpose_a == BlasTransposeType::T) {
+    cublas_ldb = m;
+  } else {
+    UNIMPLEMENTED();
+  }
+  const int cublas_ldc = n;
+  // CublasMathModeGuard guard(cuda_stream->cublas_handle());
+//   if (data_type == DataType::kFloat16) {
+// #if CUDA_VERSION < 11000
+//     guard.SetMathMode(CUBLAS_TENSOR_OP_MATH);
+// #else
+//     guard.SetMathMode(CUBLAS_DEFAULT_MATH);
+// #endif  // CUDA_VERSION < 11000
+//   }
+// #if CUDA_VERSION >= 11000
+//   hipblasGemmAlgo_t algo = HIPBLAS_GEMM_DEFAULT;
+  hipblasGemmAlgo_t algo = HIPBLAS_GEMM_DEFAULT;  
+// #else
+//   hipblasGemmAlgo_t algo =
+//       (data_type == DataType::kFloat16) ? CUBLAS_GEMM_DFALT_TENSOR_OP : HIPBLAS_GEMM_DEFAULT;
+// #endif
+  if (num_batch_dims == 1 && c_batch_dims[0] != 1) {
+    const void* cublas_a = b;
+    const void* cublas_b = a;
+    void* cublas_c = c;
+    const int64_t a_batch_count = a_batch_dims[0];
+    const int64_t b_batch_count = b_batch_dims[0];
+    CHECK(a_batch_count == 1 || b_batch_count == 1 || a_batch_count == b_batch_count);
+    CHECK_GT(a_batch_count, 0);
+    CHECK_GT(b_batch_count, 0);
+    const int batch_count = std::max(a_batch_count, b_batch_count);
+    const long long int cublas_stride_a = b_batch_count == 1 ? 0 : cublas_m * cublas_k;
+    const long long int cublas_stride_b = a_batch_count == 1 ? 0 : cublas_k * cublas_n;
+    const long long int cublas_stride_c = cublas_m * cublas_n;
+    const auto sp_beta = GetCublasScalarParameter(beta, compute_type);
+    __half h_beta = 0;
+    if (compute_type == HIPBLAS_R_16F) {
+      h_beta = __float2half(sp_beta.s);
+      OF_CUBLAS_CHECK(hipblasGemmStridedBatchedEx(
+        cuda_stream->cublas_handle(), cublas_trans_a, cublas_trans_b, cublas_m, cublas_n, cublas_k,
+        &h_alpha, cublas_a, cuda_data_type, cublas_lda, cublas_stride_a, cublas_b, cuda_data_type,
+        cublas_ldb, cublas_stride_b, &h_beta, cublas_c, cuda_data_type, cublas_ldc,
+        cublas_stride_c, batch_count, compute_type, algo));
+    } else {
+      OF_CUBLAS_CHECK(hipblasGemmStridedBatchedEx(
+        cuda_stream->cublas_handle(), cublas_trans_a, cublas_trans_b, cublas_m, cublas_n, cublas_k,
+        &sp_alpha, cublas_a, cuda_data_type, cublas_lda, cublas_stride_a, cublas_b, cuda_data_type,
+        cublas_ldb, cublas_stride_b, &sp_beta, cublas_c, cuda_data_type, cublas_ldc,
+        cublas_stride_c, batch_count, compute_type, algo));
+    }
+    
+  } else {
+    auto func = [&](const void* batch_a, const void* batch_b, void* batch_c, Scalar batch_beta) {
+      const auto sp_beta = GetCublasScalarParameter(batch_beta, compute_type);
+      __half h_beta = 0;
+      const void* cublas_a = batch_b;
+      const void* cublas_b = batch_a;
+      void* cublas_c = batch_c;
+      if (compute_type == HIPBLAS_R_16F) {
+        h_beta = __float2half(sp_beta.s);
+        OF_CUBLAS_CHECK(hipblasGemmEx(
+          cuda_stream->cublas_handle(), cublas_trans_a, cublas_trans_b, cublas_m, cublas_n,
+          cublas_k, &h_alpha, cublas_a, cuda_data_type, cublas_lda, cublas_b, cuda_data_type,
+          cublas_ldb, &h_beta, cublas_c, cuda_data_type, cublas_ldc, compute_type, algo));
+      } else {
+        OF_CUBLAS_CHECK(hipblasGemmEx(
+          cuda_stream->cublas_handle(), cublas_trans_a, cublas_trans_b, cublas_m, cublas_n,
+          cublas_k, &sp_alpha, cublas_a, cuda_data_type, cublas_lda, cublas_b, cuda_data_type,
+          cublas_ldb, &sp_beta, cublas_c, cuda_data_type, cublas_ldc, compute_type, algo));
+      }
+      
+    };
+    ForEachMatmul<kMaxNumDims>(data_type, m, n, k, beta, num_batch_dims, broadcast_batch_dims,
+                               a_batch_dims, b_batch_dims, c_batch_dims, a, b, c, func);
+  }
+}
+
+class BroadcastMatmulFactoryImpl : public BroadcastMatmulFactory {
+ public:
+  OF_DISALLOW_COPY_AND_MOVE(BroadcastMatmulFactoryImpl);
+  BroadcastMatmulFactoryImpl() = default;
+  ~BroadcastMatmulFactoryImpl() override = default;
+
+  std::unique_ptr<BroadcastMatmul> New(DataType data_type, BlasTransposeType transpose_a,
+                                       BlasTransposeType transpose_b,
+                                       size_t max_num_dims) override {
+    auto cuda_data_type = OptCudaDataType(data_type);
+    if (max_num_dims <= kMaxNumDims && cuda_data_type.has_value()) {
+      return std::make_unique<BroadcastMatmulImpl<kMaxNumDims>>(data_type, transpose_a,
+                                                                transpose_b);
+    } else {
+      return nullptr;
+    }
+  }
+};
+
+REGISTER_PRIMITIVE_FACTORY(DeviceType::kCUDA, BroadcastMatmulFactory, BroadcastMatmulFactoryImpl);
+
+}  // namespace
+
+}  // namespace internal
+
+}  // namespace broadcast_matmul
+
+}  // namespace primitive
+}  // namespace ep
+
+}  // namespace oneflow
+
+#endif  // WITH_ROCM
diff --git a/oneflow/core/ep/rocm/primitive/cast.hip.cpp b/oneflow/core/ep/rocm/primitive/cast.hip.cpp
index d65d126..d2e60b9 100644
--- a/oneflow/core/ep/rocm/primitive/cast.hip.cpp
+++ b/oneflow/core/ep/rocm/primitive/cast.hip.cpp
@@ -1,148 +1,148 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "oneflow/core/common/preprocessor.h"
-#include "oneflow/core/ep/include/primitive/cast.h"
-#include "oneflow/core/ep/rocm/primitive/type_seq.h"
-#include "oneflow/core/hip/elementwise.hip.h"
-#include "oneflow/core/ep/rocm/cuda_stream.h"
-
-namespace oneflow {
-
-namespace ep {
-namespace primitive {
-
-namespace {
-
-template<typename To, typename From, typename = void>
-struct CastFunctor {
-  __device__ To operator()(From from) const { return static_cast<To>(from); }
-};
-
-template<typename To>
-struct CastFunctor<To, half, typename std::enable_if<!std::is_same<To, half>::value>::type> {
-  __device__ To operator()(half from) const { return static_cast<To>(static_cast<float>(from)); }
-
-  __device__ void Apply2(To* to, const half* from) const {
-    const float2 f2 = __half22float2(*reinterpret_cast<const half2*>(from));
-    to[0] = static_cast<To>(f2.x);
-    to[1] = static_cast<To>(f2.y);
-  }
-};
-
-template<typename From>
-struct CastFunctor<half, From, typename std::enable_if<!std::is_same<From, half>::value>::type> {
-  __device__ half operator()(From from) const {
-    return static_cast<half>(static_cast<float>(from));
-  }
-
-  __device__ void Apply2(half* to, const From* from) const {
-    float2 f2;
-    f2.x = static_cast<float>(from[0]);
-    f2.y = static_cast<float>(from[1]);
-    *reinterpret_cast<half2*>(to) = __float22half2_rn(f2);
-  }
-};
-
-// #if CUDA_VERSION >= 11000
-
-// template<typename To>
-// struct CastFunctor<To, nv_bfloat16,
-//                    typename std::enable_if<!(std::is_same<To, nv_bfloat16>::value
-//                                              || std::is_same<To, half>::value)>::type> {
-//   __device__ To operator()(nv_bfloat16 from) const {
-//     return static_cast<To>(static_cast<float>(from));
-//   }
-// };
-
-// template<typename From>
-// struct CastFunctor<nv_bfloat16, From,
-//                    typename std::enable_if<!(std::is_same<From, nv_bfloat16>::value
-//                                              || std::is_same<From, half>::value)>::type> {
-//   __device__ nv_bfloat16 operator()(From from) const {
-//     return static_cast<nv_bfloat16>(static_cast<float>(from));
-//   }
-// };
-
-// #endif  // CUDA_VERSION >= 11000
-
-template<typename From, typename To>
-class CastImpl : public Cast {
- public:
-  OF_DISALLOW_COPY_AND_MOVE(CastImpl);
-  explicit CastImpl() = default;
-  ~CastImpl() override = default;
-
-  void Launch(Stream* stream, const void* from, void* to, size_t count) override {
-    auto* cuda_stream = stream->As<CudaStream>();
-    OF_CUDA_CHECK((cuda::elementwise::Unary<CastFunctor<To, From>, To, From>(
-        CastFunctor<To, From>(), count, reinterpret_cast<To*>(to),
-        reinterpret_cast<const From*>(from), cuda_stream->cuda_stream())));
-  }
-};
-
-template<typename From, typename To>
-std::unique_ptr<Cast> NewCast() {
-  return std::unique_ptr<Cast>(new CastImpl<From, To>());
-}
-
-#define CUDA_PRIMITIVE_CAST_TYPE_SEQ \
-  CUDA_PRIMITIVE_BOOL_TYPE_SEQ       \
-  CUDA_PRIMITIVE_CHAR_TYPE_SEQ       \
-  CUDA_PRIMITIVE_INT8_TYPE_SEQ       \
-  CUDA_PRIMITIVE_UINT8_TYPE_SEQ      \
-  CUDA_PRIMITIVE_INT32_TYPE_SEQ      \
-  CUDA_PRIMITIVE_UINT32_TYPE_SEQ     \
-  CUDA_PRIMITIVE_INT64_TYPE_SEQ      \
-  CUDA_PRIMITIVE_UINT64_TYPE_SEQ     \
-  CUDA_PRIMITIVE_FLOAT_TYPE_SEQ      \
-  CUDA_PRIMITIVE_DOUBLE_TYPE_SEQ     \
-  CUDA_PRIMITIVE_FLOAT16_TYPE_SEQ    \
-  CUDA_PRIMITIVE_BFLOAT16_TYPE_SEQ
-
-class CastFactoryImpl : public CastFactory {
- public:
-  OF_DISALLOW_COPY_AND_MOVE(CastFactoryImpl);
-  CastFactoryImpl() = default;
-  ~CastFactoryImpl() override = default;
-
-  std::unique_ptr<Cast> New(DataType from, DataType to) override {
-#define MAKE_NEW_CAST_ENTRY(from_pair, to_pair)                              \
-  {std::make_pair(OF_PP_PAIR_SECOND(from_pair), OF_PP_PAIR_SECOND(to_pair)), \
-   NewCast<OF_PP_PAIR_FIRST(from_pair), OF_PP_PAIR_FIRST(to_pair)>},
-
-    static const std::map<std::pair<DataType, DataType>, std::function<std::unique_ptr<Cast>()>>
-        new_cast_handle{OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(
-            MAKE_NEW_CAST_ENTRY, CUDA_PRIMITIVE_CAST_TYPE_SEQ, CUDA_PRIMITIVE_CAST_TYPE_SEQ)};
-
-#undef MAKE_NEW_CAST_ENTRY
-
-    const auto it = new_cast_handle.find(std::make_pair(from, to));
-    if (it != new_cast_handle.end()) {
-      return it->second();
-    } else {
-      return nullptr;
-    }
-  }
-};
-
-REGISTER_PRIMITIVE_FACTORY(DeviceType::kCUDA, CastFactory, CastFactoryImpl);
-
-}  // namespace
-
-}  // namespace primitive
-}  // namespace ep
-
-}  // namespace oneflow
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "oneflow/core/common/preprocessor.h"
+#include "oneflow/core/ep/include/primitive/cast.h"
+#include "oneflow/core/ep/rocm/primitive/type_seq.h"
+#include "oneflow/core/hip/elementwise.hip.h"
+#include "oneflow/core/ep/rocm/cuda_stream.h"
+
+namespace oneflow {
+
+namespace ep {
+namespace primitive {
+
+namespace {
+
+template<typename To, typename From, typename = void>
+struct CastFunctor {
+  __device__ To operator()(From from) const { return static_cast<To>(from); }
+};
+
+template<typename To>
+struct CastFunctor<To, half, typename std::enable_if<!std::is_same<To, half>::value>::type> {
+  __device__ To operator()(half from) const { return static_cast<To>(static_cast<float>(from)); }
+
+  __device__ void Apply2(To* to, const half* from) const {
+    const float2 f2 = __half22float2(*reinterpret_cast<const half2*>(from));
+    to[0] = static_cast<To>(f2.x);
+    to[1] = static_cast<To>(f2.y);
+  }
+};
+
+template<typename From>
+struct CastFunctor<half, From, typename std::enable_if<!std::is_same<From, half>::value>::type> {
+  __device__ half operator()(From from) const {
+    return static_cast<half>(static_cast<float>(from));
+  }
+
+  __device__ void Apply2(half* to, const From* from) const {
+    float2 f2;
+    f2.x = static_cast<float>(from[0]);
+    f2.y = static_cast<float>(from[1]);
+    *reinterpret_cast<half2*>(to) = __float22half2_rn(f2);
+  }
+};
+
+// #if CUDA_VERSION >= 11000
+
+// template<typename To>
+// struct CastFunctor<To, nv_bfloat16,
+//                    typename std::enable_if<!(std::is_same<To, nv_bfloat16>::value
+//                                              || std::is_same<To, half>::value)>::type> {
+//   __device__ To operator()(nv_bfloat16 from) const {
+//     return static_cast<To>(static_cast<float>(from));
+//   }
+// };
+
+// template<typename From>
+// struct CastFunctor<nv_bfloat16, From,
+//                    typename std::enable_if<!(std::is_same<From, nv_bfloat16>::value
+//                                              || std::is_same<From, half>::value)>::type> {
+//   __device__ nv_bfloat16 operator()(From from) const {
+//     return static_cast<nv_bfloat16>(static_cast<float>(from));
+//   }
+// };
+
+// #endif  // CUDA_VERSION >= 11000
+
+template<typename From, typename To>
+class CastImpl : public Cast {
+ public:
+  OF_DISALLOW_COPY_AND_MOVE(CastImpl);
+  explicit CastImpl() = default;
+  ~CastImpl() override = default;
+
+  void Launch(Stream* stream, const void* from, void* to, size_t count) override {
+    auto* cuda_stream = stream->As<CudaStream>();
+    OF_CUDA_CHECK((cuda::elementwise::Unary<CastFunctor<To, From>, To, From>(
+        CastFunctor<To, From>(), count, reinterpret_cast<To*>(to),
+        reinterpret_cast<const From*>(from), cuda_stream->cuda_stream())));
+  }
+};
+
+template<typename From, typename To>
+std::unique_ptr<Cast> NewCast() {
+  return std::unique_ptr<Cast>(new CastImpl<From, To>());
+}
+
+#define CUDA_PRIMITIVE_CAST_TYPE_SEQ \
+  CUDA_PRIMITIVE_BOOL_TYPE_SEQ       \
+  CUDA_PRIMITIVE_CHAR_TYPE_SEQ       \
+  CUDA_PRIMITIVE_INT8_TYPE_SEQ       \
+  CUDA_PRIMITIVE_UINT8_TYPE_SEQ      \
+  CUDA_PRIMITIVE_INT32_TYPE_SEQ      \
+  CUDA_PRIMITIVE_UINT32_TYPE_SEQ     \
+  CUDA_PRIMITIVE_INT64_TYPE_SEQ      \
+  CUDA_PRIMITIVE_UINT64_TYPE_SEQ     \
+  CUDA_PRIMITIVE_FLOAT_TYPE_SEQ      \
+  CUDA_PRIMITIVE_DOUBLE_TYPE_SEQ     \
+  CUDA_PRIMITIVE_FLOAT16_TYPE_SEQ    \
+  CUDA_PRIMITIVE_BFLOAT16_TYPE_SEQ
+
+class CastFactoryImpl : public CastFactory {
+ public:
+  OF_DISALLOW_COPY_AND_MOVE(CastFactoryImpl);
+  CastFactoryImpl() = default;
+  ~CastFactoryImpl() override = default;
+
+  std::unique_ptr<Cast> New(DataType from, DataType to) override {
+#define MAKE_NEW_CAST_ENTRY(from_pair, to_pair)                              \
+  {std::make_pair(OF_PP_PAIR_SECOND(from_pair), OF_PP_PAIR_SECOND(to_pair)), \
+   NewCast<OF_PP_PAIR_FIRST(from_pair), OF_PP_PAIR_FIRST(to_pair)>},
+
+    static const std::map<std::pair<DataType, DataType>, std::function<std::unique_ptr<Cast>()>>
+        new_cast_handle{OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(
+            MAKE_NEW_CAST_ENTRY, CUDA_PRIMITIVE_CAST_TYPE_SEQ, CUDA_PRIMITIVE_CAST_TYPE_SEQ)};
+
+#undef MAKE_NEW_CAST_ENTRY
+
+    const auto it = new_cast_handle.find(std::make_pair(from, to));
+    if (it != new_cast_handle.end()) {
+      return it->second();
+    } else {
+      return nullptr;
+    }
+  }
+};
+
+REGISTER_PRIMITIVE_FACTORY(DeviceType::kCUDA, CastFactory, CastFactoryImpl);
+
+}  // namespace
+
+}  // namespace primitive
+}  // namespace ep
+
+}  // namespace oneflow
diff --git a/oneflow/core/ep/rocm/primitive/constant_pad.hip.cpp b/oneflow/core/ep/rocm/primitive/constant_pad.hip.cpp
index fd0d037..be1a539 100644
--- a/oneflow/core/ep/rocm/primitive/constant_pad.hip.cpp
+++ b/oneflow/core/ep/rocm/primitive/constant_pad.hip.cpp
@@ -1,255 +1,255 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "oneflow/core/ep/include/primitive/constant_pad.h"
-#include "oneflow/core/ep/common/primitive/constant_pad.h"
-#include "oneflow/core/ep/rocm/primitive/type_seq.h"
-#include "oneflow/core/ep/rocm/cuda_stream.h"
-#include <hip/hip_runtime.h>
-
-namespace oneflow {
-
-namespace ep {
-
-namespace primitive {
-
-namespace {
-
-template<size_t num_dims, typename IndexType, typename StorageType>
-__global__ void ConstantPadKernel(ConstantPadParams<num_dims, IndexType> params,
-                                  StorageType packed_pad_val) {
-  const StorageType* src = reinterpret_cast<const StorageType*>(params.src);
-  StorageType* dst = reinterpret_cast<StorageType*>(params.dst);
-  IndexType src_index[num_dims];
-  IndexType dst_index[num_dims];
-  CUDA_1D_KERNEL_LOOP_T(IndexType, linear_index, params.elem_cnt) {
-    params.dst_index_helper.OffsetToNdIndex(linear_index, dst_index);
-    bool if_pad = false;
-#pragma unroll
-    for (int i = 0; i < num_dims; i++) {
-      if (dst_index[i] >= params.valid_start[i] && dst_index[i] < params.valid_end[i]) {
-        src_index[i] = dst_index[i] - params.valid_start[i];
-      } else {
-        if_pad = true;
-        break;
-      }
-    }
-    StorageType dst_val = packed_pad_val;
-    if (!if_pad) {
-      const IndexType src_offset = params.src_index_helper.NdIndexToOffset(src_index);
-      dst_val = src[src_offset];
-    }
-    dst[linear_index] = dst_val;
-  }
-}
-
-template<>
-half GetValue<half>(Scalar value) {
-  return static_cast<half>(GetValue<float>(value));
-}
-
-// #if CUDA_VERSION >= 11000
-
-// template<>
-// nv_bfloat16 GetValue<nv_bfloat16>(Scalar value) {
-//   return static_cast<nv_bfloat16>(GetValue<float>(value));
-// }
-
-// #endif  // CUDA_VERSION >= 11000
-
-template<size_t num_dims, typename IndexType, typename StorageType>
-void LaunchKernel(Stream* stream, ConstantPadParams<num_dims, IndexType> params,
-                  StorageType packed_pad_val, size_t elem_cnt) {
-  stream->As<CudaStream>()->LaunchKernelDefaultWaves(
-      (ConstantPadKernel<num_dims, IndexType, StorageType>), elem_cnt, params, packed_pad_val);
-}
-
-template<size_t num_dims, typename IndexType, typename StorageType>
-void LaunchKernel(Stream* stream, void* dst, const int64_t* dst_dims, const void* src,
-                  const int64_t* src_dims, const int64_t* padding_before,
-                  const int64_t* padding_after, StorageType packed_pad_val, size_t elem_cnt) {
-  ConstantPadParams<num_dims, IndexType> params;
-  params.dst_index_helper = OffsetToIndexCalculator<IndexType, num_dims>(dst_dims);
-  params.src_index_helper = NdIndexOffsetHelper<IndexType, num_dims>(src_dims);
-  params.dst = dst;
-  params.src = src;
-  for (int i = 0; i < num_dims; i++) {
-    params.valid_start[i] = padding_before[i];
-    params.valid_end[i] = dst_dims[i] - padding_after[i];
-  }
-  params.elem_cnt = elem_cnt;
-  LaunchKernel<num_dims, IndexType, StorageType>(stream, params, packed_pad_val, elem_cnt);
-}
-
-template<size_t num_dims, typename StorageType>
-void DispatchIndexType(Stream* stream, void* dst, const int64_t* dst_dims, const void* src,
-                       const int64_t* src_dims, const int64_t* padding_before,
-                       const int64_t* padding_after, StorageType packed_pad_val, size_t elem_cnt) {
-  if (elem_cnt < GetMaxVal<int32_t>()) {
-    LaunchKernel<num_dims, int32_t, StorageType>(stream, dst, dst_dims, src, src_dims,
-                                                 padding_before, padding_after, packed_pad_val,
-                                                 elem_cnt);
-  } else {
-    LaunchKernel<num_dims, int64_t, StorageType>(stream, dst, dst_dims, src, src_dims,
-                                                 padding_before, padding_after, packed_pad_val,
-                                                 elem_cnt);
-  }
-}
-
-template<size_t num_dims, typename T>
-void DispatchPackSize(Stream* stream, void* dst, int64_t* dst_dims, const void* src,
-                      int64_t* src_dims, int64_t* padding_before, int64_t* padding_after,
-                      T pad_val) {
-  constexpr int32_t max_packsize = GetMaxPackSize<T>();
-  size_t launch_pack_size = GetLaunchPackSize<max_packsize>(num_dims, dst, dst_dims, src, src_dims,
-                                                            padding_before, padding_after);
-
-  dst_dims[num_dims - 1] /= launch_pack_size;
-  src_dims[num_dims - 1] /= launch_pack_size;
-  padding_before[num_dims - 1] /= launch_pack_size;
-  padding_after[num_dims - 1] /= launch_pack_size;
-
-  size_t elem_cnt = 1;
-  for (int i = 0; i < num_dims; i++) { elem_cnt *= dst_dims[i]; }
-  if (launch_pack_size == 1) {
-    Pack<T, 1> packed_pad_val(pad_val);
-    DispatchIndexType<num_dims, PackType<T, 1>>(stream, dst, dst_dims, src, src_dims,
-                                                padding_before, padding_after,
-                                                packed_pad_val.storage, elem_cnt);
-  } else if (launch_pack_size == 2) {
-    Pack<T, 2> packed_pad_val(pad_val);
-    DispatchIndexType<num_dims, PackType<T, 2>>(stream, dst, dst_dims, src, src_dims,
-                                                padding_before, padding_after,
-                                                packed_pad_val.storage, elem_cnt);
-  } else if (launch_pack_size == 4) {
-    Pack<T, 4> packed_pad_val(pad_val);
-    DispatchIndexType<num_dims, PackType<T, 4>>(stream, dst, dst_dims, src, src_dims,
-                                                padding_before, padding_after,
-                                                packed_pad_val.storage, elem_cnt);
-  } else if (launch_pack_size == 8) {
-    Pack<T, 8> packed_pad_val(pad_val);
-    DispatchIndexType<num_dims, PackType<T, 8>>(stream, dst, dst_dims, src, src_dims,
-                                                padding_before, padding_after,
-                                                packed_pad_val.storage, elem_cnt);
-  } else if (launch_pack_size == 16) {
-    Pack<T, 16> packed_pad_val(pad_val);
-    DispatchIndexType<num_dims, PackType<T, 16>>(stream, dst, dst_dims, src, src_dims,
-                                                 padding_before, padding_after,
-                                                 packed_pad_val.storage, elem_cnt);
-  } else {
-    UNIMPLEMENTED();
-  }
-}
-
-template<typename T>
-void LaunchWithSimplified(Stream* stream, size_t num_dims, void* dst, int64_t* dst_dims,
-                          const void* src, int64_t* src_dims, int64_t* padding_before,
-                          int64_t* padding_after, T pad_val) {
-  void (*func)(Stream* /*stream*/, void* /*dst*/, int64_t* /*dst_dims*/, const void* /*src*/,
-               int64_t* /*src_dims*/, int64_t* /*padding_before*/, int64_t* /*padding_after*/, T) =
-      nullptr;
-  if (num_dims == 1) {
-    func = DispatchPackSize<1, T>;
-  } else if (num_dims == 2) {
-    func = DispatchPackSize<2, T>;
-  } else if (num_dims == 3) {
-    func = DispatchPackSize<3, T>;
-  } else if (num_dims == 4) {
-    func = DispatchPackSize<4, T>;
-  } else if (num_dims == 5) {
-    func = DispatchPackSize<5, T>;
-  } else if (num_dims == 6) {
-    func = DispatchPackSize<6, T>;
-  } else if (num_dims == 7) {
-    func = DispatchPackSize<7, T>;
-  } else if (num_dims == 8) {
-    func = DispatchPackSize<8, T>;
-  } else {
-    UNIMPLEMENTED();
-  }
-  func(stream, dst, dst_dims, src, src_dims, padding_before, padding_after, pad_val);
-}
-
-template<typename T>
-void SimplifyThenLaunch(Stream* stream, size_t num_dims, const int64_t* src_dims, const void* src,
-                        const int64_t* padding_before, const int64_t* padding_after, T pad_val,
-                        void* dst) {
-  CHECK_LE(num_dims, kMaxNumDims);
-  int64_t simplified_dst_dims[kMaxNumDims];
-  int64_t simplified_src_dims[kMaxNumDims];
-  int64_t simplified_padding_before[kMaxNumDims];
-  int64_t simplified_padding_after[kMaxNumDims];
-  size_t simplified_num_dims = 1;
-  SimplifyPadDims(num_dims, src_dims, padding_before, padding_after, &simplified_num_dims,
-                  simplified_dst_dims, simplified_src_dims, simplified_padding_before,
-                  simplified_padding_after);
-  LaunchWithSimplified<T>(stream, simplified_num_dims, dst, simplified_dst_dims, src,
-                          simplified_src_dims, simplified_padding_before, simplified_padding_after,
-                          pad_val);
-}
-
-template<typename T>
-class ConstantPadImpl : public ConstantPad {
- public:
-  OF_DISALLOW_COPY_AND_MOVE(ConstantPadImpl);
-  ConstantPadImpl() = default;
-  ~ConstantPadImpl() override = default;
-
-  void Launch(Stream* stream, size_t num_dims, const int64_t* src_dims, const void* src,
-              const int64_t* padding_before, const int64_t* padding_after, Scalar pad_val,
-              void* dst) override {
-    SimplifyThenLaunch<T>(stream, num_dims, src_dims, src, padding_before, padding_after,
-                          GetValue<T>(pad_val), dst);
-  }
-};
-
-template<typename T>
-std::unique_ptr<ConstantPad> NewConstantPad() {
-  return std::unique_ptr<ConstantPad>(new ConstantPadImpl<T>());
-}
-
-class ConstantPadFactoryImpl : public ConstantPadFactory {
- public:
-  OF_DISALLOW_COPY_AND_MOVE(ConstantPadFactoryImpl);
-  ConstantPadFactoryImpl() = default;
-  ~ConstantPadFactoryImpl() override = default;
-
-  std::unique_ptr<ConstantPad> New(DataType data_type) override {
-#define MAKE_NEW_CONSTANT_PAD_ENTRY(type_cpp, type_proto) {type_proto, NewConstantPad<type_cpp>},
-
-    static const std::map<DataType, std::function<std::unique_ptr<ConstantPad>()>>
-        new_constant_pad_handle{
-            OF_PP_FOR_EACH_TUPLE(MAKE_NEW_CONSTANT_PAD_ENTRY, CUDA_PRIMITIVE_ALL_TYPE_SEQ)};
-
-#undef MAKE_NEW_CONSTANT_PAD_ENTRY
-
-    const auto it = new_constant_pad_handle.find(data_type);
-    if (it != new_constant_pad_handle.end()) {
-      return it->second();
-    } else {
-      return nullptr;
-    }
-  }
-};
-
-REGISTER_PRIMITIVE_FACTORY(DeviceType::kCUDA, ConstantPadFactory, ConstantPadFactoryImpl);
-
-}  // namespace
-
-}  // namespace primitive
-
-}  // namespace ep
-
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "oneflow/core/ep/include/primitive/constant_pad.h"
+#include "oneflow/core/ep/common/primitive/constant_pad.h"
+#include "oneflow/core/ep/rocm/primitive/type_seq.h"
+#include "oneflow/core/ep/rocm/cuda_stream.h"
+#include <hip/hip_runtime.h>
+
+namespace oneflow {
+
+namespace ep {
+
+namespace primitive {
+
+namespace {
+
+template<size_t num_dims, typename IndexType, typename StorageType>
+__global__ void ConstantPadKernel(ConstantPadParams<num_dims, IndexType> params,
+                                  StorageType packed_pad_val) {
+  const StorageType* src = reinterpret_cast<const StorageType*>(params.src);
+  StorageType* dst = reinterpret_cast<StorageType*>(params.dst);
+  IndexType src_index[num_dims];
+  IndexType dst_index[num_dims];
+  CUDA_1D_KERNEL_LOOP_T(IndexType, linear_index, params.elem_cnt) {
+    params.dst_index_helper.OffsetToNdIndex(linear_index, dst_index);
+    bool if_pad = false;
+#pragma unroll
+    for (int i = 0; i < num_dims; i++) {
+      if (dst_index[i] >= params.valid_start[i] && dst_index[i] < params.valid_end[i]) {
+        src_index[i] = dst_index[i] - params.valid_start[i];
+      } else {
+        if_pad = true;
+        break;
+      }
+    }
+    StorageType dst_val = packed_pad_val;
+    if (!if_pad) {
+      const IndexType src_offset = params.src_index_helper.NdIndexToOffset(src_index);
+      dst_val = src[src_offset];
+    }
+    dst[linear_index] = dst_val;
+  }
+}
+
+template<>
+half GetValue<half>(Scalar value) {
+  return static_cast<half>(GetValue<float>(value));
+}
+
+// #if CUDA_VERSION >= 11000
+
+// template<>
+// nv_bfloat16 GetValue<nv_bfloat16>(Scalar value) {
+//   return static_cast<nv_bfloat16>(GetValue<float>(value));
+// }
+
+// #endif  // CUDA_VERSION >= 11000
+
+template<size_t num_dims, typename IndexType, typename StorageType>
+void LaunchKernel(Stream* stream, ConstantPadParams<num_dims, IndexType> params,
+                  StorageType packed_pad_val, size_t elem_cnt) {
+  stream->As<CudaStream>()->LaunchKernelDefaultWaves(
+      (ConstantPadKernel<num_dims, IndexType, StorageType>), elem_cnt, params, packed_pad_val);
+}
+
+template<size_t num_dims, typename IndexType, typename StorageType>
+void LaunchKernel(Stream* stream, void* dst, const int64_t* dst_dims, const void* src,
+                  const int64_t* src_dims, const int64_t* padding_before,
+                  const int64_t* padding_after, StorageType packed_pad_val, size_t elem_cnt) {
+  ConstantPadParams<num_dims, IndexType> params;
+  params.dst_index_helper = OffsetToIndexCalculator<IndexType, num_dims>(dst_dims);
+  params.src_index_helper = NdIndexOffsetHelper<IndexType, num_dims>(src_dims);
+  params.dst = dst;
+  params.src = src;
+  for (int i = 0; i < num_dims; i++) {
+    params.valid_start[i] = padding_before[i];
+    params.valid_end[i] = dst_dims[i] - padding_after[i];
+  }
+  params.elem_cnt = elem_cnt;
+  LaunchKernel<num_dims, IndexType, StorageType>(stream, params, packed_pad_val, elem_cnt);
+}
+
+template<size_t num_dims, typename StorageType>
+void DispatchIndexType(Stream* stream, void* dst, const int64_t* dst_dims, const void* src,
+                       const int64_t* src_dims, const int64_t* padding_before,
+                       const int64_t* padding_after, StorageType packed_pad_val, size_t elem_cnt) {
+  if (elem_cnt < GetMaxVal<int32_t>()) {
+    LaunchKernel<num_dims, int32_t, StorageType>(stream, dst, dst_dims, src, src_dims,
+                                                 padding_before, padding_after, packed_pad_val,
+                                                 elem_cnt);
+  } else {
+    LaunchKernel<num_dims, int64_t, StorageType>(stream, dst, dst_dims, src, src_dims,
+                                                 padding_before, padding_after, packed_pad_val,
+                                                 elem_cnt);
+  }
+}
+
+template<size_t num_dims, typename T>
+void DispatchPackSize(Stream* stream, void* dst, int64_t* dst_dims, const void* src,
+                      int64_t* src_dims, int64_t* padding_before, int64_t* padding_after,
+                      T pad_val) {
+  constexpr int32_t max_packsize = GetMaxPackSize<T>();
+  size_t launch_pack_size = GetLaunchPackSize<max_packsize>(num_dims, dst, dst_dims, src, src_dims,
+                                                            padding_before, padding_after);
+
+  dst_dims[num_dims - 1] /= launch_pack_size;
+  src_dims[num_dims - 1] /= launch_pack_size;
+  padding_before[num_dims - 1] /= launch_pack_size;
+  padding_after[num_dims - 1] /= launch_pack_size;
+
+  size_t elem_cnt = 1;
+  for (int i = 0; i < num_dims; i++) { elem_cnt *= dst_dims[i]; }
+  if (launch_pack_size == 1) {
+    Pack<T, 1> packed_pad_val(pad_val);
+    DispatchIndexType<num_dims, PackType<T, 1>>(stream, dst, dst_dims, src, src_dims,
+                                                padding_before, padding_after,
+                                                packed_pad_val.storage, elem_cnt);
+  } else if (launch_pack_size == 2) {
+    Pack<T, 2> packed_pad_val(pad_val);
+    DispatchIndexType<num_dims, PackType<T, 2>>(stream, dst, dst_dims, src, src_dims,
+                                                padding_before, padding_after,
+                                                packed_pad_val.storage, elem_cnt);
+  } else if (launch_pack_size == 4) {
+    Pack<T, 4> packed_pad_val(pad_val);
+    DispatchIndexType<num_dims, PackType<T, 4>>(stream, dst, dst_dims, src, src_dims,
+                                                padding_before, padding_after,
+                                                packed_pad_val.storage, elem_cnt);
+  } else if (launch_pack_size == 8) {
+    Pack<T, 8> packed_pad_val(pad_val);
+    DispatchIndexType<num_dims, PackType<T, 8>>(stream, dst, dst_dims, src, src_dims,
+                                                padding_before, padding_after,
+                                                packed_pad_val.storage, elem_cnt);
+  } else if (launch_pack_size == 16) {
+    Pack<T, 16> packed_pad_val(pad_val);
+    DispatchIndexType<num_dims, PackType<T, 16>>(stream, dst, dst_dims, src, src_dims,
+                                                 padding_before, padding_after,
+                                                 packed_pad_val.storage, elem_cnt);
+  } else {
+    UNIMPLEMENTED();
+  }
+}
+
+template<typename T>
+void LaunchWithSimplified(Stream* stream, size_t num_dims, void* dst, int64_t* dst_dims,
+                          const void* src, int64_t* src_dims, int64_t* padding_before,
+                          int64_t* padding_after, T pad_val) {
+  void (*func)(Stream* /*stream*/, void* /*dst*/, int64_t* /*dst_dims*/, const void* /*src*/,
+               int64_t* /*src_dims*/, int64_t* /*padding_before*/, int64_t* /*padding_after*/, T) =
+      nullptr;
+  if (num_dims == 1) {
+    func = DispatchPackSize<1, T>;
+  } else if (num_dims == 2) {
+    func = DispatchPackSize<2, T>;
+  } else if (num_dims == 3) {
+    func = DispatchPackSize<3, T>;
+  } else if (num_dims == 4) {
+    func = DispatchPackSize<4, T>;
+  } else if (num_dims == 5) {
+    func = DispatchPackSize<5, T>;
+  } else if (num_dims == 6) {
+    func = DispatchPackSize<6, T>;
+  } else if (num_dims == 7) {
+    func = DispatchPackSize<7, T>;
+  } else if (num_dims == 8) {
+    func = DispatchPackSize<8, T>;
+  } else {
+    UNIMPLEMENTED();
+  }
+  func(stream, dst, dst_dims, src, src_dims, padding_before, padding_after, pad_val);
+}
+
+template<typename T>
+void SimplifyThenLaunch(Stream* stream, size_t num_dims, const int64_t* src_dims, const void* src,
+                        const int64_t* padding_before, const int64_t* padding_after, T pad_val,
+                        void* dst) {
+  CHECK_LE(num_dims, kMaxNumDims);
+  int64_t simplified_dst_dims[kMaxNumDims];
+  int64_t simplified_src_dims[kMaxNumDims];
+  int64_t simplified_padding_before[kMaxNumDims];
+  int64_t simplified_padding_after[kMaxNumDims];
+  size_t simplified_num_dims = 1;
+  SimplifyPadDims(num_dims, src_dims, padding_before, padding_after, &simplified_num_dims,
+                  simplified_dst_dims, simplified_src_dims, simplified_padding_before,
+                  simplified_padding_after);
+  LaunchWithSimplified<T>(stream, simplified_num_dims, dst, simplified_dst_dims, src,
+                          simplified_src_dims, simplified_padding_before, simplified_padding_after,
+                          pad_val);
+}
+
+template<typename T>
+class ConstantPadImpl : public ConstantPad {
+ public:
+  OF_DISALLOW_COPY_AND_MOVE(ConstantPadImpl);
+  ConstantPadImpl() = default;
+  ~ConstantPadImpl() override = default;
+
+  void Launch(Stream* stream, size_t num_dims, const int64_t* src_dims, const void* src,
+              const int64_t* padding_before, const int64_t* padding_after, Scalar pad_val,
+              void* dst) override {
+    SimplifyThenLaunch<T>(stream, num_dims, src_dims, src, padding_before, padding_after,
+                          GetValue<T>(pad_val), dst);
+  }
+};
+
+template<typename T>
+std::unique_ptr<ConstantPad> NewConstantPad() {
+  return std::unique_ptr<ConstantPad>(new ConstantPadImpl<T>());
+}
+
+class ConstantPadFactoryImpl : public ConstantPadFactory {
+ public:
+  OF_DISALLOW_COPY_AND_MOVE(ConstantPadFactoryImpl);
+  ConstantPadFactoryImpl() = default;
+  ~ConstantPadFactoryImpl() override = default;
+
+  std::unique_ptr<ConstantPad> New(DataType data_type) override {
+#define MAKE_NEW_CONSTANT_PAD_ENTRY(type_cpp, type_proto) {type_proto, NewConstantPad<type_cpp>},
+
+    static const std::map<DataType, std::function<std::unique_ptr<ConstantPad>()>>
+        new_constant_pad_handle{
+            OF_PP_FOR_EACH_TUPLE(MAKE_NEW_CONSTANT_PAD_ENTRY, CUDA_PRIMITIVE_ALL_TYPE_SEQ)};
+
+#undef MAKE_NEW_CONSTANT_PAD_ENTRY
+
+    const auto it = new_constant_pad_handle.find(data_type);
+    if (it != new_constant_pad_handle.end()) {
+      return it->second();
+    } else {
+      return nullptr;
+    }
+  }
+};
+
+REGISTER_PRIMITIVE_FACTORY(DeviceType::kCUDA, ConstantPadFactory, ConstantPadFactoryImpl);
+
+}  // namespace
+
+}  // namespace primitive
+
+}  // namespace ep
+
 }  // namespace oneflow
\ No newline at end of file
diff --git a/oneflow/core/ep/rocm/primitive/copy_nd.hip.cpp b/oneflow/core/ep/rocm/primitive/copy_nd.hip.cpp
index 8dc4589..8b60dcf 100644
--- a/oneflow/core/ep/rocm/primitive/copy_nd.hip.cpp
+++ b/oneflow/core/ep/rocm/primitive/copy_nd.hip.cpp
@@ -1,95 +1,95 @@
-#include "hip/hip_runtime.h"
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-#include "oneflow/core/ep/include/primitive/copy_nd.h"
-#include "oneflow/core/ep/common/primitive/copy_nd.h"
-#include "oneflow/core/ep/rocm/cuda_stream.h"
-#include <hip/hip_runtime.h>
-
-namespace oneflow {
-
-namespace ep {
-namespace primitive {
-
-namespace {
-
-template<size_t num_dims, size_t movement_size, typename IndexType>
-__global__ void CopyNdKernel(CopyNdKernelParams<num_dims, IndexType> params) {
-  using T = typename std::aligned_storage<movement_size, movement_size>::type;
-  const T* src = reinterpret_cast<const T*>(params.src);
-  T* dst = reinterpret_cast<T*>(params.dst);
-  IndexType copy_index[num_dims];
-  IndexType src_index[num_dims];
-  IndexType dst_index[num_dims];
-  CUDA_1D_KERNEL_LOOP_T(IndexType, i, params.count) {
-    params.copy_index_helper.OffsetToNdIndex(i, copy_index);
-#pragma unroll
-    for (size_t j = 0; j < num_dims; ++j) {
-      src_index[j] = params.src_pos[j] + copy_index[j];
-      dst_index[j] = params.dst_pos[j] + copy_index[j];
-    }
-    const IndexType src_offset = params.src_index_helper.NdIndexToOffset(src_index);
-    const IndexType dst_offset = params.dst_index_helper.NdIndexToOffset(dst_index);
-    dst[dst_offset] = src[src_offset];
-  }
-}
-
-template<size_t num_dims, size_t movement_size, typename IndexType>
-void LaunchKernel(Stream* stream, CopyNdKernelParams<num_dims, IndexType> params) {
-  hipStream_t cuda_stream = stream->As<CudaStream>()->cuda_stream();
-  CopyNdKernel<num_dims, movement_size, IndexType>
-      <<<BlocksNum4ThreadsNum(params.count), kCudaThreadsNumPerBlock, 0, cuda_stream>>>(params);
-}
-
-class CopyNdImpl : public CopyNd {
- public:
-  OF_DISALLOW_COPY_AND_MOVE(CopyNdImpl);
-  CopyNdImpl() = default;
-  ~CopyNdImpl() override = default;
-
-  void Launch(Stream* stream, DataType data_type, size_t num_dims, void* dst,
-              const int64_t* dst_dims, const int64_t* dst_pos, const void* src,
-              const int64_t* src_dims, const int64_t* src_pos,
-              const int64_t* extent) const override {
-    SimplifyThenLaunch(stream, data_type, num_dims, dst, dst_dims, dst_pos, src, src_dims, src_pos,
-                       extent);
-  }
-};
-
-class CopyNdFactoryImpl : public CopyNdFactory {
- public:
-  OF_DISALLOW_COPY_AND_MOVE(CopyNdFactoryImpl);
-  CopyNdFactoryImpl() = default;
-  ~CopyNdFactoryImpl() override = default;
-
-  std::unique_ptr<CopyNd> New(size_t max_num_dims) override {
-    if (max_num_dims <= kMaxNumDims) {
-      return std::unique_ptr<CopyNd>(new CopyNdImpl());
-    } else {
-      return nullptr;
-    }
-  }
-};
-
-REGISTER_PRIMITIVE_FACTORY(DeviceType::kCUDA, CopyNdFactory, CopyNdFactoryImpl);
-
-}  // namespace
-
-}  // namespace primitive
-}  // namespace ep
-
-}  // namespace oneflow
+#include "hip/hip_runtime.h"
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+#include "oneflow/core/ep/include/primitive/copy_nd.h"
+#include "oneflow/core/ep/common/primitive/copy_nd.h"
+#include "oneflow/core/ep/rocm/cuda_stream.h"
+#include <hip/hip_runtime.h>
+
+namespace oneflow {
+
+namespace ep {
+namespace primitive {
+
+namespace {
+
+template<size_t num_dims, size_t movement_size, typename IndexType>
+__global__ void CopyNdKernel(CopyNdKernelParams<num_dims, IndexType> params) {
+  using T = typename std::aligned_storage<movement_size, movement_size>::type;
+  const T* src = reinterpret_cast<const T*>(params.src);
+  T* dst = reinterpret_cast<T*>(params.dst);
+  IndexType copy_index[num_dims];
+  IndexType src_index[num_dims];
+  IndexType dst_index[num_dims];
+  CUDA_1D_KERNEL_LOOP_T(IndexType, i, params.count) {
+    params.copy_index_helper.OffsetToNdIndex(i, copy_index);
+#pragma unroll
+    for (size_t j = 0; j < num_dims; ++j) {
+      src_index[j] = params.src_pos[j] + copy_index[j];
+      dst_index[j] = params.dst_pos[j] + copy_index[j];
+    }
+    const IndexType src_offset = params.src_index_helper.NdIndexToOffset(src_index);
+    const IndexType dst_offset = params.dst_index_helper.NdIndexToOffset(dst_index);
+    dst[dst_offset] = src[src_offset];
+  }
+}
+
+template<size_t num_dims, size_t movement_size, typename IndexType>
+void LaunchKernel(Stream* stream, CopyNdKernelParams<num_dims, IndexType> params) {
+  hipStream_t cuda_stream = stream->As<CudaStream>()->cuda_stream();
+  CopyNdKernel<num_dims, movement_size, IndexType>
+      <<<BlocksNum4ThreadsNum(params.count), kCudaThreadsNumPerBlock, 0, cuda_stream>>>(params);
+}
+
+class CopyNdImpl : public CopyNd {
+ public:
+  OF_DISALLOW_COPY_AND_MOVE(CopyNdImpl);
+  CopyNdImpl() = default;
+  ~CopyNdImpl() override = default;
+
+  void Launch(Stream* stream, DataType data_type, size_t num_dims, void* dst,
+              const int64_t* dst_dims, const int64_t* dst_pos, const void* src,
+              const int64_t* src_dims, const int64_t* src_pos,
+              const int64_t* extent) const override {
+    SimplifyThenLaunch(stream, data_type, num_dims, dst, dst_dims, dst_pos, src, src_dims, src_pos,
+                       extent);
+  }
+};
+
+class CopyNdFactoryImpl : public CopyNdFactory {
+ public:
+  OF_DISALLOW_COPY_AND_MOVE(CopyNdFactoryImpl);
+  CopyNdFactoryImpl() = default;
+  ~CopyNdFactoryImpl() override = default;
+
+  std::unique_ptr<CopyNd> New(size_t max_num_dims) override {
+    if (max_num_dims <= kMaxNumDims) {
+      return std::unique_ptr<CopyNd>(new CopyNdImpl());
+    } else {
+      return nullptr;
+    }
+  }
+};
+
+REGISTER_PRIMITIVE_FACTORY(DeviceType::kCUDA, CopyNdFactory, CopyNdFactoryImpl);
+
+}  // namespace
+
+}  // namespace primitive
+}  // namespace ep
+
+}  // namespace oneflow
diff --git a/oneflow/core/ep/rocm/primitive/elementwise_unary.hip.cpp b/oneflow/core/ep/rocm/primitive/elementwise_unary.hip.cpp
index 1f0c93b..c04763a 100644
--- a/oneflow/core/ep/rocm/primitive/elementwise_unary.hip.cpp
+++ b/oneflow/core/ep/rocm/primitive/elementwise_unary.hip.cpp
@@ -1,117 +1,117 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "oneflow/core/ep/common/primitive/elementwise_unary.h"
-#include "oneflow/core/ep/rocm/primitive/unary_functor.hip.h"
-
-namespace oneflow {
-
-namespace ep {
-namespace primitive {
-
-namespace {
-
-template<UnaryOp unary_op, typename Src, typename Dst>
-class ElementwiseUnaryImpl : public ElementwiseUnary {
- public:
-  OF_DISALLOW_COPY_AND_MOVE(ElementwiseUnaryImpl);
-  ElementwiseUnaryImpl(Scalar attr0, Scalar attr1) : attr0(attr0), attr1(attr1) {}
-  ~ElementwiseUnaryImpl() override = default;
-
-  void Launch(Stream* stream, const void* src, void* dst, size_t count) override {
-    auto* cuda_stream = stream->As<CudaStream>();
-    auto functor = UnaryFunctor<DeviceType::kCUDA, unary_op, Dst, Src>(attr0, attr1);
-    OF_CUDA_CHECK((cuda::elementwise::Unary<decltype(functor), Dst, Src>(
-        functor, count, reinterpret_cast<Dst*>(dst), reinterpret_cast<const Src*>(src),
-        cuda_stream->cuda_stream())));
-  }
-
- protected:
-  Scalar attr0, attr1;
-};
-
-template<UnaryOp unary_op, typename Src, typename Dst>
-std::unique_ptr<ElementwiseUnary> NewElementwiseUnary(Scalar attr0, Scalar attr1) {
-  return std::unique_ptr<ElementwiseUnary>(
-      new ElementwiseUnaryImpl<unary_op, Src, Dst>(attr0, attr1));
-}
-
-class ElementwiseUnaryFactoryImpl : public ElementwiseUnaryFactory {
- public:
-  OF_DISALLOW_COPY_AND_MOVE(ElementwiseUnaryFactoryImpl);
-  ElementwiseUnaryFactoryImpl() = default;
-  ~ElementwiseUnaryFactoryImpl() override = default;
-
-  std::unique_ptr<ElementwiseUnary> New(UnaryOp unary_op, DataType src_type,
-                                        DataType dst_dtype) override {
-    return New(unary_op, src_type, dst_dtype, Scalar(), Scalar());
-  }
-
-  std::unique_ptr<ElementwiseUnary> New(UnaryOp unary_op, DataType src_type, DataType dst_dtype,
-                                        Scalar attr0) override {
-    return New(unary_op, src_type, dst_dtype, attr0, Scalar());
-  }
-
-  std::unique_ptr<ElementwiseUnary> New(UnaryOp unary_op, DataType src_type, DataType dst_dtype,
-                                        Scalar attr0, Scalar attr1) override {
-#define MAKE_NEW_SAME_DTYPE_ELEMENTWISE_UNARY_ENTRY(unary_op, dtype_pair)                   \
-  {std::make_tuple(unary_op, OF_PP_PAIR_SECOND(dtype_pair), OF_PP_PAIR_SECOND(dtype_pair)), \
-   NewElementwiseUnary<unary_op, OF_PP_PAIR_FIRST(dtype_pair), OF_PP_PAIR_FIRST(dtype_pair)>},
-
-#define MAKE_NEW_DIFFERENT_DTYPE_ELEMENTWISE_UNARY_ENTRY(unary_op, src_type_pair, dst_dtype_pair)  \
-  {std::make_tuple(unary_op, OF_PP_PAIR_SECOND(src_type_pair), OF_PP_PAIR_SECOND(dst_dtype_pair)), \
-   NewElementwiseUnary<unary_op, OF_PP_PAIR_FIRST(src_type_pair),                                  \
-                       OF_PP_PAIR_FIRST(dst_dtype_pair)>},
-
-    static const std::map<std::tuple<UnaryOp, DataType, DataType>,
-                          std::function<std::unique_ptr<ElementwiseUnary>(Scalar, Scalar)>>
-        new_elementwise_unary_handle{
-            // For All Type OP
-            OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(MAKE_NEW_SAME_DTYPE_ELEMENTWISE_UNARY_ENTRY,
-                                             UNARY_MATH_OP_SEQ, CUDA_PRIMITIVE_ALL_TYPE_SEQ)
-            // For Float Type OP
-            OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(MAKE_NEW_SAME_DTYPE_ELEMENTWISE_UNARY_ENTRY,
-                                             UNARY_FLOATING_MATH_OP_SEQ,
-                                             CUDA_PRIMITIVE_FLOATING_TYPE_SEQ)
-
-            // For Utils OP
-            OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(MAKE_NEW_DIFFERENT_DTYPE_ELEMENTWISE_UNARY_ENTRY,
-                                             UNARY_UTILS_OP_SEQ, UTIL_OPS_DATA_TYPE_SEQ,
-                                             CUDA_PRIMITIVE_BOOL_TYPE_SEQ)
-
-            // For Logical OP
-            OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(MAKE_NEW_DIFFERENT_DTYPE_ELEMENTWISE_UNARY_ENTRY,
-                                             UNARY_LOGICAL_OP_SEQ, CUDA_PRIMITIVE_ALL_TYPE_SEQ,
-                                             CUDA_PRIMITIVE_BOOL_TYPE_SEQ)};
-
-#undef MAKE_NEW_DIFFERENT_DTYPE_ELEMENTWISE_UNARY_ENTRY
-
-#undef MAKE_NEW_SAME_DTYPE_ELEMENTWISE_UNARY_ENTRY
-    const auto it =
-        new_elementwise_unary_handle.find(std::make_tuple(unary_op, src_type, dst_dtype));
-    if (it != new_elementwise_unary_handle.end()) {
-      return it->second(attr0, attr1);
-    } else {
-      return nullptr;
-    }
-  }
-};
-
-REGISTER_PRIMITIVE_FACTORY(DeviceType::kCUDA, ElementwiseUnaryFactory, ElementwiseUnaryFactoryImpl);
-
-}  // namespace
-}  // namespace primitive
-}  // namespace ep
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "oneflow/core/ep/common/primitive/elementwise_unary.h"
+#include "oneflow/core/ep/rocm/primitive/unary_functor.hip.h"
+
+namespace oneflow {
+
+namespace ep {
+namespace primitive {
+
+namespace {
+
+template<UnaryOp unary_op, typename Src, typename Dst>
+class ElementwiseUnaryImpl : public ElementwiseUnary {
+ public:
+  OF_DISALLOW_COPY_AND_MOVE(ElementwiseUnaryImpl);
+  ElementwiseUnaryImpl(Scalar attr0, Scalar attr1) : attr0(attr0), attr1(attr1) {}
+  ~ElementwiseUnaryImpl() override = default;
+
+  void Launch(Stream* stream, const void* src, void* dst, size_t count) override {
+    auto* cuda_stream = stream->As<CudaStream>();
+    auto functor = UnaryFunctor<DeviceType::kCUDA, unary_op, Dst, Src>(attr0, attr1);
+    OF_CUDA_CHECK((cuda::elementwise::Unary<decltype(functor), Dst, Src>(
+        functor, count, reinterpret_cast<Dst*>(dst), reinterpret_cast<const Src*>(src),
+        cuda_stream->cuda_stream())));
+  }
+
+ protected:
+  Scalar attr0, attr1;
+};
+
+template<UnaryOp unary_op, typename Src, typename Dst>
+std::unique_ptr<ElementwiseUnary> NewElementwiseUnary(Scalar attr0, Scalar attr1) {
+  return std::unique_ptr<ElementwiseUnary>(
+      new ElementwiseUnaryImpl<unary_op, Src, Dst>(attr0, attr1));
+}
+
+class ElementwiseUnaryFactoryImpl : public ElementwiseUnaryFactory {
+ public:
+  OF_DISALLOW_COPY_AND_MOVE(ElementwiseUnaryFactoryImpl);
+  ElementwiseUnaryFactoryImpl() = default;
+  ~ElementwiseUnaryFactoryImpl() override = default;
+
+  std::unique_ptr<ElementwiseUnary> New(UnaryOp unary_op, DataType src_type,
+                                        DataType dst_dtype) override {
+    return New(unary_op, src_type, dst_dtype, Scalar(), Scalar());
+  }
+
+  std::unique_ptr<ElementwiseUnary> New(UnaryOp unary_op, DataType src_type, DataType dst_dtype,
+                                        Scalar attr0) override {
+    return New(unary_op, src_type, dst_dtype, attr0, Scalar());
+  }
+
+  std::unique_ptr<ElementwiseUnary> New(UnaryOp unary_op, DataType src_type, DataType dst_dtype,
+                                        Scalar attr0, Scalar attr1) override {
+#define MAKE_NEW_SAME_DTYPE_ELEMENTWISE_UNARY_ENTRY(unary_op, dtype_pair)                   \
+  {std::make_tuple(unary_op, OF_PP_PAIR_SECOND(dtype_pair), OF_PP_PAIR_SECOND(dtype_pair)), \
+   NewElementwiseUnary<unary_op, OF_PP_PAIR_FIRST(dtype_pair), OF_PP_PAIR_FIRST(dtype_pair)>},
+
+#define MAKE_NEW_DIFFERENT_DTYPE_ELEMENTWISE_UNARY_ENTRY(unary_op, src_type_pair, dst_dtype_pair)  \
+  {std::make_tuple(unary_op, OF_PP_PAIR_SECOND(src_type_pair), OF_PP_PAIR_SECOND(dst_dtype_pair)), \
+   NewElementwiseUnary<unary_op, OF_PP_PAIR_FIRST(src_type_pair),                                  \
+                       OF_PP_PAIR_FIRST(dst_dtype_pair)>},
+
+    static const std::map<std::tuple<UnaryOp, DataType, DataType>,
+                          std::function<std::unique_ptr<ElementwiseUnary>(Scalar, Scalar)>>
+        new_elementwise_unary_handle{
+            // For All Type OP
+            OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(MAKE_NEW_SAME_DTYPE_ELEMENTWISE_UNARY_ENTRY,
+                                             UNARY_MATH_OP_SEQ, CUDA_PRIMITIVE_ALL_TYPE_SEQ)
+            // For Float Type OP
+            OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(MAKE_NEW_SAME_DTYPE_ELEMENTWISE_UNARY_ENTRY,
+                                             UNARY_FLOATING_MATH_OP_SEQ,
+                                             CUDA_PRIMITIVE_FLOATING_TYPE_SEQ)
+
+            // For Utils OP
+            OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(MAKE_NEW_DIFFERENT_DTYPE_ELEMENTWISE_UNARY_ENTRY,
+                                             UNARY_UTILS_OP_SEQ, UTIL_OPS_DATA_TYPE_SEQ,
+                                             CUDA_PRIMITIVE_BOOL_TYPE_SEQ)
+
+            // For Logical OP
+            OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(MAKE_NEW_DIFFERENT_DTYPE_ELEMENTWISE_UNARY_ENTRY,
+                                             UNARY_LOGICAL_OP_SEQ, CUDA_PRIMITIVE_ALL_TYPE_SEQ,
+                                             CUDA_PRIMITIVE_BOOL_TYPE_SEQ)};
+
+#undef MAKE_NEW_DIFFERENT_DTYPE_ELEMENTWISE_UNARY_ENTRY
+
+#undef MAKE_NEW_SAME_DTYPE_ELEMENTWISE_UNARY_ENTRY
+    const auto it =
+        new_elementwise_unary_handle.find(std::make_tuple(unary_op, src_type, dst_dtype));
+    if (it != new_elementwise_unary_handle.end()) {
+      return it->second(attr0, attr1);
+    } else {
+      return nullptr;
+    }
+  }
+};
+
+REGISTER_PRIMITIVE_FACTORY(DeviceType::kCUDA, ElementwiseUnaryFactory, ElementwiseUnaryFactoryImpl);
+
+}  // namespace
+}  // namespace primitive
+}  // namespace ep
 }  // namespace oneflow
\ No newline at end of file
diff --git a/oneflow/core/ep/rocm/primitive/fill.hip.cpp b/oneflow/core/ep/rocm/primitive/fill.hip.cpp
index c77b251..a81d6a6 100644
--- a/oneflow/core/ep/rocm/primitive/fill.hip.cpp
+++ b/oneflow/core/ep/rocm/primitive/fill.hip.cpp
@@ -1,151 +1,151 @@
-#include "hip/hip_runtime.h"
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "oneflow/core/ep/include/primitive/fill.h"
-#include "oneflow/core/ep/rocm/primitive/type_seq.h"
-#include "oneflow/core/ep/rocm/cuda_stream.h"
-
-namespace oneflow {
-
-namespace ep {
-namespace primitive {
-
-namespace {
-
-template<size_t size>
-using Storage = typename std::aligned_storage<size, size>::type;
-
-template<typename T, size_t pack>
-union Pack {
-  static constexpr size_t size = sizeof(T) * pack;
-  explicit __device__ __host__ Pack(T value) {
-    static_assert(sizeof(Pack) == size, "");
-    static_assert(alignof(Pack) == size, "");
-#pragma unroll
-    for (size_t i = 0; i < pack; ++i) { elem[i] = value; }
-  }
-  T elem[pack];
-  Storage<size> storage;
-};
-
-template<typename T, size_t pack>
-__global__ void FillGpu(T* dst, T value, size_t count) {
-  const size_t pack_count = count / pack;
-  Pack<T, pack> pack_value(value);
-  auto* pack_dst = reinterpret_cast<decltype(pack_value.storage)*>(dst);
-  CUDA_1D_KERNEL_LOOP_T(size_t, i, pack_count) { pack_dst[i] = pack_value.storage; }
-  T* tail_dst = dst + pack_count * pack;
-  const size_t tail_count = count - pack_count * pack;
-  CUDA_1D_KERNEL_LOOP_T(size_t, i, tail_count) { tail_dst[i] = value; }
-}
-
-template<typename T>
-T GetValue(Scalar value) {
-  return value.Value<T>();
-}
-
-template<>
-half GetValue<half>(Scalar value) {
-  return static_cast<half>(GetValue<float>(value));
-}
-
-// #if CUDA_VERSION >= 11000
-
-// template<>
-// nv_bfloat16 GetValue<nv_bfloat16>(Scalar value) {
-//   return static_cast<nv_bfloat16>(GetValue<float>(value));
-// }
-
-// #endif  // CUDA_VERSION >= 11000
-
-template<typename T, size_t pack>
-typename std::enable_if<(pack != 0), void>::type LaunchPackFill(hipStream_t stream, T* dst,
-                                                                T value, size_t count) {
-  FillGpu<T, pack>
-      <<<BlocksNum4ThreadsNum(count), kCudaThreadsNumPerBlock, 0, stream>>>(dst, value, count);
-}
-
-template<typename T, size_t pack>
-typename std::enable_if<(pack == 0), void>::type LaunchPackFill(hipStream_t stream, T* dst,
-                                                                T value, size_t count) {
-  LOG(FATAL) << "wrong alignment";
-}
-
-template<typename T>
-void LaunchFill(hipStream_t stream, T* dst, T value, size_t count) {
-  auto uintptr = reinterpret_cast<std::uintptr_t>(dst);
-  if (uintptr % 16 == 0) {
-    LaunchPackFill<T, 16 / sizeof(T)>(stream, dst, value, count);
-  } else if (uintptr % 8 == 0) {
-    LaunchPackFill<T, 8 / sizeof(T)>(stream, dst, value, count);
-  } else if (uintptr % 4 == 0) {
-    LaunchPackFill<T, 4 / sizeof(T)>(stream, dst, value, count);
-  } else if (uintptr % 2 == 0) {
-    LaunchPackFill<T, 2 / sizeof(T)>(stream, dst, value, count);
-  } else {
-    LaunchPackFill<T, 1 / sizeof(T)>(stream, dst, value, count);
-  }
-}
-
-template<typename T>
-class FillImpl : public Fill {
- public:
-  OF_DISALLOW_COPY_AND_MOVE(FillImpl);
-  FillImpl() = default;
-  ~FillImpl() override = default;
-
-  void Launch(Stream* stream, void* dst, Scalar value, size_t count) override {
-    hipStream_t cuda_stream = stream->As<CudaStream>()->cuda_stream();
-    LaunchFill<T>(cuda_stream, reinterpret_cast<T*>(dst), GetValue<T>(value), count);
-  }
-};
-
-template<typename T>
-std::unique_ptr<Fill> NewFill() {
-  return std::unique_ptr<Fill>(new FillImpl<T>());
-}
-
-class FillFactoryImpl : public FillFactory {
- public:
-  OF_DISALLOW_COPY_AND_MOVE(FillFactoryImpl);
-  FillFactoryImpl() = default;
-  ~FillFactoryImpl() override = default;
-
-  std::unique_ptr<Fill> New(DataType data_type) override {
-#define MAKE_NEW_FILL_ENTRY(type_cpp, type_proto) {type_proto, NewFill<type_cpp>},
-
-    static const std::map<DataType, std::function<std::unique_ptr<Fill>()>> new_fill_handle{
-        OF_PP_FOR_EACH_TUPLE(MAKE_NEW_FILL_ENTRY, CUDA_PRIMITIVE_ALL_TYPE_SEQ)};
-
-#undef MAKE_NEW_FILL_ENTRY
-
-    const auto it = new_fill_handle.find(data_type);
-    if (it != new_fill_handle.end()) {
-      return it->second();
-    } else {
-      return nullptr;
-    }
-  }
-};
-
-REGISTER_PRIMITIVE_FACTORY(DeviceType::kCUDA, FillFactory, FillFactoryImpl);
-
-}  // namespace
-
-}  // namespace primitive
-}  // namespace ep
-
-}  // namespace oneflow
+#include "hip/hip_runtime.h"
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "oneflow/core/ep/include/primitive/fill.h"
+#include "oneflow/core/ep/rocm/primitive/type_seq.h"
+#include "oneflow/core/ep/rocm/cuda_stream.h"
+
+namespace oneflow {
+
+namespace ep {
+namespace primitive {
+
+namespace {
+
+template<size_t size>
+using Storage = typename std::aligned_storage<size, size>::type;
+
+template<typename T, size_t pack>
+union Pack {
+  static constexpr size_t size = sizeof(T) * pack;
+  explicit __device__ __host__ Pack(T value) {
+    static_assert(sizeof(Pack) == size, "");
+    static_assert(alignof(Pack) == size, "");
+#pragma unroll
+    for (size_t i = 0; i < pack; ++i) { elem[i] = value; }
+  }
+  T elem[pack];
+  Storage<size> storage;
+};
+
+template<typename T, size_t pack>
+__global__ void FillGpu(T* dst, T value, size_t count) {
+  const size_t pack_count = count / pack;
+  Pack<T, pack> pack_value(value);
+  auto* pack_dst = reinterpret_cast<decltype(pack_value.storage)*>(dst);
+  CUDA_1D_KERNEL_LOOP_T(size_t, i, pack_count) { pack_dst[i] = pack_value.storage; }
+  T* tail_dst = dst + pack_count * pack;
+  const size_t tail_count = count - pack_count * pack;
+  CUDA_1D_KERNEL_LOOP_T(size_t, i, tail_count) { tail_dst[i] = value; }
+}
+
+template<typename T>
+T GetValue(Scalar value) {
+  return value.Value<T>();
+}
+
+template<>
+half GetValue<half>(Scalar value) {
+  return static_cast<half>(GetValue<float>(value));
+}
+
+// #if CUDA_VERSION >= 11000
+
+// template<>
+// nv_bfloat16 GetValue<nv_bfloat16>(Scalar value) {
+//   return static_cast<nv_bfloat16>(GetValue<float>(value));
+// }
+
+// #endif  // CUDA_VERSION >= 11000
+
+template<typename T, size_t pack>
+typename std::enable_if<(pack != 0), void>::type LaunchPackFill(hipStream_t stream, T* dst,
+                                                                T value, size_t count) {
+  FillGpu<T, pack>
+      <<<BlocksNum4ThreadsNum(count), kCudaThreadsNumPerBlock, 0, stream>>>(dst, value, count);
+}
+
+template<typename T, size_t pack>
+typename std::enable_if<(pack == 0), void>::type LaunchPackFill(hipStream_t stream, T* dst,
+                                                                T value, size_t count) {
+  LOG(FATAL) << "wrong alignment";
+}
+
+template<typename T>
+void LaunchFill(hipStream_t stream, T* dst, T value, size_t count) {
+  auto uintptr = reinterpret_cast<std::uintptr_t>(dst);
+  if (uintptr % 16 == 0) {
+    LaunchPackFill<T, 16 / sizeof(T)>(stream, dst, value, count);
+  } else if (uintptr % 8 == 0) {
+    LaunchPackFill<T, 8 / sizeof(T)>(stream, dst, value, count);
+  } else if (uintptr % 4 == 0) {
+    LaunchPackFill<T, 4 / sizeof(T)>(stream, dst, value, count);
+  } else if (uintptr % 2 == 0) {
+    LaunchPackFill<T, 2 / sizeof(T)>(stream, dst, value, count);
+  } else {
+    LaunchPackFill<T, 1 / sizeof(T)>(stream, dst, value, count);
+  }
+}
+
+template<typename T>
+class FillImpl : public Fill {
+ public:
+  OF_DISALLOW_COPY_AND_MOVE(FillImpl);
+  FillImpl() = default;
+  ~FillImpl() override = default;
+
+  void Launch(Stream* stream, void* dst, Scalar value, size_t count) override {
+    hipStream_t cuda_stream = stream->As<CudaStream>()->cuda_stream();
+    LaunchFill<T>(cuda_stream, reinterpret_cast<T*>(dst), GetValue<T>(value), count);
+  }
+};
+
+template<typename T>
+std::unique_ptr<Fill> NewFill() {
+  return std::unique_ptr<Fill>(new FillImpl<T>());
+}
+
+class FillFactoryImpl : public FillFactory {
+ public:
+  OF_DISALLOW_COPY_AND_MOVE(FillFactoryImpl);
+  FillFactoryImpl() = default;
+  ~FillFactoryImpl() override = default;
+
+  std::unique_ptr<Fill> New(DataType data_type) override {
+#define MAKE_NEW_FILL_ENTRY(type_cpp, type_proto) {type_proto, NewFill<type_cpp>},
+
+    static const std::map<DataType, std::function<std::unique_ptr<Fill>()>> new_fill_handle{
+        OF_PP_FOR_EACH_TUPLE(MAKE_NEW_FILL_ENTRY, CUDA_PRIMITIVE_ALL_TYPE_SEQ)};
+
+#undef MAKE_NEW_FILL_ENTRY
+
+    const auto it = new_fill_handle.find(data_type);
+    if (it != new_fill_handle.end()) {
+      return it->second();
+    } else {
+      return nullptr;
+    }
+  }
+};
+
+REGISTER_PRIMITIVE_FACTORY(DeviceType::kCUDA, FillFactory, FillFactoryImpl);
+
+}  // namespace
+
+}  // namespace primitive
+}  // namespace ep
+
+}  // namespace oneflow
diff --git a/oneflow/core/ep/rocm/primitive/memcpy.cpp b/oneflow/core/ep/rocm/primitive/memcpy.cpp
index 19624e4..f6b2600 100644
--- a/oneflow/core/ep/rocm/primitive/memcpy.cpp
+++ b/oneflow/core/ep/rocm/primitive/memcpy.cpp
@@ -1,62 +1,62 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#ifdef WITH_ROCM
-
-#include "oneflow/core/ep/include/primitive/memcpy.h"
-#include "oneflow/core/ep/rocm/cuda_stream.h"
-#include <hip/hip_runtime.h>
-
-namespace oneflow {
-
-namespace ep {
-namespace primitive {
-
-namespace {
-
-class MemcpyImpl : public Memcpy {
- public:
-  OF_DISALLOW_COPY_AND_MOVE(MemcpyImpl);
-  MemcpyImpl() = default;
-  ~MemcpyImpl() override = default;
-
-  void Launch(Stream* stream, void* dst, const void* src, size_t count) override {
-    if (dst == src) { return; }
-    auto* cuda_stream = stream->As<CudaStream>();
-    OF_CUDA_CHECK(hipMemcpyAsync(dst, src, count, hipMemcpyDefault, cuda_stream->cuda_stream()));
-  }
-};
-
-class MemcpyFactoryImpl : public MemcpyFactory {
- public:
-  OF_DISALLOW_COPY_AND_MOVE(MemcpyFactoryImpl);
-  MemcpyFactoryImpl() = default;
-  ~MemcpyFactoryImpl() override = default;
-
-  std::unique_ptr<Memcpy> New(MemcpyKind kind) override {
-    return std::unique_ptr<Memcpy>(new MemcpyImpl());
-  }
-};
-
-REGISTER_PRIMITIVE_FACTORY(DeviceType::kCUDA, MemcpyFactory, MemcpyFactoryImpl);
-
-}  // namespace
-
-}  // namespace primitive
-}  // namespace ep
-
-}  // namespace oneflow
-
-#endif
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#ifdef WITH_ROCM
+
+#include "oneflow/core/ep/include/primitive/memcpy.h"
+#include "oneflow/core/ep/rocm/cuda_stream.h"
+#include <hip/hip_runtime.h>
+
+namespace oneflow {
+
+namespace ep {
+namespace primitive {
+
+namespace {
+
+class MemcpyImpl : public Memcpy {
+ public:
+  OF_DISALLOW_COPY_AND_MOVE(MemcpyImpl);
+  MemcpyImpl() = default;
+  ~MemcpyImpl() override = default;
+
+  void Launch(Stream* stream, void* dst, const void* src, size_t count) override {
+    if (dst == src) { return; }
+    auto* cuda_stream = stream->As<CudaStream>();
+    OF_CUDA_CHECK(hipMemcpyAsync(dst, src, count, hipMemcpyDefault, cuda_stream->cuda_stream()));
+  }
+};
+
+class MemcpyFactoryImpl : public MemcpyFactory {
+ public:
+  OF_DISALLOW_COPY_AND_MOVE(MemcpyFactoryImpl);
+  MemcpyFactoryImpl() = default;
+  ~MemcpyFactoryImpl() override = default;
+
+  std::unique_ptr<Memcpy> New(MemcpyKind kind) override {
+    return std::unique_ptr<Memcpy>(new MemcpyImpl());
+  }
+};
+
+REGISTER_PRIMITIVE_FACTORY(DeviceType::kCUDA, MemcpyFactory, MemcpyFactoryImpl);
+
+}  // namespace
+
+}  // namespace primitive
+}  // namespace ep
+
+}  // namespace oneflow
+
+#endif
diff --git a/oneflow/core/ep/rocm/primitive/memset.cpp b/oneflow/core/ep/rocm/primitive/memset.cpp
index f92fc4d..9d912b6 100644
--- a/oneflow/core/ep/rocm/primitive/memset.cpp
+++ b/oneflow/core/ep/rocm/primitive/memset.cpp
@@ -1,59 +1,59 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#ifdef WITH_ROCM
-
-#include "oneflow/core/ep/include/primitive/memset.h"
-#include "oneflow/core/ep/rocm/cuda_stream.h"
-#include <hip/hip_runtime.h>
-
-namespace oneflow {
-
-namespace ep {
-namespace primitive {
-
-namespace {
-
-class MemsetImpl : public Memset {
- public:
-  OF_DISALLOW_COPY_AND_MOVE(MemsetImpl);
-  MemsetImpl() = default;
-  ~MemsetImpl() override = default;
-
-  void Launch(Stream* stream, void* ptr, int value, size_t count) override {
-    auto* cuda_stream = stream->As<CudaStream>();
-    OF_CUDA_CHECK(hipMemsetAsync(ptr, value, count, cuda_stream->cuda_stream()));
-  }
-};
-
-class MemsetFactoryImpl : public MemsetFactory {
- public:
-  OF_DISALLOW_COPY_AND_MOVE(MemsetFactoryImpl);
-  MemsetFactoryImpl() = default;
-  ~MemsetFactoryImpl() override = default;
-
-  std::unique_ptr<Memset> New() override { return std::unique_ptr<Memset>(new MemsetImpl()); }
-};
-
-REGISTER_PRIMITIVE_FACTORY(DeviceType::kCUDA, MemsetFactory, MemsetFactoryImpl);
-
-}  // namespace
-
-}  // namespace primitive
-}  // namespace ep
-
-}  // namespace oneflow
-
-#endif
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#ifdef WITH_ROCM
+
+#include "oneflow/core/ep/include/primitive/memset.h"
+#include "oneflow/core/ep/rocm/cuda_stream.h"
+#include <hip/hip_runtime.h>
+
+namespace oneflow {
+
+namespace ep {
+namespace primitive {
+
+namespace {
+
+class MemsetImpl : public Memset {
+ public:
+  OF_DISALLOW_COPY_AND_MOVE(MemsetImpl);
+  MemsetImpl() = default;
+  ~MemsetImpl() override = default;
+
+  void Launch(Stream* stream, void* ptr, int value, size_t count) override {
+    auto* cuda_stream = stream->As<CudaStream>();
+    OF_CUDA_CHECK(hipMemsetAsync(ptr, value, count, cuda_stream->cuda_stream()));
+  }
+};
+
+class MemsetFactoryImpl : public MemsetFactory {
+ public:
+  OF_DISALLOW_COPY_AND_MOVE(MemsetFactoryImpl);
+  MemsetFactoryImpl() = default;
+  ~MemsetFactoryImpl() override = default;
+
+  std::unique_ptr<Memset> New() override { return std::unique_ptr<Memset>(new MemsetImpl()); }
+};
+
+REGISTER_PRIMITIVE_FACTORY(DeviceType::kCUDA, MemsetFactory, MemsetFactoryImpl);
+
+}  // namespace
+
+}  // namespace primitive
+}  // namespace ep
+
+}  // namespace oneflow
+
+#endif
diff --git a/oneflow/core/ep/rocm/primitive/permute.hip.cpp b/oneflow/core/ep/rocm/primitive/permute.hip.cpp
index afc26a3..78ae275 100644
--- a/oneflow/core/ep/rocm/primitive/permute.hip.cpp
+++ b/oneflow/core/ep/rocm/primitive/permute.hip.cpp
@@ -1,333 +1,333 @@
-#include "hip/hip_runtime.h"
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "oneflow/core/ep/include/primitive/permute.h"
-#include "oneflow/core/ep/common/primitive/permute_impl.h"
-#include "oneflow/core/ep/rocm/cuda_stream.h"
-#include <hip/hip_runtime.h>
-
-namespace oneflow {
-
-namespace ep {
-namespace primitive {
-
-namespace permute {
-
-namespace internal {
-
-namespace {
-
-constexpr int32_t kMov4TileSize = 32;
-constexpr int32_t kMov2TileSize = 64;
-constexpr int32_t kBlockRows = 8;
-
-template<size_t num_dims, size_t movement_size, typename IndexType>
-__global__ void PermuteKernel(PermuteKernelParams<num_dims, IndexType> params) {
-  using T = typename std::aligned_storage<movement_size, movement_size>::type;
-  const T* src = reinterpret_cast<const T*>(params.src);
-  T* dst = reinterpret_cast<T*>(params.dst);
-  IndexType src_index[num_dims];
-  IndexType dst_index[num_dims];
-  CUDA_1D_KERNEL_LOOP_T(IndexType, i, params.count) {
-    params.dst_index_helper.OffsetToNdIndex(i, dst_index);
-#pragma unroll
-    for (size_t dim = 0; dim < num_dims; ++dim) {
-      src_index[params.permutation[dim]] = dst_index[dim];
-    }
-    IndexType src_offset = params.src_index_helper.NdIndexToOffset(src_index);
-    dst[i] = src[src_offset];
-  }
-}
-
-// (B, X, Y) -> (B, Y, X)
-// refer from https://developer.nvidia.com/blog/efficient-matrix-transpose-cuda-cc/
-template<size_t num_dims, size_t movement_size, size_t tile_size, typename IndexType>
-__global__ void BatchTransposeKernel(const void* src_ptr, void* dst_ptr, IndexType rows,
-                                     IndexType cols, IndexType num_tile_rows,
-                                     IndexType num_tile_cols, int32_t block_nums) {
-  const IndexType src_rows = rows;
-  const IndexType src_cols = cols;
-  const IndexType dst_rows = cols;
-  const IndexType dst_cols = rows;
-
-  using T = typename std::aligned_storage<movement_size, movement_size>::type;
-  __shared__ T tile[tile_size][tile_size + 1];  // To avoid bank conflict.
-
-  const T* src = reinterpret_cast<const T*>(src_ptr);
-  T* dst = reinterpret_cast<T*>(dst_ptr);
-
-  IndexType batch_num_tile = num_tile_rows * num_tile_cols;
-  for (int i = blockIdx.x, step = gridDim.x; i < block_nums; i += step) {
-    const IndexType batch_index = i / batch_num_tile;  // the index of batch.
-    const IndexType tile_index =
-        i - batch_index * batch_num_tile;  // equal to i % (num_tile_rows*num_tile_cols). the
-                                           // flatten index of tile in a batch.
-
-    const IndexType tile_row_index =
-        tile_index / num_tile_cols;  // the row index of tile in a batch.
-    const IndexType tile_col_index =
-        tile_index
-        - tile_row_index
-              * num_tile_cols;  // equal to k % num_tile_cols. the col index of tile in a batch.
-
-    const IndexType offset = batch_index * src_rows * src_cols;
-    {
-      IndexType col_in_tile = threadIdx.x;
-      IndexType col_in_matrix = tile_col_index * tile_size + threadIdx.x;
-#pragma unroll
-      for (IndexType row_in_tile = threadIdx.y; row_in_tile < tile_size;
-           row_in_tile += kBlockRows) {
-        IndexType row_in_matrix = row_in_tile + tile_row_index * tile_size;
-        if (col_in_matrix < src_cols && row_in_matrix < src_rows) {
-          tile[row_in_tile][col_in_tile] = src[offset + row_in_matrix * src_cols + col_in_matrix];
-        }
-      }
-    }
-    __syncthreads();
-    {
-      IndexType col_in_tile = threadIdx.x;
-      IndexType col_in_matrix = tile_row_index * tile_size + threadIdx.x;
-#pragma unroll
-      for (IndexType row_in_tile = threadIdx.y; row_in_tile < tile_size;
-           row_in_tile += kBlockRows) {
-        IndexType row_in_matrix = row_in_tile + tile_col_index * tile_size;
-        if (col_in_matrix < dst_cols && row_in_matrix < dst_rows) {
-          dst[offset + row_in_matrix * dst_cols + col_in_matrix] = tile[col_in_tile][row_in_tile];
-        }
-      }
-    }
-    __syncthreads();
-  }
-}
-
-/*
-Here is a Movementsie=2 version of Batch Transpose.
-When the H W can be divided by 2. we can read data use movementsize=4, and write back as
-movementsize=4.
-*/
-template<size_t num_dims, size_t tile_size, typename IndexType>
-__global__ void BatchTransposeMovement2Kernel(const void* src_ptr, void* dst_ptr, IndexType rows,
-                                              IndexType cols, IndexType num_tile_rows,
-                                              IndexType num_tile_cols, int32_t block_nums) {
-  const IndexType src_rows = rows;
-  const IndexType src_cols = cols;
-  const IndexType dst_rows = cols;
-  const IndexType dst_cols = rows;
-
-  static_assert(tile_size % 2 == 0, "");
-  using T_MOV2 = typename std::aligned_storage<2, 2>::type;
-  using T_MOV4 = typename std::aligned_storage<4, 4>::type;
-
-  const T_MOV4* src = reinterpret_cast<const T_MOV4*>(src_ptr);
-  T_MOV4* dst = reinterpret_cast<T_MOV4*>(dst_ptr);
-
-  // Use union structure to process Load and Store.
-  __shared__ union {
-    T_MOV2 tile_m2[tile_size][tile_size + 2];      // half [64][66]
-    T_MOV4 tile_m4[tile_size][tile_size / 2 + 1];  // half2 [64][33]
-  } tile_mem;
-
-  IndexType batch_num_tile = num_tile_rows * num_tile_cols;
-  for (int i = blockIdx.x, step = gridDim.x; i < block_nums; i += step) {
-    const IndexType batch_index = i / batch_num_tile;  // the index of batch.
-    const IndexType tile_index =
-        i - batch_index * batch_num_tile;  // equal to i % (num_tile_rows*num_tile_cols). the
-                                           // flatten index of tile in a batch.
-
-    const IndexType tile_row_index =
-        tile_index / num_tile_cols;  // the row index of tile in a batch.
-    const IndexType tile_col_index =
-        tile_index
-        - tile_row_index
-              * num_tile_cols;  // equal to k % num_tile_cols. the col index of tile in a batch.
-
-    const IndexType offset = batch_index * src_rows * src_cols;
-    {
-      IndexType col_in_tile = threadIdx.x;
-      IndexType col_in_matrix = tile_col_index * tile_size + threadIdx.x * 2;
-#pragma unroll
-      for (IndexType row_in_tile = threadIdx.y; row_in_tile < tile_size;
-           row_in_tile += kBlockRows) {
-        IndexType row_in_matrix = row_in_tile + tile_row_index * tile_size;
-        if (col_in_matrix < src_cols && row_in_matrix < src_rows) {
-          tile_mem.tile_m4[row_in_tile][col_in_tile] =
-              src[(offset + row_in_matrix * src_cols + col_in_matrix) / 2];
-        }
-      }
-    }
-    __syncthreads();
-    {
-      IndexType col_in_tile = threadIdx.x;
-      IndexType col_in_matrix = tile_row_index * tile_size + threadIdx.x * 2;
-#pragma unroll
-      for (IndexType row_in_tile = threadIdx.y; row_in_tile < tile_size;
-           row_in_tile += kBlockRows) {
-        IndexType row_in_matrix = row_in_tile + tile_col_index * tile_size;
-        union {
-          T_MOV4 m4;
-          T_MOV2 m2[2];
-        } tmp_storage;
-
-        if (col_in_matrix < dst_cols && row_in_matrix < dst_rows) {
-          tmp_storage.m2[0] = tile_mem.tile_m2[col_in_tile * 2][row_in_tile];
-          tmp_storage.m2[1] = tile_mem.tile_m2[col_in_tile * 2 + 1][row_in_tile];
-          dst[(offset + row_in_matrix * dst_cols + col_in_matrix) / 2] = tmp_storage.m4;
-        }
-      }
-    }
-    __syncthreads();
-  }
-}
-
-template<size_t num_dims, size_t movement_size, size_t tile_size, typename IndexType>
-void LaunchBatchTransposeKernel(hipStream_t& cuda_stream,
-                                const PermuteKernelParams<num_dims, IndexType>& params,
-                                const IndexType& num_batches, const IndexType& rows,
-                                const IndexType& cols) {
-  IndexType num_tile_rows = (rows + tile_size - 1) / tile_size;
-  IndexType num_tile_cols = (cols + tile_size - 1) / tile_size;
-  const int32_t block_nums = num_batches * num_tile_rows * num_tile_cols;
-  int32_t launched_block_nums = std::min(block_nums, kCudaMaxBlocksNum);
-  if (tile_size == kMov2TileSize) {
-    const int32_t half2_thread = tile_size / 2;  // cause each thread process two half elements.
-    BatchTransposeMovement2Kernel<num_dims, kMov2TileSize, IndexType>
-        <<<launched_block_nums, dim3(half2_thread, kBlockRows), 0, cuda_stream>>>(
-            params.src, params.dst, rows, cols, num_tile_rows, num_tile_cols,
-            block_nums);  // Set threads num as 32x8 cause each threads
-                          // process 4 elements to 64x66 half share memory.
-  } else {
-    BatchTransposeKernel<num_dims, movement_size, tile_size, IndexType>
-        <<<launched_block_nums, dim3(tile_size, kBlockRows), 0, cuda_stream>>>(
-            params.src, params.dst, rows, cols, num_tile_rows, num_tile_cols, block_nums);
-  }
-}
-
-template<size_t tile_size, typename IndexType>
-bool CheckIfGreaterEqualThanTileSize(const IndexType& rows, const IndexType& cols) {
-  if (rows < tile_size || cols < tile_size) { return false; }
-  return true;
-}
-
-template<size_t num_dims, size_t tile_size, typename IndexType>
-bool CheckLaunchBatchTranspose(const int* permutation, const IndexType& num_batches,
-                               const IndexType& rows, const IndexType& cols) {
-  if (CheckIfGreaterEqualThanTileSize<tile_size, IndexType>(rows, cols)) {
-    if (num_batches == 1 && permutation[1] == 0 && permutation[0] == 1) {
-      // 2d tensor case: (0, 1) -> (1, 0)
-      return true;
-    } else if (num_dims == 3 && permutation[2] == 1 && permutation[1] == 2) {
-      // 3d tensor case: (0, 1, 2) -> (0, 2, 1)
-      return true;
-    } else {
-      return false;
-    }
-  }
-  return false;
-}
-
-template<typename IndexType, size_t movement_size>
-bool CheckUseMov2(const IndexType& rows, const IndexType& cols, const void* src, void* dst) {
-  auto src_ptr = reinterpret_cast<std::uintptr_t>(src);
-  auto dst_ptr = reinterpret_cast<std::uintptr_t>(dst);
-  return (movement_size == 2) && (rows % 2 == 0) && (cols % 2 == 0) && (src_ptr % 4 == 0)
-         && (dst_ptr % 4 == 0);
-}
-
-template<size_t num_dims, typename IndexType>
-void InferBatchTransposeShape(const int64_t* src_dims, IndexType* num_batches, IndexType* rows,
-                              IndexType* cols) {
-  if (num_dims == 2) {
-    *num_batches = 1;
-    *rows = src_dims[0];
-    *cols = src_dims[1];
-  } else {
-    *num_batches = src_dims[0];
-    *rows = src_dims[1];
-    *cols = src_dims[2];
-  }
-}
-
-template<size_t num_dims, size_t movement_size, typename IndexType>
-void LaunchKernel(Stream* stream, const int64_t* src_dims, const void* src, const int* permutation,
-                  void* dst, size_t count) {
-  PermuteKernelParams<num_dims, IndexType> params =
-      MakePermuteParams<num_dims, IndexType>(src_dims, src, permutation, dst, count);
-  hipStream_t cuda_stream = stream->As<CudaStream>()->cuda_stream();
-
-  if (num_dims == 2 || num_dims == 3) {
-    IndexType num_batches;
-    IndexType rows;
-    IndexType cols;
-    InferBatchTransposeShape<num_dims, IndexType>(src_dims, &num_batches, &rows, &cols);
-    if (CheckLaunchBatchTranspose<num_dims, kMov4TileSize>(params.permutation, num_batches, rows,
-                                                           cols)) {
-      if (CheckUseMov2<IndexType, movement_size>(rows, cols, src, dst)) {
-        LaunchBatchTransposeKernel<num_dims, 2, kMov2TileSize, IndexType>(cuda_stream, params,
-                                                                          num_batches, rows, cols);
-      } else {
-        LaunchBatchTransposeKernel<num_dims, movement_size, kMov4TileSize, IndexType>(
-            cuda_stream, params, num_batches, rows, cols);
-      }
-    } else {
-      PermuteKernel<num_dims, movement_size, IndexType>
-          <<<BlocksNum4ThreadsNum(params.count), kCudaThreadsNumPerBlock, 0, cuda_stream>>>(params);
-    }
-  } else {
-    PermuteKernel<num_dims, movement_size, IndexType>
-        <<<BlocksNum4ThreadsNum(params.count), kCudaThreadsNumPerBlock, 0, cuda_stream>>>(params);
-  }
-}
-
-class PermuteImpl : public Permute {
- public:
-  OF_DISALLOW_COPY_AND_MOVE(PermuteImpl);
-  PermuteImpl() = default;
-  ~PermuteImpl() override = default;
-
-  using Permute::Launch;
-  void Launch(Stream* stream, DataType data_type, size_t num_dims, const int64_t* src_dims,
-              const void* src, const int* permutation, void* dst) override {
-    SimplifyThenLaunch(stream, data_type, num_dims, src_dims, src, permutation, dst);
-  }
-};
-
-class PermuteFactoryImpl : public PermuteFactory {
- public:
-  OF_DISALLOW_COPY_AND_MOVE(PermuteFactoryImpl);
-  PermuteFactoryImpl() = default;
-  ~PermuteFactoryImpl() override = default;
-
-  std::unique_ptr<Permute> New(size_t max_num_dims) override {
-    if (max_num_dims <= kMaxNumDims) {
-      return std::unique_ptr<Permute>(new PermuteImpl());
-    } else {
-      return nullptr;
-    }
-  }
-};
-
-REGISTER_PRIMITIVE_FACTORY(DeviceType::kCUDA, PermuteFactory, PermuteFactoryImpl);
-
-}  // namespace
-
-}  // namespace internal
-
-}  // namespace permute
-
-}  // namespace primitive
-}  // namespace ep
-
-}  // namespace oneflow
+#include "hip/hip_runtime.h"
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "oneflow/core/ep/include/primitive/permute.h"
+#include "oneflow/core/ep/common/primitive/permute_impl.h"
+#include "oneflow/core/ep/rocm/cuda_stream.h"
+#include <hip/hip_runtime.h>
+
+namespace oneflow {
+
+namespace ep {
+namespace primitive {
+
+namespace permute {
+
+namespace internal {
+
+namespace {
+
+constexpr int32_t kMov4TileSize = 32;
+constexpr int32_t kMov2TileSize = 64;
+constexpr int32_t kBlockRows = 8;
+
+template<size_t num_dims, size_t movement_size, typename IndexType>
+__global__ void PermuteKernel(PermuteKernelParams<num_dims, IndexType> params) {
+  using T = typename std::aligned_storage<movement_size, movement_size>::type;
+  const T* src = reinterpret_cast<const T*>(params.src);
+  T* dst = reinterpret_cast<T*>(params.dst);
+  IndexType src_index[num_dims];
+  IndexType dst_index[num_dims];
+  CUDA_1D_KERNEL_LOOP_T(IndexType, i, params.count) {
+    params.dst_index_helper.OffsetToNdIndex(i, dst_index);
+#pragma unroll
+    for (size_t dim = 0; dim < num_dims; ++dim) {
+      src_index[params.permutation[dim]] = dst_index[dim];
+    }
+    IndexType src_offset = params.src_index_helper.NdIndexToOffset(src_index);
+    dst[i] = src[src_offset];
+  }
+}
+
+// (B, X, Y) -> (B, Y, X)
+// refer from https://developer.nvidia.com/blog/efficient-matrix-transpose-cuda-cc/
+template<size_t num_dims, size_t movement_size, size_t tile_size, typename IndexType>
+__global__ void BatchTransposeKernel(const void* src_ptr, void* dst_ptr, IndexType rows,
+                                     IndexType cols, IndexType num_tile_rows,
+                                     IndexType num_tile_cols, int32_t block_nums) {
+  const IndexType src_rows = rows;
+  const IndexType src_cols = cols;
+  const IndexType dst_rows = cols;
+  const IndexType dst_cols = rows;
+
+  using T = typename std::aligned_storage<movement_size, movement_size>::type;
+  __shared__ T tile[tile_size][tile_size + 1];  // To avoid bank conflict.
+
+  const T* src = reinterpret_cast<const T*>(src_ptr);
+  T* dst = reinterpret_cast<T*>(dst_ptr);
+
+  IndexType batch_num_tile = num_tile_rows * num_tile_cols;
+  for (int i = blockIdx.x, step = gridDim.x; i < block_nums; i += step) {
+    const IndexType batch_index = i / batch_num_tile;  // the index of batch.
+    const IndexType tile_index =
+        i - batch_index * batch_num_tile;  // equal to i % (num_tile_rows*num_tile_cols). the
+                                           // flatten index of tile in a batch.
+
+    const IndexType tile_row_index =
+        tile_index / num_tile_cols;  // the row index of tile in a batch.
+    const IndexType tile_col_index =
+        tile_index
+        - tile_row_index
+              * num_tile_cols;  // equal to k % num_tile_cols. the col index of tile in a batch.
+
+    const IndexType offset = batch_index * src_rows * src_cols;
+    {
+      IndexType col_in_tile = threadIdx.x;
+      IndexType col_in_matrix = tile_col_index * tile_size + threadIdx.x;
+#pragma unroll
+      for (IndexType row_in_tile = threadIdx.y; row_in_tile < tile_size;
+           row_in_tile += kBlockRows) {
+        IndexType row_in_matrix = row_in_tile + tile_row_index * tile_size;
+        if (col_in_matrix < src_cols && row_in_matrix < src_rows) {
+          tile[row_in_tile][col_in_tile] = src[offset + row_in_matrix * src_cols + col_in_matrix];
+        }
+      }
+    }
+    __syncthreads();
+    {
+      IndexType col_in_tile = threadIdx.x;
+      IndexType col_in_matrix = tile_row_index * tile_size + threadIdx.x;
+#pragma unroll
+      for (IndexType row_in_tile = threadIdx.y; row_in_tile < tile_size;
+           row_in_tile += kBlockRows) {
+        IndexType row_in_matrix = row_in_tile + tile_col_index * tile_size;
+        if (col_in_matrix < dst_cols && row_in_matrix < dst_rows) {
+          dst[offset + row_in_matrix * dst_cols + col_in_matrix] = tile[col_in_tile][row_in_tile];
+        }
+      }
+    }
+    __syncthreads();
+  }
+}
+
+/*
+Here is a Movementsie=2 version of Batch Transpose.
+When the H W can be divided by 2. we can read data use movementsize=4, and write back as
+movementsize=4.
+*/
+template<size_t num_dims, size_t tile_size, typename IndexType>
+__global__ void BatchTransposeMovement2Kernel(const void* src_ptr, void* dst_ptr, IndexType rows,
+                                              IndexType cols, IndexType num_tile_rows,
+                                              IndexType num_tile_cols, int32_t block_nums) {
+  const IndexType src_rows = rows;
+  const IndexType src_cols = cols;
+  const IndexType dst_rows = cols;
+  const IndexType dst_cols = rows;
+
+  static_assert(tile_size % 2 == 0, "");
+  using T_MOV2 = typename std::aligned_storage<2, 2>::type;
+  using T_MOV4 = typename std::aligned_storage<4, 4>::type;
+
+  const T_MOV4* src = reinterpret_cast<const T_MOV4*>(src_ptr);
+  T_MOV4* dst = reinterpret_cast<T_MOV4*>(dst_ptr);
+
+  // Use union structure to process Load and Store.
+  __shared__ union {
+    T_MOV2 tile_m2[tile_size][tile_size + 2];      // half [64][66]
+    T_MOV4 tile_m4[tile_size][tile_size / 2 + 1];  // half2 [64][33]
+  } tile_mem;
+
+  IndexType batch_num_tile = num_tile_rows * num_tile_cols;
+  for (int i = blockIdx.x, step = gridDim.x; i < block_nums; i += step) {
+    const IndexType batch_index = i / batch_num_tile;  // the index of batch.
+    const IndexType tile_index =
+        i - batch_index * batch_num_tile;  // equal to i % (num_tile_rows*num_tile_cols). the
+                                           // flatten index of tile in a batch.
+
+    const IndexType tile_row_index =
+        tile_index / num_tile_cols;  // the row index of tile in a batch.
+    const IndexType tile_col_index =
+        tile_index
+        - tile_row_index
+              * num_tile_cols;  // equal to k % num_tile_cols. the col index of tile in a batch.
+
+    const IndexType offset = batch_index * src_rows * src_cols;
+    {
+      IndexType col_in_tile = threadIdx.x;
+      IndexType col_in_matrix = tile_col_index * tile_size + threadIdx.x * 2;
+#pragma unroll
+      for (IndexType row_in_tile = threadIdx.y; row_in_tile < tile_size;
+           row_in_tile += kBlockRows) {
+        IndexType row_in_matrix = row_in_tile + tile_row_index * tile_size;
+        if (col_in_matrix < src_cols && row_in_matrix < src_rows) {
+          tile_mem.tile_m4[row_in_tile][col_in_tile] =
+              src[(offset + row_in_matrix * src_cols + col_in_matrix) / 2];
+        }
+      }
+    }
+    __syncthreads();
+    {
+      IndexType col_in_tile = threadIdx.x;
+      IndexType col_in_matrix = tile_row_index * tile_size + threadIdx.x * 2;
+#pragma unroll
+      for (IndexType row_in_tile = threadIdx.y; row_in_tile < tile_size;
+           row_in_tile += kBlockRows) {
+        IndexType row_in_matrix = row_in_tile + tile_col_index * tile_size;
+        union {
+          T_MOV4 m4;
+          T_MOV2 m2[2];
+        } tmp_storage;
+
+        if (col_in_matrix < dst_cols && row_in_matrix < dst_rows) {
+          tmp_storage.m2[0] = tile_mem.tile_m2[col_in_tile * 2][row_in_tile];
+          tmp_storage.m2[1] = tile_mem.tile_m2[col_in_tile * 2 + 1][row_in_tile];
+          dst[(offset + row_in_matrix * dst_cols + col_in_matrix) / 2] = tmp_storage.m4;
+        }
+      }
+    }
+    __syncthreads();
+  }
+}
+
+template<size_t num_dims, size_t movement_size, size_t tile_size, typename IndexType>
+void LaunchBatchTransposeKernel(hipStream_t& cuda_stream,
+                                const PermuteKernelParams<num_dims, IndexType>& params,
+                                const IndexType& num_batches, const IndexType& rows,
+                                const IndexType& cols) {
+  IndexType num_tile_rows = (rows + tile_size - 1) / tile_size;
+  IndexType num_tile_cols = (cols + tile_size - 1) / tile_size;
+  const int32_t block_nums = num_batches * num_tile_rows * num_tile_cols;
+  int32_t launched_block_nums = std::min(block_nums, kCudaMaxBlocksNum);
+  if (tile_size == kMov2TileSize) {
+    const int32_t half2_thread = tile_size / 2;  // cause each thread process two half elements.
+    BatchTransposeMovement2Kernel<num_dims, kMov2TileSize, IndexType>
+        <<<launched_block_nums, dim3(half2_thread, kBlockRows), 0, cuda_stream>>>(
+            params.src, params.dst, rows, cols, num_tile_rows, num_tile_cols,
+            block_nums);  // Set threads num as 32x8 cause each threads
+                          // process 4 elements to 64x66 half share memory.
+  } else {
+    BatchTransposeKernel<num_dims, movement_size, tile_size, IndexType>
+        <<<launched_block_nums, dim3(tile_size, kBlockRows), 0, cuda_stream>>>(
+            params.src, params.dst, rows, cols, num_tile_rows, num_tile_cols, block_nums);
+  }
+}
+
+template<size_t tile_size, typename IndexType>
+bool CheckIfGreaterEqualThanTileSize(const IndexType& rows, const IndexType& cols) {
+  if (rows < tile_size || cols < tile_size) { return false; }
+  return true;
+}
+
+template<size_t num_dims, size_t tile_size, typename IndexType>
+bool CheckLaunchBatchTranspose(const int* permutation, const IndexType& num_batches,
+                               const IndexType& rows, const IndexType& cols) {
+  if (CheckIfGreaterEqualThanTileSize<tile_size, IndexType>(rows, cols)) {
+    if (num_batches == 1 && permutation[1] == 0 && permutation[0] == 1) {
+      // 2d tensor case: (0, 1) -> (1, 0)
+      return true;
+    } else if (num_dims == 3 && permutation[2] == 1 && permutation[1] == 2) {
+      // 3d tensor case: (0, 1, 2) -> (0, 2, 1)
+      return true;
+    } else {
+      return false;
+    }
+  }
+  return false;
+}
+
+template<typename IndexType, size_t movement_size>
+bool CheckUseMov2(const IndexType& rows, const IndexType& cols, const void* src, void* dst) {
+  auto src_ptr = reinterpret_cast<std::uintptr_t>(src);
+  auto dst_ptr = reinterpret_cast<std::uintptr_t>(dst);
+  return (movement_size == 2) && (rows % 2 == 0) && (cols % 2 == 0) && (src_ptr % 4 == 0)
+         && (dst_ptr % 4 == 0);
+}
+
+template<size_t num_dims, typename IndexType>
+void InferBatchTransposeShape(const int64_t* src_dims, IndexType* num_batches, IndexType* rows,
+                              IndexType* cols) {
+  if (num_dims == 2) {
+    *num_batches = 1;
+    *rows = src_dims[0];
+    *cols = src_dims[1];
+  } else {
+    *num_batches = src_dims[0];
+    *rows = src_dims[1];
+    *cols = src_dims[2];
+  }
+}
+
+template<size_t num_dims, size_t movement_size, typename IndexType>
+void LaunchKernel(Stream* stream, const int64_t* src_dims, const void* src, const int* permutation,
+                  void* dst, size_t count) {
+  PermuteKernelParams<num_dims, IndexType> params =
+      MakePermuteParams<num_dims, IndexType>(src_dims, src, permutation, dst, count);
+  hipStream_t cuda_stream = stream->As<CudaStream>()->cuda_stream();
+
+  if (num_dims == 2 || num_dims == 3) {
+    IndexType num_batches;
+    IndexType rows;
+    IndexType cols;
+    InferBatchTransposeShape<num_dims, IndexType>(src_dims, &num_batches, &rows, &cols);
+    if (CheckLaunchBatchTranspose<num_dims, kMov4TileSize>(params.permutation, num_batches, rows,
+                                                           cols)) {
+      if (CheckUseMov2<IndexType, movement_size>(rows, cols, src, dst)) {
+        LaunchBatchTransposeKernel<num_dims, 2, kMov2TileSize, IndexType>(cuda_stream, params,
+                                                                          num_batches, rows, cols);
+      } else {
+        LaunchBatchTransposeKernel<num_dims, movement_size, kMov4TileSize, IndexType>(
+            cuda_stream, params, num_batches, rows, cols);
+      }
+    } else {
+      PermuteKernel<num_dims, movement_size, IndexType>
+          <<<BlocksNum4ThreadsNum(params.count), kCudaThreadsNumPerBlock, 0, cuda_stream>>>(params);
+    }
+  } else {
+    PermuteKernel<num_dims, movement_size, IndexType>
+        <<<BlocksNum4ThreadsNum(params.count), kCudaThreadsNumPerBlock, 0, cuda_stream>>>(params);
+  }
+}
+
+class PermuteImpl : public Permute {
+ public:
+  OF_DISALLOW_COPY_AND_MOVE(PermuteImpl);
+  PermuteImpl() = default;
+  ~PermuteImpl() override = default;
+
+  using Permute::Launch;
+  void Launch(Stream* stream, DataType data_type, size_t num_dims, const int64_t* src_dims,
+              const void* src, const int* permutation, void* dst) override {
+    SimplifyThenLaunch(stream, data_type, num_dims, src_dims, src, permutation, dst);
+  }
+};
+
+class PermuteFactoryImpl : public PermuteFactory {
+ public:
+  OF_DISALLOW_COPY_AND_MOVE(PermuteFactoryImpl);
+  PermuteFactoryImpl() = default;
+  ~PermuteFactoryImpl() override = default;
+
+  std::unique_ptr<Permute> New(size_t max_num_dims) override {
+    if (max_num_dims <= kMaxNumDims) {
+      return std::unique_ptr<Permute>(new PermuteImpl());
+    } else {
+      return nullptr;
+    }
+  }
+};
+
+REGISTER_PRIMITIVE_FACTORY(DeviceType::kCUDA, PermuteFactory, PermuteFactoryImpl);
+
+}  // namespace
+
+}  // namespace internal
+
+}  // namespace permute
+
+}  // namespace primitive
+}  // namespace ep
+
+}  // namespace oneflow
diff --git a/oneflow/core/ep/rocm/primitive/softmax.hip.cpp b/oneflow/core/ep/rocm/primitive/softmax.hip.cpp
index c1d2d00..85d065c 100644
--- a/oneflow/core/ep/rocm/primitive/softmax.hip.cpp
+++ b/oneflow/core/ep/rocm/primitive/softmax.hip.cpp
@@ -1,107 +1,107 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "oneflow/core/common/preprocessor.h"
-#include "oneflow/core/ep/include/primitive/softmax.h"
-#include "oneflow/core/ep/include/primitive/log_softmax.h"
-#include "oneflow/core/ep/rocm/primitive/type_seq.h"
-#include "oneflow/core/hip/softmax.hip.h"
-#include "oneflow/core/ep/rocm/cuda_stream.h"
-
-namespace oneflow {
-
-namespace ep {
-namespace primitive {
-
-namespace {
-
-enum class Algorithm {
-  kSoftmax,
-  kLogSoftmax,
-};
-
-template<Algorithm algorithm, typename T>
-void SoftmaxGpu(hipStream_t cuda_stream, size_t rows, size_t cols, const T* x, T* y) {
-  using ComputeType = typename cuda::softmax::DefaultComputeType<T>::type;
-  oneflow::cuda::softmax::DirectLoad<T, ComputeType> load(x, cols);
-  oneflow::cuda::softmax::DirectStore<ComputeType, T> store(y, cols);
-  if (algorithm == Algorithm::kSoftmax) {
-    OF_CUDA_CHECK((cuda::softmax::DispatchSoftmax<decltype(load), decltype(store), ComputeType>(
-        cuda_stream, load, store, rows, cols)));
-  } else if (algorithm == Algorithm::kLogSoftmax) {
-    OF_CUDA_CHECK((cuda::softmax::DispatchLogSoftmax<decltype(load), decltype(store), ComputeType>(
-        cuda_stream, load, store, rows, cols)));
-  } else {
-    UNIMPLEMENTED();
-  }
-}
-
-template<typename SoftmaxBase, Algorithm algorithm, typename T>
-class SoftmaxImpl : public SoftmaxBase {
- public:
-  OF_DISALLOW_COPY_AND_MOVE(SoftmaxImpl);
-  SoftmaxImpl() = default;
-  ~SoftmaxImpl() override = default;
-
-  void Launch(Stream* stream, size_t rows, size_t cols, const void* x, void* y) override {
-    hipStream_t cuda_stream = stream->As<CudaStream>()->cuda_stream();
-    SoftmaxGpu<algorithm, T>(cuda_stream, rows, cols, reinterpret_cast<const T*>(x),
-                             reinterpret_cast<T*>(y));
-  }
-};
-
-template<typename SoftmaxBase, Algorithm algorithm, typename T>
-std::unique_ptr<SoftmaxBase> NewSoftmax() {
-  return std::unique_ptr<SoftmaxBase>(new SoftmaxImpl<SoftmaxBase, algorithm, T>());
-}
-
-template<typename FactoryBase, typename SoftmaxBase, Algorithm algorithm>
-class GenericSoftmaxFactoryImpl : public FactoryBase {
- public:
-  OF_DISALLOW_COPY_AND_MOVE(GenericSoftmaxFactoryImpl);
-  GenericSoftmaxFactoryImpl() = default;
-  ~GenericSoftmaxFactoryImpl() override = default;
-
-  std::unique_ptr<SoftmaxBase> New(DataType data_type) override {
-#define MAKE_NEW_SOFTMAX_ENTRY(type_cpp, type_proto) \
-  {type_proto, NewSoftmax<SoftmaxBase, algorithm, type_cpp>},
-
-    static const std::map<DataType, std::function<std::unique_ptr<SoftmaxBase>()>>
-        new_softmax_handle{
-            OF_PP_FOR_EACH_TUPLE(MAKE_NEW_SOFTMAX_ENTRY, CUDA_PRIMITIVE_FLOATING_TYPE_SEQ)};
-
-#undef MAKE_NEW_SOFTMAX_ENTRY
-
-    const auto it = new_softmax_handle.find(data_type);
-    if (it != new_softmax_handle.end()) {
-      return it->second();
-    } else {
-      return nullptr;
-    }
-  }
-};
-
-using SoftmaxFactoryImpl = GenericSoftmaxFactoryImpl<SoftmaxFactory, Softmax, Algorithm::kSoftmax>;
-using LogSoftmaxFactoryImpl =
-    GenericSoftmaxFactoryImpl<LogSoftmaxFactory, LogSoftmax, Algorithm::kLogSoftmax>;
-REGISTER_PRIMITIVE_FACTORY(DeviceType::kCUDA, SoftmaxFactory, SoftmaxFactoryImpl);
-REGISTER_PRIMITIVE_FACTORY(DeviceType::kCUDA, LogSoftmaxFactory, LogSoftmaxFactoryImpl);
-
-}  // namespace
-
-}  // namespace primitive
-}  // namespace ep
-
-}  // namespace oneflow
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "oneflow/core/common/preprocessor.h"
+#include "oneflow/core/ep/include/primitive/softmax.h"
+#include "oneflow/core/ep/include/primitive/log_softmax.h"
+#include "oneflow/core/ep/rocm/primitive/type_seq.h"
+#include "oneflow/core/hip/softmax.hip.h"
+#include "oneflow/core/ep/rocm/cuda_stream.h"
+
+namespace oneflow {
+
+namespace ep {
+namespace primitive {
+
+namespace {
+
+enum class Algorithm {
+  kSoftmax,
+  kLogSoftmax,
+};
+
+template<Algorithm algorithm, typename T>
+void SoftmaxGpu(hipStream_t cuda_stream, size_t rows, size_t cols, const T* x, T* y) {
+  using ComputeType = typename cuda::softmax::DefaultComputeType<T>::type;
+  oneflow::cuda::softmax::DirectLoad<T, ComputeType> load(x, cols);
+  oneflow::cuda::softmax::DirectStore<ComputeType, T> store(y, cols);
+  if (algorithm == Algorithm::kSoftmax) {
+    OF_CUDA_CHECK((cuda::softmax::DispatchSoftmax<decltype(load), decltype(store), ComputeType>(
+        cuda_stream, load, store, rows, cols)));
+  } else if (algorithm == Algorithm::kLogSoftmax) {
+    OF_CUDA_CHECK((cuda::softmax::DispatchLogSoftmax<decltype(load), decltype(store), ComputeType>(
+        cuda_stream, load, store, rows, cols)));
+  } else {
+    UNIMPLEMENTED();
+  }
+}
+
+template<typename SoftmaxBase, Algorithm algorithm, typename T>
+class SoftmaxImpl : public SoftmaxBase {
+ public:
+  OF_DISALLOW_COPY_AND_MOVE(SoftmaxImpl);
+  SoftmaxImpl() = default;
+  ~SoftmaxImpl() override = default;
+
+  void Launch(Stream* stream, size_t rows, size_t cols, const void* x, void* y) override {
+    hipStream_t cuda_stream = stream->As<CudaStream>()->cuda_stream();
+    SoftmaxGpu<algorithm, T>(cuda_stream, rows, cols, reinterpret_cast<const T*>(x),
+                             reinterpret_cast<T*>(y));
+  }
+};
+
+template<typename SoftmaxBase, Algorithm algorithm, typename T>
+std::unique_ptr<SoftmaxBase> NewSoftmax() {
+  return std::unique_ptr<SoftmaxBase>(new SoftmaxImpl<SoftmaxBase, algorithm, T>());
+}
+
+template<typename FactoryBase, typename SoftmaxBase, Algorithm algorithm>
+class GenericSoftmaxFactoryImpl : public FactoryBase {
+ public:
+  OF_DISALLOW_COPY_AND_MOVE(GenericSoftmaxFactoryImpl);
+  GenericSoftmaxFactoryImpl() = default;
+  ~GenericSoftmaxFactoryImpl() override = default;
+
+  std::unique_ptr<SoftmaxBase> New(DataType data_type) override {
+#define MAKE_NEW_SOFTMAX_ENTRY(type_cpp, type_proto) \
+  {type_proto, NewSoftmax<SoftmaxBase, algorithm, type_cpp>},
+
+    static const std::map<DataType, std::function<std::unique_ptr<SoftmaxBase>()>>
+        new_softmax_handle{
+            OF_PP_FOR_EACH_TUPLE(MAKE_NEW_SOFTMAX_ENTRY, CUDA_PRIMITIVE_FLOATING_TYPE_SEQ)};
+
+#undef MAKE_NEW_SOFTMAX_ENTRY
+
+    const auto it = new_softmax_handle.find(data_type);
+    if (it != new_softmax_handle.end()) {
+      return it->second();
+    } else {
+      return nullptr;
+    }
+  }
+};
+
+using SoftmaxFactoryImpl = GenericSoftmaxFactoryImpl<SoftmaxFactory, Softmax, Algorithm::kSoftmax>;
+using LogSoftmaxFactoryImpl =
+    GenericSoftmaxFactoryImpl<LogSoftmaxFactory, LogSoftmax, Algorithm::kLogSoftmax>;
+REGISTER_PRIMITIVE_FACTORY(DeviceType::kCUDA, SoftmaxFactory, SoftmaxFactoryImpl);
+REGISTER_PRIMITIVE_FACTORY(DeviceType::kCUDA, LogSoftmaxFactory, LogSoftmaxFactoryImpl);
+
+}  // namespace
+
+}  // namespace primitive
+}  // namespace ep
+
+}  // namespace oneflow
diff --git a/oneflow/core/ep/rocm/primitive/softmax_backward.hip.cpp b/oneflow/core/ep/rocm/primitive/softmax_backward.hip.cpp
index d184890..12088b8 100644
--- a/oneflow/core/ep/rocm/primitive/softmax_backward.hip.cpp
+++ b/oneflow/core/ep/rocm/primitive/softmax_backward.hip.cpp
@@ -1,116 +1,116 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "oneflow/core/common/preprocessor.h"
-#include "oneflow/core/ep/include/primitive/softmax_backward.h"
-#include "oneflow/core/ep/include/primitive/log_softmax_backward.h"
-#include "oneflow/core/ep/rocm/primitive/type_seq.h"
-#include "oneflow/core/hip/softmax.hip.h"
-#include "oneflow/core/ep/rocm/cuda_stream.h"
-
-namespace oneflow {
-
-namespace ep {
-namespace primitive {
-
-namespace {
-
-enum class Algorithm {
-  kSoftmax,
-  kLogSoftmax,
-};
-
-template<Algorithm algorithm, typename T>
-void SoftmaxBackwardGpu(hipStream_t cuda_stream, size_t rows, size_t cols, const T* y, const T* dy,
-                        T* dx) {
-  using ComputeType = typename cuda::softmax::DefaultComputeType<T>::type;
-  cuda::softmax::DirectLoad<T, ComputeType> load_y(y, cols);
-  cuda::softmax::DirectLoad<T, ComputeType> load_dy(dy, cols);
-  cuda::softmax::DirectStore<ComputeType, T> store(dx, cols);
-  if (algorithm == Algorithm::kSoftmax) {
-    OF_CUDA_CHECK((cuda::softmax::DispatchSoftmaxGrad<decltype(load_y), decltype(load_dy),
-                                                      decltype(store), ComputeType>(
-        cuda_stream, load_y, load_dy, store, rows, cols)));
-  } else if (algorithm == Algorithm::kLogSoftmax) {
-    OF_CUDA_CHECK((cuda::softmax::DispatchLogSoftmaxGrad<decltype(load_y), decltype(load_dy),
-                                                         decltype(store), ComputeType>(
-        cuda_stream, load_y, load_dy, store, rows, cols)));
-  } else {
-    UNIMPLEMENTED();
-  }
-}
-
-template<typename SoftmaxBackwardBase, Algorithm algorithm, typename T>
-class SoftmaxBackwardImpl : public SoftmaxBackwardBase {
- public:
-  OF_DISALLOW_COPY_AND_MOVE(SoftmaxBackwardImpl);
-  SoftmaxBackwardImpl() = default;
-  ~SoftmaxBackwardImpl() override = default;
-
-  void Launch(Stream* stream, size_t rows, size_t cols, const void* y, const void* dy,
-              void* dx) override {
-    hipStream_t cuda_stream = stream->As<CudaStream>()->cuda_stream();
-    SoftmaxBackwardGpu<algorithm, T>(cuda_stream, rows, cols, reinterpret_cast<const T*>(y),
-                                     reinterpret_cast<const T*>(dy), reinterpret_cast<T*>(dx));
-  }
-};
-
-template<typename SoftmaxBackwardBase, Algorithm algorithm, typename T>
-std::unique_ptr<SoftmaxBackwardBase> NewSoftmaxBackward() {
-  return std::unique_ptr<SoftmaxBackwardBase>(
-      new SoftmaxBackwardImpl<SoftmaxBackwardBase, algorithm, T>());
-}
-
-template<typename BackwardFactoryBase, typename SoftmaxBackwardBase, Algorithm algorithm>
-class GenericSoftmaxBackwardFactoryImpl : public BackwardFactoryBase {
- public:
-  OF_DISALLOW_COPY_AND_MOVE(GenericSoftmaxBackwardFactoryImpl);
-  GenericSoftmaxBackwardFactoryImpl() = default;
-  ~GenericSoftmaxBackwardFactoryImpl() override = default;
-
-  std::unique_ptr<SoftmaxBackwardBase> New(DataType data_type) override {
-#define MAKE_NEW_SOFTMAX_ENTRY(type_cpp, type_proto) \
-  {type_proto, NewSoftmaxBackward<SoftmaxBackwardBase, algorithm, type_cpp>},
-
-    static const std::map<DataType, std::function<std::unique_ptr<SoftmaxBackwardBase>()>>
-        new_softmax_backward_handle{
-            OF_PP_FOR_EACH_TUPLE(MAKE_NEW_SOFTMAX_ENTRY, CUDA_PRIMITIVE_FLOATING_TYPE_SEQ)};
-
-#undef MAKE_NEW_SOFTMAX_ENTRY
-
-    const auto it = new_softmax_backward_handle.find(data_type);
-    if (it != new_softmax_backward_handle.end()) {
-      return it->second();
-    } else {
-      return nullptr;
-    }
-  }
-};
-
-using SoftmaxBackwardFactoryImpl =
-    GenericSoftmaxBackwardFactoryImpl<SoftmaxBackwardFactory, SoftmaxBackward, Algorithm::kSoftmax>;
-using LogSoftmaxBackwardFactoryImpl =
-    GenericSoftmaxBackwardFactoryImpl<LogSoftmaxBackwardFactory, LogSoftmaxBackward,
-                                      Algorithm::kLogSoftmax>;
-REGISTER_PRIMITIVE_FACTORY(DeviceType::kCUDA, SoftmaxBackwardFactory, SoftmaxBackwardFactoryImpl);
-REGISTER_PRIMITIVE_FACTORY(DeviceType::kCUDA, LogSoftmaxBackwardFactory,
-                           LogSoftmaxBackwardFactoryImpl);
-
-}  // namespace
-
-}  // namespace primitive
-}  // namespace ep
-
-}  // namespace oneflow
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "oneflow/core/common/preprocessor.h"
+#include "oneflow/core/ep/include/primitive/softmax_backward.h"
+#include "oneflow/core/ep/include/primitive/log_softmax_backward.h"
+#include "oneflow/core/ep/rocm/primitive/type_seq.h"
+#include "oneflow/core/hip/softmax.hip.h"
+#include "oneflow/core/ep/rocm/cuda_stream.h"
+
+namespace oneflow {
+
+namespace ep {
+namespace primitive {
+
+namespace {
+
+enum class Algorithm {
+  kSoftmax,
+  kLogSoftmax,
+};
+
+template<Algorithm algorithm, typename T>
+void SoftmaxBackwardGpu(hipStream_t cuda_stream, size_t rows, size_t cols, const T* y, const T* dy,
+                        T* dx) {
+  using ComputeType = typename cuda::softmax::DefaultComputeType<T>::type;
+  cuda::softmax::DirectLoad<T, ComputeType> load_y(y, cols);
+  cuda::softmax::DirectLoad<T, ComputeType> load_dy(dy, cols);
+  cuda::softmax::DirectStore<ComputeType, T> store(dx, cols);
+  if (algorithm == Algorithm::kSoftmax) {
+    OF_CUDA_CHECK((cuda::softmax::DispatchSoftmaxGrad<decltype(load_y), decltype(load_dy),
+                                                      decltype(store), ComputeType>(
+        cuda_stream, load_y, load_dy, store, rows, cols)));
+  } else if (algorithm == Algorithm::kLogSoftmax) {
+    OF_CUDA_CHECK((cuda::softmax::DispatchLogSoftmaxGrad<decltype(load_y), decltype(load_dy),
+                                                         decltype(store), ComputeType>(
+        cuda_stream, load_y, load_dy, store, rows, cols)));
+  } else {
+    UNIMPLEMENTED();
+  }
+}
+
+template<typename SoftmaxBackwardBase, Algorithm algorithm, typename T>
+class SoftmaxBackwardImpl : public SoftmaxBackwardBase {
+ public:
+  OF_DISALLOW_COPY_AND_MOVE(SoftmaxBackwardImpl);
+  SoftmaxBackwardImpl() = default;
+  ~SoftmaxBackwardImpl() override = default;
+
+  void Launch(Stream* stream, size_t rows, size_t cols, const void* y, const void* dy,
+              void* dx) override {
+    hipStream_t cuda_stream = stream->As<CudaStream>()->cuda_stream();
+    SoftmaxBackwardGpu<algorithm, T>(cuda_stream, rows, cols, reinterpret_cast<const T*>(y),
+                                     reinterpret_cast<const T*>(dy), reinterpret_cast<T*>(dx));
+  }
+};
+
+template<typename SoftmaxBackwardBase, Algorithm algorithm, typename T>
+std::unique_ptr<SoftmaxBackwardBase> NewSoftmaxBackward() {
+  return std::unique_ptr<SoftmaxBackwardBase>(
+      new SoftmaxBackwardImpl<SoftmaxBackwardBase, algorithm, T>());
+}
+
+template<typename BackwardFactoryBase, typename SoftmaxBackwardBase, Algorithm algorithm>
+class GenericSoftmaxBackwardFactoryImpl : public BackwardFactoryBase {
+ public:
+  OF_DISALLOW_COPY_AND_MOVE(GenericSoftmaxBackwardFactoryImpl);
+  GenericSoftmaxBackwardFactoryImpl() = default;
+  ~GenericSoftmaxBackwardFactoryImpl() override = default;
+
+  std::unique_ptr<SoftmaxBackwardBase> New(DataType data_type) override {
+#define MAKE_NEW_SOFTMAX_ENTRY(type_cpp, type_proto) \
+  {type_proto, NewSoftmaxBackward<SoftmaxBackwardBase, algorithm, type_cpp>},
+
+    static const std::map<DataType, std::function<std::unique_ptr<SoftmaxBackwardBase>()>>
+        new_softmax_backward_handle{
+            OF_PP_FOR_EACH_TUPLE(MAKE_NEW_SOFTMAX_ENTRY, CUDA_PRIMITIVE_FLOATING_TYPE_SEQ)};
+
+#undef MAKE_NEW_SOFTMAX_ENTRY
+
+    const auto it = new_softmax_backward_handle.find(data_type);
+    if (it != new_softmax_backward_handle.end()) {
+      return it->second();
+    } else {
+      return nullptr;
+    }
+  }
+};
+
+using SoftmaxBackwardFactoryImpl =
+    GenericSoftmaxBackwardFactoryImpl<SoftmaxBackwardFactory, SoftmaxBackward, Algorithm::kSoftmax>;
+using LogSoftmaxBackwardFactoryImpl =
+    GenericSoftmaxBackwardFactoryImpl<LogSoftmaxBackwardFactory, LogSoftmaxBackward,
+                                      Algorithm::kLogSoftmax>;
+REGISTER_PRIMITIVE_FACTORY(DeviceType::kCUDA, SoftmaxBackwardFactory, SoftmaxBackwardFactoryImpl);
+REGISTER_PRIMITIVE_FACTORY(DeviceType::kCUDA, LogSoftmaxBackwardFactory,
+                           LogSoftmaxBackwardFactoryImpl);
+
+}  // namespace
+
+}  // namespace primitive
+}  // namespace ep
+
+}  // namespace oneflow
diff --git a/oneflow/core/ep/rocm/primitive/type_seq.h b/oneflow/core/ep/rocm/primitive/type_seq.h
index d82aa05..fff3643 100644
--- a/oneflow/core/ep/rocm/primitive/type_seq.h
+++ b/oneflow/core/ep/rocm/primitive/type_seq.h
@@ -1,78 +1,78 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#ifndef ONEFLOW_CORE_EP_CUDA_PRIMITIVE_TYPE_SEQ_H_
-#define ONEFLOW_CORE_EP_CUDA_PRIMITIVE_TYPE_SEQ_H_
-
-#include "oneflow/core/common/preprocessor.h"
-#include "oneflow/core/common/data_type.h"
-
-#ifdef WITH_ROCM
-#include <hip/hip_runtime.h>
-#include <hip/hip_fp16.h>
-
-// #if CUDA_VERSION >= 11000
-// #include <cuda_bf16.h>
-// #endif  // CUDA_VERSION >= 11000
-
-#define CUDA_PRIMITIVE_BOOL_TYPE_SEQ OF_PP_MAKE_TUPLE_SEQ(bool, DataType::kBool)
-#define CUDA_PRIMITIVE_CHAR_TYPE_SEQ OF_PP_MAKE_TUPLE_SEQ(char, DataType::kChar)
-#define CUDA_PRIMITIVE_INT8_TYPE_SEQ OF_PP_MAKE_TUPLE_SEQ(int8_t, DataType::kInt8)
-#define CUDA_PRIMITIVE_UINT8_TYPE_SEQ OF_PP_MAKE_TUPLE_SEQ(uint8_t, DataType::kUInt8)
-#define CUDA_PRIMITIVE_INT32_TYPE_SEQ OF_PP_MAKE_TUPLE_SEQ(int32_t, DataType::kInt32)
-#define CUDA_PRIMITIVE_UINT32_TYPE_SEQ OF_PP_MAKE_TUPLE_SEQ(uint32_t, DataType::kUInt32)
-#define CUDA_PRIMITIVE_INT64_TYPE_SEQ OF_PP_MAKE_TUPLE_SEQ(int64_t, DataType::kInt64)
-#define CUDA_PRIMITIVE_UINT64_TYPE_SEQ OF_PP_MAKE_TUPLE_SEQ(uint64_t, DataType::kUInt64)
-#define CUDA_PRIMITIVE_FLOAT_TYPE_SEQ OF_PP_MAKE_TUPLE_SEQ(float, DataType::kFloat)
-#define CUDA_PRIMITIVE_DOUBLE_TYPE_SEQ OF_PP_MAKE_TUPLE_SEQ(double, DataType::kDouble)
-#define CUDA_PRIMITIVE_FLOAT16_TYPE_SEQ OF_PP_MAKE_TUPLE_SEQ(half, DataType::kFloat16)
-
-// #if CUDA_VERSION >= 11000
-// #define CUDA_PRIMITIVE_BFLOAT16_TYPE_SEQ OF_PP_MAKE_TUPLE_SEQ(nv_bfloat16, DataType::kBFloat16)
-// #else
-#define CUDA_PRIMITIVE_BFLOAT16_TYPE_SEQ
-// #endif  // CUDA_VERSION >= 11000
-
-#define CUDA_PRIMITIVE_ALL_TYPE_SEQ \
-  CUDA_PRIMITIVE_BOOL_TYPE_SEQ      \
-  CUDA_PRIMITIVE_CHAR_TYPE_SEQ      \
-  CUDA_PRIMITIVE_INT8_TYPE_SEQ      \
-  CUDA_PRIMITIVE_UINT8_TYPE_SEQ     \
-  CUDA_PRIMITIVE_INT32_TYPE_SEQ     \
-  CUDA_PRIMITIVE_INT64_TYPE_SEQ     \
-  CUDA_PRIMITIVE_FLOAT_TYPE_SEQ     \
-  CUDA_PRIMITIVE_DOUBLE_TYPE_SEQ    \
-  CUDA_PRIMITIVE_FLOAT16_TYPE_SEQ   \
-  CUDA_PRIMITIVE_BFLOAT16_TYPE_SEQ
-
-#define CUDA_PRIMITIVE_FLOATING_TYPE_SEQ \
-  CUDA_PRIMITIVE_FLOAT_TYPE_SEQ          \
-  CUDA_PRIMITIVE_DOUBLE_TYPE_SEQ         \
-  CUDA_PRIMITIVE_FLOAT16_TYPE_SEQ        \
-  CUDA_PRIMITIVE_BFLOAT16_TYPE_SEQ
-
-#define UTIL_OPS_DATA_TYPE_SEQ    \
-  CUDA_PRIMITIVE_INT8_TYPE_SEQ    \
-  CUDA_PRIMITIVE_UINT8_TYPE_SEQ   \
-  CUDA_PRIMITIVE_INT32_TYPE_SEQ   \
-  CUDA_PRIMITIVE_INT64_TYPE_SEQ   \
-  CUDA_PRIMITIVE_FLOAT_TYPE_SEQ   \
-  CUDA_PRIMITIVE_DOUBLE_TYPE_SEQ  \
-  CUDA_PRIMITIVE_FLOAT16_TYPE_SEQ \
-  CUDA_PRIMITIVE_BFLOAT16_TYPE_SEQ
-
-#endif  // WITH_ROCM
-
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#ifndef ONEFLOW_CORE_EP_CUDA_PRIMITIVE_TYPE_SEQ_H_
+#define ONEFLOW_CORE_EP_CUDA_PRIMITIVE_TYPE_SEQ_H_
+
+#include "oneflow/core/common/preprocessor.h"
+#include "oneflow/core/common/data_type.h"
+
+#ifdef WITH_ROCM
+#include <hip/hip_runtime.h>
+#include <hip/hip_fp16.h>
+
+// #if CUDA_VERSION >= 11000
+// #include <cuda_bf16.h>
+// #endif  // CUDA_VERSION >= 11000
+
+#define CUDA_PRIMITIVE_BOOL_TYPE_SEQ OF_PP_MAKE_TUPLE_SEQ(bool, DataType::kBool)
+#define CUDA_PRIMITIVE_CHAR_TYPE_SEQ OF_PP_MAKE_TUPLE_SEQ(char, DataType::kChar)
+#define CUDA_PRIMITIVE_INT8_TYPE_SEQ OF_PP_MAKE_TUPLE_SEQ(int8_t, DataType::kInt8)
+#define CUDA_PRIMITIVE_UINT8_TYPE_SEQ OF_PP_MAKE_TUPLE_SEQ(uint8_t, DataType::kUInt8)
+#define CUDA_PRIMITIVE_INT32_TYPE_SEQ OF_PP_MAKE_TUPLE_SEQ(int32_t, DataType::kInt32)
+#define CUDA_PRIMITIVE_UINT32_TYPE_SEQ OF_PP_MAKE_TUPLE_SEQ(uint32_t, DataType::kUInt32)
+#define CUDA_PRIMITIVE_INT64_TYPE_SEQ OF_PP_MAKE_TUPLE_SEQ(int64_t, DataType::kInt64)
+#define CUDA_PRIMITIVE_UINT64_TYPE_SEQ OF_PP_MAKE_TUPLE_SEQ(uint64_t, DataType::kUInt64)
+#define CUDA_PRIMITIVE_FLOAT_TYPE_SEQ OF_PP_MAKE_TUPLE_SEQ(float, DataType::kFloat)
+#define CUDA_PRIMITIVE_DOUBLE_TYPE_SEQ OF_PP_MAKE_TUPLE_SEQ(double, DataType::kDouble)
+#define CUDA_PRIMITIVE_FLOAT16_TYPE_SEQ OF_PP_MAKE_TUPLE_SEQ(half, DataType::kFloat16)
+
+// #if CUDA_VERSION >= 11000
+// #define CUDA_PRIMITIVE_BFLOAT16_TYPE_SEQ OF_PP_MAKE_TUPLE_SEQ(nv_bfloat16, DataType::kBFloat16)
+// #else
+#define CUDA_PRIMITIVE_BFLOAT16_TYPE_SEQ
+// #endif  // CUDA_VERSION >= 11000
+
+#define CUDA_PRIMITIVE_ALL_TYPE_SEQ \
+  CUDA_PRIMITIVE_BOOL_TYPE_SEQ      \
+  CUDA_PRIMITIVE_CHAR_TYPE_SEQ      \
+  CUDA_PRIMITIVE_INT8_TYPE_SEQ      \
+  CUDA_PRIMITIVE_UINT8_TYPE_SEQ     \
+  CUDA_PRIMITIVE_INT32_TYPE_SEQ     \
+  CUDA_PRIMITIVE_INT64_TYPE_SEQ     \
+  CUDA_PRIMITIVE_FLOAT_TYPE_SEQ     \
+  CUDA_PRIMITIVE_DOUBLE_TYPE_SEQ    \
+  CUDA_PRIMITIVE_FLOAT16_TYPE_SEQ   \
+  CUDA_PRIMITIVE_BFLOAT16_TYPE_SEQ
+
+#define CUDA_PRIMITIVE_FLOATING_TYPE_SEQ \
+  CUDA_PRIMITIVE_FLOAT_TYPE_SEQ          \
+  CUDA_PRIMITIVE_DOUBLE_TYPE_SEQ         \
+  CUDA_PRIMITIVE_FLOAT16_TYPE_SEQ        \
+  CUDA_PRIMITIVE_BFLOAT16_TYPE_SEQ
+
+#define UTIL_OPS_DATA_TYPE_SEQ    \
+  CUDA_PRIMITIVE_INT8_TYPE_SEQ    \
+  CUDA_PRIMITIVE_UINT8_TYPE_SEQ   \
+  CUDA_PRIMITIVE_INT32_TYPE_SEQ   \
+  CUDA_PRIMITIVE_INT64_TYPE_SEQ   \
+  CUDA_PRIMITIVE_FLOAT_TYPE_SEQ   \
+  CUDA_PRIMITIVE_DOUBLE_TYPE_SEQ  \
+  CUDA_PRIMITIVE_FLOAT16_TYPE_SEQ \
+  CUDA_PRIMITIVE_BFLOAT16_TYPE_SEQ
+
+#endif  // WITH_ROCM
+
 #endif  // ONEFLOW_CORE_EP_CUDA_PRIMITIVE_TYPE_SEQ_H_
\ No newline at end of file
diff --git a/oneflow/core/ep/rocm/primitive/unary_functor.hip.h b/oneflow/core/ep/rocm/primitive/unary_functor.hip.h
index f3ff395..2dcec8d 100644
--- a/oneflow/core/ep/rocm/primitive/unary_functor.hip.h
+++ b/oneflow/core/ep/rocm/primitive/unary_functor.hip.h
@@ -1,170 +1,170 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "oneflow/core/ep/common/primitive/unary_functor.h"
-#include "oneflow/core/ep/rocm/primitive/type_seq.h"
-#include "oneflow/core/hip/elementwise.hip.h"
-#include "oneflow/core/ep/rocm/cuda_stream.h"
-
-namespace oneflow {
-namespace ep {
-namespace primitive {
-
-template<typename Dst, typename Src>
-struct UnaryFunctor<DeviceType::kCUDA, UnaryOp::kGelu, Dst, Src> {
-  UnaryFunctor(Scalar attr0, Scalar attr1) {}
-
-  OF_DEVICE_FUNC Dst operator()(Src src) const {
-    return static_cast<Src>(0.5) * src
-           * (static_cast<Src>(1.0) + erf(static_cast<Src>(M_SQRT1_2) * src));
-  }
-};
-
-template<>
-struct UnaryFunctor<DeviceType::kCUDA, UnaryOp::kTanh, float, float> {
-  UnaryFunctor(Scalar attr0, Scalar attr1) {}
-
-  OF_DEVICE_FUNC float operator()(float src) const { return tanhf(src); }
-};
-
-template<>
-struct UnaryFunctor<DeviceType::kCUDA, UnaryOp::kTanh, double, double> {
-  UnaryFunctor(Scalar attr0, Scalar attr1) {}
-
-  OF_DEVICE_FUNC double operator()(double src) const { return tanh(src); }
-};
-
-template<>
-struct UnaryFunctor<DeviceType::kCUDA, UnaryOp::kTanh, half, half> {
-  UnaryFunctor(Scalar attr0, Scalar attr1) {}
-
-  OF_DEVICE_FUNC half operator()(half src) const { return __float2half(tanhf(__half2float(src))); }
-};
-
-template<>
-struct UnaryFunctor<DeviceType::kCUDA, UnaryOp::kIsInf, bool, half> {
-  UnaryFunctor(Scalar attr0, Scalar attr1) {}
-
-  OF_DEVICE_FUNC bool operator()(half src) const { return isinf(__half2float(src)); }
-};
-
-template<>
-struct UnaryFunctor<DeviceType::kCUDA, UnaryOp::kIsInf, bool, float> {
-  UnaryFunctor(Scalar attr0, Scalar attr1) {}
-
-  OF_DEVICE_FUNC bool operator()(float src) const { return isinf(src); }
-};
-
-template<>
-struct UnaryFunctor<DeviceType::kCUDA, UnaryOp::kIsInf, bool, double> {
-  UnaryFunctor(Scalar attr0, Scalar attr1) {}
-
-  OF_DEVICE_FUNC bool operator()(double src) const { return isinf(src); }
-};
-
-template<>
-struct UnaryFunctor<DeviceType::kCUDA, UnaryOp::kIsNan, bool, half> {
-  UnaryFunctor(Scalar attr0, Scalar attr1) {}
-
-  OF_DEVICE_FUNC bool operator()(half src) const { return isnan(__half2float(src)); }
-};
-
-template<>
-struct UnaryFunctor<DeviceType::kCUDA, UnaryOp::kIsNan, bool, float> {
-  UnaryFunctor(Scalar attr0, Scalar attr1) {}
-
-  OF_DEVICE_FUNC bool operator()(float src) const { return isnan(src); }
-};
-
-template<>
-struct UnaryFunctor<DeviceType::kCUDA, UnaryOp::kIsNan, bool, double> {
-  UnaryFunctor(Scalar attr0, Scalar attr1) {}
-
-  OF_DEVICE_FUNC bool operator()(double src) const { return isnan(src); }
-};
-
-#define SPECIALIZATION_PSEUDO_HALF_UNARY_FUNCTOR(op)                          \
-  template<>                                                                  \
-  struct UnaryFunctor<DeviceType::kCUDA, op, half, half> {                    \
-    UnaryFunctor(Scalar attr0, Scalar attr1) : float_functor(attr0, attr1) {} \
-                                                                              \
-    UnaryFunctor<DeviceType::kCUDA, op, float, float> float_functor;          \
-    OF_DEVICE_FUNC half operator()(half src) const {                          \
-      return __float2half(float_functor(__half2float(src)));                  \
-    }                                                                         \
-  };
-
-SPECIALIZATION_PSEUDO_HALF_UNARY_FUNCTOR(UnaryOp::kElu);
-SPECIALIZATION_PSEUDO_HALF_UNARY_FUNCTOR(UnaryOp::kCelu);
-SPECIALIZATION_PSEUDO_HALF_UNARY_FUNCTOR(UnaryOp::kGelu);
-SPECIALIZATION_PSEUDO_HALF_UNARY_FUNCTOR(UnaryOp::kMish);
-SPECIALIZATION_PSEUDO_HALF_UNARY_FUNCTOR(UnaryOp::kSelu);
-SPECIALIZATION_PSEUDO_HALF_UNARY_FUNCTOR(UnaryOp::kSilu);
-SPECIALIZATION_PSEUDO_HALF_UNARY_FUNCTOR(UnaryOp::kSoftSign);
-SPECIALIZATION_PSEUDO_HALF_UNARY_FUNCTOR(UnaryOp::kSoftPlus);
-
-// /*********nv_bfloat16_kernel*******/
-
-// #if CUDA_VERSION >= 11000
-
-// #define SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(op)                      \
-//   template<>                                                                  \
-//   struct UnaryFunctor<DeviceType::kCUDA, op, nv_bfloat16, nv_bfloat16> {      \
-//     UnaryFunctor(Scalar attr0, Scalar attr1) : float_functor(attr0, attr1) {} \
-//                                                                               \
-//     UnaryFunctor<DeviceType::kCUDA, op, float, float> float_functor;          \
-//     OF_DEVICE_FUNC nv_bfloat16 operator()(nv_bfloat16 src) const {            \
-//       return __float2bfloat16(float_functor(__bfloat162float(src)));          \
-//     }                                                                         \
-//   };
-
-// SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kElu);
-// SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kCelu);
-// SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kGelu);
-// SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kHardSwish);
-// SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kHardSigmoid);
-// SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kHardShrink);
-// SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kHardTanh);
-// SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kLeakyRelu);
-// SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kMish);
-// SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kSelu);
-// SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kSilu);
-// SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kSoftShrink);
-// SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kSoftSign);
-// SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kSoftPlus);
-// SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kTanh);
-// SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kThreshold);
-
-// template<>
-// struct UnaryFunctor<DeviceType::kCUDA, UnaryOp::kIsInf, bool, nv_bfloat16> {
-//   UnaryFunctor(Scalar attr0, Scalar attr1) {}
-
-//   OF_DEVICE_FUNC bool operator()(nv_bfloat16 src) const { return isinf(__bfloat162float(src)); }
-// };
-
-// template<>
-// struct UnaryFunctor<DeviceType::kCUDA, UnaryOp::kIsNan, bool, nv_bfloat16> {
-//   UnaryFunctor(Scalar attr0, Scalar attr1) {}
-
-//   OF_DEVICE_FUNC bool operator()(nv_bfloat16 src) const { return isnan(__bfloat162float(src)); }
-// };
-
-// #endif
-
-}  // namespace primitive
-}  // namespace ep
-}  // namespace oneflow
-
-
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "oneflow/core/ep/common/primitive/unary_functor.h"
+#include "oneflow/core/ep/rocm/primitive/type_seq.h"
+#include "oneflow/core/hip/elementwise.hip.h"
+#include "oneflow/core/ep/rocm/cuda_stream.h"
+
+namespace oneflow {
+namespace ep {
+namespace primitive {
+
+template<typename Dst, typename Src>
+struct UnaryFunctor<DeviceType::kCUDA, UnaryOp::kGelu, Dst, Src> {
+  UnaryFunctor(Scalar attr0, Scalar attr1) {}
+
+  OF_DEVICE_FUNC Dst operator()(Src src) const {
+    return static_cast<Src>(0.5) * src
+           * (static_cast<Src>(1.0) + erf(static_cast<Src>(M_SQRT1_2) * src));
+  }
+};
+
+template<>
+struct UnaryFunctor<DeviceType::kCUDA, UnaryOp::kTanh, float, float> {
+  UnaryFunctor(Scalar attr0, Scalar attr1) {}
+
+  OF_DEVICE_FUNC float operator()(float src) const { return tanhf(src); }
+};
+
+template<>
+struct UnaryFunctor<DeviceType::kCUDA, UnaryOp::kTanh, double, double> {
+  UnaryFunctor(Scalar attr0, Scalar attr1) {}
+
+  OF_DEVICE_FUNC double operator()(double src) const { return tanh(src); }
+};
+
+template<>
+struct UnaryFunctor<DeviceType::kCUDA, UnaryOp::kTanh, half, half> {
+  UnaryFunctor(Scalar attr0, Scalar attr1) {}
+
+  OF_DEVICE_FUNC half operator()(half src) const { return __float2half(tanhf(__half2float(src))); }
+};
+
+template<>
+struct UnaryFunctor<DeviceType::kCUDA, UnaryOp::kIsInf, bool, half> {
+  UnaryFunctor(Scalar attr0, Scalar attr1) {}
+
+  OF_DEVICE_FUNC bool operator()(half src) const { return isinf(__half2float(src)); }
+};
+
+template<>
+struct UnaryFunctor<DeviceType::kCUDA, UnaryOp::kIsInf, bool, float> {
+  UnaryFunctor(Scalar attr0, Scalar attr1) {}
+
+  OF_DEVICE_FUNC bool operator()(float src) const { return isinf(src); }
+};
+
+template<>
+struct UnaryFunctor<DeviceType::kCUDA, UnaryOp::kIsInf, bool, double> {
+  UnaryFunctor(Scalar attr0, Scalar attr1) {}
+
+  OF_DEVICE_FUNC bool operator()(double src) const { return isinf(src); }
+};
+
+template<>
+struct UnaryFunctor<DeviceType::kCUDA, UnaryOp::kIsNan, bool, half> {
+  UnaryFunctor(Scalar attr0, Scalar attr1) {}
+
+  OF_DEVICE_FUNC bool operator()(half src) const { return isnan(__half2float(src)); }
+};
+
+template<>
+struct UnaryFunctor<DeviceType::kCUDA, UnaryOp::kIsNan, bool, float> {
+  UnaryFunctor(Scalar attr0, Scalar attr1) {}
+
+  OF_DEVICE_FUNC bool operator()(float src) const { return isnan(src); }
+};
+
+template<>
+struct UnaryFunctor<DeviceType::kCUDA, UnaryOp::kIsNan, bool, double> {
+  UnaryFunctor(Scalar attr0, Scalar attr1) {}
+
+  OF_DEVICE_FUNC bool operator()(double src) const { return isnan(src); }
+};
+
+#define SPECIALIZATION_PSEUDO_HALF_UNARY_FUNCTOR(op)                          \
+  template<>                                                                  \
+  struct UnaryFunctor<DeviceType::kCUDA, op, half, half> {                    \
+    UnaryFunctor(Scalar attr0, Scalar attr1) : float_functor(attr0, attr1) {} \
+                                                                              \
+    UnaryFunctor<DeviceType::kCUDA, op, float, float> float_functor;          \
+    OF_DEVICE_FUNC half operator()(half src) const {                          \
+      return __float2half(float_functor(__half2float(src)));                  \
+    }                                                                         \
+  };
+
+SPECIALIZATION_PSEUDO_HALF_UNARY_FUNCTOR(UnaryOp::kElu);
+SPECIALIZATION_PSEUDO_HALF_UNARY_FUNCTOR(UnaryOp::kCelu);
+SPECIALIZATION_PSEUDO_HALF_UNARY_FUNCTOR(UnaryOp::kGelu);
+SPECIALIZATION_PSEUDO_HALF_UNARY_FUNCTOR(UnaryOp::kMish);
+SPECIALIZATION_PSEUDO_HALF_UNARY_FUNCTOR(UnaryOp::kSelu);
+SPECIALIZATION_PSEUDO_HALF_UNARY_FUNCTOR(UnaryOp::kSilu);
+SPECIALIZATION_PSEUDO_HALF_UNARY_FUNCTOR(UnaryOp::kSoftSign);
+SPECIALIZATION_PSEUDO_HALF_UNARY_FUNCTOR(UnaryOp::kSoftPlus);
+
+// /*********nv_bfloat16_kernel*******/
+
+// #if CUDA_VERSION >= 11000
+
+// #define SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(op)                      \
+//   template<>                                                                  \
+//   struct UnaryFunctor<DeviceType::kCUDA, op, nv_bfloat16, nv_bfloat16> {      \
+//     UnaryFunctor(Scalar attr0, Scalar attr1) : float_functor(attr0, attr1) {} \
+//                                                                               \
+//     UnaryFunctor<DeviceType::kCUDA, op, float, float> float_functor;          \
+//     OF_DEVICE_FUNC nv_bfloat16 operator()(nv_bfloat16 src) const {            \
+//       return __float2bfloat16(float_functor(__bfloat162float(src)));          \
+//     }                                                                         \
+//   };
+
+// SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kElu);
+// SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kCelu);
+// SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kGelu);
+// SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kHardSwish);
+// SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kHardSigmoid);
+// SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kHardShrink);
+// SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kHardTanh);
+// SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kLeakyRelu);
+// SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kMish);
+// SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kSelu);
+// SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kSilu);
+// SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kSoftShrink);
+// SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kSoftSign);
+// SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kSoftPlus);
+// SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kTanh);
+// SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kThreshold);
+
+// template<>
+// struct UnaryFunctor<DeviceType::kCUDA, UnaryOp::kIsInf, bool, nv_bfloat16> {
+//   UnaryFunctor(Scalar attr0, Scalar attr1) {}
+
+//   OF_DEVICE_FUNC bool operator()(nv_bfloat16 src) const { return isinf(__bfloat162float(src)); }
+// };
+
+// template<>
+// struct UnaryFunctor<DeviceType::kCUDA, UnaryOp::kIsNan, bool, nv_bfloat16> {
+//   UnaryFunctor(Scalar attr0, Scalar attr1) {}
+
+//   OF_DEVICE_FUNC bool operator()(nv_bfloat16 src) const { return isnan(__bfloat162float(src)); }
+// };
+
+// #endif
+
+}  // namespace primitive
+}  // namespace ep
+}  // namespace oneflow
+
+
diff --git a/oneflow/core/framework/random_generator_impl.hip.cpp b/oneflow/core/framework/random_generator_impl.hip.cpp
index d0285cc..7fdbc19 100644
--- a/oneflow/core/framework/random_generator_impl.hip.cpp
+++ b/oneflow/core/framework/random_generator_impl.hip.cpp
@@ -1,46 +1,46 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "hip/hip_runtime.h"
-#include "oneflow/core/framework/random_generator_impl.h"
-
-namespace oneflow {
-namespace one {
-
-namespace {
-
-__global__ void InitCurandStatesKernel(uint64_t seed, hiprandState* states,
-                                       CUDAGeneratorState* cuda_gen_state) {
-  const int id = blockIdx.x * blockDim.x + threadIdx.x;
-  size_t local_seed = (static_cast<size_t>(seed) + 0x9e3779b9U + (static_cast<size_t>(id) << 6U)
-                       + (static_cast<size_t>(id) >> 2U));
-  hiprand_init(local_seed, 0, 0, &states[id]);
-  cuda_gen_state->dev_counter = static_cast<int32_t>(0);
-  cuda_gen_state->dev_offset = static_cast<uint64_t>(0);
-}
-
-}  // namespace
-
-namespace detail {
-
-void InitCurandStates(uint64_t seed, int32_t block_num, int32_t thread_num, hiprandState* states,
-                      CUDAGeneratorState* cuda_gen_state) {
-  hipLaunchKernelGGL(InitCurandStatesKernel, block_num, thread_num, 0, 0, seed, states, cuda_gen_state);
-}
-
-}  // namespace detail
-
-}  // namespace one
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "hip/hip_runtime.h"
+#include "oneflow/core/framework/random_generator_impl.h"
+
+namespace oneflow {
+namespace one {
+
+namespace {
+
+__global__ void InitCurandStatesKernel(uint64_t seed, hiprandState* states,
+                                       CUDAGeneratorState* cuda_gen_state) {
+  const int id = blockIdx.x * blockDim.x + threadIdx.x;
+  size_t local_seed = (static_cast<size_t>(seed) + 0x9e3779b9U + (static_cast<size_t>(id) << 6U)
+                       + (static_cast<size_t>(id) >> 2U));
+  hiprand_init(local_seed, 0, 0, &states[id]);
+  cuda_gen_state->dev_counter = static_cast<int32_t>(0);
+  cuda_gen_state->dev_offset = static_cast<uint64_t>(0);
+}
+
+}  // namespace
+
+namespace detail {
+
+void InitCurandStates(uint64_t seed, int32_t block_num, int32_t thread_num, hiprandState* states,
+                      CUDAGeneratorState* cuda_gen_state) {
+  hipLaunchKernelGGL(InitCurandStatesKernel, block_num, thread_num, 0, 0, seed, states, cuda_gen_state);
+}
+
+}  // namespace detail
+
+}  // namespace one
 }  // namespace oneflow
\ No newline at end of file
diff --git a/oneflow/core/hip/atomic.hip.h b/oneflow/core/hip/atomic.hip.h
index cf1e001..c4daf4d 100644
--- a/oneflow/core/hip/atomic.hip.h
+++ b/oneflow/core/hip/atomic.hip.h
@@ -1,214 +1,214 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#ifndef ONEFLOW_CORE_HIP_ATOMIC_H_
-#define ONEFLOW_CORE_HIP_ATOMIC_H_
-
-#if defined(__HIPCC__)
-
-#include <hip/hip_runtime.h>
-#include <hip/hip_fp16.h>
-#include <cstdint>
-
-namespace oneflow {
-
-namespace cuda {
-
-namespace atomic {
-
-namespace internal {
-
-template<typename T, typename U>
-__device__ __forceinline__ T CastCASImpl(T* address, T compare, T val) {
-  static_assert(sizeof(T) == sizeof(U), "");
-  U ret = atomicCAS(reinterpret_cast<U*>(address), *(reinterpret_cast<U*>(&compare)),
-                    *(reinterpret_cast<U*>(&val)));
-  return *(reinterpret_cast<T*>(&ret));
-}
-
-template<typename T>
-__device__ __forceinline__ typename std::enable_if<sizeof(T) == sizeof(unsigned int), T>::type
-CASImpl(T* address, T compare, T val) {
-  return CastCASImpl<T, unsigned int>(address, compare, val);
-}
-
-template<typename T>
-__device__ __forceinline__
-    typename std::enable_if<sizeof(T) == sizeof(unsigned long long int), T>::type
-    CASImpl(T* address, T compare, T val) {
-  return CastCASImpl<T, unsigned long long int>(address, compare, val);
-}
-
-__device__ __forceinline__ int CASImpl(int* address, int compare, int val) {
-  return atomicCAS(address, compare, val);
-}
-
-__device__ __forceinline__ unsigned int CASImpl(unsigned int* address, unsigned int compare,
-                                                unsigned int val) {
-  return atomicCAS(address, compare, val);
-}
-
-__device__ __forceinline__ unsigned long long int CASImpl(unsigned long long int* address,
-                                                          unsigned long long int compare,
-                                                          unsigned long long int val) {
-  return atomicCAS(address, compare, val);
-}
-
-// #if __CUDA_ARCH__ >= 700
-
-// __device__ __forceinline__ unsigned short int CASImpl(unsigned short int* address,
-//                                                       unsigned short int compare,
-//                                                       unsigned short int val) {
-//   return atomicCAS(address, compare, val);
-// }
-
-// #endif  // __CUDA_ARCH__ >= 700
-
-template<typename T>
-struct AddOp {
-  __device__ __forceinline__ T operator()(T a, T b) { return a + b; }
-};
-
-template<typename T, template<typename> class BinaryOp>
-__device__ __forceinline__ T AtomicCASBinaryImpl(T* address, T val) {
-  T old = *address;
-  T assumed;
-  do {
-    assumed = old;
-    old = CASImpl(address, assumed, BinaryOp<T>()(old, val));
-  } while (old != assumed);
-  return old;
-}
-
-template<typename T>
-__device__ __forceinline__ T AddImpl(T* address, T val) {
-  return AtomicCASBinaryImpl<T, AddOp>(address, val);
-}
-
-__device__ __forceinline__ int AddImpl(int* address, int val) { return atomicAdd(address, val); }
-
-__device__ __forceinline__ unsigned int AddImpl(unsigned int* address, unsigned int val) {
-  return atomicAdd(address, val);
-}
-
-__device__ __forceinline__ unsigned long long int AddImpl(unsigned long long int* address,
-                                                          unsigned long long int val) {
-  return atomicAdd(address, val);
-}
-
-__device__ __forceinline__ uint64_t AddImpl(uint64_t* address, uint64_t val) {
-  static_assert(sizeof(uint64_t) == sizeof(unsigned long long int), "");
-  return static_cast<uint64_t>(atomicAdd(reinterpret_cast<unsigned long long int*>(address),
-                                         static_cast<unsigned long long int>(val)));
-}
-
-__device__ __forceinline__ float AddImpl(float* address, float val) {
-  return atomicAdd(address, val);
-}
-
-// #if __CUDA_ARCH__ >= 600
-
-__device__ __forceinline__ double AddImpl(double* address, double val) {
-  return atomicAdd(address, val);
-}
-
-// __device__ __forceinline__ half2 AddImpl(half2* address, half2 val) {
-//   return atomicAdd(address, val);
-// }
-
-// #endif  // __CUDA_ARCH__ >= 600
-
-// #if __CUDA_ARCH__ >= 700
-
-__device__ __forceinline__ half AddImpl(half* address, half val) { 
-  float address_value = __half2float(*address);
-  return __float2half(atomicAdd(&address_value, __half2float(val))); }
-
-// #endif  // __CUDA_ARCH__ >= 700
-
-// #if __CUDA_ARCH__ >= 800
-
-// __device__ __forceinline__ nv_bfloat16 AddImpl(nv_bfloat16* address, nv_bfloat16 val) {
-//   return atomicAdd(address, val);
-// }
-
-// #endif  // __CUDA_ARCH__ >= 800
-
-// #if __CUDA_ARCH__ < 530
-
-__device__ __forceinline__ half2 AddImpl(half2* address, half2 val) {
-  half2 res;
-  float2 address_value = __half22float2(*address);
-  res.data.x = __float2half(atomicAdd(&address_value.x, __half2float(val.data.x)));
-  res.data.y = __float2half(atomicAdd(&address_value.y, __half2float(val.data.y)));
-  return res;
-}
-
-// #endif  // __CUDA_ARCH__ < 530
-
-}  // namespace internal
-
-template<typename T, typename U>
-__device__ __forceinline__ typename std::enable_if<!std::is_same<T, U>::value, T>::type Cast(U v) {
-  return static_cast<T>(v);
-}
-
-template<typename T, typename U>
-__device__ __forceinline__ typename std::enable_if<std::is_same<T, U>::value, T>::type Cast(U v) {
-  return v;
-}
-
-template<typename T, typename U, typename V>
-__device__ __forceinline__ T CAS(T* address, U compare, V val) {
-  return internal::CASImpl(address, Cast<T>(compare), Cast<T>(val));
-}
-
-template<typename T, typename U>
-__device__ __forceinline__ T Add(T* address, U val) {
-  return internal::AddImpl(address, Cast<T>(val));
-}
-
-__device__ __forceinline__ float Max(float* address, const float val) {
-  int* address_as_i = (int*)address;
-  int old = *address_as_i;
-  int assumed = 0;
-  do {
-    assumed = old;
-    old = atomicCAS(address_as_i, assumed, __float_as_int(fmaxf(val, __int_as_float(assumed))));
-  } while (assumed != old);
-  return __int_as_float(old);
-}
-
-__device__ __forceinline__ double Max(double* address, const double val) {
-  unsigned long long int* address_as_i = (unsigned long long int*)address;
-  unsigned long long int old = *address_as_i;
-  unsigned long long int assumed = 0;
-  do {
-    assumed = old;
-    old = atomicCAS(address_as_i, assumed,
-                    __double_as_longlong(fmax(val, __longlong_as_double(assumed))));
-  } while (assumed != old);
-  return __longlong_as_double(old);
-}
-
-}  // namespace atomic
-
-}  // namespace cuda
-
-}  // namespace oneflow
-
-#endif  // defined(__HIPCC__)
-
-#endif  // ONEFLOW_CORE_CUDA_ATOMIC_H_
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#ifndef ONEFLOW_CORE_HIP_ATOMIC_H_
+#define ONEFLOW_CORE_HIP_ATOMIC_H_
+
+#if defined(__HIPCC__)
+
+#include <hip/hip_runtime.h>
+#include <hip/hip_fp16.h>
+#include <cstdint>
+
+namespace oneflow {
+
+namespace cuda {
+
+namespace atomic {
+
+namespace internal {
+
+template<typename T, typename U>
+__device__ __forceinline__ T CastCASImpl(T* address, T compare, T val) {
+  static_assert(sizeof(T) == sizeof(U), "");
+  U ret = atomicCAS(reinterpret_cast<U*>(address), *(reinterpret_cast<U*>(&compare)),
+                    *(reinterpret_cast<U*>(&val)));
+  return *(reinterpret_cast<T*>(&ret));
+}
+
+template<typename T>
+__device__ __forceinline__ typename std::enable_if<sizeof(T) == sizeof(unsigned int), T>::type
+CASImpl(T* address, T compare, T val) {
+  return CastCASImpl<T, unsigned int>(address, compare, val);
+}
+
+template<typename T>
+__device__ __forceinline__
+    typename std::enable_if<sizeof(T) == sizeof(unsigned long long int), T>::type
+    CASImpl(T* address, T compare, T val) {
+  return CastCASImpl<T, unsigned long long int>(address, compare, val);
+}
+
+__device__ __forceinline__ int CASImpl(int* address, int compare, int val) {
+  return atomicCAS(address, compare, val);
+}
+
+__device__ __forceinline__ unsigned int CASImpl(unsigned int* address, unsigned int compare,
+                                                unsigned int val) {
+  return atomicCAS(address, compare, val);
+}
+
+__device__ __forceinline__ unsigned long long int CASImpl(unsigned long long int* address,
+                                                          unsigned long long int compare,
+                                                          unsigned long long int val) {
+  return atomicCAS(address, compare, val);
+}
+
+// #if __CUDA_ARCH__ >= 700
+
+// __device__ __forceinline__ unsigned short int CASImpl(unsigned short int* address,
+//                                                       unsigned short int compare,
+//                                                       unsigned short int val) {
+//   return atomicCAS(address, compare, val);
+// }
+
+// #endif  // __CUDA_ARCH__ >= 700
+
+template<typename T>
+struct AddOp {
+  __device__ __forceinline__ T operator()(T a, T b) { return a + b; }
+};
+
+template<typename T, template<typename> class BinaryOp>
+__device__ __forceinline__ T AtomicCASBinaryImpl(T* address, T val) {
+  T old = *address;
+  T assumed;
+  do {
+    assumed = old;
+    old = CASImpl(address, assumed, BinaryOp<T>()(old, val));
+  } while (old != assumed);
+  return old;
+}
+
+template<typename T>
+__device__ __forceinline__ T AddImpl(T* address, T val) {
+  return AtomicCASBinaryImpl<T, AddOp>(address, val);
+}
+
+__device__ __forceinline__ int AddImpl(int* address, int val) { return atomicAdd(address, val); }
+
+__device__ __forceinline__ unsigned int AddImpl(unsigned int* address, unsigned int val) {
+  return atomicAdd(address, val);
+}
+
+__device__ __forceinline__ unsigned long long int AddImpl(unsigned long long int* address,
+                                                          unsigned long long int val) {
+  return atomicAdd(address, val);
+}
+
+__device__ __forceinline__ uint64_t AddImpl(uint64_t* address, uint64_t val) {
+  static_assert(sizeof(uint64_t) == sizeof(unsigned long long int), "");
+  return static_cast<uint64_t>(atomicAdd(reinterpret_cast<unsigned long long int*>(address),
+                                         static_cast<unsigned long long int>(val)));
+}
+
+__device__ __forceinline__ float AddImpl(float* address, float val) {
+  return atomicAdd(address, val);
+}
+
+// #if __CUDA_ARCH__ >= 600
+
+__device__ __forceinline__ double AddImpl(double* address, double val) {
+  return atomicAdd(address, val);
+}
+
+// __device__ __forceinline__ half2 AddImpl(half2* address, half2 val) {
+//   return atomicAdd(address, val);
+// }
+
+// #endif  // __CUDA_ARCH__ >= 600
+
+// #if __CUDA_ARCH__ >= 700
+
+__device__ __forceinline__ half AddImpl(half* address, half val) { 
+  float address_value = __half2float(*address);
+  return __float2half(atomicAdd(&address_value, __half2float(val))); }
+
+// #endif  // __CUDA_ARCH__ >= 700
+
+// #if __CUDA_ARCH__ >= 800
+
+// __device__ __forceinline__ nv_bfloat16 AddImpl(nv_bfloat16* address, nv_bfloat16 val) {
+//   return atomicAdd(address, val);
+// }
+
+// #endif  // __CUDA_ARCH__ >= 800
+
+// #if __CUDA_ARCH__ < 530
+
+__device__ __forceinline__ half2 AddImpl(half2* address, half2 val) {
+  half2 res;
+  float2 address_value = __half22float2(*address);
+  res.data.x = __float2half(atomicAdd(&address_value.x, __half2float(val.data.x)));
+  res.data.y = __float2half(atomicAdd(&address_value.y, __half2float(val.data.y)));
+  return res;
+}
+
+// #endif  // __CUDA_ARCH__ < 530
+
+}  // namespace internal
+
+template<typename T, typename U>
+__device__ __forceinline__ typename std::enable_if<!std::is_same<T, U>::value, T>::type Cast(U v) {
+  return static_cast<T>(v);
+}
+
+template<typename T, typename U>
+__device__ __forceinline__ typename std::enable_if<std::is_same<T, U>::value, T>::type Cast(U v) {
+  return v;
+}
+
+template<typename T, typename U, typename V>
+__device__ __forceinline__ T CAS(T* address, U compare, V val) {
+  return internal::CASImpl(address, Cast<T>(compare), Cast<T>(val));
+}
+
+template<typename T, typename U>
+__device__ __forceinline__ T Add(T* address, U val) {
+  return internal::AddImpl(address, Cast<T>(val));
+}
+
+__device__ __forceinline__ float Max(float* address, const float val) {
+  int* address_as_i = (int*)address;
+  int old = *address_as_i;
+  int assumed = 0;
+  do {
+    assumed = old;
+    old = atomicCAS(address_as_i, assumed, __float_as_int(fmaxf(val, __int_as_float(assumed))));
+  } while (assumed != old);
+  return __int_as_float(old);
+}
+
+__device__ __forceinline__ double Max(double* address, const double val) {
+  unsigned long long int* address_as_i = (unsigned long long int*)address;
+  unsigned long long int old = *address_as_i;
+  unsigned long long int assumed = 0;
+  do {
+    assumed = old;
+    old = atomicCAS(address_as_i, assumed,
+                    __double_as_longlong(fmax(val, __longlong_as_double(assumed))));
+  } while (assumed != old);
+  return __longlong_as_double(old);
+}
+
+}  // namespace atomic
+
+}  // namespace cuda
+
+}  // namespace oneflow
+
+#endif  // defined(__HIPCC__)
+
+#endif  // ONEFLOW_CORE_CUDA_ATOMIC_H_
diff --git a/oneflow/core/hip/elementwise.hip.h b/oneflow/core/hip/elementwise.hip.h
index 29b51fd..8c5ae25 100644
--- a/oneflow/core/hip/elementwise.hip.h
+++ b/oneflow/core/hip/elementwise.hip.h
@@ -1,243 +1,243 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#ifndef ONEFLOW_CORE_HIP_ELEMENTWISE_H_
-#define ONEFLOW_CORE_HIP_ELEMENTWISE_H_
-
-#ifdef WITH_ROCM
-
-#include <hip/hip_runtime.h>
-#include <cstdint>
-#include <algorithm>
-#include <type_traits>
-
-namespace oneflow {
-
-namespace cuda {
-
-namespace elementwise {
-
-constexpr int kBlockSize = 256;
-constexpr int kNumWaves = 32;
-
-inline hipError_t GetNumBlocks(int64_t n, int* num_blocks) {
-  int dev;
-  {
-    hipError_t err = hipGetDevice(&dev);
-    if (err != hipSuccess) { return err; }
-  }
-  int sm_count;
-  {
-    hipError_t err = hipDeviceGetAttribute(&sm_count, hipDeviceAttributeMultiprocessorCount, dev);
-    if (err != hipSuccess) { return err; }
-  }
-  int tpm;
-  {
-    hipError_t err = hipDeviceGetAttribute(&tpm, hipDeviceAttributeMaxThreadsPerMultiProcessor, dev);
-    if (err != hipSuccess) { return err; }
-  }
-  *num_blocks = std::max<int>(1, std::min<int64_t>((n + kBlockSize - 1) / kBlockSize,
-                                                   sm_count * tpm / kBlockSize * kNumWaves));
-  return hipSuccess;
-}
-
-template<typename T, int pack_size>
-struct GetPackType {
-  using type = typename std::aligned_storage<pack_size * sizeof(T), pack_size * sizeof(T)>::type;
-};
-
-template<typename T, int pack_size>
-using PackType = typename GetPackType<T, pack_size>::type;
-
-template<typename T, int pack_size>
-union Pack {
-  static_assert(sizeof(PackType<T, pack_size>) == sizeof(T) * pack_size, "");
-  __device__ Pack() {
-    // do nothing
-  }
-  PackType<T, pack_size> storage;
-  T elem[pack_size];
-};
-
-template<typename T, int pack_size>
-struct alignas(sizeof(T) * pack_size) Packed {
-  __device__ Packed() {
-    // do nothing
-  }
-  union {
-    T elem[pack_size];
-  };
-};
-
-constexpr int kMaxPackBytes = 128 / 8;
-constexpr int kMaxPackSize = 8;
-
-constexpr int Min(int a, int b) { return a < b ? a : b; }
-
-template<typename T>
-constexpr int PackSize() {
-  return Min(kMaxPackBytes / sizeof(T), kMaxPackSize);
-}
-
-template<typename T, typename U, typename... Args>
-constexpr int PackSize() {
-  return Min(PackSize<T>(), PackSize<U, Args...>());
-}
-
-template<typename T>
-class HasApply2 {
-  typedef char one;
-  struct two {
-    char x[2];
-  };
-
-  template<typename C>
-  static one test(decltype(&C::Apply2));
-  template<typename C>
-  static two test(...);
-
- public:
-  enum { value = sizeof(test<T>(0)) == sizeof(char) };
-};
-
-template<int pack_size, typename FunctorT, typename R, typename... IN>
-__device__ typename std::enable_if<HasApply2<FunctorT>::value == true && pack_size % 2 == 0,
-                                   Packed<R, pack_size>>::type
-ApplyPack(const FunctorT& functor, const IN... in[pack_size]) {
-  Packed<R, pack_size> ret;
-#pragma unroll
-  for (int j = 0; j < pack_size; j += 2) { functor.Apply2(ret.elem + j, (in + j)...); }
-  return ret;
-}
-
-template<int pack_size, typename FunctorT, typename R, typename... IN>
-__device__ typename std::enable_if<HasApply2<FunctorT>::value == false || pack_size % 2 != 0,
-                                   Packed<R, pack_size>>::type
-ApplyPack(const FunctorT& functor, const IN... in[pack_size]) {
-  Packed<R, pack_size> ret;
-#pragma unroll
-  for (int j = 0; j < pack_size; ++j) { ret.elem[j] = functor((in[j])...); }
-  return ret;
-}
-
-template<int pack_size, bool tail, typename FactoryT, typename R, typename... IN>
-__global__ void __launch_bounds__(kBlockSize)
-    ApplyGeneric(FactoryT factory, int64_t n_pack, Packed<R, pack_size>* pack_r,
-                 const Packed<IN, pack_size>*... pack_in, int64_t n_tail, R* tail_r,
-                 const IN*... tail_in) {
-  auto functor = factory();
-  const int global_tid = blockIdx.x * kBlockSize + threadIdx.x;
-  for (int64_t i = global_tid; i < n_pack; i += blockDim.x * gridDim.x) {
-    pack_r[i] = ApplyPack<pack_size, decltype(functor), R, IN...>(functor, (pack_in[i].elem)...);
-  }
-  if (tail && global_tid < n_tail) { tail_r[global_tid] = functor((tail_in[global_tid])...); }
-}
-
-template<typename FunctorT>
-struct SimpleFactory {
-  explicit SimpleFactory(FunctorT functor) : tpl(functor) {}
-  __device__ FunctorT operator()() const { return tpl; }
-
- private:
-  FunctorT tpl;
-};
-
-template<size_t pack_size>
-bool IsAligendForPack() {
-  return true;
-}
-
-template<size_t pack_size, typename T, typename... Args>
-bool IsAligendForPack(const T* ptr, const Args*... others) {
-  return reinterpret_cast<uintptr_t>(ptr) % sizeof(Pack<T, pack_size>) == 0
-         && IsAligendForPack<pack_size, Args...>(others...);
-}
-
-template<size_t pack_size, typename FactoryT, typename R, typename... IN>
-hipError_t LaunchKernel(FactoryT factory, int64_t n, R* r, const IN*... in, hipStream_t stream) {
-  const int64_t n_pack = n / pack_size;
-  const int64_t tail_offset = n_pack * pack_size;
-  const int64_t n_tail = n - tail_offset;
-  int num_blocks;
-  {
-    hipError_t err = GetNumBlocks(n_pack, &num_blocks);
-    if (err != hipSuccess) { return err; }
-  }
-  auto func = n_tail > 0 ? ApplyGeneric<pack_size, true, FactoryT, R, IN...>
-                         : ApplyGeneric<pack_size, false, FactoryT, R, IN...>;
-  hipLaunchKernelGGL(func, num_blocks, kBlockSize, 0, stream, 
-      factory, n_pack, reinterpret_cast<Packed<R, pack_size>*>(r),
-      (reinterpret_cast<const Packed<IN, pack_size>*>(in))..., n_tail, r + tail_offset,
-      (in + tail_offset)...);
-  return hipPeekAtLastError();
-}
-
-template<typename FactoryT, typename R, typename... IN>
-struct GenericLauncher {
-  static hipError_t Launch(FactoryT factory, int64_t n, R* r, const IN*... in,
-                            hipStream_t stream) {
-    constexpr int max_pack_size = PackSize<R, IN...>();
-    if (IsAligendForPack<max_pack_size, R, IN...>(r, in...)) {
-      return LaunchKernel<max_pack_size, FactoryT, R, IN...>(factory, n, r, in..., stream);
-    } else {
-      return LaunchKernel<1, FactoryT, R, IN...>(factory, n, r, in..., stream);
-    }
-  }
-};
-
-template<typename FactoryT, typename R, typename A>
-inline hipError_t UnaryWithFactory(FactoryT factory, int64_t n, R* r, const A* a,
-                                    hipStream_t stream) {
-  return GenericLauncher<FactoryT, R, A>::Launch(factory, n, r, a, stream);
-}
-
-template<typename FunctorT, typename R, typename A>
-inline hipError_t Unary(FunctorT functor, int64_t n, R* r, const A* a, hipStream_t stream) {
-  return UnaryWithFactory(SimpleFactory<FunctorT>(functor), n, r, a, stream);
-}
-
-template<typename FactoryT, typename R, typename A, typename B>
-inline hipError_t BinaryWithFactory(FactoryT factory, int64_t n, R* r, const A* a, const B* b,
-                                     hipStream_t stream) {
-  return GenericLauncher<FactoryT, R, A, B>::Launch(factory, n, r, a, b, stream);
-}
-
-template<typename FunctorT, typename R, typename A, typename B>
-inline hipError_t Binary(FunctorT functor, int64_t n, R* r, const A* a, const B* b,
-                          hipStream_t stream) {
-  return BinaryWithFactory(SimpleFactory<FunctorT>(functor), n, r, a, b, stream);
-}
-
-template<typename FactoryT, typename R, typename A, typename B, typename C>
-inline hipError_t TernaryWithFactory(FactoryT factory, int64_t n, R* r, const A* a, const B* b,
-                                      const C* c, hipStream_t stream) {
-  return GenericLauncher<FactoryT, R, A, B, C>::Launch(factory, n, r, a, b, c, stream);
-}
-
-template<typename FunctorT, typename R, typename A, typename B, typename C>
-inline hipError_t Ternary(FunctorT functor, int64_t n, R* r, const A* a, const B* b, const C* c,
-                           hipStream_t stream) {
-  return TernaryWithFactory(SimpleFactory<FunctorT>(functor), n, r, a, b, c, stream);
-}
-
-}  // namespace elementwise
-
-}  // namespace cuda
-
-}  // namespace oneflow
-
-#endif  // WITH_ROCM
-
-#endif  // ONEFLOW_CORE_CUDA_ELEMENTWISE_H_
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#ifndef ONEFLOW_CORE_HIP_ELEMENTWISE_H_
+#define ONEFLOW_CORE_HIP_ELEMENTWISE_H_
+
+#ifdef WITH_ROCM
+
+#include <hip/hip_runtime.h>
+#include <cstdint>
+#include <algorithm>
+#include <type_traits>
+
+namespace oneflow {
+
+namespace cuda {
+
+namespace elementwise {
+
+constexpr int kBlockSize = 256;
+constexpr int kNumWaves = 32;
+
+inline hipError_t GetNumBlocks(int64_t n, int* num_blocks) {
+  int dev;
+  {
+    hipError_t err = hipGetDevice(&dev);
+    if (err != hipSuccess) { return err; }
+  }
+  int sm_count;
+  {
+    hipError_t err = hipDeviceGetAttribute(&sm_count, hipDeviceAttributeMultiprocessorCount, dev);
+    if (err != hipSuccess) { return err; }
+  }
+  int tpm;
+  {
+    hipError_t err = hipDeviceGetAttribute(&tpm, hipDeviceAttributeMaxThreadsPerMultiProcessor, dev);
+    if (err != hipSuccess) { return err; }
+  }
+  *num_blocks = std::max<int>(1, std::min<int64_t>((n + kBlockSize - 1) / kBlockSize,
+                                                   sm_count * tpm / kBlockSize * kNumWaves));
+  return hipSuccess;
+}
+
+template<typename T, int pack_size>
+struct GetPackType {
+  using type = typename std::aligned_storage<pack_size * sizeof(T), pack_size * sizeof(T)>::type;
+};
+
+template<typename T, int pack_size>
+using PackType = typename GetPackType<T, pack_size>::type;
+
+template<typename T, int pack_size>
+union Pack {
+  static_assert(sizeof(PackType<T, pack_size>) == sizeof(T) * pack_size, "");
+  __device__ Pack() {
+    // do nothing
+  }
+  PackType<T, pack_size> storage;
+  T elem[pack_size];
+};
+
+template<typename T, int pack_size>
+struct alignas(sizeof(T) * pack_size) Packed {
+  __device__ Packed() {
+    // do nothing
+  }
+  union {
+    T elem[pack_size];
+  };
+};
+
+constexpr int kMaxPackBytes = 128 / 8;
+constexpr int kMaxPackSize = 8;
+
+constexpr int Min(int a, int b) { return a < b ? a : b; }
+
+template<typename T>
+constexpr int PackSize() {
+  return Min(kMaxPackBytes / sizeof(T), kMaxPackSize);
+}
+
+template<typename T, typename U, typename... Args>
+constexpr int PackSize() {
+  return Min(PackSize<T>(), PackSize<U, Args...>());
+}
+
+template<typename T>
+class HasApply2 {
+  typedef char one;
+  struct two {
+    char x[2];
+  };
+
+  template<typename C>
+  static one test(decltype(&C::Apply2));
+  template<typename C>
+  static two test(...);
+
+ public:
+  enum { value = sizeof(test<T>(0)) == sizeof(char) };
+};
+
+template<int pack_size, typename FunctorT, typename R, typename... IN>
+__device__ typename std::enable_if<HasApply2<FunctorT>::value == true && pack_size % 2 == 0,
+                                   Packed<R, pack_size>>::type
+ApplyPack(const FunctorT& functor, const IN... in[pack_size]) {
+  Packed<R, pack_size> ret;
+#pragma unroll
+  for (int j = 0; j < pack_size; j += 2) { functor.Apply2(ret.elem + j, (in + j)...); }
+  return ret;
+}
+
+template<int pack_size, typename FunctorT, typename R, typename... IN>
+__device__ typename std::enable_if<HasApply2<FunctorT>::value == false || pack_size % 2 != 0,
+                                   Packed<R, pack_size>>::type
+ApplyPack(const FunctorT& functor, const IN... in[pack_size]) {
+  Packed<R, pack_size> ret;
+#pragma unroll
+  for (int j = 0; j < pack_size; ++j) { ret.elem[j] = functor((in[j])...); }
+  return ret;
+}
+
+template<int pack_size, bool tail, typename FactoryT, typename R, typename... IN>
+__global__ void __launch_bounds__(kBlockSize)
+    ApplyGeneric(FactoryT factory, int64_t n_pack, Packed<R, pack_size>* pack_r,
+                 const Packed<IN, pack_size>*... pack_in, int64_t n_tail, R* tail_r,
+                 const IN*... tail_in) {
+  auto functor = factory();
+  const int global_tid = blockIdx.x * kBlockSize + threadIdx.x;
+  for (int64_t i = global_tid; i < n_pack; i += blockDim.x * gridDim.x) {
+    pack_r[i] = ApplyPack<pack_size, decltype(functor), R, IN...>(functor, (pack_in[i].elem)...);
+  }
+  if (tail && global_tid < n_tail) { tail_r[global_tid] = functor((tail_in[global_tid])...); }
+}
+
+template<typename FunctorT>
+struct SimpleFactory {
+  explicit SimpleFactory(FunctorT functor) : tpl(functor) {}
+  __device__ FunctorT operator()() const { return tpl; }
+
+ private:
+  FunctorT tpl;
+};
+
+template<size_t pack_size>
+bool IsAligendForPack() {
+  return true;
+}
+
+template<size_t pack_size, typename T, typename... Args>
+bool IsAligendForPack(const T* ptr, const Args*... others) {
+  return reinterpret_cast<uintptr_t>(ptr) % sizeof(Pack<T, pack_size>) == 0
+         && IsAligendForPack<pack_size, Args...>(others...);
+}
+
+template<size_t pack_size, typename FactoryT, typename R, typename... IN>
+hipError_t LaunchKernel(FactoryT factory, int64_t n, R* r, const IN*... in, hipStream_t stream) {
+  const int64_t n_pack = n / pack_size;
+  const int64_t tail_offset = n_pack * pack_size;
+  const int64_t n_tail = n - tail_offset;
+  int num_blocks;
+  {
+    hipError_t err = GetNumBlocks(n_pack, &num_blocks);
+    if (err != hipSuccess) { return err; }
+  }
+  auto func = n_tail > 0 ? ApplyGeneric<pack_size, true, FactoryT, R, IN...>
+                         : ApplyGeneric<pack_size, false, FactoryT, R, IN...>;
+  hipLaunchKernelGGL(func, num_blocks, kBlockSize, 0, stream, 
+      factory, n_pack, reinterpret_cast<Packed<R, pack_size>*>(r),
+      (reinterpret_cast<const Packed<IN, pack_size>*>(in))..., n_tail, r + tail_offset,
+      (in + tail_offset)...);
+  return hipPeekAtLastError();
+}
+
+template<typename FactoryT, typename R, typename... IN>
+struct GenericLauncher {
+  static hipError_t Launch(FactoryT factory, int64_t n, R* r, const IN*... in,
+                            hipStream_t stream) {
+    constexpr int max_pack_size = PackSize<R, IN...>();
+    if (IsAligendForPack<max_pack_size, R, IN...>(r, in...)) {
+      return LaunchKernel<max_pack_size, FactoryT, R, IN...>(factory, n, r, in..., stream);
+    } else {
+      return LaunchKernel<1, FactoryT, R, IN...>(factory, n, r, in..., stream);
+    }
+  }
+};
+
+template<typename FactoryT, typename R, typename A>
+inline hipError_t UnaryWithFactory(FactoryT factory, int64_t n, R* r, const A* a,
+                                    hipStream_t stream) {
+  return GenericLauncher<FactoryT, R, A>::Launch(factory, n, r, a, stream);
+}
+
+template<typename FunctorT, typename R, typename A>
+inline hipError_t Unary(FunctorT functor, int64_t n, R* r, const A* a, hipStream_t stream) {
+  return UnaryWithFactory(SimpleFactory<FunctorT>(functor), n, r, a, stream);
+}
+
+template<typename FactoryT, typename R, typename A, typename B>
+inline hipError_t BinaryWithFactory(FactoryT factory, int64_t n, R* r, const A* a, const B* b,
+                                     hipStream_t stream) {
+  return GenericLauncher<FactoryT, R, A, B>::Launch(factory, n, r, a, b, stream);
+}
+
+template<typename FunctorT, typename R, typename A, typename B>
+inline hipError_t Binary(FunctorT functor, int64_t n, R* r, const A* a, const B* b,
+                          hipStream_t stream) {
+  return BinaryWithFactory(SimpleFactory<FunctorT>(functor), n, r, a, b, stream);
+}
+
+template<typename FactoryT, typename R, typename A, typename B, typename C>
+inline hipError_t TernaryWithFactory(FactoryT factory, int64_t n, R* r, const A* a, const B* b,
+                                      const C* c, hipStream_t stream) {
+  return GenericLauncher<FactoryT, R, A, B, C>::Launch(factory, n, r, a, b, c, stream);
+}
+
+template<typename FunctorT, typename R, typename A, typename B, typename C>
+inline hipError_t Ternary(FunctorT functor, int64_t n, R* r, const A* a, const B* b, const C* c,
+                           hipStream_t stream) {
+  return TernaryWithFactory(SimpleFactory<FunctorT>(functor), n, r, a, b, c, stream);
+}
+
+}  // namespace elementwise
+
+}  // namespace cuda
+
+}  // namespace oneflow
+
+#endif  // WITH_ROCM
+
+#endif  // ONEFLOW_CORE_CUDA_ELEMENTWISE_H_
diff --git a/oneflow/core/hip/layer_norm.hip.h b/oneflow/core/hip/layer_norm.hip.h
index 97891c3..183197f 100644
--- a/oneflow/core/hip/layer_norm.hip.h
+++ b/oneflow/core/hip/layer_norm.hip.h
@@ -1,1606 +1,1606 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-#ifndef ONEFLOW_CORE_HIP_LAYER_NORM_H_
-#define ONEFLOW_CORE_HIP_LAYER_NORM_H_
-
-#ifdef WITH_ROCM
-
-#include "hip/hip_runtime.h"
-#include <hipcub/hipcub.hpp>
-// #include <math_constants.h>
-#include <assert.h>
-
-namespace oneflow {
-
-namespace cuda {
-
-namespace layer_norm {
-
-constexpr int kWarpSize = 64;
-
-template<typename T>
-struct SumOp {
-  __device__ __forceinline__ T operator()(const T& a, const T& b) const { return a + b; }
-};
-
-template<typename T>
-struct MaxOp {
-  __device__ __forceinline__ T operator()(const T& a, const T& b) const { return max(a, b); }
-};
-
-template<template<typename> class ReductionOp, typename T, int thread_group_width = kWarpSize>
-__inline__ __device__ T WarpAllReduce(T val) {
-  for (int mask = thread_group_width / 2; mask > 0; mask /= 2) {
-    // val = ReductionOp<T>()(val, __shfl_xor(0xffffffff, val, mask, thread_group_width));
-    val = ReductionOp<T>()(val, __shfl_xor(val, mask, thread_group_width));
-  }
-  return val;
-}
-
-template<template<typename> class ReductionOp, typename T, int block_size>
-__inline__ __device__ T BlockAllReduce(T val) {
-  typedef hipcub::BlockReduce<T, block_size> BlockReduce;
-  __shared__ typename BlockReduce::TempStorage temp_storage;
-  __shared__ T result_broadcast;
-  T result = BlockReduce(temp_storage).Reduce(val, ReductionOp<T>());
-  if (threadIdx.x == 0) { result_broadcast = result; }
-  __syncthreads();
-  return result_broadcast;
-}
-
-template<typename T>
-__inline__ __device__ T Div(T a, T b);
-
-template<>
-__inline__ __device__ float Div<float>(float a, float b) {
-#ifdef OF_LAYER_NORM_USE_FAST_MATH
-  return __fdividef(a, b);
-#else
-  return a / b;
-#endif
-}
-
-template<>
-__inline__ __device__ double Div<double>(double a, double b) {
-  return a / b;
-}
-
-template<typename T>
-__inline__ __device__ T Rsqrt(T x);
-
-template<>
-__inline__ __device__ float Rsqrt<float>(float x) {
-#ifdef OF_LAYER_NORM_USE_FAST_MATH
-  return __frsqrt_rn(x);
-#else
-  return rsqrt(x);
-#endif
-}
-
-template<>
-__inline__ __device__ double Rsqrt<double>(double x) {
-  return rsqrt(x);
-}
-
-template<class Func>
-inline hipError_t GetNumBlocks(Func func, int64_t block_size, size_t dynamic_smem_size,
-                                int64_t max_blocks, int64_t waves, int* num_blocks) {
-  int dev;
-  {
-    hipError_t err = hipGetDevice(&dev);
-    if (err != hipSuccess) { return err; }
-  }
-  int sm_count;
-  {
-    hipError_t err = hipDeviceGetAttribute(&sm_count, hipDeviceAttributeMultiprocessorCount, dev);
-    if (err != hipSuccess) { return err; }
-  }
-  int max_active_blocks;
-  {
-    hipError_t err = hipOccupancyMaxActiveBlocksPerMultiprocessor(&max_active_blocks, func,
-                                                                    block_size, dynamic_smem_size);
-  }
-  *num_blocks =
-      std::max<int>(1, std::min<int64_t>(max_blocks, sm_count * max_active_blocks * waves));
-  return hipSuccess;
-}
-
-template<typename T>
-struct DefaultComputeType {
-  using type = T;
-};
-
-template<>
-struct DefaultComputeType<half> {
-  using type = float;
-};
-
-// #if CUDA_VERSION >= 11000
-// template<>
-// struct DefaultComputeType<nv_bfloat16> {
-//   using type = float;
-// };
-// #endif  // CUDA_VERSION >= 11000
-
-template<typename T, int N>
-struct GetPackType {
-  using type = typename std::aligned_storage<N * sizeof(T), N * sizeof(T)>::type;
-};
-
-template<typename T, int N>
-using PackType = typename GetPackType<T, N>::type;
-
-template<typename T, int N>
-union Pack {
-  static_assert(sizeof(PackType<T, N>) == sizeof(T) * N, "");
-  __device__ Pack() {
-    // do nothing
-  }
-  PackType<T, N> storage;
-  T elem[N];
-};
-
-template<typename SRC, typename DST>
-struct DirectLoad {
-  DirectLoad(const SRC* src, int64_t row_size) : src(src), row_size(row_size) {}
-  template<int N>
-  __device__ void load(DST* dst, int64_t row, int64_t col) const {
-    Pack<SRC, N> pack;
-    const int64_t offset = (row * row_size + col) / N;
-    pack.storage = *(reinterpret_cast<const PackType<SRC, N>*>(src) + offset);
-#pragma unroll
-    for (int i = 0; i < N; ++i) { dst[i] = static_cast<DST>(pack.elem[i]); }
-  }
-  const SRC* src;
-  int64_t row_size;
-};
-
-template<typename SRC, typename DST>
-struct DirectStore {
-  DirectStore(DST* dst, int64_t row_size) : dst(dst), row_size(row_size) {}
-  template<int N>
-  __device__ void store(const SRC* src, int64_t row, int64_t col) {
-    Pack<DST, N> pack;
-    const int64_t offset = (row * row_size + col) / N;
-#pragma unroll
-    for (int i = 0; i < N; ++i) { pack.elem[i] = static_cast<DST>(src[i]); }
-    *(reinterpret_cast<PackType<DST, N>*>(dst) + offset) = pack.storage;
-  }
-  DST* dst;
-  int64_t row_size;
-};
-
-template<typename T>
-inline __device__ void WelfordCombine(T val, T* mean, T* m2, T* count) {
-  // Use Welford Online algorithem to compute mean and variance
-  // For more details you can refer to:
-  // https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Welford's_online_algorithm
-  *count += 1;
-  T delta1 = val - *mean;
-  *mean += Div(delta1, *count);
-  T delta2 = val - *mean;
-  *m2 += delta1 * delta2;
-}
-
-template<typename T>
-inline __device__ void WelfordCombine(T b_mean, T b_m2, T b_count, T* mean, T* m2, T* count) {
-  if (b_count == 0) { return; }
-  T new_count = *count + b_count;
-  T nb_over_n = Div(b_count, new_count);
-  T delta = b_mean - *mean;
-  *mean += delta * nb_over_n;
-  *m2 += b_m2 + delta * delta * (*count) * nb_over_n;
-  *count = new_count;
-}
-
-template<typename T, int thread_group_width = kWarpSize>
-__inline__ __device__ void WelfordWarpReduce(T thread_mean, T thread_m2, T thread_count, T* mean,
-                                             T* m2, T* count) {
-  *mean = thread_mean;
-  *m2 = thread_m2;
-  *count = thread_count;
-  for (int mask = thread_group_width / 2; mask > 0; mask /= 2) {
-    // T b_mean = __shfl_down(0xffffffff, *mean, mask, thread_group_width);
-    // T b_m2 = __shfl_down(0xffffffff, *m2, mask, thread_group_width);
-    // T b_count = __shfl_down(0xffffffff, *count, mask, thread_group_width);
-    T b_mean = __shfl_down(*mean, mask, thread_group_width);
-    T b_m2 = __shfl_down(*m2, mask, thread_group_width);
-    T b_count = __shfl_down(*count, mask, thread_group_width);
-    WelfordCombine(b_mean, b_m2, b_count, mean, m2, count);
-  }
-}
-
-template<typename T, int thread_group_width = kWarpSize>
-__inline__ __device__ void WelfordWarpAllReduce(T thread_mean, T thread_m2, T thread_count, T* mean,
-                                                T* m2, T* count) {
-  WelfordWarpReduce<T, thread_group_width>(thread_mean, thread_m2, thread_count, mean, m2, count);
-  // *mean = __shfl(0xffffffff, *mean, 0, thread_group_width);
-  // *m2 = __shfl(0xffffffff, *m2, 0, thread_group_width);
-  // *count = __shfl(0xffffffff, *count, 0, thread_group_width);
-  *mean = __shfl(*mean, 0, thread_group_width);
-  *m2 = __shfl(*m2, 0, thread_group_width);
-  *count = __shfl(*count, 0, thread_group_width);
-}
-
-template<typename T>
-__inline__ __device__ void WelfordBlockAllReduce(T thread_mean, T thread_m2, T thread_count,
-                                                 T* result_mean, T* result_m2, T* result_count) {
-  __shared__ T mean_shared[kWarpSize];
-  __shared__ T m2_shared[kWarpSize];
-  __shared__ T count_shared[kWarpSize];
-  __shared__ T mean_result_broadcast;
-  __shared__ T m2_result_broadcast;
-  __shared__ T count_result_broadcast;
-  const int lid = threadIdx.x % kWarpSize;
-  const int wid = threadIdx.x / kWarpSize;
-  T warp_mean = 0;
-  T warp_m2 = 0;
-  T warp_count = 0;
-  WelfordWarpReduce(thread_mean, thread_m2, thread_count, &warp_mean, &warp_m2, &warp_count);
-  __syncthreads();
-  if (lid == 0) {
-    mean_shared[wid] = warp_mean;
-    m2_shared[wid] = warp_m2;
-    count_shared[wid] = warp_count;
-  }
-  __syncthreads();
-  if (wid == 0) {
-    if (threadIdx.x < blockDim.x / kWarpSize) {
-      warp_mean = mean_shared[lid];
-      warp_m2 = m2_shared[lid];
-      warp_count = count_shared[lid];
-    } else {
-      warp_mean = static_cast<T>(0);
-      warp_m2 = static_cast<T>(0);
-      warp_count = static_cast<T>(0);
-    }
-    __syncthreads();
-    T block_mean = 0;
-    T block_m2 = 0;
-    T block_count = 0;
-    WelfordWarpReduce(warp_mean, warp_m2, warp_count, &block_mean, &block_m2, &block_count);
-    if (lid == 0) {
-      mean_result_broadcast = block_mean;
-      m2_result_broadcast = block_m2;
-      count_result_broadcast = block_count;
-    }
-  }
-  __syncthreads();
-  *result_mean = mean_result_broadcast;
-  *result_m2 = m2_result_broadcast;
-  *result_count = count_result_broadcast;
-}
-
-template<typename LOAD, typename STORE, typename ComputeType, int pack_size, int cols_per_thread,
-         int thread_group_width, int rows_per_access, bool padding>
-__global__ void LayerNormWarpImpl(LOAD load, STORE store, const int64_t rows, const int64_t cols,
-                                  const double epsilon, ComputeType* mean,
-                                  ComputeType* inv_variance) {
-  static_assert(cols_per_thread % pack_size == 0, "");
-  static_assert(thread_group_width <= kWarpSize, "");
-  static_assert(kWarpSize % thread_group_width == 0, "");
-  constexpr int num_packs = cols_per_thread / pack_size;
-  assert(cols <= cols_per_thread * thread_group_width);
-  ComputeType buf[rows_per_access][cols_per_thread];
-  const int64_t global_thread_group_id = blockIdx.x * blockDim.y + threadIdx.y;
-  const int64_t num_global_thread_group = gridDim.x * blockDim.y;
-  const int64_t lane_id = threadIdx.x;
-  const int64_t step = num_global_thread_group * rows_per_access;
-  for (int64_t row = global_thread_group_id * rows_per_access; row < rows; row += step) {
-    ComputeType thread_mean[rows_per_access];
-    ComputeType thread_m2[rows_per_access];
-    ComputeType thread_count[rows_per_access];
-#pragma unroll
-    for (int row_id = 0; row_id < rows_per_access; ++row_id) {
-      thread_mean[row_id] = 0;
-      thread_m2[row_id] = 0;
-      thread_count[row_id] = 0;
-      ComputeType* row_buf = buf[row_id];
-#pragma unroll
-      for (int pack_id = 0; pack_id < num_packs; ++pack_id) {
-        const int col = (pack_id * thread_group_width + lane_id) * pack_size;
-        const int pack_offset = pack_id * pack_size;
-        if (!padding || col < cols) {
-          load.template load<pack_size>(row_buf + pack_offset, row + row_id, col);
-#pragma unroll
-          for (int i = 0; i < pack_size; ++i) {
-            WelfordCombine(row_buf[pack_offset + i], thread_mean + row_id, thread_m2 + row_id,
-                           thread_count + row_id);
-          }
-        } else {
-#pragma unroll
-          for (int i = 0; i < pack_size; ++i) { row_buf[pack_offset + i] = 0; }
-        }
-      }
-    }
-    ComputeType warp_mean[rows_per_access];
-    ComputeType warp_m2[rows_per_access];
-    ComputeType warp_count[rows_per_access];
-#pragma unroll
-    for (int row_id = 0; row_id < rows_per_access; ++row_id) {
-      int global_row_id = row + row_id;
-      ComputeType* row_buf = buf[row_id];
-      WelfordWarpAllReduce<ComputeType, thread_group_width>(
-          thread_mean[row_id], thread_m2[row_id], thread_count[row_id], warp_mean + row_id,
-          warp_m2 + row_id, warp_count + row_id);
-      ComputeType row_mean = warp_mean[row_id];
-      ComputeType row_variance =
-          max(Div(warp_m2[row_id], warp_count[row_id]), static_cast<ComputeType>(0.0));
-      ComputeType row_inv_var = Rsqrt(row_variance + static_cast<ComputeType>(epsilon));
-      if (lane_id == 0) {
-        mean[global_row_id] = row_mean;
-        inv_variance[global_row_id] = row_inv_var;
-      }
-#pragma unroll
-      for (int i = 0; i < cols_per_thread; ++i) {
-        row_buf[i] = (row_buf[i] - row_mean) * row_inv_var;
-      }
-#pragma unroll
-      for (int i = 0; i < num_packs; ++i) {
-        const int col = (i * thread_group_width + lane_id) * pack_size;
-        if (!padding || col < cols) {
-          store.template store<pack_size>(row_buf + i * pack_size, global_row_id, col);
-        }
-      }
-    }
-  }
-}
-
-template<typename LOAD, typename STORE, typename ComputeType, int pack_size, int cols_per_thread,
-         int thread_group_width, int rows_per_access, bool padding>
-inline hipError_t LaunchLayerNormWarpImpl(hipStream_t stream, LOAD load, STORE store,
-                                           const int64_t rows, const int64_t cols,
-                                           const double epsilon, ComputeType* mean,
-                                           ComputeType* inv_variance) {
-  constexpr int block_size = 128;
-  constexpr int waves = 32;
-  static_assert(block_size % thread_group_width == 0, "");
-  constexpr int thread_groups_per_block = block_size / thread_group_width;
-  dim3 block_dim(thread_group_width, thread_groups_per_block);
-  const int64_t num_blocks =
-      (rows / rows_per_access + thread_groups_per_block - 1) / thread_groups_per_block;
-  int grid_dim_x;
-  {
-    hipError_t err =
-        GetNumBlocks(LayerNormWarpImpl<LOAD, STORE, ComputeType, pack_size, cols_per_thread,
-                                       thread_group_width, rows_per_access, padding>,
-                     block_size, 0, num_blocks, waves, &grid_dim_x);
-    if (err != hipSuccess) { return err; }
-  }
-  LayerNormWarpImpl<LOAD, STORE, ComputeType, pack_size, cols_per_thread, thread_group_width,
-                    rows_per_access, padding>
-      <<<grid_dim_x, block_dim, 0, stream>>>(load, store, rows, cols, epsilon, mean, inv_variance);
-  return hipPeekAtLastError();
-}
-
-template<typename LOAD, typename STORE, typename ComputeType, int pack_size, int cols_per_thread,
-         int thread_group_width, int rows_per_access>
-inline hipError_t DispatchLayerNormWarpImplPadding(hipStream_t stream, LOAD load, STORE store,
-                                                    const int64_t rows, const int64_t cols,
-                                                    const double epsilon, ComputeType* mean,
-                                                    ComputeType* inv_variance) {
-  if (cols == cols_per_thread * thread_group_width) {
-    return LaunchLayerNormWarpImpl<LOAD, STORE, ComputeType, pack_size, cols_per_thread,
-                                   thread_group_width, rows_per_access, false>(
-        stream, load, store, rows, cols, epsilon, mean, inv_variance);
-  } else {
-    return LaunchLayerNormWarpImpl<LOAD, STORE, ComputeType, pack_size, cols_per_thread,
-                                   thread_group_width, rows_per_access, true>(
-        stream, load, store, rows, cols, epsilon, mean, inv_variance);
-  }
-}
-
-template<typename LOAD, typename STORE, typename ComputeType, int pack_size>
-typename std::enable_if<pack_size == 1, hipError_t>::type DispatchLayerNormWarpImplCols(
-    hipStream_t stream, LOAD load, STORE store, const int64_t rows, const int64_t cols,
-    const double epsilon, ComputeType* mean, ComputeType* inv_variance) {
-  if (cols <= 0) { return hipErrorInvalidValue; }
-#define DEFINE_ONE_ELIF(thread_group_width)                                                   \
-  else if (cols <= (thread_group_width)*pack_size) {                                          \
-    if (rows % 2 == 0) {                                                                      \
-      return DispatchLayerNormWarpImplPadding<LOAD, STORE, ComputeType, pack_size, pack_size, \
-                                              thread_group_width, 2>(                         \
-          stream, load, store, rows, cols, epsilon, mean, inv_variance);                      \
-    } else {                                                                                  \
-      return DispatchLayerNormWarpImplPadding<LOAD, STORE, ComputeType, pack_size, pack_size, \
-                                              thread_group_width, 1>(                         \
-          stream, load, store, rows, cols, epsilon, mean, inv_variance);                      \
-    }                                                                                         \
-  }
-  DEFINE_ONE_ELIF(1)
-  DEFINE_ONE_ELIF(2)
-  DEFINE_ONE_ELIF(4)
-  DEFINE_ONE_ELIF(8)
-  DEFINE_ONE_ELIF(16)
-  DEFINE_ONE_ELIF(32)
-#undef DEFINE_ONE_ELIF
-#define DEFINE_ONE_ELIF(col)                                                                     \
-  else if (cols <= (col)*kWarpSize) {                                                            \
-    return DispatchLayerNormWarpImplPadding<LOAD, STORE, ComputeType, pack_size, col, kWarpSize, \
-                                            1>(stream, load, store, rows, cols, epsilon, mean,   \
-                                               inv_variance);                                    \
-  }
-  DEFINE_ONE_ELIF(2)
-  DEFINE_ONE_ELIF(3)
-  DEFINE_ONE_ELIF(4)
-  DEFINE_ONE_ELIF(5)
-  DEFINE_ONE_ELIF(6)
-  DEFINE_ONE_ELIF(7)
-  DEFINE_ONE_ELIF(8)
-  DEFINE_ONE_ELIF(9)
-  DEFINE_ONE_ELIF(10)
-  DEFINE_ONE_ELIF(11)
-  DEFINE_ONE_ELIF(12)
-  DEFINE_ONE_ELIF(13)
-  DEFINE_ONE_ELIF(14)
-  DEFINE_ONE_ELIF(15)
-  DEFINE_ONE_ELIF(16)
-  DEFINE_ONE_ELIF(17)
-  DEFINE_ONE_ELIF(18)
-  DEFINE_ONE_ELIF(19)
-  DEFINE_ONE_ELIF(20)
-  DEFINE_ONE_ELIF(21)
-  DEFINE_ONE_ELIF(22)
-  DEFINE_ONE_ELIF(23)
-  DEFINE_ONE_ELIF(24)
-  DEFINE_ONE_ELIF(25)
-  DEFINE_ONE_ELIF(26)
-  DEFINE_ONE_ELIF(27)
-  DEFINE_ONE_ELIF(28)
-  DEFINE_ONE_ELIF(29)
-  DEFINE_ONE_ELIF(30)
-  DEFINE_ONE_ELIF(31)
-  DEFINE_ONE_ELIF(32)
-#undef DEFINE_ONE_ELIF
-  else {
-    return hipErrorInvalidValue;
-  }
-}
-
-template<typename LOAD, typename STORE, typename ComputeType, int pack_size>
-typename std::enable_if<pack_size == 2, hipError_t>::type DispatchLayerNormWarpImplCols(
-    hipStream_t stream, LOAD load, STORE store, const int64_t rows, const int64_t cols,
-    const double epsilon, ComputeType* mean, ComputeType* inv_variance) {
-  if (cols <= 0) { return hipErrorInvalidValue; }
-#define DEFINE_ONE_ELIF(thread_group_width)                                                   \
-  else if (cols <= (thread_group_width)*pack_size) {                                          \
-    if (rows % 2 == 0) {                                                                      \
-      return DispatchLayerNormWarpImplPadding<LOAD, STORE, ComputeType, pack_size, pack_size, \
-                                              thread_group_width, 2>(                         \
-          stream, load, store, rows, cols, epsilon, mean, inv_variance);                      \
-    } else {                                                                                  \
-      return DispatchLayerNormWarpImplPadding<LOAD, STORE, ComputeType, pack_size, pack_size, \
-                                              thread_group_width, 1>(                         \
-          stream, load, store, rows, cols, epsilon, mean, inv_variance);                      \
-    }                                                                                         \
-  }
-  DEFINE_ONE_ELIF(1)
-  DEFINE_ONE_ELIF(2)
-  DEFINE_ONE_ELIF(4)
-  DEFINE_ONE_ELIF(8)
-  DEFINE_ONE_ELIF(16)
-  DEFINE_ONE_ELIF(32)
-#undef DEFINE_ONE_ELIF
-#define DEFINE_ONE_ELIF(col)                                                                     \
-  else if (cols <= (col)*kWarpSize) {                                                            \
-    return DispatchLayerNormWarpImplPadding<LOAD, STORE, ComputeType, pack_size, col, kWarpSize, \
-                                            1>(stream, load, store, rows, cols, epsilon, mean,   \
-                                               inv_variance);                                    \
-  }
-  DEFINE_ONE_ELIF(4)
-  DEFINE_ONE_ELIF(6)
-  DEFINE_ONE_ELIF(8)
-  DEFINE_ONE_ELIF(10)
-  DEFINE_ONE_ELIF(12)
-  DEFINE_ONE_ELIF(14)
-  DEFINE_ONE_ELIF(16)
-  DEFINE_ONE_ELIF(18)
-  DEFINE_ONE_ELIF(20)
-  DEFINE_ONE_ELIF(22)
-  DEFINE_ONE_ELIF(24)
-  DEFINE_ONE_ELIF(26)
-  DEFINE_ONE_ELIF(28)
-  DEFINE_ONE_ELIF(30)
-  DEFINE_ONE_ELIF(32)
-#undef DEFINE_ONE_ELIF
-  else {
-    return hipErrorInvalidValue;
-  }
-}
-template<typename LOAD, typename STORE, typename ComputeType, int pack_size>
-typename std::enable_if<pack_size == 4, hipError_t>::type DispatchLayerNormWarpImplCols(
-    hipStream_t stream, LOAD load, STORE store, const int64_t rows, const int64_t cols,
-    const double epsilon, ComputeType* mean, ComputeType* inv_variance) {
-  if (cols <= 0) { return hipErrorInvalidValue; }
-#define DEFINE_ONE_ELIF(thread_group_width)                                                   \
-  else if (cols <= (thread_group_width)*pack_size) {                                          \
-    if (rows % 2 == 0) {                                                                      \
-      return DispatchLayerNormWarpImplPadding<LOAD, STORE, ComputeType, pack_size, pack_size, \
-                                              thread_group_width, 2>(                         \
-          stream, load, store, rows, cols, epsilon, mean, inv_variance);                      \
-    } else {                                                                                  \
-      return DispatchLayerNormWarpImplPadding<LOAD, STORE, ComputeType, pack_size, pack_size, \
-                                              thread_group_width, 1>(                         \
-          stream, load, store, rows, cols, epsilon, mean, inv_variance);                      \
-    }                                                                                         \
-  }
-  DEFINE_ONE_ELIF(1)
-  DEFINE_ONE_ELIF(2)
-  DEFINE_ONE_ELIF(4)
-  DEFINE_ONE_ELIF(8)
-  DEFINE_ONE_ELIF(16)
-  DEFINE_ONE_ELIF(32)
-#undef DEFINE_ONE_ELIF
-#define DEFINE_ONE_ELIF(col)                                                                     \
-  else if (cols <= (col)*kWarpSize) {                                                            \
-    return DispatchLayerNormWarpImplPadding<LOAD, STORE, ComputeType, pack_size, col, kWarpSize, \
-                                            1>(stream, load, store, rows, cols, epsilon, mean,   \
-                                               inv_variance);                                    \
-  }
-  DEFINE_ONE_ELIF(8)
-  DEFINE_ONE_ELIF(12)
-  DEFINE_ONE_ELIF(16)
-  DEFINE_ONE_ELIF(20)
-  DEFINE_ONE_ELIF(24)
-  DEFINE_ONE_ELIF(28)
-  DEFINE_ONE_ELIF(32)
-#undef DEFINE_ONE_ELIF
-  else {
-    return hipErrorInvalidValue;
-  }
-}
-
-template<typename LOAD, typename STORE, typename ComputeType>
-struct DispatchLayerNormWarpImplPackSize {
-  hipError_t operator()(hipStream_t stream, LOAD load, STORE store, const int64_t rows,
-                         const int64_t cols, const double epsilon, ComputeType* mean,
-                         ComputeType* inv_variance) {
-    if (cols % 4 == 0) {
-      return DispatchLayerNormWarpImplCols<LOAD, STORE, ComputeType, 4>(
-          stream, load, store, rows, cols, epsilon, mean, inv_variance);
-    } else if (cols % 2 == 0) {
-      return DispatchLayerNormWarpImplCols<LOAD, STORE, ComputeType, 2>(
-          stream, load, store, rows, cols, epsilon, mean, inv_variance);
-    } else {
-      return DispatchLayerNormWarpImplCols<LOAD, STORE, ComputeType, 1>(
-          stream, load, store, rows, cols, epsilon, mean, inv_variance);
-    }
-  }
-};
-
-template<typename LOAD, typename STORE, typename ComputeType>
-inline hipError_t DispatchLayerNormWarpImpl(hipStream_t stream, LOAD load, STORE store,
-                                             const int64_t rows, const int64_t cols,
-                                             const double epsilon, ComputeType* mean,
-                                             ComputeType* inv_variance) {
-  return DispatchLayerNormWarpImplPackSize<LOAD, STORE, ComputeType>()(
-      stream, load, store, rows, cols, epsilon, mean, inv_variance);
-}
-
-template<typename LOAD, typename STORE, typename ComputeType, int pack_size, int block_size>
-__global__ void LayerNormBlockSMemImpl(LOAD load, STORE store, const int64_t rows,
-                                       const int64_t cols, const double epsilon, ComputeType* mean,
-                                       ComputeType* inv_variance) {
-  extern __shared__ __align__(sizeof(double)) unsigned char shared_buf[];
-  auto* buf = reinterpret_cast<ComputeType*>(shared_buf);
-  const int tid = threadIdx.x;
-  assert(cols % pack_size == 0);
-  const int num_packs = static_cast<int>(cols) / pack_size;
-  for (int64_t row = blockIdx.x; row < rows; row += gridDim.x) {
-    ComputeType thread_mean = 0;
-    ComputeType thread_m2 = 0;
-    ComputeType thread_count = 0;
-    for (int pack_id = tid; pack_id < num_packs; pack_id += block_size) {
-      ComputeType pack[pack_size];
-      load.template load<pack_size>(pack, row, pack_id * pack_size);
-#pragma unroll
-      for (int i = 0; i < pack_size; ++i) {
-        buf[i * num_packs + pack_id] = pack[i];
-        WelfordCombine(pack[i], &thread_mean, &thread_m2, &thread_count);
-      }
-    }
-    ComputeType row_mean = 0;
-    ComputeType row_m2 = 0;
-    ComputeType row_count = 0;
-    WelfordBlockAllReduce<ComputeType>(thread_mean, thread_m2, thread_count, &row_mean, &row_m2,
-                                       &row_count);
-    ComputeType row_variance = max(Div(row_m2, row_count), static_cast<ComputeType>(0.0));
-    ComputeType row_inv_var = Rsqrt(row_variance + static_cast<ComputeType>(epsilon));
-    if (threadIdx.x == 0) {
-      mean[row] = row_mean;
-      inv_variance[row] = row_inv_var;
-    }
-    for (int pack_id = tid; pack_id < num_packs; pack_id += block_size) {
-      ComputeType pack[pack_size];
-#pragma unroll
-      for (int i = 0; i < pack_size; ++i) {
-        pack[i] = (buf[i * num_packs + pack_id] - row_mean) * row_inv_var;
-      }
-      store.template store<pack_size>(pack, row, pack_id * pack_size);
-    }
-  }
-}
-
-template<typename LOAD, typename STORE, typename ComputeType, int pack_size, int block_size>
-__global__ void LayerNormBlockSMemImpl_1024(LOAD load, STORE store, const int64_t rows,
-                                       const int64_t cols, const double epsilon, ComputeType* mean,
-                                       ComputeType* inv_variance) __attribute__((amdgpu_flat_work_group_size(1,1024))) {
-  extern __shared__ __align__(sizeof(double)) unsigned char shared_buf[];
-  auto* buf = reinterpret_cast<ComputeType*>(shared_buf);
-  const int tid = threadIdx.x;
-  assert(cols % pack_size == 0);
-  const int num_packs = static_cast<int>(cols) / pack_size;
-  for (int64_t row = blockIdx.x; row < rows; row += gridDim.x) {
-    ComputeType thread_mean = 0;
-    ComputeType thread_m2 = 0;
-    ComputeType thread_count = 0;
-    for (int pack_id = tid; pack_id < num_packs; pack_id += block_size) {
-      ComputeType pack[pack_size];
-      load.template load<pack_size>(pack, row, pack_id * pack_size);
-#pragma unroll
-      for (int i = 0; i < pack_size; ++i) {
-        buf[i * num_packs + pack_id] = pack[i];
-        WelfordCombine(pack[i], &thread_mean, &thread_m2, &thread_count);
-      }
-    }
-    ComputeType row_mean = 0;
-    ComputeType row_m2 = 0;
-    ComputeType row_count = 0;
-    WelfordBlockAllReduce<ComputeType>(thread_mean, thread_m2, thread_count, &row_mean, &row_m2,
-                                       &row_count);
-    ComputeType row_variance = max(Div(row_m2, row_count), static_cast<ComputeType>(0.0));
-    ComputeType row_inv_var = Rsqrt(row_variance + static_cast<ComputeType>(epsilon));
-    if (threadIdx.x == 0) {
-      mean[row] = row_mean;
-      inv_variance[row] = row_inv_var;
-    }
-    for (int pack_id = tid; pack_id < num_packs; pack_id += block_size) {
-      ComputeType pack[pack_size];
-#pragma unroll
-      for (int i = 0; i < pack_size; ++i) {
-        pack[i] = (buf[i * num_packs + pack_id] - row_mean) * row_inv_var;
-      }
-      store.template store<pack_size>(pack, row, pack_id * pack_size);
-    }
-  }
-}
-
-
-template<typename LOAD, typename STORE, typename ComputeType, int pack_size, int block_size>
-inline hipError_t LaunchLayerNormBlockSMemImpl(hipStream_t stream, LOAD load, STORE store,
-                                                int smem, const int64_t rows, const int64_t cols,
-                                                const double epsilon, ComputeType* mean,
-                                                ComputeType* inv_variance) {
-  constexpr int waves = 32;
-  int grid_dim_x;
-  {
-    hipError_t err =
-        GetNumBlocks(LayerNormBlockSMemImpl<LOAD, STORE, ComputeType, pack_size, block_size>,
-                     block_size, smem, rows, waves, &grid_dim_x);
-    if (err != hipSuccess) { return err; }
-  }
-  LayerNormBlockSMemImpl<LOAD, STORE, ComputeType, pack_size, block_size>
-      <<<grid_dim_x, block_size, smem, stream>>>(load, store, rows, cols, epsilon, mean,
-                                                 inv_variance);
-  return hipPeekAtLastError();
-}
-
-template<typename LOAD, typename STORE, typename ComputeType, int pack_size, int block_size>
-inline hipError_t LaunchLayerNormBlockSMemImpl_1024(hipStream_t stream, LOAD load, STORE store,
-                                                int smem, const int64_t rows, const int64_t cols,
-                                                const double epsilon, ComputeType* mean,
-                                                ComputeType* inv_variance) {
-  constexpr int waves = 32;
-  int grid_dim_x;
-  {
-    hipError_t err =
-        GetNumBlocks(LayerNormBlockSMemImpl_1024<LOAD, STORE, ComputeType, pack_size, block_size>,
-                     block_size, smem, rows, waves, &grid_dim_x);
-    if (err != hipSuccess) { return err; }
-  }
-  LayerNormBlockSMemImpl_1024<LOAD, STORE, ComputeType, pack_size, block_size>
-      <<<grid_dim_x, block_size, smem, stream>>>(load, store, rows, cols, epsilon, mean,
-                                                 inv_variance);
-  return hipPeekAtLastError();
-}
-
-template<typename LOAD, typename STORE, typename ComputeType, int pack_size>
-inline hipError_t TryDispatchLayerNormBlockSMemImplBlockSize(
-    hipStream_t stream, LOAD load, STORE store, const int64_t rows, const int64_t cols,
-    const double epsilon, ComputeType* mean, ComputeType* inv_variance, bool* success) {
-  constexpr int block_size_conf_1 = 128;
-  constexpr int block_size_conf_2 = 256;
-  constexpr int block_size_conf_3 = 512;
-  constexpr int block_size_conf_4 = 1024;
-  const size_t smem = cols * sizeof(ComputeType);
-  int max_active_blocks_conf_1;
-
-  {
-    hipError_t err = hipOccupancyMaxActiveBlocksPerMultiprocessor(
-        &max_active_blocks_conf_1,
-        LayerNormBlockSMemImpl<LOAD, STORE, ComputeType, pack_size, block_size_conf_1>,
-        block_size_conf_1, smem);
-    if (err != hipSuccess) { return err; }
-  }
-  if (max_active_blocks_conf_1 <= 0) {
-    *success = false;
-    return hipSuccess;
-  }
-  int max_active_blocks_conf_4;
-  {
-    hipError_t err = hipOccupancyMaxActiveBlocksPerMultiprocessor(
-        &max_active_blocks_conf_4,
-        LayerNormBlockSMemImpl_1024<LOAD, STORE, ComputeType, pack_size, block_size_conf_4>,
-        block_size_conf_4, smem);
-    if (err != hipSuccess) { return err; }
-  }
-
-  if (max_active_blocks_conf_4 == max_active_blocks_conf_1) {
-    *success = true;
-    return LaunchLayerNormBlockSMemImpl_1024<LOAD, STORE, ComputeType, pack_size, block_size_conf_4>(
-        stream, load, store, smem, rows, cols, epsilon, mean, inv_variance);
-  }
-  int max_active_blocks_conf_3;
-  {
-    hipError_t err = hipOccupancyMaxActiveBlocksPerMultiprocessor(
-        &max_active_blocks_conf_3,
-        LayerNormBlockSMemImpl<LOAD, STORE, ComputeType, pack_size, block_size_conf_3>,
-        block_size_conf_3, smem);
-    if (err != hipSuccess) { return err; }
-  }
-
-  if (max_active_blocks_conf_3 == max_active_blocks_conf_1) {
-    *success = true;
-    return LaunchLayerNormBlockSMemImpl<LOAD, STORE, ComputeType, pack_size, block_size_conf_3>(
-        stream, load, store, smem, rows, cols, epsilon, mean, inv_variance);
-  }
-  int max_active_blocks_conf_2;
-  {
-    hipError_t err = hipOccupancyMaxActiveBlocksPerMultiprocessor(
-        &max_active_blocks_conf_2,
-        LayerNormBlockSMemImpl<LOAD, STORE, ComputeType, pack_size, block_size_conf_2>,
-        block_size_conf_2, smem);
-    if (err != hipSuccess) { return err; }
-  }
-
-  if (max_active_blocks_conf_2 == max_active_blocks_conf_1) {
-    *success = true;
-    return LaunchLayerNormBlockSMemImpl<LOAD, STORE, ComputeType, pack_size, block_size_conf_2>(
-        stream, load, store, smem, rows, cols, epsilon, mean, inv_variance);
-  }
-  *success = true;
-  return LaunchLayerNormBlockSMemImpl<LOAD, STORE, ComputeType, pack_size, block_size_conf_1>(
-      stream, load, store, smem, rows, cols, epsilon, mean, inv_variance);
-}
-
-template<typename LOAD, typename STORE, typename ComputeType>
-struct TryDispatchLayerNormBlockSMemImplPackSize {
-  hipError_t operator()(hipStream_t stream, LOAD load, STORE store, const int64_t rows,
-                         const int64_t cols, const double epsilon, ComputeType* mean,
-                         ComputeType* inv_variance, bool* success) {
-    if (cols % 4 == 0) {
-      return TryDispatchLayerNormBlockSMemImplBlockSize<LOAD, STORE, ComputeType, 4>(
-          stream, load, store, rows, cols, epsilon, mean, inv_variance, success);
-    } else if (cols % 2 == 0) {
-      return TryDispatchLayerNormBlockSMemImplBlockSize<LOAD, STORE, ComputeType, 2>(
-          stream, load, store, rows, cols, epsilon, mean, inv_variance, success);
-    } else {
-      return TryDispatchLayerNormBlockSMemImplBlockSize<LOAD, STORE, ComputeType, 1>(
-          stream, load, store, rows, cols, epsilon, mean, inv_variance, success);
-    }
-  }
-};
-
-template<typename LOAD, typename STORE, typename ComputeType>
-inline hipError_t TryDispatchLayerNormBlockSMemImpl(hipStream_t stream, LOAD load, STORE store,
-                                                     const int64_t rows, const int64_t cols,
-                                                     const double epsilon, ComputeType* mean,
-                                                     ComputeType* inv_variance, bool* success) {
-  return TryDispatchLayerNormBlockSMemImplPackSize<LOAD, STORE, ComputeType>()(
-      stream, load, store, rows, cols, epsilon, mean, inv_variance, success);
-}
-
-template<typename LOAD, typename STORE, typename ComputeType, int pack_size, int block_size>
-__global__ void LayerNormBlockUncachedImpl(LOAD load, STORE store, const int64_t rows,
-                                           const int64_t cols, const double epsilon,
-                                           ComputeType* mean, ComputeType* inv_variance) __attribute__((amdgpu_flat_work_group_size(1,1024))) {
-  const int tid = threadIdx.x;
-  assert(cols % pack_size == 0);
-  const int num_packs = static_cast<int>(cols) / pack_size;
-  for (int64_t row = blockIdx.x; row < rows; row += gridDim.x) {
-    ComputeType thread_mean = 0;
-    ComputeType thread_m2 = 0;
-    ComputeType thread_count = 0;
-    for (int pack_id = tid; pack_id < num_packs; pack_id += block_size) {
-      ComputeType pack[pack_size];
-      load.template load<pack_size>(pack, row, pack_id * pack_size);
-#pragma unroll
-      for (int i = 0; i < pack_size; ++i) {
-        WelfordCombine(pack[i], &thread_mean, &thread_m2, &thread_count);
-      }
-    }
-    ComputeType row_mean = 0;
-    ComputeType row_m2 = 0;
-    ComputeType row_count = 0;
-    WelfordBlockAllReduce<ComputeType>(thread_mean, thread_m2, thread_count, &row_mean, &row_m2,
-                                       &row_count);
-    ComputeType row_variance = max(Div(row_m2, row_count), static_cast<ComputeType>(0.0));
-    ComputeType row_inv_var = Rsqrt(row_variance + static_cast<ComputeType>(epsilon));
-    if (threadIdx.x == 0) {
-      mean[row] = row_mean;
-      inv_variance[row] = row_inv_var;
-    }
-    for (int pack_id = tid; pack_id < num_packs; pack_id += block_size) {
-      ComputeType pack[pack_size];
-      const int pack_offset = pack_id * pack_size;
-      load.template load<pack_size>(pack, row, pack_offset);
-#pragma unroll
-      for (int i = 0; i < pack_size; ++i) { pack[i] = (pack[i] - row_mean) * row_inv_var; }
-      store.template store<pack_size>(pack, row, pack_offset);
-    }
-  }
-}
-
-template<typename LOAD, typename STORE, typename ComputeType, int pack_size>
-inline hipError_t LaunchLayerNormBlockUncachedImpl(hipStream_t stream, LOAD load, STORE store,
-                                                    const int64_t rows, const int64_t cols,
-                                                    const double epsilon, ComputeType* mean,
-                                                    ComputeType* inv_variance) {
-  constexpr int block_size = 1024;
-  constexpr int waves = 32;
-  int grid_dim_x;
-  {
-    hipError_t err =
-        GetNumBlocks(LayerNormBlockUncachedImpl<LOAD, STORE, ComputeType, pack_size, block_size>,
-                     block_size, 0, rows, waves, &grid_dim_x);
-    if (err != hipSuccess) { return err; }
-  }
-  LayerNormBlockUncachedImpl<LOAD, STORE, ComputeType, pack_size, block_size>
-      <<<grid_dim_x, block_size, 0, stream>>>(load, store, rows, cols, epsilon, mean, inv_variance);
-  return hipPeekAtLastError();
-}
-
-template<typename LOAD, typename STORE, typename ComputeType>
-struct DispatchLayerNormBlockUncachedImplPackSize {
-  hipError_t operator()(hipStream_t stream, LOAD load, STORE store, const int64_t rows,
-                         const int64_t cols, const double epsilon, ComputeType* mean,
-                         ComputeType* inv_variance) {
-    if (cols % 4 == 0) {
-      return LaunchLayerNormBlockUncachedImpl<LOAD, STORE, ComputeType, 4>(
-          stream, load, store, rows, cols, epsilon, mean, inv_variance);
-    } else if (cols % 2 == 0) {
-      return LaunchLayerNormBlockUncachedImpl<LOAD, STORE, ComputeType, 2>(
-          stream, load, store, rows, cols, epsilon, mean, inv_variance);
-    } else {
-      return LaunchLayerNormBlockUncachedImpl<LOAD, STORE, ComputeType, 1>(
-          stream, load, store, rows, cols, epsilon, mean, inv_variance);
-    }
-  }
-};
-
-template<typename LOAD, typename STORE, typename ComputeType>
-inline hipError_t DispatchLayerNormBlockUncachedImpl(hipStream_t stream, LOAD load, STORE store,
-                                                      const int64_t rows, const int64_t cols,
-                                                      const double epsilon, ComputeType* mean,
-                                                      ComputeType* inv_variance) {
-  return DispatchLayerNormBlockUncachedImplPackSize<LOAD, STORE, ComputeType>()(
-      stream, load, store, rows, cols, epsilon, mean, inv_variance);
-}
-
-template<typename LOAD, typename STORE, typename ComputeType>
-inline typename std::enable_if<!std::is_same<ComputeType, double>::value, hipError_t>::type
-DispatchLayerNorm(hipStream_t stream, LOAD load, STORE store, const int64_t rows,
-                  const int64_t cols, const double epsilon, ComputeType* mean,
-                  ComputeType* inv_variance) {
-  if (cols <= 1024) {
-    return DispatchLayerNormWarpImpl<LOAD, STORE, ComputeType>(stream, load, store, rows, cols,
-                                                               epsilon, mean, inv_variance);
-  } else {
-    bool dispatch_smem_impl_success;
-    {
-      hipError_t err = TryDispatchLayerNormBlockSMemImpl<LOAD, STORE, ComputeType>(
-          stream, load, store, rows, cols, epsilon, mean, inv_variance,
-          &dispatch_smem_impl_success);
-      if (err != hipSuccess) { return err; }
-    }
-    if (!dispatch_smem_impl_success) {
-      return DispatchLayerNormBlockUncachedImpl<LOAD, STORE, ComputeType>(
-          stream, load, store, rows, cols, epsilon, mean, inv_variance);
-    }
-    return hipSuccess;
-  }
-}
-
-template<typename LOAD, typename STORE, typename ComputeType>
-inline typename std::enable_if<std::is_same<ComputeType, double>::value, hipError_t>::type
-DispatchLayerNorm(hipStream_t stream, LOAD load, STORE store, const int64_t rows,
-                  const int64_t cols, const double epsilon, ComputeType* mean,
-                  ComputeType* inv_variance) {
-  return DispatchLayerNormBlockUncachedImpl<LOAD, STORE, ComputeType>(
-      stream, load, store, rows, cols, epsilon, mean, inv_variance);
-}
-
-/*
-LayerNormGrad dx:
-normalized = (x - mean) * inv_var
-sum_stats1 = sum(scaled_dy)
-sum_stats2 = sum(scaled_dy * normalized)
-dx = cols * dy - sum_stats1 - normalized * sum_stats2
-dx *= inv_var / cols
-*/
-template<typename LOAD_X, typename LOAD_SCALED_DY, typename STORE, typename ComputeType,
-         int pack_size, int cols_per_thread, int thread_group_width, int rows_per_access,
-         bool padding>
-__global__ void LayerNormGradWarpImpl(LOAD_X load_x, LOAD_SCALED_DY load_scaled_dy, STORE store,
-                                      const ComputeType* mean, const ComputeType* inv_variance,
-                                      const int64_t rows, const int64_t cols) {
-  static_assert(cols_per_thread % pack_size == 0, "");
-  constexpr int pack_per_thread = cols_per_thread / pack_size;
-  assert(cols <= cols_per_thread * thread_group_width);
-  static_assert(thread_group_width <= kWarpSize, "");
-  static_assert(kWarpSize % thread_group_width == 0, "");
-  ComputeType normalized_buf[rows_per_access][cols_per_thread];
-  ComputeType dy_buf[rows_per_access][cols_per_thread];
-  const ComputeType one_over_cols = static_cast<ComputeType>(1.0) / static_cast<ComputeType>(cols);
-  const int64_t global_thread_group_id = blockIdx.x * blockDim.y + threadIdx.y;
-  const int64_t num_global_thread_group = gridDim.x * blockDim.y;
-  const int lane_id = threadIdx.x;
-  const int64_t step = num_global_thread_group * rows_per_access;
-  for (int64_t row = global_thread_group_id * rows_per_access; row < rows; row += step) {
-    ComputeType sum_stats1[rows_per_access];
-    ComputeType sum_stats2[rows_per_access];
-    ComputeType inv_variance_buf[rows_per_access];
-#pragma unroll
-    for (int row_id = 0; row_id < rows_per_access; ++row_id) {
-      const int global_row_id = row + row_id;
-      ComputeType mean_val = mean[global_row_id];
-      inv_variance_buf[row_id] = inv_variance[global_row_id];
-      sum_stats1[row_id] = 0;
-      sum_stats2[row_id] = 0;
-      ComputeType* row_normalized_buf = normalized_buf[row_id];
-      ComputeType* row_dy_buf = dy_buf[row_id];
-#pragma unroll
-      for (int pack_id = 0; pack_id < pack_per_thread; ++pack_id) {
-        const int col = (pack_id * thread_group_width + lane_id) * pack_size;
-        const int pack_offset = pack_id * pack_size;
-        if (!padding || col < cols) {
-          load_x.template load<pack_size>(row_normalized_buf + pack_offset, global_row_id, col);
-          load_scaled_dy.template load<pack_size>(row_dy_buf + pack_offset, global_row_id, col);
-#pragma unroll
-          for (int i = 0; i < pack_size; ++i) {
-            const int col_id = pack_offset + i;
-            // row_normalized_buf store x
-            row_normalized_buf[col_id] =
-                (row_normalized_buf[col_id] - mean_val) * inv_variance_buf[row_id];
-            sum_stats1[row_id] += row_dy_buf[col_id];
-            sum_stats2[row_id] += row_dy_buf[col_id] * row_normalized_buf[col_id];
-          }
-        }
-      }
-    }
-    ComputeType warp_sum_stats1[rows_per_access];
-    ComputeType warp_sum_stats2[rows_per_access];
-#pragma unroll
-    for (int row_id = 0; row_id < rows_per_access; ++row_id) {
-      warp_sum_stats1[row_id] =
-          WarpAllReduce<SumOp, ComputeType, thread_group_width>(sum_stats1[row_id]);
-      warp_sum_stats2[row_id] =
-          WarpAllReduce<SumOp, ComputeType, thread_group_width>(sum_stats2[row_id]);
-    }
-#pragma unroll
-    for (int row_id = 0; row_id < rows_per_access; ++row_id) {
-      const int global_row_id = row + row_id;
-      ComputeType* row_normalized_buf = normalized_buf[row_id];
-      ComputeType* row_dy_buf = dy_buf[row_id];
-      const ComputeType inv_variance_over_cols = inv_variance_buf[row_id] * one_over_cols;
-#pragma unroll
-      for (int pack_id = 0; pack_id < pack_per_thread; ++pack_id) {
-        const int col = (pack_id * thread_group_width + lane_id) * pack_size;
-        if (!padding || col < cols) {
-          for (int i = 0; i < pack_size; ++i) {
-            const int col_id = pack_id * pack_size + i;
-            row_dy_buf[col_id] = (cols * row_dy_buf[col_id] - warp_sum_stats1[row_id]
-                                  - row_normalized_buf[col_id] * warp_sum_stats2[row_id])
-                                 * inv_variance_over_cols;
-          }
-          store.template store<pack_size>(row_dy_buf + pack_id * pack_size, global_row_id, col);
-        }
-      }
-    }
-  }
-}
-
-template<typename LOAD_X, typename LOAD_SCALED_DY, typename STORE, typename ComputeType,
-         int pack_size, int cols_per_thread, int thread_group_width, int rows_per_access,
-         bool padding>
-inline hipError_t LaunchLayerNormGradWarpImpl(hipStream_t stream, LOAD_X load_x,
-                                               LOAD_SCALED_DY load_scaled_dy, STORE store,
-                                               const ComputeType* mean,
-                                               const ComputeType* inv_variance, const int64_t rows,
-                                               const int64_t cols) {
-  constexpr int block_size = 128;
-  constexpr int waves = 32;
-  static_assert(block_size % thread_group_width == 0, "");
-  constexpr int thread_groups_per_block = block_size / thread_group_width;
-  dim3 block_dim(thread_group_width, thread_groups_per_block);
-  const int64_t num_blocks =
-      (rows / rows_per_access + thread_groups_per_block - 1) / thread_groups_per_block;
-  int grid_dim_x;
-  {
-    hipError_t err = GetNumBlocks(
-        LayerNormGradWarpImpl<LOAD_X, LOAD_SCALED_DY, STORE, ComputeType, pack_size,
-                              cols_per_thread, thread_group_width, rows_per_access, padding>,
-        block_size, 0, num_blocks, waves, &grid_dim_x);
-    if (err != hipSuccess) { return err; }
-  }
-  LayerNormGradWarpImpl<LOAD_X, LOAD_SCALED_DY, STORE, ComputeType, pack_size, cols_per_thread,
-                        thread_group_width, rows_per_access, padding>
-      <<<grid_dim_x, block_dim, 0, stream>>>(load_x, load_scaled_dy, store, mean, inv_variance,
-                                             rows, cols);
-  return hipPeekAtLastError();
-}
-
-template<typename LOAD_X, typename LOAD_SCALED_DY, typename STORE, typename ComputeType,
-         int pack_size, int cols_per_thread, int thread_group_width, int rows_per_access>
-inline hipError_t DispatchLayerNormGradWarpImplPadding(hipStream_t stream, LOAD_X load_x,
-                                                        LOAD_SCALED_DY load_scaled_dy, STORE store,
-                                                        const ComputeType* mean,
-                                                        const ComputeType* inv_variance,
-                                                        const int64_t rows, const int64_t cols) {
-  if (cols == cols_per_thread * thread_group_width) {
-    return LaunchLayerNormGradWarpImpl<LOAD_X, LOAD_SCALED_DY, STORE, ComputeType, pack_size,
-                                       cols_per_thread, thread_group_width, rows_per_access, false>(
-        stream, load_x, load_scaled_dy, store, mean, inv_variance, rows, cols);
-  } else {
-    return LaunchLayerNormGradWarpImpl<LOAD_X, LOAD_SCALED_DY, STORE, ComputeType, pack_size,
-                                       cols_per_thread, thread_group_width, rows_per_access, true>(
-        stream, load_x, load_scaled_dy, store, mean, inv_variance, rows, cols);
-  }
-}
-
-template<typename LOAD_X, typename LOAD_SCALED_DY, typename STORE, typename ComputeType,
-         int pack_size>
-typename std::enable_if<pack_size == 1, hipError_t>::type DispatchLayerNormGradWarpImplCols(
-    hipStream_t stream, LOAD_X load_x, LOAD_SCALED_DY load_scaled_dy, STORE store,
-    const ComputeType* mean, const ComputeType* inv_variance, const int64_t rows,
-    const int64_t cols) {
-  if (cols <= 0) { return hipErrorInvalidValue; }
-#define DEFINE_ONE_ELIF(thread_group_width)                                                     \
-  else if (cols <= (thread_group_width)*pack_size) {                                            \
-    if (rows % 2 == 0) {                                                                        \
-      return DispatchLayerNormGradWarpImplPadding<LOAD_X, LOAD_SCALED_DY, STORE, ComputeType,   \
-                                                  pack_size, pack_size, thread_group_width, 2>( \
-          stream, load_x, load_scaled_dy, store, mean, inv_variance, rows, cols);               \
-    } else {                                                                                    \
-      return DispatchLayerNormGradWarpImplPadding<LOAD_X, LOAD_SCALED_DY, STORE, ComputeType,   \
-                                                  pack_size, pack_size, thread_group_width, 1>( \
-          stream, load_x, load_scaled_dy, store, mean, inv_variance, rows, cols);               \
-    }                                                                                           \
-  }
-  DEFINE_ONE_ELIF(1)
-  DEFINE_ONE_ELIF(2)
-  DEFINE_ONE_ELIF(4)
-  DEFINE_ONE_ELIF(8)
-  DEFINE_ONE_ELIF(16)
-  DEFINE_ONE_ELIF(32)
-#undef DEFINE_ONE_ELIF
-#define DEFINE_ONE_ELIF(col)                                                                \
-  else if (cols <= (col)*kWarpSize) {                                                       \
-    return DispatchLayerNormGradWarpImplPadding<LOAD_X, LOAD_SCALED_DY, STORE, ComputeType, \
-                                                pack_size, col, kWarpSize, 1>(              \
-        stream, load_x, load_scaled_dy, store, mean, inv_variance, rows, cols);             \
-  }
-  DEFINE_ONE_ELIF(2)
-  DEFINE_ONE_ELIF(3)
-  DEFINE_ONE_ELIF(4)
-  DEFINE_ONE_ELIF(5)
-  DEFINE_ONE_ELIF(6)
-  DEFINE_ONE_ELIF(7)
-  DEFINE_ONE_ELIF(8)
-  DEFINE_ONE_ELIF(9)
-  DEFINE_ONE_ELIF(10)
-  DEFINE_ONE_ELIF(11)
-  DEFINE_ONE_ELIF(12)
-  DEFINE_ONE_ELIF(13)
-  DEFINE_ONE_ELIF(14)
-  DEFINE_ONE_ELIF(15)
-  DEFINE_ONE_ELIF(16)
-  DEFINE_ONE_ELIF(17)
-  DEFINE_ONE_ELIF(18)
-  DEFINE_ONE_ELIF(19)
-  DEFINE_ONE_ELIF(20)
-  DEFINE_ONE_ELIF(21)
-  DEFINE_ONE_ELIF(22)
-  DEFINE_ONE_ELIF(23)
-  DEFINE_ONE_ELIF(24)
-  DEFINE_ONE_ELIF(25)
-  DEFINE_ONE_ELIF(26)
-  DEFINE_ONE_ELIF(27)
-  DEFINE_ONE_ELIF(28)
-  DEFINE_ONE_ELIF(29)
-  DEFINE_ONE_ELIF(30)
-  DEFINE_ONE_ELIF(31)
-  DEFINE_ONE_ELIF(32)
-#undef DEFINE_ONE_ELIF
-  else {
-    return hipErrorInvalidValue;
-  }
-}
-
-template<typename LOAD_X, typename LOAD_SCALED_DY, typename STORE, typename ComputeType,
-         int pack_size>
-typename std::enable_if<pack_size == 2, hipError_t>::type DispatchLayerNormGradWarpImplCols(
-    hipStream_t stream, LOAD_X load_x, LOAD_SCALED_DY load_scaled_dy, STORE store,
-    const ComputeType* mean, const ComputeType* inv_variance, const int64_t rows,
-    const int64_t cols) {
-  if (cols <= 0) { return hipErrorInvalidValue; }
-#define DEFINE_ONE_ELIF(thread_group_width)                                                     \
-  else if (cols <= (thread_group_width)*pack_size) {                                            \
-    if (rows % 2 == 0) {                                                                        \
-      return DispatchLayerNormGradWarpImplPadding<LOAD_X, LOAD_SCALED_DY, STORE, ComputeType,   \
-                                                  pack_size, pack_size, thread_group_width, 2>( \
-          stream, load_x, load_scaled_dy, store, mean, inv_variance, rows, cols);               \
-    } else {                                                                                    \
-      return DispatchLayerNormGradWarpImplPadding<LOAD_X, LOAD_SCALED_DY, STORE, ComputeType,   \
-                                                  pack_size, pack_size, thread_group_width, 1>( \
-          stream, load_x, load_scaled_dy, store, mean, inv_variance, rows, cols);               \
-    }                                                                                           \
-  }
-  DEFINE_ONE_ELIF(1)
-  DEFINE_ONE_ELIF(2)
-  DEFINE_ONE_ELIF(4)
-  DEFINE_ONE_ELIF(8)
-  DEFINE_ONE_ELIF(16)
-  DEFINE_ONE_ELIF(32)
-#undef DEFINE_ONE_ELIF
-#define DEFINE_ONE_ELIF(col)                                                                \
-  else if (cols <= (col)*kWarpSize) {                                                       \
-    return DispatchLayerNormGradWarpImplPadding<LOAD_X, LOAD_SCALED_DY, STORE, ComputeType, \
-                                                pack_size, col, kWarpSize, 1>(              \
-        stream, load_x, load_scaled_dy, store, mean, inv_variance, rows, cols);             \
-  }
-  DEFINE_ONE_ELIF(4)
-  DEFINE_ONE_ELIF(6)
-  DEFINE_ONE_ELIF(8)
-  DEFINE_ONE_ELIF(10)
-  DEFINE_ONE_ELIF(12)
-  DEFINE_ONE_ELIF(14)
-  DEFINE_ONE_ELIF(16)
-  DEFINE_ONE_ELIF(18)
-  DEFINE_ONE_ELIF(20)
-  DEFINE_ONE_ELIF(22)
-  DEFINE_ONE_ELIF(24)
-  DEFINE_ONE_ELIF(26)
-  DEFINE_ONE_ELIF(28)
-  DEFINE_ONE_ELIF(30)
-  DEFINE_ONE_ELIF(32)
-#undef DEFINE_ONE_ELIF
-  else {
-    return hipErrorInvalidValue;
-  }
-}
-
-template<typename LOAD_X, typename LOAD_SCALED_DY, typename STORE, typename ComputeType>
-struct DispatchLayerNormGradWarpImplPackSize {
-  hipError_t operator()(hipStream_t stream, LOAD_X load_x, LOAD_SCALED_DY load_scaled_dy,
-                         STORE store, const ComputeType* mean, const ComputeType* inv_variance,
-                         const int64_t rows, const int64_t cols) {
-    if (cols % 2 == 0) {
-      return DispatchLayerNormGradWarpImplCols<LOAD_X, LOAD_SCALED_DY, STORE, ComputeType, 2>(
-          stream, load_x, load_scaled_dy, store, mean, inv_variance, rows, cols);
-    } else {
-      return DispatchLayerNormGradWarpImplCols<LOAD_X, LOAD_SCALED_DY, STORE, ComputeType, 1>(
-          stream, load_x, load_scaled_dy, store, mean, inv_variance, rows, cols);
-    }
-  }
-};
-
-template<typename LOAD_X, typename LOAD_SCALED_DY, typename STORE, typename ComputeType>
-inline hipError_t DispatchLayerNormGradWarpImpl(hipStream_t stream, LOAD_X load_x,
-                                                 LOAD_SCALED_DY load_scaled_dy, STORE store,
-                                                 const ComputeType* mean,
-                                                 const ComputeType* inv_variance,
-                                                 const int64_t rows, const int64_t cols) {
-  return DispatchLayerNormGradWarpImplPackSize<LOAD_X, LOAD_SCALED_DY, STORE, ComputeType>()(
-      stream, load_x, load_scaled_dy, store, mean, inv_variance, rows, cols);
-}
-
-template<typename LOAD_X, typename LOAD_SCALED_DY, typename STORE, typename ComputeType,
-         int pack_size, int block_size>
-__global__ void LayerNormGradBlockSMemImpl(LOAD_X load_x, LOAD_SCALED_DY load_scaled_dy,
-                                           STORE store, const ComputeType* mean,
-                                           const ComputeType* inv_variance, const int64_t rows,
-                                           const int64_t cols) {
-  extern __shared__ __align__(sizeof(double)) unsigned char grad_shared_buf[];
-  auto* normalized_buf = reinterpret_cast<ComputeType*>(grad_shared_buf);
-  auto* dy_buf = normalized_buf + cols;
-  const int tid = threadIdx.x;
-  assert(cols % pack_size == 0);
-  const int num_packs = static_cast<int>(cols) / pack_size;
-  const ComputeType one_over_cols = static_cast<ComputeType>(1.0) / static_cast<ComputeType>(cols);
-  for (int64_t row = blockIdx.x; row < rows; row += gridDim.x) {
-    ComputeType sum_stats1 = 0;
-    ComputeType sum_stats2 = 0;
-    const ComputeType mean_val = mean[row];
-    const ComputeType inv_variance_val = inv_variance[row];
-    const ComputeType inv_variance_over_cols = inv_variance_val * one_over_cols;
-    for (int pack_id = tid; pack_id < num_packs; pack_id += block_size) {
-      ComputeType x_pack[pack_size];
-      ComputeType dy_pack[pack_size];
-      load_x.template load<pack_size>(x_pack, row, pack_id * pack_size);
-      load_scaled_dy.template load<pack_size>(dy_pack, row, pack_id * pack_size);
-#pragma unroll
-      for (int i = 0; i < pack_size; ++i) {
-        const int buf_offset = i * num_packs + pack_id;
-        ComputeType normalized = (x_pack[i] - mean_val) * inv_variance_val;
-        normalized_buf[buf_offset] = normalized;
-        dy_buf[buf_offset] = dy_pack[i];
-        sum_stats1 += dy_pack[i];
-        sum_stats2 += dy_pack[i] * normalized;
-      }
-    }
-    const ComputeType row_sum_stats1 = BlockAllReduce<SumOp, ComputeType, block_size>(sum_stats1);
-    const ComputeType row_sum_stats2 = BlockAllReduce<SumOp, ComputeType, block_size>(sum_stats2);
-    for (int pack_id = tid; pack_id < num_packs; pack_id += block_size) {
-      ComputeType pack[pack_size];
-#pragma unroll
-      for (int i = 0; i < pack_size; ++i) {
-        const int buf_offset = i * num_packs + pack_id;
-        pack[i] = (cols * dy_buf[buf_offset] - row_sum_stats1
-                   - normalized_buf[buf_offset] * row_sum_stats2)
-                  * inv_variance_over_cols;
-      }
-      store.template store<pack_size>(pack, row, pack_id * pack_size);
-    }
-  }
-}
-
-template<typename LOAD_X, typename LOAD_SCALED_DY, typename STORE, typename ComputeType,
-         int pack_size, int block_size>
-__global__ void LayerNormGradBlockSMemImpl_1024(LOAD_X load_x, LOAD_SCALED_DY load_scaled_dy,
-                                           STORE store, const ComputeType* mean,
-                                           const ComputeType* inv_variance, const int64_t rows,
-                                           const int64_t cols) __attribute__((amdgpu_flat_work_group_size(1,1024))) {
-  extern __shared__ __align__(sizeof(double)) unsigned char grad_shared_buf[];
-  auto* normalized_buf = reinterpret_cast<ComputeType*>(grad_shared_buf);
-  auto* dy_buf = normalized_buf + cols;
-  const int tid = threadIdx.x;
-  assert(cols % pack_size == 0);
-  const int num_packs = static_cast<int>(cols) / pack_size;
-  const ComputeType one_over_cols = static_cast<ComputeType>(1.0) / static_cast<ComputeType>(cols);
-  for (int64_t row = blockIdx.x; row < rows; row += gridDim.x) {
-    ComputeType sum_stats1 = 0;
-    ComputeType sum_stats2 = 0;
-    const ComputeType mean_val = mean[row];
-    const ComputeType inv_variance_val = inv_variance[row];
-    const ComputeType inv_variance_over_cols = inv_variance_val * one_over_cols;
-    for (int pack_id = tid; pack_id < num_packs; pack_id += block_size) {
-      ComputeType x_pack[pack_size];
-      ComputeType dy_pack[pack_size];
-      load_x.template load<pack_size>(x_pack, row, pack_id * pack_size);
-      load_scaled_dy.template load<pack_size>(dy_pack, row, pack_id * pack_size);
-#pragma unroll
-      for (int i = 0; i < pack_size; ++i) {
-        const int buf_offset = i * num_packs + pack_id;
-        ComputeType normalized = (x_pack[i] - mean_val) * inv_variance_val;
-        normalized_buf[buf_offset] = normalized;
-        dy_buf[buf_offset] = dy_pack[i];
-        sum_stats1 += dy_pack[i];
-        sum_stats2 += dy_pack[i] * normalized;
-      }
-    }
-    const ComputeType row_sum_stats1 = BlockAllReduce<SumOp, ComputeType, block_size>(sum_stats1);
-    const ComputeType row_sum_stats2 = BlockAllReduce<SumOp, ComputeType, block_size>(sum_stats2);
-    for (int pack_id = tid; pack_id < num_packs; pack_id += block_size) {
-      ComputeType pack[pack_size];
-#pragma unroll
-      for (int i = 0; i < pack_size; ++i) {
-        const int buf_offset = i * num_packs + pack_id;
-        pack[i] = (cols * dy_buf[buf_offset] - row_sum_stats1
-                   - normalized_buf[buf_offset] * row_sum_stats2)
-                  * inv_variance_over_cols;
-      }
-      store.template store<pack_size>(pack, row, pack_id * pack_size);
-    }
-  }
-}
-
-template<typename LOAD_X, typename LOAD_SCALED_DY, typename STORE, typename ComputeType,
-         int pack_size, int block_size>
-inline hipError_t LaunchLayerNormGradBlockSMemImpl(hipStream_t stream, LOAD_X load_x,
-                                                    LOAD_SCALED_DY load_scaled_dy, STORE store,
-                                                    const ComputeType* mean,
-                                                    const ComputeType* inv_variance, int smem,
-                                                    const int64_t rows, const int64_t cols) {
-  constexpr int waves = 32;
-  int grid_dim_x;
-  {
-    hipError_t err = GetNumBlocks(LayerNormGradBlockSMemImpl<LOAD_X, LOAD_SCALED_DY, STORE,
-                                                              ComputeType, pack_size, block_size>,
-                                   block_size, smem, rows, waves, &grid_dim_x);
-    if (err != hipSuccess) { return err; }
-  }
-  LayerNormGradBlockSMemImpl<LOAD_X, LOAD_SCALED_DY, STORE, ComputeType, pack_size, block_size>
-      <<<grid_dim_x, block_size, smem, stream>>>(load_x, load_scaled_dy, store, mean, inv_variance,
-                                                 rows, cols);
-  return hipPeekAtLastError();
-}
-
-template<typename LOAD_X, typename LOAD_SCALED_DY, typename STORE, typename ComputeType,
-         int pack_size, int block_size>
-inline hipError_t LaunchLayerNormGradBlockSMemImpl_1024(hipStream_t stream, LOAD_X load_x,
-                                                    LOAD_SCALED_DY load_scaled_dy, STORE store,
-                                                    const ComputeType* mean,
-                                                    const ComputeType* inv_variance, int smem,
-                                                    const int64_t rows, const int64_t cols) {
-  constexpr int waves = 32;
-  int grid_dim_x;
-  {
-    hipError_t err = GetNumBlocks(LayerNormGradBlockSMemImpl_1024<LOAD_X, LOAD_SCALED_DY, STORE,
-                                                              ComputeType, pack_size, block_size>,
-                                   block_size, smem, rows, waves, &grid_dim_x);
-    if (err != hipSuccess) { return err; }
-  }
-  LayerNormGradBlockSMemImpl_1024<LOAD_X, LOAD_SCALED_DY, STORE, ComputeType, pack_size, block_size>
-      <<<grid_dim_x, block_size, smem, stream>>>(load_x, load_scaled_dy, store, mean, inv_variance,
-                                                 rows, cols);
-  return hipPeekAtLastError();
-}
-
-template<typename LOAD_X, typename LOAD_SCALED_DY, typename STORE, typename ComputeType,
-         int pack_size>
-inline hipError_t TryDispatchLayerNormGradBlockSMemImplBlockSize(
-    hipStream_t stream, LOAD_X load_x, LOAD_SCALED_DY load_scaled_dy, STORE store,
-    const ComputeType* mean, const ComputeType* inv_variance, const int64_t rows,
-    const int64_t cols, bool* success) {
-  constexpr int block_size_conf_1 = 128;
-  constexpr int block_size_conf_2 = 256;
-  constexpr int block_size_conf_3 = 512;
-  constexpr int block_size_conf_4 = 1024;
-  const size_t smem = cols * sizeof(ComputeType) * 2;
-  int max_active_blocks_conf_1;
-  {
-    hipError_t err = hipOccupancyMaxActiveBlocksPerMultiprocessor(
-        &max_active_blocks_conf_1,
-        LayerNormGradBlockSMemImpl<LOAD_X, LOAD_SCALED_DY, STORE, ComputeType, pack_size,
-                                   block_size_conf_1>,
-        block_size_conf_1, smem);
-    if (err != hipSuccess) { return err; }
-  }
-  if (max_active_blocks_conf_1 <= 0) {
-    *success = false;
-    return hipSuccess;
-  }
-  int max_active_blocks_conf_4;
-  {
-    hipError_t err = hipOccupancyMaxActiveBlocksPerMultiprocessor(
-        &max_active_blocks_conf_4,
-        LayerNormGradBlockSMemImpl_1024<LOAD_X, LOAD_SCALED_DY, STORE, ComputeType, pack_size,
-                                   block_size_conf_4>,
-        block_size_conf_4, smem);
-    if (err != hipSuccess) { return err; }
-  }
-  if (max_active_blocks_conf_4 == max_active_blocks_conf_1) {
-    *success = true;
-    return LaunchLayerNormGradBlockSMemImpl_1024<LOAD_X, LOAD_SCALED_DY, STORE, ComputeType, pack_size,
-                                            block_size_conf_4>(
-        stream, load_x, load_scaled_dy, store, mean, inv_variance, smem, rows, cols);
-  }
-  int max_active_blocks_conf_3;
-  {
-    hipError_t err = hipOccupancyMaxActiveBlocksPerMultiprocessor(
-        &max_active_blocks_conf_3,
-        LayerNormGradBlockSMemImpl<LOAD_X, LOAD_SCALED_DY, STORE, ComputeType, pack_size,
-                                   block_size_conf_3>,
-        block_size_conf_3, smem);
-    if (err != hipSuccess) { return err; }
-  }
-  if (max_active_blocks_conf_3 == max_active_blocks_conf_1) {
-    *success = true;
-    return LaunchLayerNormGradBlockSMemImpl<LOAD_X, LOAD_SCALED_DY, STORE, ComputeType, pack_size,
-                                            block_size_conf_3>(
-        stream, load_x, load_scaled_dy, store, mean, inv_variance, smem, rows, cols);
-  }
-  int max_active_blocks_conf_2;
-  {
-    hipError_t err = hipOccupancyMaxActiveBlocksPerMultiprocessor(
-        &max_active_blocks_conf_2,
-        LayerNormGradBlockSMemImpl<LOAD_X, LOAD_SCALED_DY, STORE, ComputeType, pack_size,
-                                   block_size_conf_2>,
-        block_size_conf_2, smem);
-    if (err != hipSuccess) { return err; }
-  }
-  if (max_active_blocks_conf_2 == max_active_blocks_conf_1) {
-    *success = true;
-    return LaunchLayerNormGradBlockSMemImpl<LOAD_X, LOAD_SCALED_DY, STORE, ComputeType, pack_size,
-                                            block_size_conf_2>(
-        stream, load_x, load_scaled_dy, store, mean, inv_variance, smem, rows, cols);
-  }
-  *success = true;
-  return LaunchLayerNormGradBlockSMemImpl<LOAD_X, LOAD_SCALED_DY, STORE, ComputeType, pack_size,
-                                          block_size_conf_1>(stream, load_x, load_scaled_dy, store,
-                                                             mean, inv_variance, smem, rows, cols);
-}
-
-template<typename LOAD_X, typename LOAD_SCALED_DY, typename STORE, typename ComputeType>
-struct TryDispatchLayerNormGradBlockSMemImplPackSize {
-  hipError_t operator()(hipStream_t stream, LOAD_X load_x, LOAD_SCALED_DY load_scaled_dy,
-                         STORE store, const ComputeType* mean, const ComputeType* inv_variance,
-                         const int64_t rows, const int64_t cols, bool* success) {
-    if (cols % 2 == 0) {
-      return TryDispatchLayerNormGradBlockSMemImplBlockSize<LOAD_X, LOAD_SCALED_DY, STORE,
-                                                            ComputeType, 2>(
-          stream, load_x, load_scaled_dy, store, mean, inv_variance, rows, cols, success);
-    } else {
-      return TryDispatchLayerNormGradBlockSMemImplBlockSize<LOAD_X, LOAD_SCALED_DY, STORE,
-                                                            ComputeType, 1>(
-          stream, load_x, load_scaled_dy, store, mean, inv_variance, rows, cols, success);
-    }
-  }
-};
-
-template<typename LOAD_X, typename LOAD_SCALED_DY, typename STORE, typename ComputeType>
-inline hipError_t TryDispatchLayerNormGradBlockSMemImpl(hipStream_t stream, LOAD_X load_x,
-                                                         LOAD_SCALED_DY load_scaled_dy, STORE store,
-                                                         const ComputeType* mean,
-                                                         const ComputeType* inv_variance,
-                                                         const int64_t rows, const int64_t cols,
-                                                         bool* success) {
-  return TryDispatchLayerNormGradBlockSMemImplPackSize<LOAD_X, LOAD_SCALED_DY, STORE,
-                                                       ComputeType>()(
-      stream, load_x, load_scaled_dy, store, mean, inv_variance, rows, cols, success);
-}
-
-template<typename LOAD_X, typename LOAD_SCALED_DY, typename STORE, typename ComputeType,
-         int pack_size, int block_size>
-__global__ void LayerNormGradBlockUncachedImpl(LOAD_X load_x, LOAD_SCALED_DY load_scaled_dy,
-                                               STORE store, const ComputeType* mean,
-                                               const ComputeType* inv_variance, const int64_t rows,
-                                               const int64_t cols) __attribute__((amdgpu_flat_work_group_size(1,1024))) {
-  const int tid = threadIdx.x;
-  assert(cols % pack_size == 0);
-  const int num_packs = static_cast<int>(cols) / pack_size;
-  const ComputeType one_over_cols = static_cast<ComputeType>(1.0) / static_cast<ComputeType>(cols);
-  for (int64_t row = blockIdx.x; row < rows; row += gridDim.x) {
-    const ComputeType mean_val = mean[row];
-    const ComputeType inv_variance_val = inv_variance[row];
-    const ComputeType inv_variance_over_cols = inv_variance_val * one_over_cols;
-    ComputeType sum_stats1 = 0;
-    ComputeType sum_stats2 = 0;
-    for (int pack_id = tid; pack_id < num_packs; pack_id += block_size) {
-      ComputeType x_pack[pack_size];
-      ComputeType dy_pack[pack_size];
-      load_x.template load<pack_size>(x_pack, row, pack_id * pack_size);
-      load_scaled_dy.template load<pack_size>(dy_pack, row, pack_id * pack_size);
-
-#pragma unroll
-      for (int i = 0; i < pack_size; ++i) {
-        sum_stats1 += dy_pack[i];
-        sum_stats2 += dy_pack[i] * (x_pack[i] - mean_val) * inv_variance_val;
-      }
-    }
-    const ComputeType row_sum_stats1 = BlockAllReduce<SumOp, ComputeType, block_size>(sum_stats1);
-    const ComputeType row_sum_stats2 = BlockAllReduce<SumOp, ComputeType, block_size>(sum_stats2);
-    for (int pack_id = tid; pack_id < num_packs; pack_id += block_size) {
-      ComputeType x_pack[pack_size];
-      ComputeType dy_pack[pack_size];
-      load_x.template load<pack_size>(x_pack, row, pack_id * pack_size);
-      load_scaled_dy.template load<pack_size>(dy_pack, row, pack_id * pack_size);
-#pragma unroll
-      for (int i = 0; i < pack_size; ++i) {
-        dy_pack[i] = (cols * dy_pack[i] - row_sum_stats1
-                      - (x_pack[i] - mean_val) * inv_variance_val * row_sum_stats2)
-                     * inv_variance_over_cols;
-      }
-      store.template store<pack_size>(dy_pack, row, pack_id * pack_size);
-    }
-  }
-}
-
-template<typename LOAD_X, typename LOAD_SCALED_DY, typename STORE, typename ComputeType,
-         int pack_size>
-inline hipError_t LaunchLayerNormGradBlockUncachedImpl(hipStream_t stream, LOAD_X load_x,
-                                                        LOAD_SCALED_DY load_scaled_dy, STORE store,
-                                                        const ComputeType* mean,
-                                                        const ComputeType* inv_variance,
-                                                        const int64_t rows, const int64_t cols) {
-  constexpr int block_size = 1024;
-  constexpr int waves = 32;
-  int grid_dim_x;
-  {
-    hipError_t err =
-        GetNumBlocks(LayerNormGradBlockUncachedImpl<LOAD_X, LOAD_SCALED_DY, STORE, ComputeType,
-                                                    pack_size, block_size>,
-                     block_size, 0, rows, waves, &grid_dim_x);
-    if (err != hipSuccess) { return err; }
-  }
-  LayerNormGradBlockUncachedImpl<LOAD_X, LOAD_SCALED_DY, STORE, ComputeType, pack_size, block_size>
-      <<<grid_dim_x, block_size, 0, stream>>>(load_x, load_scaled_dy, store, mean, inv_variance,
-                                              rows, cols);
-  return hipPeekAtLastError();
-}
-
-template<typename LOAD_X, typename LOAD_SCALED_DY, typename STORE, typename ComputeType>
-struct DispatchLayerNormGradBlockUncachedImplPackSize {
-  hipError_t operator()(hipStream_t stream, LOAD_X load_x, LOAD_SCALED_DY load_scaled_dy,
-                         STORE store, const ComputeType* mean, const ComputeType* inv_variance,
-                         const int64_t rows, const int64_t cols) {
-    if (cols % 2 == 0 && cols > kWarpSize) {
-      return LaunchLayerNormGradBlockUncachedImpl<LOAD_X, LOAD_SCALED_DY, STORE, ComputeType, 2>(
-          stream, load_x, load_scaled_dy, store, mean, inv_variance, rows, cols);
-    } else {
-      return LaunchLayerNormGradBlockUncachedImpl<LOAD_X, LOAD_SCALED_DY, STORE, ComputeType, 1>(
-          stream, load_x, load_scaled_dy, store, mean, inv_variance, rows, cols);
-    }
-  }
-};
-
-template<typename LOAD_X, typename LOAD_SCALED_DY, typename STORE, typename ComputeType>
-inline hipError_t DispatchLayerNormGradBlockUncachedImpl(hipStream_t stream, LOAD_X load_x,
-                                                          LOAD_SCALED_DY load_scaled_dy,
-                                                          STORE store, const ComputeType* mean,
-                                                          const ComputeType* inv_variance,
-                                                          const int64_t rows, const int64_t cols) {
-  return DispatchLayerNormGradBlockUncachedImplPackSize<LOAD_X, LOAD_SCALED_DY, STORE,
-                                                        ComputeType>()(
-      stream, load_x, load_scaled_dy, store, mean, inv_variance, rows, cols);
-}
-
-template<typename LOAD_X, typename LOAD_SCALED_DY, typename STORE, typename ComputeType>
-inline typename std::enable_if<!std::is_same<ComputeType, double>::value, hipError_t>::type
-DispatchLayerNormGrad(hipStream_t stream, LOAD_X load_x, LOAD_SCALED_DY load_scaled_dy,
-                      STORE store, const ComputeType* mean, const ComputeType* inv_variance,
-                      const int64_t rows, const int64_t cols) {
-  if (cols <= 1024) {
-    return DispatchLayerNormGradWarpImpl<LOAD_X, LOAD_SCALED_DY, STORE, ComputeType>(
-        stream, load_x, load_scaled_dy, store, mean, inv_variance, rows, cols);
-  } else {
-    bool dispatch_smem_impl_success;
-    {
-      hipError_t err =
-          TryDispatchLayerNormGradBlockSMemImpl<LOAD_X, LOAD_SCALED_DY, STORE, ComputeType>(
-              stream, load_x, load_scaled_dy, store, mean, inv_variance, rows, cols,
-              &dispatch_smem_impl_success);
-      if (err != hipSuccess) { return err; }
-    }
-    if (!dispatch_smem_impl_success) {
-      return DispatchLayerNormGradBlockUncachedImpl<LOAD_X, LOAD_SCALED_DY, STORE, ComputeType>(
-          stream, load_x, load_scaled_dy, store, mean, inv_variance, rows, cols);
-    }
-    return hipSuccess;
-  }
-}
-
-template<typename LOAD_X, typename LOAD_SCALED_DY, typename STORE, typename ComputeType>
-inline typename std::enable_if<std::is_same<ComputeType, double>::value, hipError_t>::type
-DispatchLayerNormGrad(hipStream_t stream, LOAD_X load_x, LOAD_SCALED_DY load_scaled_dy,
-                      STORE store, const ComputeType* mean, const ComputeType* inv_variance,
-                      const int64_t rows, const int64_t cols) {
-  return DispatchLayerNormGradBlockUncachedImpl<LOAD_X, LOAD_SCALED_DY, STORE, ComputeType>(
-      stream, load_x, load_scaled_dy, store, mean, inv_variance, rows, cols);
-}
-
-}  // namespace layer_norm
-
-}  // namespace cuda
-
-}  // namespace oneflow
-
-
-#endif  // WITH_ROCM
-
-#endif  // ONEFLOW_CORE_CUDA_LAYER_NORM_H_
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+#ifndef ONEFLOW_CORE_HIP_LAYER_NORM_H_
+#define ONEFLOW_CORE_HIP_LAYER_NORM_H_
+
+#ifdef WITH_ROCM
+
+#include "hip/hip_runtime.h"
+#include <hipcub/hipcub.hpp>
+// #include <math_constants.h>
+#include <assert.h>
+
+namespace oneflow {
+
+namespace cuda {
+
+namespace layer_norm {
+
+constexpr int kWarpSize = 64;
+
+template<typename T>
+struct SumOp {
+  __device__ __forceinline__ T operator()(const T& a, const T& b) const { return a + b; }
+};
+
+template<typename T>
+struct MaxOp {
+  __device__ __forceinline__ T operator()(const T& a, const T& b) const { return max(a, b); }
+};
+
+template<template<typename> class ReductionOp, typename T, int thread_group_width = kWarpSize>
+__inline__ __device__ T WarpAllReduce(T val) {
+  for (int mask = thread_group_width / 2; mask > 0; mask /= 2) {
+    // val = ReductionOp<T>()(val, __shfl_xor(0xffffffff, val, mask, thread_group_width));
+    val = ReductionOp<T>()(val, __shfl_xor(val, mask, thread_group_width));
+  }
+  return val;
+}
+
+template<template<typename> class ReductionOp, typename T, int block_size>
+__inline__ __device__ T BlockAllReduce(T val) {
+  typedef hipcub::BlockReduce<T, block_size> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage temp_storage;
+  __shared__ T result_broadcast;
+  T result = BlockReduce(temp_storage).Reduce(val, ReductionOp<T>());
+  if (threadIdx.x == 0) { result_broadcast = result; }
+  __syncthreads();
+  return result_broadcast;
+}
+
+template<typename T>
+__inline__ __device__ T Div(T a, T b);
+
+template<>
+__inline__ __device__ float Div<float>(float a, float b) {
+#ifdef OF_LAYER_NORM_USE_FAST_MATH
+  return __fdividef(a, b);
+#else
+  return a / b;
+#endif
+}
+
+template<>
+__inline__ __device__ double Div<double>(double a, double b) {
+  return a / b;
+}
+
+template<typename T>
+__inline__ __device__ T Rsqrt(T x);
+
+template<>
+__inline__ __device__ float Rsqrt<float>(float x) {
+#ifdef OF_LAYER_NORM_USE_FAST_MATH
+  return __frsqrt_rn(x);
+#else
+  return rsqrt(x);
+#endif
+}
+
+template<>
+__inline__ __device__ double Rsqrt<double>(double x) {
+  return rsqrt(x);
+}
+
+template<class Func>
+inline hipError_t GetNumBlocks(Func func, int64_t block_size, size_t dynamic_smem_size,
+                                int64_t max_blocks, int64_t waves, int* num_blocks) {
+  int dev;
+  {
+    hipError_t err = hipGetDevice(&dev);
+    if (err != hipSuccess) { return err; }
+  }
+  int sm_count;
+  {
+    hipError_t err = hipDeviceGetAttribute(&sm_count, hipDeviceAttributeMultiprocessorCount, dev);
+    if (err != hipSuccess) { return err; }
+  }
+  int max_active_blocks;
+  {
+    hipError_t err = hipOccupancyMaxActiveBlocksPerMultiprocessor(&max_active_blocks, func,
+                                                                    block_size, dynamic_smem_size);
+  }
+  *num_blocks =
+      std::max<int>(1, std::min<int64_t>(max_blocks, sm_count * max_active_blocks * waves));
+  return hipSuccess;
+}
+
+template<typename T>
+struct DefaultComputeType {
+  using type = T;
+};
+
+template<>
+struct DefaultComputeType<half> {
+  using type = float;
+};
+
+// #if CUDA_VERSION >= 11000
+// template<>
+// struct DefaultComputeType<nv_bfloat16> {
+//   using type = float;
+// };
+// #endif  // CUDA_VERSION >= 11000
+
+template<typename T, int N>
+struct GetPackType {
+  using type = typename std::aligned_storage<N * sizeof(T), N * sizeof(T)>::type;
+};
+
+template<typename T, int N>
+using PackType = typename GetPackType<T, N>::type;
+
+template<typename T, int N>
+union Pack {
+  static_assert(sizeof(PackType<T, N>) == sizeof(T) * N, "");
+  __device__ Pack() {
+    // do nothing
+  }
+  PackType<T, N> storage;
+  T elem[N];
+};
+
+template<typename SRC, typename DST>
+struct DirectLoad {
+  DirectLoad(const SRC* src, int64_t row_size) : src(src), row_size(row_size) {}
+  template<int N>
+  __device__ void load(DST* dst, int64_t row, int64_t col) const {
+    Pack<SRC, N> pack;
+    const int64_t offset = (row * row_size + col) / N;
+    pack.storage = *(reinterpret_cast<const PackType<SRC, N>*>(src) + offset);
+#pragma unroll
+    for (int i = 0; i < N; ++i) { dst[i] = static_cast<DST>(pack.elem[i]); }
+  }
+  const SRC* src;
+  int64_t row_size;
+};
+
+template<typename SRC, typename DST>
+struct DirectStore {
+  DirectStore(DST* dst, int64_t row_size) : dst(dst), row_size(row_size) {}
+  template<int N>
+  __device__ void store(const SRC* src, int64_t row, int64_t col) {
+    Pack<DST, N> pack;
+    const int64_t offset = (row * row_size + col) / N;
+#pragma unroll
+    for (int i = 0; i < N; ++i) { pack.elem[i] = static_cast<DST>(src[i]); }
+    *(reinterpret_cast<PackType<DST, N>*>(dst) + offset) = pack.storage;
+  }
+  DST* dst;
+  int64_t row_size;
+};
+
+template<typename T>
+inline __device__ void WelfordCombine(T val, T* mean, T* m2, T* count) {
+  // Use Welford Online algorithem to compute mean and variance
+  // For more details you can refer to:
+  // https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Welford's_online_algorithm
+  *count += 1;
+  T delta1 = val - *mean;
+  *mean += Div(delta1, *count);
+  T delta2 = val - *mean;
+  *m2 += delta1 * delta2;
+}
+
+template<typename T>
+inline __device__ void WelfordCombine(T b_mean, T b_m2, T b_count, T* mean, T* m2, T* count) {
+  if (b_count == 0) { return; }
+  T new_count = *count + b_count;
+  T nb_over_n = Div(b_count, new_count);
+  T delta = b_mean - *mean;
+  *mean += delta * nb_over_n;
+  *m2 += b_m2 + delta * delta * (*count) * nb_over_n;
+  *count = new_count;
+}
+
+template<typename T, int thread_group_width = kWarpSize>
+__inline__ __device__ void WelfordWarpReduce(T thread_mean, T thread_m2, T thread_count, T* mean,
+                                             T* m2, T* count) {
+  *mean = thread_mean;
+  *m2 = thread_m2;
+  *count = thread_count;
+  for (int mask = thread_group_width / 2; mask > 0; mask /= 2) {
+    // T b_mean = __shfl_down(0xffffffff, *mean, mask, thread_group_width);
+    // T b_m2 = __shfl_down(0xffffffff, *m2, mask, thread_group_width);
+    // T b_count = __shfl_down(0xffffffff, *count, mask, thread_group_width);
+    T b_mean = __shfl_down(*mean, mask, thread_group_width);
+    T b_m2 = __shfl_down(*m2, mask, thread_group_width);
+    T b_count = __shfl_down(*count, mask, thread_group_width);
+    WelfordCombine(b_mean, b_m2, b_count, mean, m2, count);
+  }
+}
+
+template<typename T, int thread_group_width = kWarpSize>
+__inline__ __device__ void WelfordWarpAllReduce(T thread_mean, T thread_m2, T thread_count, T* mean,
+                                                T* m2, T* count) {
+  WelfordWarpReduce<T, thread_group_width>(thread_mean, thread_m2, thread_count, mean, m2, count);
+  // *mean = __shfl(0xffffffff, *mean, 0, thread_group_width);
+  // *m2 = __shfl(0xffffffff, *m2, 0, thread_group_width);
+  // *count = __shfl(0xffffffff, *count, 0, thread_group_width);
+  *mean = __shfl(*mean, 0, thread_group_width);
+  *m2 = __shfl(*m2, 0, thread_group_width);
+  *count = __shfl(*count, 0, thread_group_width);
+}
+
+template<typename T>
+__inline__ __device__ void WelfordBlockAllReduce(T thread_mean, T thread_m2, T thread_count,
+                                                 T* result_mean, T* result_m2, T* result_count) {
+  __shared__ T mean_shared[kWarpSize];
+  __shared__ T m2_shared[kWarpSize];
+  __shared__ T count_shared[kWarpSize];
+  __shared__ T mean_result_broadcast;
+  __shared__ T m2_result_broadcast;
+  __shared__ T count_result_broadcast;
+  const int lid = threadIdx.x % kWarpSize;
+  const int wid = threadIdx.x / kWarpSize;
+  T warp_mean = 0;
+  T warp_m2 = 0;
+  T warp_count = 0;
+  WelfordWarpReduce(thread_mean, thread_m2, thread_count, &warp_mean, &warp_m2, &warp_count);
+  __syncthreads();
+  if (lid == 0) {
+    mean_shared[wid] = warp_mean;
+    m2_shared[wid] = warp_m2;
+    count_shared[wid] = warp_count;
+  }
+  __syncthreads();
+  if (wid == 0) {
+    if (threadIdx.x < blockDim.x / kWarpSize) {
+      warp_mean = mean_shared[lid];
+      warp_m2 = m2_shared[lid];
+      warp_count = count_shared[lid];
+    } else {
+      warp_mean = static_cast<T>(0);
+      warp_m2 = static_cast<T>(0);
+      warp_count = static_cast<T>(0);
+    }
+    __syncthreads();
+    T block_mean = 0;
+    T block_m2 = 0;
+    T block_count = 0;
+    WelfordWarpReduce(warp_mean, warp_m2, warp_count, &block_mean, &block_m2, &block_count);
+    if (lid == 0) {
+      mean_result_broadcast = block_mean;
+      m2_result_broadcast = block_m2;
+      count_result_broadcast = block_count;
+    }
+  }
+  __syncthreads();
+  *result_mean = mean_result_broadcast;
+  *result_m2 = m2_result_broadcast;
+  *result_count = count_result_broadcast;
+}
+
+template<typename LOAD, typename STORE, typename ComputeType, int pack_size, int cols_per_thread,
+         int thread_group_width, int rows_per_access, bool padding>
+__global__ void LayerNormWarpImpl(LOAD load, STORE store, const int64_t rows, const int64_t cols,
+                                  const double epsilon, ComputeType* mean,
+                                  ComputeType* inv_variance) {
+  static_assert(cols_per_thread % pack_size == 0, "");
+  static_assert(thread_group_width <= kWarpSize, "");
+  static_assert(kWarpSize % thread_group_width == 0, "");
+  constexpr int num_packs = cols_per_thread / pack_size;
+  assert(cols <= cols_per_thread * thread_group_width);
+  ComputeType buf[rows_per_access][cols_per_thread];
+  const int64_t global_thread_group_id = blockIdx.x * blockDim.y + threadIdx.y;
+  const int64_t num_global_thread_group = gridDim.x * blockDim.y;
+  const int64_t lane_id = threadIdx.x;
+  const int64_t step = num_global_thread_group * rows_per_access;
+  for (int64_t row = global_thread_group_id * rows_per_access; row < rows; row += step) {
+    ComputeType thread_mean[rows_per_access];
+    ComputeType thread_m2[rows_per_access];
+    ComputeType thread_count[rows_per_access];
+#pragma unroll
+    for (int row_id = 0; row_id < rows_per_access; ++row_id) {
+      thread_mean[row_id] = 0;
+      thread_m2[row_id] = 0;
+      thread_count[row_id] = 0;
+      ComputeType* row_buf = buf[row_id];
+#pragma unroll
+      for (int pack_id = 0; pack_id < num_packs; ++pack_id) {
+        const int col = (pack_id * thread_group_width + lane_id) * pack_size;
+        const int pack_offset = pack_id * pack_size;
+        if (!padding || col < cols) {
+          load.template load<pack_size>(row_buf + pack_offset, row + row_id, col);
+#pragma unroll
+          for (int i = 0; i < pack_size; ++i) {
+            WelfordCombine(row_buf[pack_offset + i], thread_mean + row_id, thread_m2 + row_id,
+                           thread_count + row_id);
+          }
+        } else {
+#pragma unroll
+          for (int i = 0; i < pack_size; ++i) { row_buf[pack_offset + i] = 0; }
+        }
+      }
+    }
+    ComputeType warp_mean[rows_per_access];
+    ComputeType warp_m2[rows_per_access];
+    ComputeType warp_count[rows_per_access];
+#pragma unroll
+    for (int row_id = 0; row_id < rows_per_access; ++row_id) {
+      int global_row_id = row + row_id;
+      ComputeType* row_buf = buf[row_id];
+      WelfordWarpAllReduce<ComputeType, thread_group_width>(
+          thread_mean[row_id], thread_m2[row_id], thread_count[row_id], warp_mean + row_id,
+          warp_m2 + row_id, warp_count + row_id);
+      ComputeType row_mean = warp_mean[row_id];
+      ComputeType row_variance =
+          max(Div(warp_m2[row_id], warp_count[row_id]), static_cast<ComputeType>(0.0));
+      ComputeType row_inv_var = Rsqrt(row_variance + static_cast<ComputeType>(epsilon));
+      if (lane_id == 0) {
+        mean[global_row_id] = row_mean;
+        inv_variance[global_row_id] = row_inv_var;
+      }
+#pragma unroll
+      for (int i = 0; i < cols_per_thread; ++i) {
+        row_buf[i] = (row_buf[i] - row_mean) * row_inv_var;
+      }
+#pragma unroll
+      for (int i = 0; i < num_packs; ++i) {
+        const int col = (i * thread_group_width + lane_id) * pack_size;
+        if (!padding || col < cols) {
+          store.template store<pack_size>(row_buf + i * pack_size, global_row_id, col);
+        }
+      }
+    }
+  }
+}
+
+template<typename LOAD, typename STORE, typename ComputeType, int pack_size, int cols_per_thread,
+         int thread_group_width, int rows_per_access, bool padding>
+inline hipError_t LaunchLayerNormWarpImpl(hipStream_t stream, LOAD load, STORE store,
+                                           const int64_t rows, const int64_t cols,
+                                           const double epsilon, ComputeType* mean,
+                                           ComputeType* inv_variance) {
+  constexpr int block_size = 128;
+  constexpr int waves = 32;
+  static_assert(block_size % thread_group_width == 0, "");
+  constexpr int thread_groups_per_block = block_size / thread_group_width;
+  dim3 block_dim(thread_group_width, thread_groups_per_block);
+  const int64_t num_blocks =
+      (rows / rows_per_access + thread_groups_per_block - 1) / thread_groups_per_block;
+  int grid_dim_x;
+  {
+    hipError_t err =
+        GetNumBlocks(LayerNormWarpImpl<LOAD, STORE, ComputeType, pack_size, cols_per_thread,
+                                       thread_group_width, rows_per_access, padding>,
+                     block_size, 0, num_blocks, waves, &grid_dim_x);
+    if (err != hipSuccess) { return err; }
+  }
+  LayerNormWarpImpl<LOAD, STORE, ComputeType, pack_size, cols_per_thread, thread_group_width,
+                    rows_per_access, padding>
+      <<<grid_dim_x, block_dim, 0, stream>>>(load, store, rows, cols, epsilon, mean, inv_variance);
+  return hipPeekAtLastError();
+}
+
+template<typename LOAD, typename STORE, typename ComputeType, int pack_size, int cols_per_thread,
+         int thread_group_width, int rows_per_access>
+inline hipError_t DispatchLayerNormWarpImplPadding(hipStream_t stream, LOAD load, STORE store,
+                                                    const int64_t rows, const int64_t cols,
+                                                    const double epsilon, ComputeType* mean,
+                                                    ComputeType* inv_variance) {
+  if (cols == cols_per_thread * thread_group_width) {
+    return LaunchLayerNormWarpImpl<LOAD, STORE, ComputeType, pack_size, cols_per_thread,
+                                   thread_group_width, rows_per_access, false>(
+        stream, load, store, rows, cols, epsilon, mean, inv_variance);
+  } else {
+    return LaunchLayerNormWarpImpl<LOAD, STORE, ComputeType, pack_size, cols_per_thread,
+                                   thread_group_width, rows_per_access, true>(
+        stream, load, store, rows, cols, epsilon, mean, inv_variance);
+  }
+}
+
+template<typename LOAD, typename STORE, typename ComputeType, int pack_size>
+typename std::enable_if<pack_size == 1, hipError_t>::type DispatchLayerNormWarpImplCols(
+    hipStream_t stream, LOAD load, STORE store, const int64_t rows, const int64_t cols,
+    const double epsilon, ComputeType* mean, ComputeType* inv_variance) {
+  if (cols <= 0) { return hipErrorInvalidValue; }
+#define DEFINE_ONE_ELIF(thread_group_width)                                                   \
+  else if (cols <= (thread_group_width)*pack_size) {                                          \
+    if (rows % 2 == 0) {                                                                      \
+      return DispatchLayerNormWarpImplPadding<LOAD, STORE, ComputeType, pack_size, pack_size, \
+                                              thread_group_width, 2>(                         \
+          stream, load, store, rows, cols, epsilon, mean, inv_variance);                      \
+    } else {                                                                                  \
+      return DispatchLayerNormWarpImplPadding<LOAD, STORE, ComputeType, pack_size, pack_size, \
+                                              thread_group_width, 1>(                         \
+          stream, load, store, rows, cols, epsilon, mean, inv_variance);                      \
+    }                                                                                         \
+  }
+  DEFINE_ONE_ELIF(1)
+  DEFINE_ONE_ELIF(2)
+  DEFINE_ONE_ELIF(4)
+  DEFINE_ONE_ELIF(8)
+  DEFINE_ONE_ELIF(16)
+  DEFINE_ONE_ELIF(32)
+#undef DEFINE_ONE_ELIF
+#define DEFINE_ONE_ELIF(col)                                                                     \
+  else if (cols <= (col)*kWarpSize) {                                                            \
+    return DispatchLayerNormWarpImplPadding<LOAD, STORE, ComputeType, pack_size, col, kWarpSize, \
+                                            1>(stream, load, store, rows, cols, epsilon, mean,   \
+                                               inv_variance);                                    \
+  }
+  DEFINE_ONE_ELIF(2)
+  DEFINE_ONE_ELIF(3)
+  DEFINE_ONE_ELIF(4)
+  DEFINE_ONE_ELIF(5)
+  DEFINE_ONE_ELIF(6)
+  DEFINE_ONE_ELIF(7)
+  DEFINE_ONE_ELIF(8)
+  DEFINE_ONE_ELIF(9)
+  DEFINE_ONE_ELIF(10)
+  DEFINE_ONE_ELIF(11)
+  DEFINE_ONE_ELIF(12)
+  DEFINE_ONE_ELIF(13)
+  DEFINE_ONE_ELIF(14)
+  DEFINE_ONE_ELIF(15)
+  DEFINE_ONE_ELIF(16)
+  DEFINE_ONE_ELIF(17)
+  DEFINE_ONE_ELIF(18)
+  DEFINE_ONE_ELIF(19)
+  DEFINE_ONE_ELIF(20)
+  DEFINE_ONE_ELIF(21)
+  DEFINE_ONE_ELIF(22)
+  DEFINE_ONE_ELIF(23)
+  DEFINE_ONE_ELIF(24)
+  DEFINE_ONE_ELIF(25)
+  DEFINE_ONE_ELIF(26)
+  DEFINE_ONE_ELIF(27)
+  DEFINE_ONE_ELIF(28)
+  DEFINE_ONE_ELIF(29)
+  DEFINE_ONE_ELIF(30)
+  DEFINE_ONE_ELIF(31)
+  DEFINE_ONE_ELIF(32)
+#undef DEFINE_ONE_ELIF
+  else {
+    return hipErrorInvalidValue;
+  }
+}
+
+template<typename LOAD, typename STORE, typename ComputeType, int pack_size>
+typename std::enable_if<pack_size == 2, hipError_t>::type DispatchLayerNormWarpImplCols(
+    hipStream_t stream, LOAD load, STORE store, const int64_t rows, const int64_t cols,
+    const double epsilon, ComputeType* mean, ComputeType* inv_variance) {
+  if (cols <= 0) { return hipErrorInvalidValue; }
+#define DEFINE_ONE_ELIF(thread_group_width)                                                   \
+  else if (cols <= (thread_group_width)*pack_size) {                                          \
+    if (rows % 2 == 0) {                                                                      \
+      return DispatchLayerNormWarpImplPadding<LOAD, STORE, ComputeType, pack_size, pack_size, \
+                                              thread_group_width, 2>(                         \
+          stream, load, store, rows, cols, epsilon, mean, inv_variance);                      \
+    } else {                                                                                  \
+      return DispatchLayerNormWarpImplPadding<LOAD, STORE, ComputeType, pack_size, pack_size, \
+                                              thread_group_width, 1>(                         \
+          stream, load, store, rows, cols, epsilon, mean, inv_variance);                      \
+    }                                                                                         \
+  }
+  DEFINE_ONE_ELIF(1)
+  DEFINE_ONE_ELIF(2)
+  DEFINE_ONE_ELIF(4)
+  DEFINE_ONE_ELIF(8)
+  DEFINE_ONE_ELIF(16)
+  DEFINE_ONE_ELIF(32)
+#undef DEFINE_ONE_ELIF
+#define DEFINE_ONE_ELIF(col)                                                                     \
+  else if (cols <= (col)*kWarpSize) {                                                            \
+    return DispatchLayerNormWarpImplPadding<LOAD, STORE, ComputeType, pack_size, col, kWarpSize, \
+                                            1>(stream, load, store, rows, cols, epsilon, mean,   \
+                                               inv_variance);                                    \
+  }
+  DEFINE_ONE_ELIF(4)
+  DEFINE_ONE_ELIF(6)
+  DEFINE_ONE_ELIF(8)
+  DEFINE_ONE_ELIF(10)
+  DEFINE_ONE_ELIF(12)
+  DEFINE_ONE_ELIF(14)
+  DEFINE_ONE_ELIF(16)
+  DEFINE_ONE_ELIF(18)
+  DEFINE_ONE_ELIF(20)
+  DEFINE_ONE_ELIF(22)
+  DEFINE_ONE_ELIF(24)
+  DEFINE_ONE_ELIF(26)
+  DEFINE_ONE_ELIF(28)
+  DEFINE_ONE_ELIF(30)
+  DEFINE_ONE_ELIF(32)
+#undef DEFINE_ONE_ELIF
+  else {
+    return hipErrorInvalidValue;
+  }
+}
+template<typename LOAD, typename STORE, typename ComputeType, int pack_size>
+typename std::enable_if<pack_size == 4, hipError_t>::type DispatchLayerNormWarpImplCols(
+    hipStream_t stream, LOAD load, STORE store, const int64_t rows, const int64_t cols,
+    const double epsilon, ComputeType* mean, ComputeType* inv_variance) {
+  if (cols <= 0) { return hipErrorInvalidValue; }
+#define DEFINE_ONE_ELIF(thread_group_width)                                                   \
+  else if (cols <= (thread_group_width)*pack_size) {                                          \
+    if (rows % 2 == 0) {                                                                      \
+      return DispatchLayerNormWarpImplPadding<LOAD, STORE, ComputeType, pack_size, pack_size, \
+                                              thread_group_width, 2>(                         \
+          stream, load, store, rows, cols, epsilon, mean, inv_variance);                      \
+    } else {                                                                                  \
+      return DispatchLayerNormWarpImplPadding<LOAD, STORE, ComputeType, pack_size, pack_size, \
+                                              thread_group_width, 1>(                         \
+          stream, load, store, rows, cols, epsilon, mean, inv_variance);                      \
+    }                                                                                         \
+  }
+  DEFINE_ONE_ELIF(1)
+  DEFINE_ONE_ELIF(2)
+  DEFINE_ONE_ELIF(4)
+  DEFINE_ONE_ELIF(8)
+  DEFINE_ONE_ELIF(16)
+  DEFINE_ONE_ELIF(32)
+#undef DEFINE_ONE_ELIF
+#define DEFINE_ONE_ELIF(col)                                                                     \
+  else if (cols <= (col)*kWarpSize) {                                                            \
+    return DispatchLayerNormWarpImplPadding<LOAD, STORE, ComputeType, pack_size, col, kWarpSize, \
+                                            1>(stream, load, store, rows, cols, epsilon, mean,   \
+                                               inv_variance);                                    \
+  }
+  DEFINE_ONE_ELIF(8)
+  DEFINE_ONE_ELIF(12)
+  DEFINE_ONE_ELIF(16)
+  DEFINE_ONE_ELIF(20)
+  DEFINE_ONE_ELIF(24)
+  DEFINE_ONE_ELIF(28)
+  DEFINE_ONE_ELIF(32)
+#undef DEFINE_ONE_ELIF
+  else {
+    return hipErrorInvalidValue;
+  }
+}
+
+template<typename LOAD, typename STORE, typename ComputeType>
+struct DispatchLayerNormWarpImplPackSize {
+  hipError_t operator()(hipStream_t stream, LOAD load, STORE store, const int64_t rows,
+                         const int64_t cols, const double epsilon, ComputeType* mean,
+                         ComputeType* inv_variance) {
+    if (cols % 4 == 0) {
+      return DispatchLayerNormWarpImplCols<LOAD, STORE, ComputeType, 4>(
+          stream, load, store, rows, cols, epsilon, mean, inv_variance);
+    } else if (cols % 2 == 0) {
+      return DispatchLayerNormWarpImplCols<LOAD, STORE, ComputeType, 2>(
+          stream, load, store, rows, cols, epsilon, mean, inv_variance);
+    } else {
+      return DispatchLayerNormWarpImplCols<LOAD, STORE, ComputeType, 1>(
+          stream, load, store, rows, cols, epsilon, mean, inv_variance);
+    }
+  }
+};
+
+template<typename LOAD, typename STORE, typename ComputeType>
+inline hipError_t DispatchLayerNormWarpImpl(hipStream_t stream, LOAD load, STORE store,
+                                             const int64_t rows, const int64_t cols,
+                                             const double epsilon, ComputeType* mean,
+                                             ComputeType* inv_variance) {
+  return DispatchLayerNormWarpImplPackSize<LOAD, STORE, ComputeType>()(
+      stream, load, store, rows, cols, epsilon, mean, inv_variance);
+}
+
+template<typename LOAD, typename STORE, typename ComputeType, int pack_size, int block_size>
+__global__ void LayerNormBlockSMemImpl(LOAD load, STORE store, const int64_t rows,
+                                       const int64_t cols, const double epsilon, ComputeType* mean,
+                                       ComputeType* inv_variance) {
+  extern __shared__ __align__(sizeof(double)) unsigned char shared_buf[];
+  auto* buf = reinterpret_cast<ComputeType*>(shared_buf);
+  const int tid = threadIdx.x;
+  assert(cols % pack_size == 0);
+  const int num_packs = static_cast<int>(cols) / pack_size;
+  for (int64_t row = blockIdx.x; row < rows; row += gridDim.x) {
+    ComputeType thread_mean = 0;
+    ComputeType thread_m2 = 0;
+    ComputeType thread_count = 0;
+    for (int pack_id = tid; pack_id < num_packs; pack_id += block_size) {
+      ComputeType pack[pack_size];
+      load.template load<pack_size>(pack, row, pack_id * pack_size);
+#pragma unroll
+      for (int i = 0; i < pack_size; ++i) {
+        buf[i * num_packs + pack_id] = pack[i];
+        WelfordCombine(pack[i], &thread_mean, &thread_m2, &thread_count);
+      }
+    }
+    ComputeType row_mean = 0;
+    ComputeType row_m2 = 0;
+    ComputeType row_count = 0;
+    WelfordBlockAllReduce<ComputeType>(thread_mean, thread_m2, thread_count, &row_mean, &row_m2,
+                                       &row_count);
+    ComputeType row_variance = max(Div(row_m2, row_count), static_cast<ComputeType>(0.0));
+    ComputeType row_inv_var = Rsqrt(row_variance + static_cast<ComputeType>(epsilon));
+    if (threadIdx.x == 0) {
+      mean[row] = row_mean;
+      inv_variance[row] = row_inv_var;
+    }
+    for (int pack_id = tid; pack_id < num_packs; pack_id += block_size) {
+      ComputeType pack[pack_size];
+#pragma unroll
+      for (int i = 0; i < pack_size; ++i) {
+        pack[i] = (buf[i * num_packs + pack_id] - row_mean) * row_inv_var;
+      }
+      store.template store<pack_size>(pack, row, pack_id * pack_size);
+    }
+  }
+}
+
+template<typename LOAD, typename STORE, typename ComputeType, int pack_size, int block_size>
+__global__ void LayerNormBlockSMemImpl_1024(LOAD load, STORE store, const int64_t rows,
+                                       const int64_t cols, const double epsilon, ComputeType* mean,
+                                       ComputeType* inv_variance) __attribute__((amdgpu_flat_work_group_size(1,1024))) {
+  extern __shared__ __align__(sizeof(double)) unsigned char shared_buf[];
+  auto* buf = reinterpret_cast<ComputeType*>(shared_buf);
+  const int tid = threadIdx.x;
+  assert(cols % pack_size == 0);
+  const int num_packs = static_cast<int>(cols) / pack_size;
+  for (int64_t row = blockIdx.x; row < rows; row += gridDim.x) {
+    ComputeType thread_mean = 0;
+    ComputeType thread_m2 = 0;
+    ComputeType thread_count = 0;
+    for (int pack_id = tid; pack_id < num_packs; pack_id += block_size) {
+      ComputeType pack[pack_size];
+      load.template load<pack_size>(pack, row, pack_id * pack_size);
+#pragma unroll
+      for (int i = 0; i < pack_size; ++i) {
+        buf[i * num_packs + pack_id] = pack[i];
+        WelfordCombine(pack[i], &thread_mean, &thread_m2, &thread_count);
+      }
+    }
+    ComputeType row_mean = 0;
+    ComputeType row_m2 = 0;
+    ComputeType row_count = 0;
+    WelfordBlockAllReduce<ComputeType>(thread_mean, thread_m2, thread_count, &row_mean, &row_m2,
+                                       &row_count);
+    ComputeType row_variance = max(Div(row_m2, row_count), static_cast<ComputeType>(0.0));
+    ComputeType row_inv_var = Rsqrt(row_variance + static_cast<ComputeType>(epsilon));
+    if (threadIdx.x == 0) {
+      mean[row] = row_mean;
+      inv_variance[row] = row_inv_var;
+    }
+    for (int pack_id = tid; pack_id < num_packs; pack_id += block_size) {
+      ComputeType pack[pack_size];
+#pragma unroll
+      for (int i = 0; i < pack_size; ++i) {
+        pack[i] = (buf[i * num_packs + pack_id] - row_mean) * row_inv_var;
+      }
+      store.template store<pack_size>(pack, row, pack_id * pack_size);
+    }
+  }
+}
+
+
+template<typename LOAD, typename STORE, typename ComputeType, int pack_size, int block_size>
+inline hipError_t LaunchLayerNormBlockSMemImpl(hipStream_t stream, LOAD load, STORE store,
+                                                int smem, const int64_t rows, const int64_t cols,
+                                                const double epsilon, ComputeType* mean,
+                                                ComputeType* inv_variance) {
+  constexpr int waves = 32;
+  int grid_dim_x;
+  {
+    hipError_t err =
+        GetNumBlocks(LayerNormBlockSMemImpl<LOAD, STORE, ComputeType, pack_size, block_size>,
+                     block_size, smem, rows, waves, &grid_dim_x);
+    if (err != hipSuccess) { return err; }
+  }
+  LayerNormBlockSMemImpl<LOAD, STORE, ComputeType, pack_size, block_size>
+      <<<grid_dim_x, block_size, smem, stream>>>(load, store, rows, cols, epsilon, mean,
+                                                 inv_variance);
+  return hipPeekAtLastError();
+}
+
+template<typename LOAD, typename STORE, typename ComputeType, int pack_size, int block_size>
+inline hipError_t LaunchLayerNormBlockSMemImpl_1024(hipStream_t stream, LOAD load, STORE store,
+                                                int smem, const int64_t rows, const int64_t cols,
+                                                const double epsilon, ComputeType* mean,
+                                                ComputeType* inv_variance) {
+  constexpr int waves = 32;
+  int grid_dim_x;
+  {
+    hipError_t err =
+        GetNumBlocks(LayerNormBlockSMemImpl_1024<LOAD, STORE, ComputeType, pack_size, block_size>,
+                     block_size, smem, rows, waves, &grid_dim_x);
+    if (err != hipSuccess) { return err; }
+  }
+  LayerNormBlockSMemImpl_1024<LOAD, STORE, ComputeType, pack_size, block_size>
+      <<<grid_dim_x, block_size, smem, stream>>>(load, store, rows, cols, epsilon, mean,
+                                                 inv_variance);
+  return hipPeekAtLastError();
+}
+
+template<typename LOAD, typename STORE, typename ComputeType, int pack_size>
+inline hipError_t TryDispatchLayerNormBlockSMemImplBlockSize(
+    hipStream_t stream, LOAD load, STORE store, const int64_t rows, const int64_t cols,
+    const double epsilon, ComputeType* mean, ComputeType* inv_variance, bool* success) {
+  constexpr int block_size_conf_1 = 128;
+  constexpr int block_size_conf_2 = 256;
+  constexpr int block_size_conf_3 = 512;
+  constexpr int block_size_conf_4 = 1024;
+  const size_t smem = cols * sizeof(ComputeType);
+  int max_active_blocks_conf_1;
+
+  {
+    hipError_t err = hipOccupancyMaxActiveBlocksPerMultiprocessor(
+        &max_active_blocks_conf_1,
+        LayerNormBlockSMemImpl<LOAD, STORE, ComputeType, pack_size, block_size_conf_1>,
+        block_size_conf_1, smem);
+    if (err != hipSuccess) { return err; }
+  }
+  if (max_active_blocks_conf_1 <= 0) {
+    *success = false;
+    return hipSuccess;
+  }
+  int max_active_blocks_conf_4;
+  {
+    hipError_t err = hipOccupancyMaxActiveBlocksPerMultiprocessor(
+        &max_active_blocks_conf_4,
+        LayerNormBlockSMemImpl_1024<LOAD, STORE, ComputeType, pack_size, block_size_conf_4>,
+        block_size_conf_4, smem);
+    if (err != hipSuccess) { return err; }
+  }
+
+  if (max_active_blocks_conf_4 == max_active_blocks_conf_1) {
+    *success = true;
+    return LaunchLayerNormBlockSMemImpl_1024<LOAD, STORE, ComputeType, pack_size, block_size_conf_4>(
+        stream, load, store, smem, rows, cols, epsilon, mean, inv_variance);
+  }
+  int max_active_blocks_conf_3;
+  {
+    hipError_t err = hipOccupancyMaxActiveBlocksPerMultiprocessor(
+        &max_active_blocks_conf_3,
+        LayerNormBlockSMemImpl<LOAD, STORE, ComputeType, pack_size, block_size_conf_3>,
+        block_size_conf_3, smem);
+    if (err != hipSuccess) { return err; }
+  }
+
+  if (max_active_blocks_conf_3 == max_active_blocks_conf_1) {
+    *success = true;
+    return LaunchLayerNormBlockSMemImpl<LOAD, STORE, ComputeType, pack_size, block_size_conf_3>(
+        stream, load, store, smem, rows, cols, epsilon, mean, inv_variance);
+  }
+  int max_active_blocks_conf_2;
+  {
+    hipError_t err = hipOccupancyMaxActiveBlocksPerMultiprocessor(
+        &max_active_blocks_conf_2,
+        LayerNormBlockSMemImpl<LOAD, STORE, ComputeType, pack_size, block_size_conf_2>,
+        block_size_conf_2, smem);
+    if (err != hipSuccess) { return err; }
+  }
+
+  if (max_active_blocks_conf_2 == max_active_blocks_conf_1) {
+    *success = true;
+    return LaunchLayerNormBlockSMemImpl<LOAD, STORE, ComputeType, pack_size, block_size_conf_2>(
+        stream, load, store, smem, rows, cols, epsilon, mean, inv_variance);
+  }
+  *success = true;
+  return LaunchLayerNormBlockSMemImpl<LOAD, STORE, ComputeType, pack_size, block_size_conf_1>(
+      stream, load, store, smem, rows, cols, epsilon, mean, inv_variance);
+}
+
+template<typename LOAD, typename STORE, typename ComputeType>
+struct TryDispatchLayerNormBlockSMemImplPackSize {
+  hipError_t operator()(hipStream_t stream, LOAD load, STORE store, const int64_t rows,
+                         const int64_t cols, const double epsilon, ComputeType* mean,
+                         ComputeType* inv_variance, bool* success) {
+    if (cols % 4 == 0) {
+      return TryDispatchLayerNormBlockSMemImplBlockSize<LOAD, STORE, ComputeType, 4>(
+          stream, load, store, rows, cols, epsilon, mean, inv_variance, success);
+    } else if (cols % 2 == 0) {
+      return TryDispatchLayerNormBlockSMemImplBlockSize<LOAD, STORE, ComputeType, 2>(
+          stream, load, store, rows, cols, epsilon, mean, inv_variance, success);
+    } else {
+      return TryDispatchLayerNormBlockSMemImplBlockSize<LOAD, STORE, ComputeType, 1>(
+          stream, load, store, rows, cols, epsilon, mean, inv_variance, success);
+    }
+  }
+};
+
+template<typename LOAD, typename STORE, typename ComputeType>
+inline hipError_t TryDispatchLayerNormBlockSMemImpl(hipStream_t stream, LOAD load, STORE store,
+                                                     const int64_t rows, const int64_t cols,
+                                                     const double epsilon, ComputeType* mean,
+                                                     ComputeType* inv_variance, bool* success) {
+  return TryDispatchLayerNormBlockSMemImplPackSize<LOAD, STORE, ComputeType>()(
+      stream, load, store, rows, cols, epsilon, mean, inv_variance, success);
+}
+
+template<typename LOAD, typename STORE, typename ComputeType, int pack_size, int block_size>
+__global__ void LayerNormBlockUncachedImpl(LOAD load, STORE store, const int64_t rows,
+                                           const int64_t cols, const double epsilon,
+                                           ComputeType* mean, ComputeType* inv_variance) __attribute__((amdgpu_flat_work_group_size(1,1024))) {
+  const int tid = threadIdx.x;
+  assert(cols % pack_size == 0);
+  const int num_packs = static_cast<int>(cols) / pack_size;
+  for (int64_t row = blockIdx.x; row < rows; row += gridDim.x) {
+    ComputeType thread_mean = 0;
+    ComputeType thread_m2 = 0;
+    ComputeType thread_count = 0;
+    for (int pack_id = tid; pack_id < num_packs; pack_id += block_size) {
+      ComputeType pack[pack_size];
+      load.template load<pack_size>(pack, row, pack_id * pack_size);
+#pragma unroll
+      for (int i = 0; i < pack_size; ++i) {
+        WelfordCombine(pack[i], &thread_mean, &thread_m2, &thread_count);
+      }
+    }
+    ComputeType row_mean = 0;
+    ComputeType row_m2 = 0;
+    ComputeType row_count = 0;
+    WelfordBlockAllReduce<ComputeType>(thread_mean, thread_m2, thread_count, &row_mean, &row_m2,
+                                       &row_count);
+    ComputeType row_variance = max(Div(row_m2, row_count), static_cast<ComputeType>(0.0));
+    ComputeType row_inv_var = Rsqrt(row_variance + static_cast<ComputeType>(epsilon));
+    if (threadIdx.x == 0) {
+      mean[row] = row_mean;
+      inv_variance[row] = row_inv_var;
+    }
+    for (int pack_id = tid; pack_id < num_packs; pack_id += block_size) {
+      ComputeType pack[pack_size];
+      const int pack_offset = pack_id * pack_size;
+      load.template load<pack_size>(pack, row, pack_offset);
+#pragma unroll
+      for (int i = 0; i < pack_size; ++i) { pack[i] = (pack[i] - row_mean) * row_inv_var; }
+      store.template store<pack_size>(pack, row, pack_offset);
+    }
+  }
+}
+
+template<typename LOAD, typename STORE, typename ComputeType, int pack_size>
+inline hipError_t LaunchLayerNormBlockUncachedImpl(hipStream_t stream, LOAD load, STORE store,
+                                                    const int64_t rows, const int64_t cols,
+                                                    const double epsilon, ComputeType* mean,
+                                                    ComputeType* inv_variance) {
+  constexpr int block_size = 1024;
+  constexpr int waves = 32;
+  int grid_dim_x;
+  {
+    hipError_t err =
+        GetNumBlocks(LayerNormBlockUncachedImpl<LOAD, STORE, ComputeType, pack_size, block_size>,
+                     block_size, 0, rows, waves, &grid_dim_x);
+    if (err != hipSuccess) { return err; }
+  }
+  LayerNormBlockUncachedImpl<LOAD, STORE, ComputeType, pack_size, block_size>
+      <<<grid_dim_x, block_size, 0, stream>>>(load, store, rows, cols, epsilon, mean, inv_variance);
+  return hipPeekAtLastError();
+}
+
+template<typename LOAD, typename STORE, typename ComputeType>
+struct DispatchLayerNormBlockUncachedImplPackSize {
+  hipError_t operator()(hipStream_t stream, LOAD load, STORE store, const int64_t rows,
+                         const int64_t cols, const double epsilon, ComputeType* mean,
+                         ComputeType* inv_variance) {
+    if (cols % 4 == 0) {
+      return LaunchLayerNormBlockUncachedImpl<LOAD, STORE, ComputeType, 4>(
+          stream, load, store, rows, cols, epsilon, mean, inv_variance);
+    } else if (cols % 2 == 0) {
+      return LaunchLayerNormBlockUncachedImpl<LOAD, STORE, ComputeType, 2>(
+          stream, load, store, rows, cols, epsilon, mean, inv_variance);
+    } else {
+      return LaunchLayerNormBlockUncachedImpl<LOAD, STORE, ComputeType, 1>(
+          stream, load, store, rows, cols, epsilon, mean, inv_variance);
+    }
+  }
+};
+
+template<typename LOAD, typename STORE, typename ComputeType>
+inline hipError_t DispatchLayerNormBlockUncachedImpl(hipStream_t stream, LOAD load, STORE store,
+                                                      const int64_t rows, const int64_t cols,
+                                                      const double epsilon, ComputeType* mean,
+                                                      ComputeType* inv_variance) {
+  return DispatchLayerNormBlockUncachedImplPackSize<LOAD, STORE, ComputeType>()(
+      stream, load, store, rows, cols, epsilon, mean, inv_variance);
+}
+
+template<typename LOAD, typename STORE, typename ComputeType>
+inline typename std::enable_if<!std::is_same<ComputeType, double>::value, hipError_t>::type
+DispatchLayerNorm(hipStream_t stream, LOAD load, STORE store, const int64_t rows,
+                  const int64_t cols, const double epsilon, ComputeType* mean,
+                  ComputeType* inv_variance) {
+  if (cols <= 1024) {
+    return DispatchLayerNormWarpImpl<LOAD, STORE, ComputeType>(stream, load, store, rows, cols,
+                                                               epsilon, mean, inv_variance);
+  } else {
+    bool dispatch_smem_impl_success;
+    {
+      hipError_t err = TryDispatchLayerNormBlockSMemImpl<LOAD, STORE, ComputeType>(
+          stream, load, store, rows, cols, epsilon, mean, inv_variance,
+          &dispatch_smem_impl_success);
+      if (err != hipSuccess) { return err; }
+    }
+    if (!dispatch_smem_impl_success) {
+      return DispatchLayerNormBlockUncachedImpl<LOAD, STORE, ComputeType>(
+          stream, load, store, rows, cols, epsilon, mean, inv_variance);
+    }
+    return hipSuccess;
+  }
+}
+
+template<typename LOAD, typename STORE, typename ComputeType>
+inline typename std::enable_if<std::is_same<ComputeType, double>::value, hipError_t>::type
+DispatchLayerNorm(hipStream_t stream, LOAD load, STORE store, const int64_t rows,
+                  const int64_t cols, const double epsilon, ComputeType* mean,
+                  ComputeType* inv_variance) {
+  return DispatchLayerNormBlockUncachedImpl<LOAD, STORE, ComputeType>(
+      stream, load, store, rows, cols, epsilon, mean, inv_variance);
+}
+
+/*
+LayerNormGrad dx:
+normalized = (x - mean) * inv_var
+sum_stats1 = sum(scaled_dy)
+sum_stats2 = sum(scaled_dy * normalized)
+dx = cols * dy - sum_stats1 - normalized * sum_stats2
+dx *= inv_var / cols
+*/
+template<typename LOAD_X, typename LOAD_SCALED_DY, typename STORE, typename ComputeType,
+         int pack_size, int cols_per_thread, int thread_group_width, int rows_per_access,
+         bool padding>
+__global__ void LayerNormGradWarpImpl(LOAD_X load_x, LOAD_SCALED_DY load_scaled_dy, STORE store,
+                                      const ComputeType* mean, const ComputeType* inv_variance,
+                                      const int64_t rows, const int64_t cols) {
+  static_assert(cols_per_thread % pack_size == 0, "");
+  constexpr int pack_per_thread = cols_per_thread / pack_size;
+  assert(cols <= cols_per_thread * thread_group_width);
+  static_assert(thread_group_width <= kWarpSize, "");
+  static_assert(kWarpSize % thread_group_width == 0, "");
+  ComputeType normalized_buf[rows_per_access][cols_per_thread];
+  ComputeType dy_buf[rows_per_access][cols_per_thread];
+  const ComputeType one_over_cols = static_cast<ComputeType>(1.0) / static_cast<ComputeType>(cols);
+  const int64_t global_thread_group_id = blockIdx.x * blockDim.y + threadIdx.y;
+  const int64_t num_global_thread_group = gridDim.x * blockDim.y;
+  const int lane_id = threadIdx.x;
+  const int64_t step = num_global_thread_group * rows_per_access;
+  for (int64_t row = global_thread_group_id * rows_per_access; row < rows; row += step) {
+    ComputeType sum_stats1[rows_per_access];
+    ComputeType sum_stats2[rows_per_access];
+    ComputeType inv_variance_buf[rows_per_access];
+#pragma unroll
+    for (int row_id = 0; row_id < rows_per_access; ++row_id) {
+      const int global_row_id = row + row_id;
+      ComputeType mean_val = mean[global_row_id];
+      inv_variance_buf[row_id] = inv_variance[global_row_id];
+      sum_stats1[row_id] = 0;
+      sum_stats2[row_id] = 0;
+      ComputeType* row_normalized_buf = normalized_buf[row_id];
+      ComputeType* row_dy_buf = dy_buf[row_id];
+#pragma unroll
+      for (int pack_id = 0; pack_id < pack_per_thread; ++pack_id) {
+        const int col = (pack_id * thread_group_width + lane_id) * pack_size;
+        const int pack_offset = pack_id * pack_size;
+        if (!padding || col < cols) {
+          load_x.template load<pack_size>(row_normalized_buf + pack_offset, global_row_id, col);
+          load_scaled_dy.template load<pack_size>(row_dy_buf + pack_offset, global_row_id, col);
+#pragma unroll
+          for (int i = 0; i < pack_size; ++i) {
+            const int col_id = pack_offset + i;
+            // row_normalized_buf store x
+            row_normalized_buf[col_id] =
+                (row_normalized_buf[col_id] - mean_val) * inv_variance_buf[row_id];
+            sum_stats1[row_id] += row_dy_buf[col_id];
+            sum_stats2[row_id] += row_dy_buf[col_id] * row_normalized_buf[col_id];
+          }
+        }
+      }
+    }
+    ComputeType warp_sum_stats1[rows_per_access];
+    ComputeType warp_sum_stats2[rows_per_access];
+#pragma unroll
+    for (int row_id = 0; row_id < rows_per_access; ++row_id) {
+      warp_sum_stats1[row_id] =
+          WarpAllReduce<SumOp, ComputeType, thread_group_width>(sum_stats1[row_id]);
+      warp_sum_stats2[row_id] =
+          WarpAllReduce<SumOp, ComputeType, thread_group_width>(sum_stats2[row_id]);
+    }
+#pragma unroll
+    for (int row_id = 0; row_id < rows_per_access; ++row_id) {
+      const int global_row_id = row + row_id;
+      ComputeType* row_normalized_buf = normalized_buf[row_id];
+      ComputeType* row_dy_buf = dy_buf[row_id];
+      const ComputeType inv_variance_over_cols = inv_variance_buf[row_id] * one_over_cols;
+#pragma unroll
+      for (int pack_id = 0; pack_id < pack_per_thread; ++pack_id) {
+        const int col = (pack_id * thread_group_width + lane_id) * pack_size;
+        if (!padding || col < cols) {
+          for (int i = 0; i < pack_size; ++i) {
+            const int col_id = pack_id * pack_size + i;
+            row_dy_buf[col_id] = (cols * row_dy_buf[col_id] - warp_sum_stats1[row_id]
+                                  - row_normalized_buf[col_id] * warp_sum_stats2[row_id])
+                                 * inv_variance_over_cols;
+          }
+          store.template store<pack_size>(row_dy_buf + pack_id * pack_size, global_row_id, col);
+        }
+      }
+    }
+  }
+}
+
+template<typename LOAD_X, typename LOAD_SCALED_DY, typename STORE, typename ComputeType,
+         int pack_size, int cols_per_thread, int thread_group_width, int rows_per_access,
+         bool padding>
+inline hipError_t LaunchLayerNormGradWarpImpl(hipStream_t stream, LOAD_X load_x,
+                                               LOAD_SCALED_DY load_scaled_dy, STORE store,
+                                               const ComputeType* mean,
+                                               const ComputeType* inv_variance, const int64_t rows,
+                                               const int64_t cols) {
+  constexpr int block_size = 128;
+  constexpr int waves = 32;
+  static_assert(block_size % thread_group_width == 0, "");
+  constexpr int thread_groups_per_block = block_size / thread_group_width;
+  dim3 block_dim(thread_group_width, thread_groups_per_block);
+  const int64_t num_blocks =
+      (rows / rows_per_access + thread_groups_per_block - 1) / thread_groups_per_block;
+  int grid_dim_x;
+  {
+    hipError_t err = GetNumBlocks(
+        LayerNormGradWarpImpl<LOAD_X, LOAD_SCALED_DY, STORE, ComputeType, pack_size,
+                              cols_per_thread, thread_group_width, rows_per_access, padding>,
+        block_size, 0, num_blocks, waves, &grid_dim_x);
+    if (err != hipSuccess) { return err; }
+  }
+  LayerNormGradWarpImpl<LOAD_X, LOAD_SCALED_DY, STORE, ComputeType, pack_size, cols_per_thread,
+                        thread_group_width, rows_per_access, padding>
+      <<<grid_dim_x, block_dim, 0, stream>>>(load_x, load_scaled_dy, store, mean, inv_variance,
+                                             rows, cols);
+  return hipPeekAtLastError();
+}
+
+template<typename LOAD_X, typename LOAD_SCALED_DY, typename STORE, typename ComputeType,
+         int pack_size, int cols_per_thread, int thread_group_width, int rows_per_access>
+inline hipError_t DispatchLayerNormGradWarpImplPadding(hipStream_t stream, LOAD_X load_x,
+                                                        LOAD_SCALED_DY load_scaled_dy, STORE store,
+                                                        const ComputeType* mean,
+                                                        const ComputeType* inv_variance,
+                                                        const int64_t rows, const int64_t cols) {
+  if (cols == cols_per_thread * thread_group_width) {
+    return LaunchLayerNormGradWarpImpl<LOAD_X, LOAD_SCALED_DY, STORE, ComputeType, pack_size,
+                                       cols_per_thread, thread_group_width, rows_per_access, false>(
+        stream, load_x, load_scaled_dy, store, mean, inv_variance, rows, cols);
+  } else {
+    return LaunchLayerNormGradWarpImpl<LOAD_X, LOAD_SCALED_DY, STORE, ComputeType, pack_size,
+                                       cols_per_thread, thread_group_width, rows_per_access, true>(
+        stream, load_x, load_scaled_dy, store, mean, inv_variance, rows, cols);
+  }
+}
+
+template<typename LOAD_X, typename LOAD_SCALED_DY, typename STORE, typename ComputeType,
+         int pack_size>
+typename std::enable_if<pack_size == 1, hipError_t>::type DispatchLayerNormGradWarpImplCols(
+    hipStream_t stream, LOAD_X load_x, LOAD_SCALED_DY load_scaled_dy, STORE store,
+    const ComputeType* mean, const ComputeType* inv_variance, const int64_t rows,
+    const int64_t cols) {
+  if (cols <= 0) { return hipErrorInvalidValue; }
+#define DEFINE_ONE_ELIF(thread_group_width)                                                     \
+  else if (cols <= (thread_group_width)*pack_size) {                                            \
+    if (rows % 2 == 0) {                                                                        \
+      return DispatchLayerNormGradWarpImplPadding<LOAD_X, LOAD_SCALED_DY, STORE, ComputeType,   \
+                                                  pack_size, pack_size, thread_group_width, 2>( \
+          stream, load_x, load_scaled_dy, store, mean, inv_variance, rows, cols);               \
+    } else {                                                                                    \
+      return DispatchLayerNormGradWarpImplPadding<LOAD_X, LOAD_SCALED_DY, STORE, ComputeType,   \
+                                                  pack_size, pack_size, thread_group_width, 1>( \
+          stream, load_x, load_scaled_dy, store, mean, inv_variance, rows, cols);               \
+    }                                                                                           \
+  }
+  DEFINE_ONE_ELIF(1)
+  DEFINE_ONE_ELIF(2)
+  DEFINE_ONE_ELIF(4)
+  DEFINE_ONE_ELIF(8)
+  DEFINE_ONE_ELIF(16)
+  DEFINE_ONE_ELIF(32)
+#undef DEFINE_ONE_ELIF
+#define DEFINE_ONE_ELIF(col)                                                                \
+  else if (cols <= (col)*kWarpSize) {                                                       \
+    return DispatchLayerNormGradWarpImplPadding<LOAD_X, LOAD_SCALED_DY, STORE, ComputeType, \
+                                                pack_size, col, kWarpSize, 1>(              \
+        stream, load_x, load_scaled_dy, store, mean, inv_variance, rows, cols);             \
+  }
+  DEFINE_ONE_ELIF(2)
+  DEFINE_ONE_ELIF(3)
+  DEFINE_ONE_ELIF(4)
+  DEFINE_ONE_ELIF(5)
+  DEFINE_ONE_ELIF(6)
+  DEFINE_ONE_ELIF(7)
+  DEFINE_ONE_ELIF(8)
+  DEFINE_ONE_ELIF(9)
+  DEFINE_ONE_ELIF(10)
+  DEFINE_ONE_ELIF(11)
+  DEFINE_ONE_ELIF(12)
+  DEFINE_ONE_ELIF(13)
+  DEFINE_ONE_ELIF(14)
+  DEFINE_ONE_ELIF(15)
+  DEFINE_ONE_ELIF(16)
+  DEFINE_ONE_ELIF(17)
+  DEFINE_ONE_ELIF(18)
+  DEFINE_ONE_ELIF(19)
+  DEFINE_ONE_ELIF(20)
+  DEFINE_ONE_ELIF(21)
+  DEFINE_ONE_ELIF(22)
+  DEFINE_ONE_ELIF(23)
+  DEFINE_ONE_ELIF(24)
+  DEFINE_ONE_ELIF(25)
+  DEFINE_ONE_ELIF(26)
+  DEFINE_ONE_ELIF(27)
+  DEFINE_ONE_ELIF(28)
+  DEFINE_ONE_ELIF(29)
+  DEFINE_ONE_ELIF(30)
+  DEFINE_ONE_ELIF(31)
+  DEFINE_ONE_ELIF(32)
+#undef DEFINE_ONE_ELIF
+  else {
+    return hipErrorInvalidValue;
+  }
+}
+
+template<typename LOAD_X, typename LOAD_SCALED_DY, typename STORE, typename ComputeType,
+         int pack_size>
+typename std::enable_if<pack_size == 2, hipError_t>::type DispatchLayerNormGradWarpImplCols(
+    hipStream_t stream, LOAD_X load_x, LOAD_SCALED_DY load_scaled_dy, STORE store,
+    const ComputeType* mean, const ComputeType* inv_variance, const int64_t rows,
+    const int64_t cols) {
+  if (cols <= 0) { return hipErrorInvalidValue; }
+#define DEFINE_ONE_ELIF(thread_group_width)                                                     \
+  else if (cols <= (thread_group_width)*pack_size) {                                            \
+    if (rows % 2 == 0) {                                                                        \
+      return DispatchLayerNormGradWarpImplPadding<LOAD_X, LOAD_SCALED_DY, STORE, ComputeType,   \
+                                                  pack_size, pack_size, thread_group_width, 2>( \
+          stream, load_x, load_scaled_dy, store, mean, inv_variance, rows, cols);               \
+    } else {                                                                                    \
+      return DispatchLayerNormGradWarpImplPadding<LOAD_X, LOAD_SCALED_DY, STORE, ComputeType,   \
+                                                  pack_size, pack_size, thread_group_width, 1>( \
+          stream, load_x, load_scaled_dy, store, mean, inv_variance, rows, cols);               \
+    }                                                                                           \
+  }
+  DEFINE_ONE_ELIF(1)
+  DEFINE_ONE_ELIF(2)
+  DEFINE_ONE_ELIF(4)
+  DEFINE_ONE_ELIF(8)
+  DEFINE_ONE_ELIF(16)
+  DEFINE_ONE_ELIF(32)
+#undef DEFINE_ONE_ELIF
+#define DEFINE_ONE_ELIF(col)                                                                \
+  else if (cols <= (col)*kWarpSize) {                                                       \
+    return DispatchLayerNormGradWarpImplPadding<LOAD_X, LOAD_SCALED_DY, STORE, ComputeType, \
+                                                pack_size, col, kWarpSize, 1>(              \
+        stream, load_x, load_scaled_dy, store, mean, inv_variance, rows, cols);             \
+  }
+  DEFINE_ONE_ELIF(4)
+  DEFINE_ONE_ELIF(6)
+  DEFINE_ONE_ELIF(8)
+  DEFINE_ONE_ELIF(10)
+  DEFINE_ONE_ELIF(12)
+  DEFINE_ONE_ELIF(14)
+  DEFINE_ONE_ELIF(16)
+  DEFINE_ONE_ELIF(18)
+  DEFINE_ONE_ELIF(20)
+  DEFINE_ONE_ELIF(22)
+  DEFINE_ONE_ELIF(24)
+  DEFINE_ONE_ELIF(26)
+  DEFINE_ONE_ELIF(28)
+  DEFINE_ONE_ELIF(30)
+  DEFINE_ONE_ELIF(32)
+#undef DEFINE_ONE_ELIF
+  else {
+    return hipErrorInvalidValue;
+  }
+}
+
+template<typename LOAD_X, typename LOAD_SCALED_DY, typename STORE, typename ComputeType>
+struct DispatchLayerNormGradWarpImplPackSize {
+  hipError_t operator()(hipStream_t stream, LOAD_X load_x, LOAD_SCALED_DY load_scaled_dy,
+                         STORE store, const ComputeType* mean, const ComputeType* inv_variance,
+                         const int64_t rows, const int64_t cols) {
+    if (cols % 2 == 0) {
+      return DispatchLayerNormGradWarpImplCols<LOAD_X, LOAD_SCALED_DY, STORE, ComputeType, 2>(
+          stream, load_x, load_scaled_dy, store, mean, inv_variance, rows, cols);
+    } else {
+      return DispatchLayerNormGradWarpImplCols<LOAD_X, LOAD_SCALED_DY, STORE, ComputeType, 1>(
+          stream, load_x, load_scaled_dy, store, mean, inv_variance, rows, cols);
+    }
+  }
+};
+
+template<typename LOAD_X, typename LOAD_SCALED_DY, typename STORE, typename ComputeType>
+inline hipError_t DispatchLayerNormGradWarpImpl(hipStream_t stream, LOAD_X load_x,
+                                                 LOAD_SCALED_DY load_scaled_dy, STORE store,
+                                                 const ComputeType* mean,
+                                                 const ComputeType* inv_variance,
+                                                 const int64_t rows, const int64_t cols) {
+  return DispatchLayerNormGradWarpImplPackSize<LOAD_X, LOAD_SCALED_DY, STORE, ComputeType>()(
+      stream, load_x, load_scaled_dy, store, mean, inv_variance, rows, cols);
+}
+
+template<typename LOAD_X, typename LOAD_SCALED_DY, typename STORE, typename ComputeType,
+         int pack_size, int block_size>
+__global__ void LayerNormGradBlockSMemImpl(LOAD_X load_x, LOAD_SCALED_DY load_scaled_dy,
+                                           STORE store, const ComputeType* mean,
+                                           const ComputeType* inv_variance, const int64_t rows,
+                                           const int64_t cols) {
+  extern __shared__ __align__(sizeof(double)) unsigned char grad_shared_buf[];
+  auto* normalized_buf = reinterpret_cast<ComputeType*>(grad_shared_buf);
+  auto* dy_buf = normalized_buf + cols;
+  const int tid = threadIdx.x;
+  assert(cols % pack_size == 0);
+  const int num_packs = static_cast<int>(cols) / pack_size;
+  const ComputeType one_over_cols = static_cast<ComputeType>(1.0) / static_cast<ComputeType>(cols);
+  for (int64_t row = blockIdx.x; row < rows; row += gridDim.x) {
+    ComputeType sum_stats1 = 0;
+    ComputeType sum_stats2 = 0;
+    const ComputeType mean_val = mean[row];
+    const ComputeType inv_variance_val = inv_variance[row];
+    const ComputeType inv_variance_over_cols = inv_variance_val * one_over_cols;
+    for (int pack_id = tid; pack_id < num_packs; pack_id += block_size) {
+      ComputeType x_pack[pack_size];
+      ComputeType dy_pack[pack_size];
+      load_x.template load<pack_size>(x_pack, row, pack_id * pack_size);
+      load_scaled_dy.template load<pack_size>(dy_pack, row, pack_id * pack_size);
+#pragma unroll
+      for (int i = 0; i < pack_size; ++i) {
+        const int buf_offset = i * num_packs + pack_id;
+        ComputeType normalized = (x_pack[i] - mean_val) * inv_variance_val;
+        normalized_buf[buf_offset] = normalized;
+        dy_buf[buf_offset] = dy_pack[i];
+        sum_stats1 += dy_pack[i];
+        sum_stats2 += dy_pack[i] * normalized;
+      }
+    }
+    const ComputeType row_sum_stats1 = BlockAllReduce<SumOp, ComputeType, block_size>(sum_stats1);
+    const ComputeType row_sum_stats2 = BlockAllReduce<SumOp, ComputeType, block_size>(sum_stats2);
+    for (int pack_id = tid; pack_id < num_packs; pack_id += block_size) {
+      ComputeType pack[pack_size];
+#pragma unroll
+      for (int i = 0; i < pack_size; ++i) {
+        const int buf_offset = i * num_packs + pack_id;
+        pack[i] = (cols * dy_buf[buf_offset] - row_sum_stats1
+                   - normalized_buf[buf_offset] * row_sum_stats2)
+                  * inv_variance_over_cols;
+      }
+      store.template store<pack_size>(pack, row, pack_id * pack_size);
+    }
+  }
+}
+
+template<typename LOAD_X, typename LOAD_SCALED_DY, typename STORE, typename ComputeType,
+         int pack_size, int block_size>
+__global__ void LayerNormGradBlockSMemImpl_1024(LOAD_X load_x, LOAD_SCALED_DY load_scaled_dy,
+                                           STORE store, const ComputeType* mean,
+                                           const ComputeType* inv_variance, const int64_t rows,
+                                           const int64_t cols) __attribute__((amdgpu_flat_work_group_size(1,1024))) {
+  extern __shared__ __align__(sizeof(double)) unsigned char grad_shared_buf[];
+  auto* normalized_buf = reinterpret_cast<ComputeType*>(grad_shared_buf);
+  auto* dy_buf = normalized_buf + cols;
+  const int tid = threadIdx.x;
+  assert(cols % pack_size == 0);
+  const int num_packs = static_cast<int>(cols) / pack_size;
+  const ComputeType one_over_cols = static_cast<ComputeType>(1.0) / static_cast<ComputeType>(cols);
+  for (int64_t row = blockIdx.x; row < rows; row += gridDim.x) {
+    ComputeType sum_stats1 = 0;
+    ComputeType sum_stats2 = 0;
+    const ComputeType mean_val = mean[row];
+    const ComputeType inv_variance_val = inv_variance[row];
+    const ComputeType inv_variance_over_cols = inv_variance_val * one_over_cols;
+    for (int pack_id = tid; pack_id < num_packs; pack_id += block_size) {
+      ComputeType x_pack[pack_size];
+      ComputeType dy_pack[pack_size];
+      load_x.template load<pack_size>(x_pack, row, pack_id * pack_size);
+      load_scaled_dy.template load<pack_size>(dy_pack, row, pack_id * pack_size);
+#pragma unroll
+      for (int i = 0; i < pack_size; ++i) {
+        const int buf_offset = i * num_packs + pack_id;
+        ComputeType normalized = (x_pack[i] - mean_val) * inv_variance_val;
+        normalized_buf[buf_offset] = normalized;
+        dy_buf[buf_offset] = dy_pack[i];
+        sum_stats1 += dy_pack[i];
+        sum_stats2 += dy_pack[i] * normalized;
+      }
+    }
+    const ComputeType row_sum_stats1 = BlockAllReduce<SumOp, ComputeType, block_size>(sum_stats1);
+    const ComputeType row_sum_stats2 = BlockAllReduce<SumOp, ComputeType, block_size>(sum_stats2);
+    for (int pack_id = tid; pack_id < num_packs; pack_id += block_size) {
+      ComputeType pack[pack_size];
+#pragma unroll
+      for (int i = 0; i < pack_size; ++i) {
+        const int buf_offset = i * num_packs + pack_id;
+        pack[i] = (cols * dy_buf[buf_offset] - row_sum_stats1
+                   - normalized_buf[buf_offset] * row_sum_stats2)
+                  * inv_variance_over_cols;
+      }
+      store.template store<pack_size>(pack, row, pack_id * pack_size);
+    }
+  }
+}
+
+template<typename LOAD_X, typename LOAD_SCALED_DY, typename STORE, typename ComputeType,
+         int pack_size, int block_size>
+inline hipError_t LaunchLayerNormGradBlockSMemImpl(hipStream_t stream, LOAD_X load_x,
+                                                    LOAD_SCALED_DY load_scaled_dy, STORE store,
+                                                    const ComputeType* mean,
+                                                    const ComputeType* inv_variance, int smem,
+                                                    const int64_t rows, const int64_t cols) {
+  constexpr int waves = 32;
+  int grid_dim_x;
+  {
+    hipError_t err = GetNumBlocks(LayerNormGradBlockSMemImpl<LOAD_X, LOAD_SCALED_DY, STORE,
+                                                              ComputeType, pack_size, block_size>,
+                                   block_size, smem, rows, waves, &grid_dim_x);
+    if (err != hipSuccess) { return err; }
+  }
+  LayerNormGradBlockSMemImpl<LOAD_X, LOAD_SCALED_DY, STORE, ComputeType, pack_size, block_size>
+      <<<grid_dim_x, block_size, smem, stream>>>(load_x, load_scaled_dy, store, mean, inv_variance,
+                                                 rows, cols);
+  return hipPeekAtLastError();
+}
+
+template<typename LOAD_X, typename LOAD_SCALED_DY, typename STORE, typename ComputeType,
+         int pack_size, int block_size>
+inline hipError_t LaunchLayerNormGradBlockSMemImpl_1024(hipStream_t stream, LOAD_X load_x,
+                                                    LOAD_SCALED_DY load_scaled_dy, STORE store,
+                                                    const ComputeType* mean,
+                                                    const ComputeType* inv_variance, int smem,
+                                                    const int64_t rows, const int64_t cols) {
+  constexpr int waves = 32;
+  int grid_dim_x;
+  {
+    hipError_t err = GetNumBlocks(LayerNormGradBlockSMemImpl_1024<LOAD_X, LOAD_SCALED_DY, STORE,
+                                                              ComputeType, pack_size, block_size>,
+                                   block_size, smem, rows, waves, &grid_dim_x);
+    if (err != hipSuccess) { return err; }
+  }
+  LayerNormGradBlockSMemImpl_1024<LOAD_X, LOAD_SCALED_DY, STORE, ComputeType, pack_size, block_size>
+      <<<grid_dim_x, block_size, smem, stream>>>(load_x, load_scaled_dy, store, mean, inv_variance,
+                                                 rows, cols);
+  return hipPeekAtLastError();
+}
+
+template<typename LOAD_X, typename LOAD_SCALED_DY, typename STORE, typename ComputeType,
+         int pack_size>
+inline hipError_t TryDispatchLayerNormGradBlockSMemImplBlockSize(
+    hipStream_t stream, LOAD_X load_x, LOAD_SCALED_DY load_scaled_dy, STORE store,
+    const ComputeType* mean, const ComputeType* inv_variance, const int64_t rows,
+    const int64_t cols, bool* success) {
+  constexpr int block_size_conf_1 = 128;
+  constexpr int block_size_conf_2 = 256;
+  constexpr int block_size_conf_3 = 512;
+  constexpr int block_size_conf_4 = 1024;
+  const size_t smem = cols * sizeof(ComputeType) * 2;
+  int max_active_blocks_conf_1;
+  {
+    hipError_t err = hipOccupancyMaxActiveBlocksPerMultiprocessor(
+        &max_active_blocks_conf_1,
+        LayerNormGradBlockSMemImpl<LOAD_X, LOAD_SCALED_DY, STORE, ComputeType, pack_size,
+                                   block_size_conf_1>,
+        block_size_conf_1, smem);
+    if (err != hipSuccess) { return err; }
+  }
+  if (max_active_blocks_conf_1 <= 0) {
+    *success = false;
+    return hipSuccess;
+  }
+  int max_active_blocks_conf_4;
+  {
+    hipError_t err = hipOccupancyMaxActiveBlocksPerMultiprocessor(
+        &max_active_blocks_conf_4,
+        LayerNormGradBlockSMemImpl_1024<LOAD_X, LOAD_SCALED_DY, STORE, ComputeType, pack_size,
+                                   block_size_conf_4>,
+        block_size_conf_4, smem);
+    if (err != hipSuccess) { return err; }
+  }
+  if (max_active_blocks_conf_4 == max_active_blocks_conf_1) {
+    *success = true;
+    return LaunchLayerNormGradBlockSMemImpl_1024<LOAD_X, LOAD_SCALED_DY, STORE, ComputeType, pack_size,
+                                            block_size_conf_4>(
+        stream, load_x, load_scaled_dy, store, mean, inv_variance, smem, rows, cols);
+  }
+  int max_active_blocks_conf_3;
+  {
+    hipError_t err = hipOccupancyMaxActiveBlocksPerMultiprocessor(
+        &max_active_blocks_conf_3,
+        LayerNormGradBlockSMemImpl<LOAD_X, LOAD_SCALED_DY, STORE, ComputeType, pack_size,
+                                   block_size_conf_3>,
+        block_size_conf_3, smem);
+    if (err != hipSuccess) { return err; }
+  }
+  if (max_active_blocks_conf_3 == max_active_blocks_conf_1) {
+    *success = true;
+    return LaunchLayerNormGradBlockSMemImpl<LOAD_X, LOAD_SCALED_DY, STORE, ComputeType, pack_size,
+                                            block_size_conf_3>(
+        stream, load_x, load_scaled_dy, store, mean, inv_variance, smem, rows, cols);
+  }
+  int max_active_blocks_conf_2;
+  {
+    hipError_t err = hipOccupancyMaxActiveBlocksPerMultiprocessor(
+        &max_active_blocks_conf_2,
+        LayerNormGradBlockSMemImpl<LOAD_X, LOAD_SCALED_DY, STORE, ComputeType, pack_size,
+                                   block_size_conf_2>,
+        block_size_conf_2, smem);
+    if (err != hipSuccess) { return err; }
+  }
+  if (max_active_blocks_conf_2 == max_active_blocks_conf_1) {
+    *success = true;
+    return LaunchLayerNormGradBlockSMemImpl<LOAD_X, LOAD_SCALED_DY, STORE, ComputeType, pack_size,
+                                            block_size_conf_2>(
+        stream, load_x, load_scaled_dy, store, mean, inv_variance, smem, rows, cols);
+  }
+  *success = true;
+  return LaunchLayerNormGradBlockSMemImpl<LOAD_X, LOAD_SCALED_DY, STORE, ComputeType, pack_size,
+                                          block_size_conf_1>(stream, load_x, load_scaled_dy, store,
+                                                             mean, inv_variance, smem, rows, cols);
+}
+
+template<typename LOAD_X, typename LOAD_SCALED_DY, typename STORE, typename ComputeType>
+struct TryDispatchLayerNormGradBlockSMemImplPackSize {
+  hipError_t operator()(hipStream_t stream, LOAD_X load_x, LOAD_SCALED_DY load_scaled_dy,
+                         STORE store, const ComputeType* mean, const ComputeType* inv_variance,
+                         const int64_t rows, const int64_t cols, bool* success) {
+    if (cols % 2 == 0) {
+      return TryDispatchLayerNormGradBlockSMemImplBlockSize<LOAD_X, LOAD_SCALED_DY, STORE,
+                                                            ComputeType, 2>(
+          stream, load_x, load_scaled_dy, store, mean, inv_variance, rows, cols, success);
+    } else {
+      return TryDispatchLayerNormGradBlockSMemImplBlockSize<LOAD_X, LOAD_SCALED_DY, STORE,
+                                                            ComputeType, 1>(
+          stream, load_x, load_scaled_dy, store, mean, inv_variance, rows, cols, success);
+    }
+  }
+};
+
+template<typename LOAD_X, typename LOAD_SCALED_DY, typename STORE, typename ComputeType>
+inline hipError_t TryDispatchLayerNormGradBlockSMemImpl(hipStream_t stream, LOAD_X load_x,
+                                                         LOAD_SCALED_DY load_scaled_dy, STORE store,
+                                                         const ComputeType* mean,
+                                                         const ComputeType* inv_variance,
+                                                         const int64_t rows, const int64_t cols,
+                                                         bool* success) {
+  return TryDispatchLayerNormGradBlockSMemImplPackSize<LOAD_X, LOAD_SCALED_DY, STORE,
+                                                       ComputeType>()(
+      stream, load_x, load_scaled_dy, store, mean, inv_variance, rows, cols, success);
+}
+
+template<typename LOAD_X, typename LOAD_SCALED_DY, typename STORE, typename ComputeType,
+         int pack_size, int block_size>
+__global__ void LayerNormGradBlockUncachedImpl(LOAD_X load_x, LOAD_SCALED_DY load_scaled_dy,
+                                               STORE store, const ComputeType* mean,
+                                               const ComputeType* inv_variance, const int64_t rows,
+                                               const int64_t cols) __attribute__((amdgpu_flat_work_group_size(1,1024))) {
+  const int tid = threadIdx.x;
+  assert(cols % pack_size == 0);
+  const int num_packs = static_cast<int>(cols) / pack_size;
+  const ComputeType one_over_cols = static_cast<ComputeType>(1.0) / static_cast<ComputeType>(cols);
+  for (int64_t row = blockIdx.x; row < rows; row += gridDim.x) {
+    const ComputeType mean_val = mean[row];
+    const ComputeType inv_variance_val = inv_variance[row];
+    const ComputeType inv_variance_over_cols = inv_variance_val * one_over_cols;
+    ComputeType sum_stats1 = 0;
+    ComputeType sum_stats2 = 0;
+    for (int pack_id = tid; pack_id < num_packs; pack_id += block_size) {
+      ComputeType x_pack[pack_size];
+      ComputeType dy_pack[pack_size];
+      load_x.template load<pack_size>(x_pack, row, pack_id * pack_size);
+      load_scaled_dy.template load<pack_size>(dy_pack, row, pack_id * pack_size);
+
+#pragma unroll
+      for (int i = 0; i < pack_size; ++i) {
+        sum_stats1 += dy_pack[i];
+        sum_stats2 += dy_pack[i] * (x_pack[i] - mean_val) * inv_variance_val;
+      }
+    }
+    const ComputeType row_sum_stats1 = BlockAllReduce<SumOp, ComputeType, block_size>(sum_stats1);
+    const ComputeType row_sum_stats2 = BlockAllReduce<SumOp, ComputeType, block_size>(sum_stats2);
+    for (int pack_id = tid; pack_id < num_packs; pack_id += block_size) {
+      ComputeType x_pack[pack_size];
+      ComputeType dy_pack[pack_size];
+      load_x.template load<pack_size>(x_pack, row, pack_id * pack_size);
+      load_scaled_dy.template load<pack_size>(dy_pack, row, pack_id * pack_size);
+#pragma unroll
+      for (int i = 0; i < pack_size; ++i) {
+        dy_pack[i] = (cols * dy_pack[i] - row_sum_stats1
+                      - (x_pack[i] - mean_val) * inv_variance_val * row_sum_stats2)
+                     * inv_variance_over_cols;
+      }
+      store.template store<pack_size>(dy_pack, row, pack_id * pack_size);
+    }
+  }
+}
+
+template<typename LOAD_X, typename LOAD_SCALED_DY, typename STORE, typename ComputeType,
+         int pack_size>
+inline hipError_t LaunchLayerNormGradBlockUncachedImpl(hipStream_t stream, LOAD_X load_x,
+                                                        LOAD_SCALED_DY load_scaled_dy, STORE store,
+                                                        const ComputeType* mean,
+                                                        const ComputeType* inv_variance,
+                                                        const int64_t rows, const int64_t cols) {
+  constexpr int block_size = 1024;
+  constexpr int waves = 32;
+  int grid_dim_x;
+  {
+    hipError_t err =
+        GetNumBlocks(LayerNormGradBlockUncachedImpl<LOAD_X, LOAD_SCALED_DY, STORE, ComputeType,
+                                                    pack_size, block_size>,
+                     block_size, 0, rows, waves, &grid_dim_x);
+    if (err != hipSuccess) { return err; }
+  }
+  LayerNormGradBlockUncachedImpl<LOAD_X, LOAD_SCALED_DY, STORE, ComputeType, pack_size, block_size>
+      <<<grid_dim_x, block_size, 0, stream>>>(load_x, load_scaled_dy, store, mean, inv_variance,
+                                              rows, cols);
+  return hipPeekAtLastError();
+}
+
+template<typename LOAD_X, typename LOAD_SCALED_DY, typename STORE, typename ComputeType>
+struct DispatchLayerNormGradBlockUncachedImplPackSize {
+  hipError_t operator()(hipStream_t stream, LOAD_X load_x, LOAD_SCALED_DY load_scaled_dy,
+                         STORE store, const ComputeType* mean, const ComputeType* inv_variance,
+                         const int64_t rows, const int64_t cols) {
+    if (cols % 2 == 0 && cols > kWarpSize) {
+      return LaunchLayerNormGradBlockUncachedImpl<LOAD_X, LOAD_SCALED_DY, STORE, ComputeType, 2>(
+          stream, load_x, load_scaled_dy, store, mean, inv_variance, rows, cols);
+    } else {
+      return LaunchLayerNormGradBlockUncachedImpl<LOAD_X, LOAD_SCALED_DY, STORE, ComputeType, 1>(
+          stream, load_x, load_scaled_dy, store, mean, inv_variance, rows, cols);
+    }
+  }
+};
+
+template<typename LOAD_X, typename LOAD_SCALED_DY, typename STORE, typename ComputeType>
+inline hipError_t DispatchLayerNormGradBlockUncachedImpl(hipStream_t stream, LOAD_X load_x,
+                                                          LOAD_SCALED_DY load_scaled_dy,
+                                                          STORE store, const ComputeType* mean,
+                                                          const ComputeType* inv_variance,
+                                                          const int64_t rows, const int64_t cols) {
+  return DispatchLayerNormGradBlockUncachedImplPackSize<LOAD_X, LOAD_SCALED_DY, STORE,
+                                                        ComputeType>()(
+      stream, load_x, load_scaled_dy, store, mean, inv_variance, rows, cols);
+}
+
+template<typename LOAD_X, typename LOAD_SCALED_DY, typename STORE, typename ComputeType>
+inline typename std::enable_if<!std::is_same<ComputeType, double>::value, hipError_t>::type
+DispatchLayerNormGrad(hipStream_t stream, LOAD_X load_x, LOAD_SCALED_DY load_scaled_dy,
+                      STORE store, const ComputeType* mean, const ComputeType* inv_variance,
+                      const int64_t rows, const int64_t cols) {
+  if (cols <= 1024) {
+    return DispatchLayerNormGradWarpImpl<LOAD_X, LOAD_SCALED_DY, STORE, ComputeType>(
+        stream, load_x, load_scaled_dy, store, mean, inv_variance, rows, cols);
+  } else {
+    bool dispatch_smem_impl_success;
+    {
+      hipError_t err =
+          TryDispatchLayerNormGradBlockSMemImpl<LOAD_X, LOAD_SCALED_DY, STORE, ComputeType>(
+              stream, load_x, load_scaled_dy, store, mean, inv_variance, rows, cols,
+              &dispatch_smem_impl_success);
+      if (err != hipSuccess) { return err; }
+    }
+    if (!dispatch_smem_impl_success) {
+      return DispatchLayerNormGradBlockUncachedImpl<LOAD_X, LOAD_SCALED_DY, STORE, ComputeType>(
+          stream, load_x, load_scaled_dy, store, mean, inv_variance, rows, cols);
+    }
+    return hipSuccess;
+  }
+}
+
+template<typename LOAD_X, typename LOAD_SCALED_DY, typename STORE, typename ComputeType>
+inline typename std::enable_if<std::is_same<ComputeType, double>::value, hipError_t>::type
+DispatchLayerNormGrad(hipStream_t stream, LOAD_X load_x, LOAD_SCALED_DY load_scaled_dy,
+                      STORE store, const ComputeType* mean, const ComputeType* inv_variance,
+                      const int64_t rows, const int64_t cols) {
+  return DispatchLayerNormGradBlockUncachedImpl<LOAD_X, LOAD_SCALED_DY, STORE, ComputeType>(
+      stream, load_x, load_scaled_dy, store, mean, inv_variance, rows, cols);
+}
+
+}  // namespace layer_norm
+
+}  // namespace cuda
+
+}  // namespace oneflow
+
+
+#endif  // WITH_ROCM
+
+#endif  // ONEFLOW_CORE_CUDA_LAYER_NORM_H_
diff --git a/oneflow/core/hip/softmax.hip.h b/oneflow/core/hip/softmax.hip.h
index 5cf7f05..f887d2e 100644
--- a/oneflow/core/hip/softmax.hip.h
+++ b/oneflow/core/hip/softmax.hip.h
@@ -1,1499 +1,1499 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-#ifndef ONEFLOW_CORE_HIP_SOFTMAX_H_
-#define ONEFLOW_CORE_HIP_SOFTMAX_H_
-
-#ifdef WITH_ROCM
-
-#include <hipcub/hipcub.hpp>
-// #include <math_constants.h>
-#include <assert.h>
-#include <hip/hip_runtime.h>
-
-// #if CUDA_VERSION >= 11000
-// #include <cuda_bf16.h>
-// #endif  // CUDA_VERSION >= 11000
-
-namespace oneflow {
-
-namespace cuda {
-
-namespace softmax {
-
-constexpr int kWarpSize = 64;
-
-template<typename T>
-struct SumOp {
-  __device__ __forceinline__ T operator()(const T& a, const T& b) const { return a + b; }
-};
-
-template<typename T>
-struct MaxOp {
-  __device__ __forceinline__ T operator()(const T& a, const T& b) const { return max(a, b); }
-};
-
-template<template<typename> class ReductionOp, typename T, int thread_group_width = kWarpSize>
-__inline__ __device__ T WarpAllReduce(T val) {
-  for (int mask = thread_group_width / 2; mask > 0; mask /= 2) {
-    // val = ReductionOp<T>()(val, __shfl_xor(0xffffffff, val, mask));
-    val = ReductionOp<T>()(val, __shfl_xor(val, mask, kWarpSize));
-  }
-  return val;
-}
-
-template<template<typename> class ReductionOp, typename T, int block_size>
-__inline__ __device__ T BlockAllReduce(T val) {
-  typedef hipcub::BlockReduce<T, block_size> BlockReduce;
-  __shared__ typename BlockReduce::TempStorage temp_storage;
-  __shared__ T result_broadcast;
-  T result = BlockReduce(temp_storage).Reduce(val, ReductionOp<T>());
-  if (threadIdx.x == 0) { result_broadcast = result; }
-  __syncthreads();
-  return result_broadcast;
-}
-
-template<typename T>
-__inline__ __device__ T Inf();
-
-template<>
-__inline__ __device__ float Inf<float>() {
-  return __int_as_float(0x7f800000U);
-}
-
-template<>
-__inline__ __device__ double Inf<double>() {
-  return __longlong_as_double(0x7ff0000000000000ULL);
-}
-
-template<typename T>
-__inline__ __device__ T Exp(T x);
-
-template<>
-__inline__ __device__ float Exp<float>(float x) {
-#ifdef OF_SOFTMAX_USE_FAST_MATH
-  return __expf(x);
-#else
-  return exp(x);
-#endif
-}
-
-template<>
-__inline__ __device__ double Exp<double>(double x) {
-  return exp(x);
-}
-
-template<typename T>
-__inline__ __device__ T Div(T a, T b);
-
-template<>
-__inline__ __device__ float Div<float>(float a, float b) {
-#ifdef OF_SOFTMAX_USE_FAST_MATH
-  return __fdividef(a, b);
-#else
-  return a / b;
-#endif
-}
-
-template<>
-__inline__ __device__ double Div<double>(double a, double b) {
-  return a / b;
-}
-
-template<typename T>
-__inline__ __device__ T Log(T x);
-
-template<>
-__inline__ __device__ float Log<float>(float x) {
-#ifdef OF_SOFTMAX_USE_FAST_MATH
-  return __logf(x);
-#else
-  return log(x);
-#endif
-}
-template<>
-__inline__ __device__ double Log<double>(double x) {
-  return log(x);
-}
-
-inline hipError_t GetNumBlocks(int64_t block_size, int64_t max_blocks, int64_t waves,
-                                int* num_blocks) {
-  int dev;
-  {
-    hipError_t err = hipGetDevice(&dev);
-    if (err != hipSuccess) { return err; }
-  }
-  int sm_count;
-  {
-    hipError_t err = hipDeviceGetAttribute(&sm_count, hipDeviceAttributeMultiprocessorCount, dev);
-    if (err != hipSuccess) { return err; }
-  }
-  int tpm;
-  {
-    hipError_t err = hipDeviceGetAttribute(&tpm, hipDeviceAttributeMaxThreadsPerMultiProcessor, dev);
-    if (err != hipSuccess) { return err; }
-  }
-  *num_blocks =
-      std::max<int>(1, std::min<int64_t>(max_blocks, sm_count * tpm / block_size * waves));
-  return hipSuccess;
-}
-
-template<typename T>
-struct DefaultComputeType {
-  using type = T;
-};
-
-template<>
-struct DefaultComputeType<half> {
-  using type = float;
-};
-
-// #if CUDA_VERSION >= 11000
-// template<>
-// struct DefaultComputeType<nv_bfloat16> {
-//   using type = float;
-// };
-// #endif  // CUDA_VERSION >= 11000
-
-template<typename T, int N>
-struct GetPackType {
-  using type = typename std::aligned_storage<N * sizeof(T), N * sizeof(T)>::type;
-};
-
-template<typename T, int N>
-using PackType = typename GetPackType<T, N>::type;
-
-template<typename T, int N>
-union Pack {
-  static_assert(sizeof(PackType<T, N>) == sizeof(T) * N, "");
-  __device__ Pack() {
-    // do nothing
-  }
-  PackType<T, N> storage;
-  T elem[N];
-};
-
-template<typename SRC, typename DST>
-struct DirectLoad {
-  DirectLoad(const SRC* src, int64_t row_size) : src(src), row_size(row_size) {}
-  template<int N>
-  __device__ void load(DST* dst, int64_t row, int64_t col) const {
-    Pack<SRC, N> pack;
-    const int64_t offset = (row * row_size + col) / N;
-    pack.storage = *(reinterpret_cast<const PackType<SRC, N>*>(src) + offset);
-#pragma unroll
-    for (int i = 0; i < N; ++i) { dst[i] = static_cast<DST>(pack.elem[i]); }
-  }
-  const SRC* src;
-  int64_t row_size;
-};
-
-template<typename SRC, typename DST>
-struct DirectStore {
-  DirectStore(DST* dst, int64_t row_size) : dst(dst), row_size(row_size) {}
-  template<int N>
-  __device__ void store(const SRC* src, int64_t row, int64_t col) {
-    Pack<DST, N> pack;
-    const int64_t offset = (row * row_size + col) / N;
-#pragma unroll
-    for (int i = 0; i < N; ++i) { pack.elem[i] = static_cast<DST>(src[i]); }
-    *(reinterpret_cast<PackType<DST, N>*>(dst) + offset) = pack.storage;
-  }
-  DST* dst;
-  int64_t row_size;
-};
-
-enum class Algorithm {
-  kSoftmax = 0,
-  kLogSoftmax = 1,
-};
-
-template<typename LOAD, typename STORE, typename ComputeType, int pack_size, int cols_per_thread,
-         int thread_group_width, int rows_per_access, bool padding, Algorithm algorithm>
-__global__ void SoftmaxWarpImpl(LOAD load, STORE store, const int64_t rows, const int64_t cols) {
-  static_assert(cols_per_thread % pack_size == 0, "");
-  static_assert(thread_group_width <= kWarpSize, "");
-  static_assert(kWarpSize % thread_group_width == 0, "");
-  constexpr int num_packs = cols_per_thread / pack_size;
-  assert(cols <= cols_per_thread * thread_group_width);
-  ComputeType buf[rows_per_access][cols_per_thread];
-  const int global_thread_group_id = blockIdx.x * blockDim.y + threadIdx.y;
-  const int num_global_thread_group = gridDim.x * blockDim.y;
-  const int lane_id = threadIdx.x;
-  const int64_t step = num_global_thread_group * rows_per_access;
-  for (int64_t row = global_thread_group_id * rows_per_access; row < rows; row += step) {
-    ComputeType thread_max[rows_per_access];
-#pragma unroll
-    for (int row_id = 0; row_id < rows_per_access; ++row_id) {
-      thread_max[row_id] = -Inf<ComputeType>();
-      ComputeType* row_buf = buf[row_id];
-#pragma unroll
-      for (int pack_id = 0; pack_id < num_packs; ++pack_id) {
-        const int pack_offset = pack_id * pack_size;
-        const int col = (pack_id * thread_group_width + lane_id) * pack_size;
-        if (!padding || col < cols) {
-          load.template load<pack_size>(row_buf + pack_offset, row + row_id, col);
-#pragma unroll
-          for (int i = 0; i < pack_size; ++i) {
-            thread_max[row_id] = max(thread_max[row_id], row_buf[pack_offset + i]);
-          }
-        } else {
-#pragma unroll
-          for (int i = 0; i < pack_size; ++i) { row_buf[pack_offset + i] = -Inf<ComputeType>(); }
-        }
-      }
-    }
-    ComputeType warp_max[rows_per_access];
-#pragma unroll
-    for (int row_id = 0; row_id < rows_per_access; ++row_id) {
-      warp_max[row_id] = WarpAllReduce<MaxOp, ComputeType, thread_group_width>(thread_max[row_id]);
-    }
-    ComputeType thread_sum[rows_per_access];
-#pragma unroll
-    for (int row_id = 0; row_id < rows_per_access; ++row_id) {
-      thread_sum[row_id] = 0;
-      ComputeType* row_buf = buf[row_id];
-#pragma unroll
-      for (int i = 0; i < cols_per_thread; ++i) {
-        if (algorithm == Algorithm::kSoftmax) {
-          row_buf[i] = Exp(row_buf[i] - warp_max[row_id]);
-          thread_sum[row_id] += row_buf[i];
-        } else if (algorithm == Algorithm::kLogSoftmax) {
-          row_buf[i] -= warp_max[row_id];
-          thread_sum[row_id] += Exp(row_buf[i]);
-        } else {
-          asm volatile("s_trap 0;");
-        }
-      }
-    }
-    ComputeType warp_sum[rows_per_access];
-#pragma unroll
-    for (int row_id = 0; row_id < rows_per_access; ++row_id) {
-      warp_sum[row_id] = WarpAllReduce<SumOp, ComputeType, thread_group_width>(thread_sum[row_id]);
-    }
-#pragma unroll
-    for (int row_id = 0; row_id < rows_per_access; ++row_id) {
-      ComputeType* row_buf = buf[row_id];
-#pragma unroll
-      for (int i = 0; i < cols_per_thread; ++i) {
-        if (algorithm == Algorithm::kSoftmax) {
-          row_buf[i] = Div(row_buf[i], warp_sum[row_id]);
-        } else if (algorithm == Algorithm::kLogSoftmax) {
-          row_buf[i] -= Log(warp_sum[row_id]);
-        } else {
-          asm volatile("s_trap 0;");
-        }
-      }
-#pragma unroll
-      for (int i = 0; i < num_packs; ++i) {
-        const int col = (i * thread_group_width + lane_id) * pack_size;
-        if (!padding || col < cols) {
-          store.template store<pack_size>(row_buf + i * pack_size, row + row_id, col);
-        }
-      }
-    }
-  }
-}
-
-template<typename LOAD, typename STORE, typename ComputeType, int pack_size, int cols_per_thread,
-         int thread_group_width, int rows_per_access, bool padding, Algorithm algorithm>
-inline hipError_t LaunchSoftmaxWarpImpl(hipStream_t stream, LOAD load, STORE store,
-                                         const int64_t rows, const int64_t cols) {
-  // std::cout << "LaunchSoftmaxWarpImpl" << std::endl;                                       
-  constexpr int block_size = 128;
-  constexpr int waves = 32;
-  static_assert(block_size % thread_group_width == 0, "");
-  constexpr int thread_groups_per_block = block_size / thread_group_width;
-  dim3 block_dim(thread_group_width, thread_groups_per_block);
-  const int64_t num_blocks =
-      (rows / rows_per_access + thread_groups_per_block - 1) / thread_groups_per_block;
-  int grid_dim_x;
-  {
-    hipError_t err = GetNumBlocks(block_size, num_blocks, waves, &grid_dim_x);
-    if (err != hipSuccess) { return err; }
-  }
-  SoftmaxWarpImpl<LOAD, STORE, ComputeType, pack_size, cols_per_thread, thread_group_width,
-                  rows_per_access, padding, algorithm>
-      <<<grid_dim_x, block_dim, 0, stream>>>(load, store, rows, cols);
-  return hipPeekAtLastError();
-}
-
-template<typename LOAD, typename STORE, typename ComputeType, int pack_size, int cols_per_thread,
-         int thread_group_width, int rows_per_access, Algorithm algorithm>
-inline hipError_t DispatchSoftmaxWarpImplPadding(hipStream_t stream, LOAD load, STORE store,
-                                                  const int64_t rows, const int64_t cols) {
-  if (cols == cols_per_thread * thread_group_width) {
-    return LaunchSoftmaxWarpImpl<LOAD, STORE, ComputeType, pack_size, cols_per_thread,
-                                 thread_group_width, rows_per_access, false, algorithm>(
-        stream, load, store, rows, cols);
-  } else {
-    return LaunchSoftmaxWarpImpl<LOAD, STORE, ComputeType, pack_size, cols_per_thread,
-                                 thread_group_width, rows_per_access, true, algorithm>(
-        stream, load, store, rows, cols);
-  }
-}
-
-template<typename LOAD, typename STORE, typename ComputeType, int pack_size, Algorithm algorithm>
-typename std::enable_if<pack_size == 1, hipError_t>::type DispatchSoftmaxWarpImplCols(
-    hipStream_t stream, LOAD load, STORE store, const int64_t rows, const int64_t cols) {
-  if (cols <= 0) { return hipErrorInvalidValue; }
-#define DEFINE_ONE_ELIF(thread_group_width)                                                        \
-  else if (cols <= (thread_group_width)*pack_size) {                                               \
-    if (rows % 2 == 0) {                                                                           \
-      return DispatchSoftmaxWarpImplPadding<LOAD, STORE, ComputeType, pack_size, pack_size,        \
-                                            thread_group_width, 2, algorithm>(stream, load, store, \
-                                                                              rows, cols);         \
-    } else {                                                                                       \
-      return DispatchSoftmaxWarpImplPadding<LOAD, STORE, ComputeType, pack_size, pack_size,        \
-                                            thread_group_width, 1, algorithm>(stream, load, store, \
-                                                                              rows, cols);         \
-    }                                                                                              \
-  }
-  DEFINE_ONE_ELIF(1)
-  DEFINE_ONE_ELIF(2)
-  DEFINE_ONE_ELIF(4)
-  DEFINE_ONE_ELIF(8)
-  DEFINE_ONE_ELIF(16)
-  DEFINE_ONE_ELIF(32)
-#undef DEFINE_ONE_ELIF
-#define DEFINE_ONE_ELIF(col)                                                                      \
-  else if (cols <= (col)*kWarpSize) {                                                             \
-    return DispatchSoftmaxWarpImplPadding<LOAD, STORE, ComputeType, pack_size, col, kWarpSize, 1, \
-                                          algorithm>(stream, load, store, rows, cols);            \
-  }
-  DEFINE_ONE_ELIF(2)
-  DEFINE_ONE_ELIF(3)
-  DEFINE_ONE_ELIF(4)
-  DEFINE_ONE_ELIF(5)
-  DEFINE_ONE_ELIF(6)
-  DEFINE_ONE_ELIF(7)
-  DEFINE_ONE_ELIF(8)
-  DEFINE_ONE_ELIF(9)
-  DEFINE_ONE_ELIF(10)
-  DEFINE_ONE_ELIF(11)
-  DEFINE_ONE_ELIF(12)
-  DEFINE_ONE_ELIF(13)
-  DEFINE_ONE_ELIF(14)
-  DEFINE_ONE_ELIF(15)
-  DEFINE_ONE_ELIF(16)
-  DEFINE_ONE_ELIF(17)
-  DEFINE_ONE_ELIF(18)
-  DEFINE_ONE_ELIF(19)
-  DEFINE_ONE_ELIF(20)
-  DEFINE_ONE_ELIF(21)
-  DEFINE_ONE_ELIF(22)
-  DEFINE_ONE_ELIF(23)
-  DEFINE_ONE_ELIF(24)
-  DEFINE_ONE_ELIF(25)
-  DEFINE_ONE_ELIF(26)
-  DEFINE_ONE_ELIF(27)
-  DEFINE_ONE_ELIF(28)
-  DEFINE_ONE_ELIF(29)
-  DEFINE_ONE_ELIF(30)
-  DEFINE_ONE_ELIF(31)
-  DEFINE_ONE_ELIF(32)
-#undef DEFINE_ONE_ELIF
-  else {
-    return hipErrorInvalidValue;
-  }
-}
-
-template<typename LOAD, typename STORE, typename ComputeType, int pack_size, Algorithm algorithm>
-typename std::enable_if<pack_size == 2, hipError_t>::type DispatchSoftmaxWarpImplCols(
-    hipStream_t stream, LOAD load, STORE store, const int64_t rows, const int64_t cols) {
-  if (cols <= 0) { return hipErrorInvalidValue; }
-#define DEFINE_ONE_ELIF(thread_group_width)                                                        \
-  else if (cols <= (thread_group_width)*pack_size) {                                               \
-    if (rows % 2 == 0) {                                                                           \
-      return DispatchSoftmaxWarpImplPadding<LOAD, STORE, ComputeType, pack_size, pack_size,        \
-                                            thread_group_width, 2, algorithm>(stream, load, store, \
-                                                                              rows, cols);         \
-    } else {                                                                                       \
-      return DispatchSoftmaxWarpImplPadding<LOAD, STORE, ComputeType, pack_size, pack_size,        \
-                                            thread_group_width, 1, algorithm>(stream, load, store, \
-                                                                              rows, cols);         \
-    }                                                                                              \
-  }
-  DEFINE_ONE_ELIF(1)
-  DEFINE_ONE_ELIF(2)
-  DEFINE_ONE_ELIF(4)
-  DEFINE_ONE_ELIF(8)
-  DEFINE_ONE_ELIF(16)
-  DEFINE_ONE_ELIF(32)
-#undef DEFINE_ONE_ELIF
-#define DEFINE_ONE_ELIF(col)                                                                      \
-  else if (cols <= (col)*kWarpSize) {                                                             \
-    return DispatchSoftmaxWarpImplPadding<LOAD, STORE, ComputeType, pack_size, col, kWarpSize, 1, \
-                                          algorithm>(stream, load, store, rows, cols);            \
-  }
-  DEFINE_ONE_ELIF(4)
-  DEFINE_ONE_ELIF(6)
-  DEFINE_ONE_ELIF(8)
-  DEFINE_ONE_ELIF(10)
-  DEFINE_ONE_ELIF(12)
-  DEFINE_ONE_ELIF(14)
-  DEFINE_ONE_ELIF(16)
-  DEFINE_ONE_ELIF(18)
-  DEFINE_ONE_ELIF(20)
-  DEFINE_ONE_ELIF(22)
-  DEFINE_ONE_ELIF(24)
-  DEFINE_ONE_ELIF(26)
-  DEFINE_ONE_ELIF(28)
-  DEFINE_ONE_ELIF(30)
-  DEFINE_ONE_ELIF(32)
-#undef DEFINE_ONE_ELIF
-  else {
-    return hipErrorInvalidValue;
-  }
-}
-
-template<typename LOAD, typename STORE, typename ComputeType, Algorithm algorithm>
-struct DispatchSoftmaxWarpImplPackSize {
-  hipError_t operator()(hipStream_t stream, LOAD load, STORE store, const int64_t rows,
-                         const int64_t cols) {
-    if (cols % 2 == 0) {
-      return DispatchSoftmaxWarpImplCols<LOAD, STORE, ComputeType, 2, algorithm>(stream, load,
-                                                                                 store, rows, cols);
-    } else {
-      return DispatchSoftmaxWarpImplCols<LOAD, STORE, ComputeType, 1, algorithm>(stream, load,
-                                                                                 store, rows, cols);
-    }
-  }
-};
-
-template<typename LOAD, typename STORE, typename ComputeType, Algorithm algorithm>
-inline hipError_t DispatchSoftmaxWarpImpl(hipStream_t stream, LOAD load, STORE store,
-                                           const int64_t rows, const int64_t cols) {
-  return DispatchSoftmaxWarpImplPackSize<LOAD, STORE, ComputeType, algorithm>()(stream, load, store,
-                                                                                rows, cols);
-}
-
-template<typename LOAD, typename STORE, typename ComputeType, int pack_size, int block_size,
-         Algorithm algorithm>
-__global__ void SoftmaxBlockSMemImpl(LOAD load, STORE store, const int64_t rows,
-                                     const int64_t cols) {
-  extern __shared__ __align__(sizeof(double)) unsigned char shared_buf[];
-  auto* buf = reinterpret_cast<ComputeType*>(shared_buf);
-  const int tid = threadIdx.x;
-  assert(cols % pack_size == 0);
-  const int num_packs = cols / pack_size;
-  for (int64_t row = blockIdx.x; row < rows; row += gridDim.x) {
-    ComputeType thread_max = -Inf<ComputeType>();
-    for (int pack_id = tid; pack_id < num_packs; pack_id += block_size) {
-      ComputeType pack[pack_size];
-      load.template load<pack_size>(pack, row, pack_id * pack_size);
-#pragma unroll
-      for (int i = 0; i < pack_size; ++i) {
-        buf[i * num_packs + pack_id] = pack[i];
-        thread_max = max(thread_max, pack[i]);
-      }
-    }
-    const ComputeType row_max = BlockAllReduce<MaxOp, ComputeType, block_size>(thread_max);
-    ComputeType thread_sum = 0;
-    for (int col = tid; col < cols; col += block_size) {
-      if (algorithm == Algorithm::kSoftmax) {
-        const ComputeType exp_x = Exp(buf[col] - row_max);
-        buf[col] = exp_x;
-        thread_sum += exp_x;
-      } else {
-        const ComputeType x = buf[col] - row_max;
-        buf[col] = x;
-        thread_sum += Exp(x);
-      }
-    }
-    const ComputeType row_sum = BlockAllReduce<SumOp, ComputeType, block_size>(thread_sum);
-    for (int pack_id = tid; pack_id < num_packs; pack_id += block_size) {
-      ComputeType pack[pack_size];
-#pragma unroll
-      for (int i = 0; i < pack_size; ++i) {
-        if (algorithm == Algorithm::kSoftmax) {
-          pack[i] = Div(buf[i * num_packs + pack_id], row_sum);
-        } else if (algorithm == Algorithm::kLogSoftmax) {
-          pack[i] = buf[i * num_packs + pack_id] - Log(row_sum);
-        } else {
-          asm volatile("s_trap 0;");
-        }
-      }
-      store.template store<pack_size>(pack, row, pack_id * pack_size);
-    }
-  }
-}
-
-template<typename LOAD, typename STORE, typename ComputeType, int pack_size, int block_size,
-         Algorithm algorithm>
-__global__ void SoftmaxBlockSMemImpl_1024(LOAD load, STORE store, const int64_t rows,
-                                     const int64_t cols) __attribute__((amdgpu_flat_work_group_size(1,1024))) {
-  extern __shared__ __align__(sizeof(double)) unsigned char shared_buf[];
-  auto* buf = reinterpret_cast<ComputeType*>(shared_buf);
-  const int tid = threadIdx.x;
-  assert(cols % pack_size == 0);
-  const int num_packs = cols / pack_size;
-  for (int64_t row = blockIdx.x; row < rows; row += gridDim.x) {
-    ComputeType thread_max = -Inf<ComputeType>();
-    for (int pack_id = tid; pack_id < num_packs; pack_id += block_size) {
-      ComputeType pack[pack_size];
-      load.template load<pack_size>(pack, row, pack_id * pack_size);
-#pragma unroll
-      for (int i = 0; i < pack_size; ++i) {
-        buf[i * num_packs + pack_id] = pack[i];
-        thread_max = max(thread_max, pack[i]);
-      }
-    }
-    const ComputeType row_max = BlockAllReduce<MaxOp, ComputeType, block_size>(thread_max);
-    ComputeType thread_sum = 0;
-    for (int col = tid; col < cols; col += block_size) {
-      if (algorithm == Algorithm::kSoftmax) {
-        const ComputeType exp_x = Exp(buf[col] - row_max);
-        buf[col] = exp_x;
-        thread_sum += exp_x;
-      } else {
-        const ComputeType x = buf[col] - row_max;
-        buf[col] = x;
-        thread_sum += Exp(x);
-      }
-    }
-    const ComputeType row_sum = BlockAllReduce<SumOp, ComputeType, block_size>(thread_sum);
-    for (int pack_id = tid; pack_id < num_packs; pack_id += block_size) {
-      ComputeType pack[pack_size];
-#pragma unroll
-      for (int i = 0; i < pack_size; ++i) {
-        if (algorithm == Algorithm::kSoftmax) {
-          pack[i] = Div(buf[i * num_packs + pack_id], row_sum);
-        } else if (algorithm == Algorithm::kLogSoftmax) {
-          pack[i] = buf[i * num_packs + pack_id] - Log(row_sum);
-        } else {
-          asm volatile("s_trap 0;");
-        }
-      }
-      store.template store<pack_size>(pack, row, pack_id * pack_size);
-    }
-  }
-}
-
-template<typename LOAD, typename STORE, typename ComputeType, int pack_size, int block_size,
-         Algorithm algorithm>
-inline hipError_t LaunchSoftmaxBlockSMemImpl(hipStream_t stream, LOAD load, STORE store, int smem,
-                                              const int64_t rows, const int64_t cols) {
-                                               
-  constexpr int waves = 32;
-  int grid_dim_x;
-  {
-    hipError_t err = GetNumBlocks(block_size, rows, waves, &grid_dim_x);
-    if (err != hipSuccess) { return err; }
-  }
-  SoftmaxBlockSMemImpl<LOAD, STORE, ComputeType, pack_size, block_size, algorithm>
-      <<<grid_dim_x, block_size, smem, stream>>>(load, store, rows, cols);
-  return hipPeekAtLastError();
-}
-
-template<typename LOAD, typename STORE, typename ComputeType, int pack_size, int block_size,
-         Algorithm algorithm>
-inline hipError_t LaunchSoftmaxBlockSMemImpl_1024(hipStream_t stream, LOAD load, STORE store, int smem,
-                                              const int64_t rows, const int64_t cols) {
-                                               
-  constexpr int waves = 32;
-  int grid_dim_x;
-  {
-    hipError_t err = GetNumBlocks(block_size, rows, waves, &grid_dim_x);
-    if (err != hipSuccess) { return err; }
-  }
-  SoftmaxBlockSMemImpl_1024<LOAD, STORE, ComputeType, pack_size, block_size, algorithm>
-      <<<grid_dim_x, block_size, smem, stream>>>(load, store, rows, cols);
-  return hipPeekAtLastError();
-}
-
-template<typename LOAD, typename STORE, typename ComputeType, int pack_size, Algorithm algorithm>
-inline hipError_t TryDispatchSoftmaxBlockSMemImplBlockSize(hipStream_t stream, LOAD load,
-                                                            STORE store, const int64_t rows,
-                                                            const int64_t cols, bool* success) {
-  
-  constexpr int block_size_conf_1 = 128;
-  constexpr int block_size_conf_2 = 256;
-  constexpr int block_size_conf_3 = 512;
-  constexpr int block_size_conf_4 = 1024;
-  const size_t smem = cols * sizeof(ComputeType);
-  int max_active_blocks_conf_1;
-  {
-    hipError_t err = hipOccupancyMaxActiveBlocksPerMultiprocessor(
-        &max_active_blocks_conf_1,
-        SoftmaxBlockSMemImpl<LOAD, STORE, ComputeType, pack_size, block_size_conf_1, algorithm>,
-        block_size_conf_1, smem);
-    if (err != hipSuccess) { return err; }
-  }
-  if (max_active_blocks_conf_1 <= 0) {
-    *success = false;
-    return hipSuccess;
-  }
-  int max_active_blocks_conf_4;
-  {
-    hipError_t err = hipOccupancyMaxActiveBlocksPerMultiprocessor(
-        &max_active_blocks_conf_4,
-        SoftmaxBlockSMemImpl_1024<LOAD, STORE, ComputeType, pack_size, block_size_conf_4, algorithm>,
-        block_size_conf_4, smem);
-    if (err != hipSuccess) { return err; }
-  }
-  if (max_active_blocks_conf_4 == max_active_blocks_conf_1) {
-    *success = true;
-    return LaunchSoftmaxBlockSMemImpl_1024<LOAD, STORE, ComputeType, pack_size, block_size_conf_4,
-                                      algorithm>(stream, load, store, smem, rows, cols);
-  }
-  int max_active_blocks_conf_3;
-  {
-    hipError_t err = hipOccupancyMaxActiveBlocksPerMultiprocessor(
-        &max_active_blocks_conf_3,
-        SoftmaxBlockSMemImpl<LOAD, STORE, ComputeType, pack_size, block_size_conf_3, algorithm>,
-        block_size_conf_3, smem);
-    if (err != hipSuccess) { return err; }
-  }
-  if (max_active_blocks_conf_3 == max_active_blocks_conf_1) {
-    *success = true;
-    return LaunchSoftmaxBlockSMemImpl<LOAD, STORE, ComputeType, pack_size, block_size_conf_3,
-                                      algorithm>(stream, load, store, smem, rows, cols);
-  }
-  int max_active_blocks_conf_2;
-  {
-    hipError_t err = hipOccupancyMaxActiveBlocksPerMultiprocessor(
-        &max_active_blocks_conf_2,
-        SoftmaxBlockSMemImpl<LOAD, STORE, ComputeType, pack_size, block_size_conf_2, algorithm>,
-        block_size_conf_2, smem);
-    if (err != hipSuccess) { return err; }
-  }
-  if (max_active_blocks_conf_2 == max_active_blocks_conf_1) {
-    *success = true;
-    return LaunchSoftmaxBlockSMemImpl<LOAD, STORE, ComputeType, pack_size, block_size_conf_2,
-                                      algorithm>(stream, load, store, smem, rows, cols);
-  }
-  *success = true;
-  return LaunchSoftmaxBlockSMemImpl<LOAD, STORE, ComputeType, pack_size, block_size_conf_1,
-                                    algorithm>(stream, load, store, smem, rows, cols);
-}
-
-template<typename LOAD, typename STORE, typename ComputeType, Algorithm algorithm>
-struct TryDispatchSoftmaxBlockSMemImplPackSize {
-  hipError_t operator()(hipStream_t stream, LOAD load, STORE store, const int64_t rows,
-                         const int64_t cols, bool* success) {
-    if (cols % 2 == 0) {
-      return TryDispatchSoftmaxBlockSMemImplBlockSize<LOAD, STORE, ComputeType, 2, algorithm>(
-          stream, load, store, rows, cols, success);
-    } else {
-      return TryDispatchSoftmaxBlockSMemImplBlockSize<LOAD, STORE, ComputeType, 1, algorithm>(
-          stream, load, store, rows, cols, success);
-    }
-  }
-};
-
-template<typename LOAD, typename STORE, typename ComputeType, Algorithm algorithm>
-inline hipError_t TryDispatchSoftmaxBlockSMemImpl(hipStream_t stream, LOAD load, STORE store,
-                                                   const int64_t rows, const int64_t cols,
-                                                   bool* success) {
-  return TryDispatchSoftmaxBlockSMemImplPackSize<LOAD, STORE, ComputeType, algorithm>()(
-      stream, load, store, rows, cols, success);
-}
-
-template<typename LOAD, typename STORE, typename ComputeType, int pack_size, int block_size,
-         Algorithm algorithm>
-__global__ void SoftmaxBlockUncachedImpl(LOAD load, STORE store, const int64_t rows,
-                                         const int64_t cols) __attribute__((amdgpu_flat_work_group_size(1,1024))) {
-  const int tid = threadIdx.x;
-  assert(cols % pack_size == 0);
-  const int num_packs = cols / pack_size;
-  for (int64_t row = blockIdx.x; row < rows; row += gridDim.x) {
-    ComputeType thread_max = -Inf<ComputeType>();
-    for (int pack_id = tid; pack_id < num_packs; pack_id += block_size) {
-      ComputeType pack[pack_size];
-      load.template load<pack_size>(pack, row, pack_id * pack_size);
-#pragma unroll
-      for (int i = 0; i < pack_size; ++i) { thread_max = max(thread_max, pack[i]); }
-    }
-    const ComputeType row_max = BlockAllReduce<MaxOp, ComputeType, block_size>(thread_max);
-    ComputeType thread_sum = 0;
-    for (int pack_id = tid; pack_id < num_packs; pack_id += block_size) {
-      ComputeType pack[pack_size];
-      load.template load<pack_size>(pack, row, pack_id * pack_size);
-#pragma unroll
-      for (int i = 0; i < pack_size; ++i) { thread_sum += Exp(pack[i] - row_max); }
-    }
-    const ComputeType row_sum = BlockAllReduce<SumOp, ComputeType, block_size>(thread_sum);
-    for (int pack_id = tid; pack_id < num_packs; pack_id += block_size) {
-      ComputeType pack[pack_size];
-      load.template load<pack_size>(pack, row, pack_id * pack_size);
-#pragma unroll
-      for (int i = 0; i < pack_size; ++i) {
-        if (algorithm == Algorithm::kSoftmax) {
-          pack[i] = Div(Exp(pack[i] - row_max), row_sum);
-        } else if (algorithm == Algorithm::kLogSoftmax) {
-          pack[i] = (pack[i] - row_max) - Log(row_sum);
-        } else {
-          asm volatile("s_trap 0;");
-        }
-      }
-      store.template store<pack_size>(pack, row, pack_id * pack_size);
-    }
-  }
-}
-
-template<typename LOAD, typename STORE, typename ComputeType, int pack_size, Algorithm algorithm>
-inline hipError_t LaunchSoftmaxBlockUncachedImpl(hipStream_t stream, LOAD load, STORE store,
-                                                  const int64_t rows, const int64_t cols) {
-  // std::cout << "LaunchSoftmaxBlockUncachedImpl" << std::endl;                                                
-  constexpr int block_size = 1024;
-  constexpr int waves = 32;
-  int grid_dim_x;
-  {
-    hipError_t err = GetNumBlocks(block_size, rows, waves, &grid_dim_x);
-    if (err != hipSuccess) { return err; }
-  }
-  SoftmaxBlockUncachedImpl<LOAD, STORE, ComputeType, pack_size, block_size, algorithm>
-      <<<grid_dim_x, block_size, 0, stream>>>(load, store, rows, cols);
-  return hipPeekAtLastError();
-}
-
-template<typename LOAD, typename STORE, typename ComputeType, Algorithm algorithm>
-struct DispatchSoftmaxBlockUncachedImplPackSize {
-  hipError_t operator()(hipStream_t stream, LOAD load, STORE store, const int64_t rows,
-                         const int64_t cols) {
-    if (cols % 2 == 0) {
-      return LaunchSoftmaxBlockUncachedImpl<LOAD, STORE, ComputeType, 2, algorithm>(
-          stream, load, store, rows, cols);
-    } else {
-      return LaunchSoftmaxBlockUncachedImpl<LOAD, STORE, ComputeType, 1, algorithm>(
-          stream, load, store, rows, cols);
-    }
-  }
-};
-
-template<typename LOAD, typename STORE, typename ComputeType, Algorithm algorithm>
-inline hipError_t DispatchSoftmaxBlockUncachedImpl(hipStream_t stream, LOAD load, STORE store,
-                                                    const int64_t rows, const int64_t cols) {
-  return DispatchSoftmaxBlockUncachedImplPackSize<LOAD, STORE, ComputeType, algorithm>()(
-      stream, load, store, rows, cols);
-}
-
-template<typename LOAD, typename STORE, typename ComputeType>
-inline typename std::enable_if<!std::is_same<ComputeType, double>::value, hipError_t>::type
-DispatchSoftmax(hipStream_t stream, LOAD load, STORE store, const int64_t rows,
-                const int64_t cols) {
-  if (cols < 1024) {
-    return DispatchSoftmaxWarpImpl<LOAD, STORE, ComputeType, Algorithm::kSoftmax>(
-        stream, load, store, rows, cols);
-  } else {
-    bool dispatch_smem_impl_success;
-    {
-      hipError_t err =
-          TryDispatchSoftmaxBlockSMemImpl<LOAD, STORE, ComputeType, Algorithm::kSoftmax>(
-              stream, load, store, rows, cols, &dispatch_smem_impl_success);
-      if (err != hipSuccess) { return err; }
-    }
-    if (!dispatch_smem_impl_success) {
-      return DispatchSoftmaxBlockUncachedImpl<LOAD, STORE, ComputeType, Algorithm::kSoftmax>(
-          stream, load, store, rows, cols);
-    }
-    return hipSuccess;
-  }
-}
-
-template<typename LOAD, typename STORE, typename ComputeType>
-inline typename std::enable_if<std::is_same<ComputeType, double>::value, hipError_t>::type
-DispatchSoftmax(hipStream_t stream, LOAD load, STORE store, const int64_t rows,
-                const int64_t cols) {
-  return DispatchSoftmaxBlockUncachedImpl<LOAD, STORE, ComputeType, Algorithm::kSoftmax>(
-      stream, load, store, rows, cols);
-}
-
-template<typename LOAD, typename STORE, typename ComputeType>
-inline typename std::enable_if<!std::is_same<ComputeType, double>::value, hipError_t>::type
-DispatchLogSoftmax(hipStream_t stream, LOAD load, STORE store, const int64_t rows,
-                   const int64_t cols) {
-  if (cols <= 1024) {
-    return DispatchSoftmaxWarpImpl<LOAD, STORE, ComputeType, Algorithm::kLogSoftmax>(
-        stream, load, store, rows, cols);
-  } else {
-    bool dispatch_smem_impl_success;
-    {
-      hipError_t err =
-          TryDispatchSoftmaxBlockSMemImpl<LOAD, STORE, ComputeType, Algorithm::kLogSoftmax>(
-              stream, load, store, rows, cols, &dispatch_smem_impl_success);
-      if (err != hipSuccess) { return err; }
-    }
-    if (!dispatch_smem_impl_success) {
-      return DispatchSoftmaxBlockUncachedImpl<LOAD, STORE, ComputeType, Algorithm::kLogSoftmax>(
-          stream, load, store, rows, cols);
-    }
-    return hipSuccess;
-  }
-}
-
-template<typename LOAD, typename STORE, typename ComputeType>
-inline typename std::enable_if<std::is_same<ComputeType, double>::value, hipError_t>::type
-DispatchLogSoftmax(hipStream_t stream, LOAD load, STORE store, const int64_t rows,
-                   const int64_t cols) {
-  return DispatchSoftmaxBlockUncachedImpl<LOAD, STORE, ComputeType, Algorithm::kLogSoftmax>(
-      stream, load, store, rows, cols);
-}
-
-template<typename LOAD_Y, typename LOAD_DY, typename STORE, typename ComputeType, int pack_size,
-         int cols_per_thread, int thread_group_width, int rows_per_access, bool padding,
-         Algorithm algorithm>
-__global__ void SoftmaxGradWarpImpl(LOAD_Y load_y, LOAD_DY load_dy, STORE store, const int64_t rows,
-                                    const int64_t cols) {
-  static_assert(cols_per_thread % pack_size == 0, "");
-  constexpr int pack_per_thread = cols_per_thread / pack_size;
-  assert(cols <= cols_per_thread * thread_group_width);
-  static_assert(thread_group_width <= kWarpSize, "");
-  static_assert(kWarpSize % thread_group_width == 0, "");
-  ComputeType y_buf[rows_per_access][cols_per_thread];
-  ComputeType dy_buf[rows_per_access][cols_per_thread];
-  const int global_thread_group_id = blockIdx.x * blockDim.y + threadIdx.y;
-  const int num_global_thread_group = gridDim.x * blockDim.y;
-  const int lane_id = threadIdx.x;
-  const int64_t step = num_global_thread_group * rows_per_access;
-  for (int64_t row = global_thread_group_id * rows_per_access; row < rows; row += step) {
-    ComputeType thread_sum[rows_per_access];
-#pragma unroll
-    for (int row_id = 0; row_id < rows_per_access; ++row_id) {
-      thread_sum[row_id] = 0;
-      ComputeType* row_y_buf = y_buf[row_id];
-      ComputeType* row_dy_buf = dy_buf[row_id];
-#pragma unroll
-      for (int pack_id = 0; pack_id < pack_per_thread; ++pack_id) {
-        const int pack_offset = pack_id * pack_size;
-        const int col = (pack_id * thread_group_width + lane_id) * pack_size;
-        if (!padding || col < cols) {
-          load_y.template load<pack_size>(row_y_buf + pack_offset, row + row_id, col);
-          load_dy.template load<pack_size>(row_dy_buf + pack_offset, row + row_id, col);
-#pragma unroll
-          for (int i = 0; i < pack_size; ++i) {
-            if (algorithm == Algorithm::kSoftmax) {
-              thread_sum[row_id] += row_y_buf[pack_offset + i] * row_dy_buf[pack_offset + i];
-            } else if (algorithm == Algorithm::kLogSoftmax) {
-              thread_sum[row_id] += row_dy_buf[pack_offset + i];
-            } else {
-              asm volatile("s_trap 0;");
-            }
-          }
-        }
-      }
-    }
-    ComputeType warp_sum[rows_per_access];
-#pragma unroll
-    for (int row_id = 0; row_id < rows_per_access; ++row_id) {
-      warp_sum[row_id] = WarpAllReduce<SumOp, ComputeType, thread_group_width>(thread_sum[row_id]);
-    }
-#pragma unroll
-    for (int row_id = 0; row_id < rows_per_access; ++row_id) {
-      ComputeType* row_y_buf = y_buf[row_id];
-      ComputeType* row_dy_buf = dy_buf[row_id];
-#pragma unroll
-      for (int pack_id = 0; pack_id < pack_per_thread; ++pack_id) {
-        const int pack_offset = pack_id * pack_size;
-        const int col = (pack_id * thread_group_width + lane_id) * pack_size;
-        if (!padding || col < cols) {
-          for (int i = 0; i < pack_size; ++i) {
-            if (algorithm == Algorithm::kSoftmax) {
-              row_dy_buf[pack_offset + i] =
-                  (row_dy_buf[pack_offset + i] - warp_sum[row_id]) * row_y_buf[pack_offset + i];
-            } else if (algorithm == Algorithm::kLogSoftmax) {
-              row_dy_buf[pack_offset + i] -= Exp(row_y_buf[pack_offset + i]) * warp_sum[row_id];
-            } else {
-              asm volatile("s_trap 0;");
-            }
-          }
-          store.template store<pack_size>(row_dy_buf + pack_offset, row + row_id, col);
-        }
-      }
-    }
-  }
-}
-
-template<typename LOAD_Y, typename LOAD_DY, typename STORE, typename ComputeType, int pack_size,
-         int cols_per_thread, int thread_group_width, int rows_per_access, bool padding,
-         Algorithm algorithm>
-inline hipError_t LaunchSoftmaxGradWarpImpl(hipStream_t stream, LOAD_Y load_y, LOAD_DY load_dy,
-                                             STORE store, const int64_t rows, const int64_t cols) {
-  constexpr int block_size = 128;
-  constexpr int waves = 32;
-  static_assert(block_size % thread_group_width == 0, "");
-  constexpr int thread_groups_per_block = block_size / thread_group_width;
-  dim3 block_dim(thread_group_width, thread_groups_per_block);
-  const int64_t num_blocks =
-      (rows / rows_per_access + thread_groups_per_block - 1) / thread_groups_per_block;
-  int grid_dim_x;
-  {
-    hipError_t err = GetNumBlocks(block_size, num_blocks, waves, &grid_dim_x);
-    if (err != hipSuccess) { return err; }
-  }
-  SoftmaxGradWarpImpl<LOAD_Y, LOAD_DY, STORE, ComputeType, pack_size, cols_per_thread,
-                      thread_group_width, rows_per_access, padding, algorithm>
-      <<<grid_dim_x, block_dim, 0, stream>>>(load_y, load_dy, store, rows, cols);
-  return hipPeekAtLastError();
-}
-
-template<typename LOAD_Y, typename LOAD_DY, typename STORE, typename ComputeType, int pack_size,
-         int cols_per_thread, int thread_group_width, int rows_per_access, Algorithm algorithm>
-inline hipError_t DispatchSoftmaxGradWarpImplPadding(hipStream_t stream, LOAD_Y load_y,
-                                                      LOAD_DY load_dy, STORE store,
-                                                      const int64_t rows, const int64_t cols) {
-  if (cols == cols_per_thread * thread_group_width) {
-    return LaunchSoftmaxGradWarpImpl<LOAD_Y, LOAD_DY, STORE, ComputeType, pack_size,
-                                     cols_per_thread, thread_group_width, rows_per_access, false,
-                                     algorithm>(stream, load_y, load_dy, store, rows, cols);
-  } else {
-    return LaunchSoftmaxGradWarpImpl<LOAD_Y, LOAD_DY, STORE, ComputeType, pack_size,
-                                     cols_per_thread, thread_group_width, rows_per_access, true,
-                                     algorithm>(stream, load_y, load_dy, store, rows, cols);
-  }
-}
-
-template<typename LOAD_Y, typename LOAD_DY, typename STORE, typename ComputeType, int pack_size,
-         Algorithm algorithm>
-typename std::enable_if<pack_size == 1, hipError_t>::type DispatchSoftmaxGradWarpImplCols(
-    hipStream_t stream, LOAD_Y load_y, LOAD_DY load_dy, STORE store, const int64_t rows,
-    const int64_t cols) {
-  if (cols <= 0) { return hipErrorInvalidValue; }
-#define DEFINE_ONE_ELIF(thread_group_width)                                                     \
-  else if (cols <= (thread_group_width)*pack_size) {                                            \
-    if (rows % 2 == 0) {                                                                        \
-      return DispatchSoftmaxGradWarpImplPadding<LOAD_Y, LOAD_DY, STORE, ComputeType, pack_size, \
-                                                pack_size, thread_group_width, 2, algorithm>(   \
-          stream, load_y, load_dy, store, rows, cols);                                          \
-    } else {                                                                                    \
-      return DispatchSoftmaxGradWarpImplPadding<LOAD_Y, LOAD_DY, STORE, ComputeType, pack_size, \
-                                                pack_size, thread_group_width, 1, algorithm>(   \
-          stream, load_y, load_dy, store, rows, cols);                                          \
-    }                                                                                           \
-  }
-  DEFINE_ONE_ELIF(1)
-  DEFINE_ONE_ELIF(2)
-  DEFINE_ONE_ELIF(4)
-  DEFINE_ONE_ELIF(8)
-  DEFINE_ONE_ELIF(16)
-  DEFINE_ONE_ELIF(32)
-#undef DEFINE_ONE_ELIF
-#define DEFINE_ONE_ELIF(col)                                                                       \
-  else if (cols <= (col)*kWarpSize) {                                                              \
-    return DispatchSoftmaxGradWarpImplPadding<LOAD_Y, LOAD_DY, STORE, ComputeType, pack_size, col, \
-                                              kWarpSize, 1, algorithm>(stream, load_y, load_dy,    \
-                                                                       store, rows, cols);         \
-  }
-  DEFINE_ONE_ELIF(2)
-  DEFINE_ONE_ELIF(3)
-  DEFINE_ONE_ELIF(4)
-  DEFINE_ONE_ELIF(5)
-  DEFINE_ONE_ELIF(6)
-  DEFINE_ONE_ELIF(7)
-  DEFINE_ONE_ELIF(8)
-  DEFINE_ONE_ELIF(9)
-  DEFINE_ONE_ELIF(10)
-  DEFINE_ONE_ELIF(11)
-  DEFINE_ONE_ELIF(12)
-  DEFINE_ONE_ELIF(13)
-  DEFINE_ONE_ELIF(14)
-  DEFINE_ONE_ELIF(15)
-  DEFINE_ONE_ELIF(16)
-  DEFINE_ONE_ELIF(17)
-  DEFINE_ONE_ELIF(18)
-  DEFINE_ONE_ELIF(19)
-  DEFINE_ONE_ELIF(20)
-  DEFINE_ONE_ELIF(21)
-  DEFINE_ONE_ELIF(22)
-  DEFINE_ONE_ELIF(23)
-  DEFINE_ONE_ELIF(24)
-  DEFINE_ONE_ELIF(25)
-  DEFINE_ONE_ELIF(26)
-  DEFINE_ONE_ELIF(27)
-  DEFINE_ONE_ELIF(28)
-  DEFINE_ONE_ELIF(29)
-  DEFINE_ONE_ELIF(30)
-  DEFINE_ONE_ELIF(31)
-  DEFINE_ONE_ELIF(32)
-#undef DEFINE_ONE_ELIF
-  else {
-    return hipErrorInvalidValue;
-  }
-}
-
-template<typename LOAD_Y, typename LOAD_DY, typename STORE, typename ComputeType, int pack_size,
-         Algorithm algorithm>
-typename std::enable_if<pack_size == 2, hipError_t>::type DispatchSoftmaxGradWarpImplCols(
-    hipStream_t stream, LOAD_Y load_y, LOAD_DY load_dy, STORE store, const int64_t rows,
-    const int64_t cols) {
-  if (cols <= 0) { return hipErrorInvalidValue; }
-#define DEFINE_ONE_ELIF(thread_group_width)                                                     \
-  else if (cols <= (thread_group_width)*pack_size) {                                            \
-    if (rows % 2 == 0) {                                                                        \
-      return DispatchSoftmaxGradWarpImplPadding<LOAD_Y, LOAD_DY, STORE, ComputeType, pack_size, \
-                                                pack_size, thread_group_width, 2, algorithm>(   \
-          stream, load_y, load_dy, store, rows, cols);                                          \
-    } else {                                                                                    \
-      return DispatchSoftmaxGradWarpImplPadding<LOAD_Y, LOAD_DY, STORE, ComputeType, pack_size, \
-                                                pack_size, thread_group_width, 1, algorithm>(   \
-          stream, load_y, load_dy, store, rows, cols);                                          \
-    }                                                                                           \
-  }
-  DEFINE_ONE_ELIF(1)
-  DEFINE_ONE_ELIF(2)
-  DEFINE_ONE_ELIF(4)
-  DEFINE_ONE_ELIF(8)
-  DEFINE_ONE_ELIF(16)
-  DEFINE_ONE_ELIF(32)
-#undef DEFINE_ONE_ELIF
-#define DEFINE_ONE_ELIF(col)                                                                       \
-  else if (cols <= (col)*kWarpSize) {                                                              \
-    return DispatchSoftmaxGradWarpImplPadding<LOAD_Y, LOAD_DY, STORE, ComputeType, pack_size, col, \
-                                              kWarpSize, 1, algorithm>(stream, load_y, load_dy,    \
-                                                                       store, rows, cols);         \
-  }
-  DEFINE_ONE_ELIF(4)
-  DEFINE_ONE_ELIF(6)
-  DEFINE_ONE_ELIF(8)
-  DEFINE_ONE_ELIF(10)
-  DEFINE_ONE_ELIF(12)
-  DEFINE_ONE_ELIF(14)
-  DEFINE_ONE_ELIF(16)
-  DEFINE_ONE_ELIF(18)
-  DEFINE_ONE_ELIF(20)
-  DEFINE_ONE_ELIF(22)
-  DEFINE_ONE_ELIF(24)
-  DEFINE_ONE_ELIF(26)
-  DEFINE_ONE_ELIF(28)
-  DEFINE_ONE_ELIF(30)
-  DEFINE_ONE_ELIF(32)
-#undef DEFINE_ONE_ELIF
-  else {
-    return hipErrorInvalidValue;
-  }
-}
-
-template<typename LOAD_Y, typename LOAD_DY, typename STORE, typename ComputeType,
-         Algorithm algorithm>
-struct DispatchSoftmaxGradWarpImplPackSize {
-  hipError_t operator()(hipStream_t stream, LOAD_Y load_y, LOAD_DY load_dy, STORE store,
-                         const int64_t rows, const int64_t cols) {
-    if (cols % 2 == 0) {
-      return DispatchSoftmaxGradWarpImplCols<LOAD_Y, LOAD_DY, STORE, ComputeType, 2, algorithm>(
-          stream, load_y, load_dy, store, rows, cols);
-    } else {
-      return DispatchSoftmaxGradWarpImplCols<LOAD_Y, LOAD_DY, STORE, ComputeType, 1, algorithm>(
-          stream, load_y, load_dy, store, rows, cols);
-    }
-  }
-};
-
-template<typename LOAD_Y, typename LOAD_DY, typename STORE, typename ComputeType,
-         Algorithm algorithm>
-inline hipError_t DispatchSoftmaxGradWarpImpl(hipStream_t stream, LOAD_Y load_y, LOAD_DY load_dy,
-                                               STORE store, const int64_t rows,
-                                               const int64_t cols) {
-  return DispatchSoftmaxGradWarpImplPackSize<LOAD_Y, LOAD_DY, STORE, ComputeType, algorithm>()(
-      stream, load_y, load_dy, store, rows, cols);
-}
-
-template<typename LOAD_Y, typename LOAD_DY, typename STORE, typename ComputeType, int pack_size,
-         int block_size, Algorithm algorithm>
-__global__ void SoftmaxGradBlockSMemImpl(LOAD_Y load_y, LOAD_DY load_dy, STORE store,
-                                         const int64_t rows, const int64_t cols) {
-  extern __shared__ __align__(sizeof(double)) unsigned char grad_shared_buf[];
-  auto* y_buf = reinterpret_cast<ComputeType*>(grad_shared_buf);
-  auto* dy_buf = y_buf + cols;
-  const int tid = threadIdx.x;
-  assert(cols % pack_size == 0);
-  const int num_packs = cols / pack_size;
-  for (int64_t row = blockIdx.x; row < rows; row += gridDim.x) {
-    ComputeType thread_sum = 0;
-    for (int pack_id = tid; pack_id < num_packs; pack_id += block_size) {
-      ComputeType y_pack[pack_size];
-      ComputeType dy_pack[pack_size];
-      load_y.template load<pack_size>(y_pack, row, pack_id * pack_size);
-      load_dy.template load<pack_size>(dy_pack, row, pack_id * pack_size);
-#pragma unroll
-      for (int i = 0; i < pack_size; ++i) {
-        y_buf[i * num_packs + pack_id] = y_pack[i];
-        dy_buf[i * num_packs + pack_id] = dy_pack[i];
-        if (algorithm == Algorithm::kSoftmax) {
-          thread_sum += y_pack[i] * dy_pack[i];
-        } else if (algorithm == Algorithm::kLogSoftmax) {
-          thread_sum += dy_pack[i];
-        } else {
-          asm volatile("s_trap 0;");
-        }
-      }
-    }
-    const ComputeType row_sum = BlockAllReduce<SumOp, ComputeType, block_size>(thread_sum);
-    for (int pack_id = tid; pack_id < num_packs; pack_id += block_size) {
-      ComputeType pack[pack_size];
-#pragma unroll
-      for (int i = 0; i < pack_size; ++i) {
-        if (algorithm == Algorithm::kSoftmax) {
-          pack[i] = (dy_buf[i * num_packs + pack_id] - row_sum) * y_buf[i * num_packs + pack_id];
-        } else if (algorithm == Algorithm::kLogSoftmax) {
-          pack[i] = dy_buf[i * num_packs + pack_id] - Exp(y_buf[i * num_packs + pack_id]) * row_sum;
-        } else {
-          asm volatile("s_trap 0;");
-        }
-      }
-      store.template store<pack_size>(pack, row, pack_id * pack_size);
-    }
-  }
-}
-
-template<typename LOAD_Y, typename LOAD_DY, typename STORE, typename ComputeType, int pack_size,
-         int block_size, Algorithm algorithm>
-__global__ void SoftmaxGradBlockSMemImpl_1024(LOAD_Y load_y, LOAD_DY load_dy, STORE store,
-                                         const int64_t rows, const int64_t cols) __attribute__((amdgpu_flat_work_group_size(1,1024))) {
-  extern __shared__ __align__(sizeof(double)) unsigned char grad_shared_buf[];
-  auto* y_buf = reinterpret_cast<ComputeType*>(grad_shared_buf);
-  auto* dy_buf = y_buf + cols;
-  const int tid = threadIdx.x;
-  assert(cols % pack_size == 0);
-  const int num_packs = cols / pack_size;
-  for (int64_t row = blockIdx.x; row < rows; row += gridDim.x) {
-    ComputeType thread_sum = 0;
-    for (int pack_id = tid; pack_id < num_packs; pack_id += block_size) {
-      ComputeType y_pack[pack_size];
-      ComputeType dy_pack[pack_size];
-      load_y.template load<pack_size>(y_pack, row, pack_id * pack_size);
-      load_dy.template load<pack_size>(dy_pack, row, pack_id * pack_size);
-#pragma unroll
-      for (int i = 0; i < pack_size; ++i) {
-        y_buf[i * num_packs + pack_id] = y_pack[i];
-        dy_buf[i * num_packs + pack_id] = dy_pack[i];
-        if (algorithm == Algorithm::kSoftmax) {
-          thread_sum += y_pack[i] * dy_pack[i];
-        } else if (algorithm == Algorithm::kLogSoftmax) {
-          thread_sum += dy_pack[i];
-        } else {
-          asm volatile("s_trap 0;");
-        }
-      }
-    }
-    const ComputeType row_sum = BlockAllReduce<SumOp, ComputeType, block_size>(thread_sum);
-    for (int pack_id = tid; pack_id < num_packs; pack_id += block_size) {
-      ComputeType pack[pack_size];
-#pragma unroll
-      for (int i = 0; i < pack_size; ++i) {
-        if (algorithm == Algorithm::kSoftmax) {
-          pack[i] = (dy_buf[i * num_packs + pack_id] - row_sum) * y_buf[i * num_packs + pack_id];
-        } else if (algorithm == Algorithm::kLogSoftmax) {
-          pack[i] = dy_buf[i * num_packs + pack_id] - Exp(y_buf[i * num_packs + pack_id]) * row_sum;
-        } else {
-          asm volatile("s_trap 0;");
-        }
-      }
-      store.template store<pack_size>(pack, row, pack_id * pack_size);
-    }
-  }
-}
-
-template<typename LOAD_Y, typename LOAD_DY, typename STORE, typename ComputeType, int pack_size,
-         int block_size, Algorithm algorithm>
-inline hipError_t LaunchSoftmaxGradBlockSMemImpl(hipStream_t stream, LOAD_Y load_y,
-                                                  LOAD_DY load_dy, STORE store, int smem,
-                                                  const int64_t rows, const int64_t cols) {
-  constexpr int waves = 32;
-  int grid_dim_x;
-  {
-    hipError_t err = GetNumBlocks(block_size, rows, waves, &grid_dim_x);
-    if (err != hipSuccess) { return err; }
-  }
-  SoftmaxGradBlockSMemImpl<LOAD_Y, LOAD_DY, STORE, ComputeType, pack_size, block_size, algorithm>
-      <<<grid_dim_x, block_size, smem, stream>>>(load_y, load_dy, store, rows, cols);
-  return hipPeekAtLastError();
-}
-
-template<typename LOAD_Y, typename LOAD_DY, typename STORE, typename ComputeType, int pack_size,
-         int block_size, Algorithm algorithm>
-inline hipError_t LaunchSoftmaxGradBlockSMemImpl_1024(hipStream_t stream, LOAD_Y load_y,
-                                                  LOAD_DY load_dy, STORE store, int smem,
-                                                  const int64_t rows, const int64_t cols) {
-  constexpr int waves = 32;
-  int grid_dim_x;
-  {
-    hipError_t err = GetNumBlocks(block_size, rows, waves, &grid_dim_x);
-    if (err != hipSuccess) { return err; }
-  }
-  SoftmaxGradBlockSMemImpl_1024<LOAD_Y, LOAD_DY, STORE, ComputeType, pack_size, block_size, algorithm>
-      <<<grid_dim_x, block_size, smem, stream>>>(load_y, load_dy, store, rows, cols);
-  return hipPeekAtLastError();
-}
-
-template<typename LOAD_Y, typename LOAD_DY, typename STORE, typename ComputeType, int pack_size,
-         Algorithm algorithm>
-inline hipError_t TryDispatchSoftmaxGradBlockSMemImplBlockSize(hipStream_t stream, LOAD_Y load_y,
-                                                                LOAD_DY load_dy, STORE store,
-                                                                const int64_t rows,
-                                                                const int64_t cols, bool* success) {
-  constexpr int block_size_conf_1 = 128;
-  constexpr int block_size_conf_2 = 256;
-  constexpr int block_size_conf_3 = 512;
-  constexpr int block_size_conf_4 = 1024;
-  const size_t smem = cols * sizeof(ComputeType) * 2;
-  int max_active_blocks_conf_1;
-  {
-    hipError_t err = hipOccupancyMaxActiveBlocksPerMultiprocessor(
-        &max_active_blocks_conf_1,
-        SoftmaxGradBlockSMemImpl<LOAD_Y, LOAD_DY, STORE, ComputeType, pack_size, block_size_conf_1,
-                                 algorithm>,
-        block_size_conf_1, smem);
-    if (err != hipSuccess) { return err; }
-  }
-  if (max_active_blocks_conf_1 <= 0) {
-    *success = false;
-    return hipSuccess;
-  }
-  int max_active_blocks_conf_4;
-  {
-    hipError_t err = hipOccupancyMaxActiveBlocksPerMultiprocessor(
-        &max_active_blocks_conf_4,
-        SoftmaxGradBlockSMemImpl_1024<LOAD_Y, LOAD_DY, STORE, ComputeType, pack_size, block_size_conf_4,
-                                 algorithm>,
-        block_size_conf_4, smem);
-    if (err != hipSuccess) { return err; }
-  }
-  if (max_active_blocks_conf_4 == max_active_blocks_conf_1) {
-    *success = true;
-    return LaunchSoftmaxGradBlockSMemImpl_1024<LOAD_Y, LOAD_DY, STORE, ComputeType, pack_size,
-                                          block_size_conf_4, algorithm>(stream, load_y, load_dy,
-                                                                        store, smem, rows, cols);
-  }
-  int max_active_blocks_conf_3;
-  {
-    hipError_t err = hipOccupancyMaxActiveBlocksPerMultiprocessor(
-        &max_active_blocks_conf_3,
-        SoftmaxGradBlockSMemImpl<LOAD_Y, LOAD_DY, STORE, ComputeType, pack_size, block_size_conf_3,
-                                 algorithm>,
-        block_size_conf_3, smem);
-    if (err != hipSuccess) { return err; }
-  }
-  if (max_active_blocks_conf_3 == max_active_blocks_conf_1) {
-    *success = true;
-    return LaunchSoftmaxGradBlockSMemImpl<LOAD_Y, LOAD_DY, STORE, ComputeType, pack_size,
-                                          block_size_conf_3, algorithm>(stream, load_y, load_dy,
-                                                                        store, smem, rows, cols);
-  }
-  int max_active_blocks_conf_2;
-  {
-    hipError_t err = hipOccupancyMaxActiveBlocksPerMultiprocessor(
-        &max_active_blocks_conf_2,
-        SoftmaxGradBlockSMemImpl<LOAD_Y, LOAD_DY, STORE, ComputeType, pack_size, block_size_conf_2,
-                                 algorithm>,
-        block_size_conf_2, smem);
-    if (err != hipSuccess) { return err; }
-  }
-  if (max_active_blocks_conf_2 == max_active_blocks_conf_1) {
-    *success = true;
-    return LaunchSoftmaxGradBlockSMemImpl<LOAD_Y, LOAD_DY, STORE, ComputeType, pack_size,
-                                          block_size_conf_2, algorithm>(stream, load_y, load_dy,
-                                                                        store, smem, rows, cols);
-  }
-  *success = true;
-  return LaunchSoftmaxGradBlockSMemImpl<LOAD_Y, LOAD_DY, STORE, ComputeType, pack_size,
-                                        block_size_conf_1, algorithm>(stream, load_y, load_dy,
-                                                                      store, smem, rows, cols);
-}
-
-template<typename LOAD_Y, typename LOAD_DY, typename STORE, typename ComputeType,
-         Algorithm algorithm>
-struct TryDispatchSoftmaxGradBlockSMemImplPackSize {
-  hipError_t operator()(hipStream_t stream, LOAD_Y load_y, LOAD_DY load_dy, STORE store,
-                         const int64_t rows, const int64_t cols, bool* success) {
-    if (cols % 2 == 0) {
-      return TryDispatchSoftmaxGradBlockSMemImplBlockSize<LOAD_Y, LOAD_DY, STORE, ComputeType, 2,
-                                                          algorithm>(stream, load_y, load_dy, store,
-                                                                     rows, cols, success);
-    } else {
-      return TryDispatchSoftmaxGradBlockSMemImplBlockSize<LOAD_Y, LOAD_DY, STORE, ComputeType, 1,
-                                                          algorithm>(stream, load_y, load_dy, store,
-                                                                     rows, cols, success);
-    }
-  }
-};
-
-template<typename LOAD_Y, typename LOAD_DY, typename STORE, typename ComputeType,
-         Algorithm algorithm>
-inline hipError_t TryDispatchSoftmaxGradBlockSMemImpl(hipStream_t stream, LOAD_Y load_y,
-                                                       LOAD_DY load_dy, STORE store,
-                                                       const int64_t rows, const int64_t cols,
-                                                       bool* success) {
-  return TryDispatchSoftmaxGradBlockSMemImplPackSize<LOAD_Y, LOAD_DY, STORE, ComputeType,
-                                                     algorithm>()(stream, load_y, load_dy, store,
-                                                                  rows, cols, success);
-}
-
-template<typename LOAD_Y, typename LOAD_DY, typename STORE, typename ComputeType, int pack_size,
-         int block_size, Algorithm algorithm>
-__global__ void SoftmaxGradBlockUncachedImpl(LOAD_Y load_y, LOAD_DY load_dy, STORE store,
-                                             const int64_t rows, const int64_t cols) __attribute__((amdgpu_flat_work_group_size(1,1024))) {
-  const int tid = threadIdx.x;
-  assert(cols % pack_size == 0);
-  const int num_packs = cols / pack_size;
-  for (int64_t row = blockIdx.x; row < rows; row += gridDim.x) {
-    ComputeType thread_sum = 0;
-    for (int pack_id = tid; pack_id < num_packs; pack_id += block_size) {
-      ComputeType y_pack[pack_size];
-      ComputeType dy_pack[pack_size];
-      load_y.template load<pack_size>(y_pack, row, pack_id * pack_size);
-      load_dy.template load<pack_size>(dy_pack, row, pack_id * pack_size);
-
-#pragma unroll
-      for (int i = 0; i < pack_size; ++i) {
-        if (algorithm == Algorithm::kSoftmax) {
-          thread_sum += y_pack[i] * dy_pack[i];
-        } else if (algorithm == Algorithm::kLogSoftmax) {
-          thread_sum += dy_pack[i];
-        } else {
-          asm volatile("s_trap 0;");
-        }
-      }
-    }
-    const ComputeType row_sum = BlockAllReduce<SumOp, ComputeType, block_size>(thread_sum);
-    for (int pack_id = tid; pack_id < num_packs; pack_id += block_size) {
-      ComputeType y_pack[pack_size];
-      ComputeType dy_pack[pack_size];
-      load_y.template load<pack_size>(y_pack, row, pack_id * pack_size);
-      load_dy.template load<pack_size>(dy_pack, row, pack_id * pack_size);
-#pragma unroll
-      for (int i = 0; i < pack_size; ++i) {
-        if (algorithm == Algorithm::kSoftmax) {
-          dy_pack[i] = (dy_pack[i] - row_sum) * y_pack[i];
-        } else if (algorithm == Algorithm::kLogSoftmax) {
-          dy_pack[i] -= Exp(y_pack[i]) * row_sum;
-        } else {
-          asm volatile("s_trap 0;");
-        }
-      }
-      store.template store<pack_size>(dy_pack, row, pack_id * pack_size);
-    }
-  }
-}
-
-template<typename LOAD_Y, typename LOAD_DY, typename STORE, typename ComputeType, int pack_size,
-         Algorithm algorithm>
-inline hipError_t LaunchSoftmaxGradBlockUncachedImpl(hipStream_t stream, LOAD_Y load_y,
-                                                      LOAD_DY load_dy, STORE store,
-                                                      const int64_t rows, const int64_t cols) {
-  constexpr int block_size = 1024;
-  constexpr int waves = 32;
-  int grid_dim_x;
-  {
-    hipError_t err = GetNumBlocks(block_size, rows, waves, &grid_dim_x);
-    if (err != hipSuccess) { return err; }
-  }
-  SoftmaxGradBlockUncachedImpl<LOAD_Y, LOAD_DY, STORE, ComputeType, pack_size, block_size,
-                               algorithm>
-      <<<grid_dim_x, block_size, 0, stream>>>(load_y, load_dy, store, rows, cols);
-  return hipPeekAtLastError();
-}
-
-template<typename LOAD_Y, typename LOAD_DY, typename STORE, typename ComputeType,
-         Algorithm algorithm>
-struct DispatchSoftmaxGradBlockUncachedImplPackSize {
-  hipError_t operator()(hipStream_t stream, LOAD_Y load_y, LOAD_DY load_dy, STORE store,
-                         const int64_t rows, const int64_t cols) {
-    if (cols % 2 == 0 && cols > kWarpSize) {
-      return LaunchSoftmaxGradBlockUncachedImpl<LOAD_Y, LOAD_DY, STORE, ComputeType, 2, algorithm>(
-          stream, load_y, load_dy, store, rows, cols);
-    } else {
-      return LaunchSoftmaxGradBlockUncachedImpl<LOAD_Y, LOAD_DY, STORE, ComputeType, 1, algorithm>(
-          stream, load_y, load_dy, store, rows, cols);
-    }
-  }
-};
-
-template<typename LOAD_Y, typename LOAD_DY, typename STORE, typename ComputeType,
-         Algorithm algorithm>
-inline hipError_t DispatchSoftmaxGradBlockUncachedImpl(hipStream_t stream, LOAD_Y load_y,
-                                                        LOAD_DY load_dy, STORE store,
-                                                        const int64_t rows, const int64_t cols) {
-  return DispatchSoftmaxGradBlockUncachedImplPackSize<LOAD_Y, LOAD_DY, STORE, ComputeType,
-                                                      algorithm>()(stream, load_y, load_dy, store,
-                                                                   rows, cols);
-}
-
-template<typename LOAD_Y, typename LOAD_DY, typename STORE, typename ComputeType>
-inline typename std::enable_if<!std::is_same<ComputeType, double>::value, hipError_t>::type
-DispatchSoftmaxGrad(hipStream_t stream, LOAD_Y load_y, LOAD_DY load_dy, STORE store,
-                    const int64_t rows, const int64_t cols) {
-  if (cols <= 1024) {
-    return DispatchSoftmaxGradWarpImpl<LOAD_Y, LOAD_DY, STORE, ComputeType, Algorithm::kSoftmax>(
-        stream, load_y, load_dy, store, rows, cols);
-  } else {
-    bool dispatch_smem_impl_success;
-    {
-      hipError_t err = TryDispatchSoftmaxGradBlockSMemImpl<LOAD_Y, LOAD_DY, STORE, ComputeType,
-                                                            Algorithm::kSoftmax>(
-          stream, load_y, load_dy, store, rows, cols, &dispatch_smem_impl_success);
-      if (err != hipSuccess) { return err; }
-    }
-    if (!dispatch_smem_impl_success) {
-      return DispatchSoftmaxGradBlockUncachedImpl<LOAD_Y, LOAD_DY, STORE, ComputeType,
-                                                  Algorithm::kSoftmax>(stream, load_y, load_dy,
-                                                                       store, rows, cols);
-    }
-    return hipSuccess;
-  }
-}
-
-template<typename LOAD_Y, typename LOAD_DY, typename STORE, typename ComputeType>
-inline typename std::enable_if<std::is_same<ComputeType, double>::value, hipError_t>::type
-DispatchSoftmaxGrad(hipStream_t stream, LOAD_Y load_y, LOAD_DY load_dy, STORE store,
-                    const int64_t rows, const int64_t cols) {
-  return DispatchSoftmaxGradBlockUncachedImpl<LOAD_Y, LOAD_DY, STORE, ComputeType,
-                                              Algorithm::kSoftmax>(stream, load_y, load_dy, store,
-                                                                   rows, cols);
-}
-
-template<typename LOAD_Y, typename LOAD_DY, typename STORE, typename ComputeType>
-inline typename std::enable_if<!std::is_same<ComputeType, double>::value, hipError_t>::type
-DispatchLogSoftmaxGrad(hipStream_t stream, LOAD_Y load_y, LOAD_DY load_dy, STORE store,
-                       const int64_t rows, const int64_t cols) {
-  if (cols <= 1024) {
-    return DispatchSoftmaxGradWarpImpl<LOAD_Y, LOAD_DY, STORE, ComputeType, Algorithm::kLogSoftmax>(
-        stream, load_y, load_dy, store, rows, cols);
-  } else {
-    bool dispatch_smem_impl_success;
-    {
-      hipError_t err = TryDispatchSoftmaxGradBlockSMemImpl<LOAD_Y, LOAD_DY, STORE, ComputeType,
-                                                            Algorithm::kLogSoftmax>(
-          stream, load_y, load_dy, store, rows, cols, &dispatch_smem_impl_success);
-      if (err != hipSuccess) { return err; }
-    }
-    if (!dispatch_smem_impl_success) {
-      return DispatchSoftmaxGradBlockUncachedImpl<LOAD_Y, LOAD_DY, STORE, ComputeType,
-                                                  Algorithm::kLogSoftmax>(stream, load_y, load_dy,
-                                                                          store, rows, cols);
-    }
-    return hipSuccess;
-  }
-}
-
-template<typename LOAD_Y, typename LOAD_DY, typename STORE, typename ComputeType>
-inline typename std::enable_if<std::is_same<ComputeType, double>::value, hipError_t>::type
-DispatchLogSoftmaxGrad(hipStream_t stream, LOAD_Y load_y, LOAD_DY load_dy, STORE store,
-                       const int64_t rows, const int64_t cols) {
-  return DispatchSoftmaxGradBlockUncachedImpl<LOAD_Y, LOAD_DY, STORE, ComputeType,
-                                              Algorithm::kLogSoftmax>(stream, load_y, load_dy,
-                                                                      store, rows, cols);
-}
-
-}  // namespace softmax
-
-}  // namespace cuda
-
-}  // namespace oneflow
-
-#endif // WITH_ROCM
-
-#endif  // ONEFLOW_CORE_CUDA_SOFTMAX_H_
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+#ifndef ONEFLOW_CORE_HIP_SOFTMAX_H_
+#define ONEFLOW_CORE_HIP_SOFTMAX_H_
+
+#ifdef WITH_ROCM
+
+#include <hipcub/hipcub.hpp>
+// #include <math_constants.h>
+#include <assert.h>
+#include <hip/hip_runtime.h>
+
+// #if CUDA_VERSION >= 11000
+// #include <cuda_bf16.h>
+// #endif  // CUDA_VERSION >= 11000
+
+namespace oneflow {
+
+namespace cuda {
+
+namespace softmax {
+
+constexpr int kWarpSize = 64;
+
+template<typename T>
+struct SumOp {
+  __device__ __forceinline__ T operator()(const T& a, const T& b) const { return a + b; }
+};
+
+template<typename T>
+struct MaxOp {
+  __device__ __forceinline__ T operator()(const T& a, const T& b) const { return max(a, b); }
+};
+
+template<template<typename> class ReductionOp, typename T, int thread_group_width = kWarpSize>
+__inline__ __device__ T WarpAllReduce(T val) {
+  for (int mask = thread_group_width / 2; mask > 0; mask /= 2) {
+    // val = ReductionOp<T>()(val, __shfl_xor(0xffffffff, val, mask));
+    val = ReductionOp<T>()(val, __shfl_xor(val, mask, kWarpSize));
+  }
+  return val;
+}
+
+template<template<typename> class ReductionOp, typename T, int block_size>
+__inline__ __device__ T BlockAllReduce(T val) {
+  typedef hipcub::BlockReduce<T, block_size> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage temp_storage;
+  __shared__ T result_broadcast;
+  T result = BlockReduce(temp_storage).Reduce(val, ReductionOp<T>());
+  if (threadIdx.x == 0) { result_broadcast = result; }
+  __syncthreads();
+  return result_broadcast;
+}
+
+template<typename T>
+__inline__ __device__ T Inf();
+
+template<>
+__inline__ __device__ float Inf<float>() {
+  return __int_as_float(0x7f800000U);
+}
+
+template<>
+__inline__ __device__ double Inf<double>() {
+  return __longlong_as_double(0x7ff0000000000000ULL);
+}
+
+template<typename T>
+__inline__ __device__ T Exp(T x);
+
+template<>
+__inline__ __device__ float Exp<float>(float x) {
+#ifdef OF_SOFTMAX_USE_FAST_MATH
+  return __expf(x);
+#else
+  return exp(x);
+#endif
+}
+
+template<>
+__inline__ __device__ double Exp<double>(double x) {
+  return exp(x);
+}
+
+template<typename T>
+__inline__ __device__ T Div(T a, T b);
+
+template<>
+__inline__ __device__ float Div<float>(float a, float b) {
+#ifdef OF_SOFTMAX_USE_FAST_MATH
+  return __fdividef(a, b);
+#else
+  return a / b;
+#endif
+}
+
+template<>
+__inline__ __device__ double Div<double>(double a, double b) {
+  return a / b;
+}
+
+template<typename T>
+__inline__ __device__ T Log(T x);
+
+template<>
+__inline__ __device__ float Log<float>(float x) {
+#ifdef OF_SOFTMAX_USE_FAST_MATH
+  return __logf(x);
+#else
+  return log(x);
+#endif
+}
+template<>
+__inline__ __device__ double Log<double>(double x) {
+  return log(x);
+}
+
+inline hipError_t GetNumBlocks(int64_t block_size, int64_t max_blocks, int64_t waves,
+                                int* num_blocks) {
+  int dev;
+  {
+    hipError_t err = hipGetDevice(&dev);
+    if (err != hipSuccess) { return err; }
+  }
+  int sm_count;
+  {
+    hipError_t err = hipDeviceGetAttribute(&sm_count, hipDeviceAttributeMultiprocessorCount, dev);
+    if (err != hipSuccess) { return err; }
+  }
+  int tpm;
+  {
+    hipError_t err = hipDeviceGetAttribute(&tpm, hipDeviceAttributeMaxThreadsPerMultiProcessor, dev);
+    if (err != hipSuccess) { return err; }
+  }
+  *num_blocks =
+      std::max<int>(1, std::min<int64_t>(max_blocks, sm_count * tpm / block_size * waves));
+  return hipSuccess;
+}
+
+template<typename T>
+struct DefaultComputeType {
+  using type = T;
+};
+
+template<>
+struct DefaultComputeType<half> {
+  using type = float;
+};
+
+// #if CUDA_VERSION >= 11000
+// template<>
+// struct DefaultComputeType<nv_bfloat16> {
+//   using type = float;
+// };
+// #endif  // CUDA_VERSION >= 11000
+
+template<typename T, int N>
+struct GetPackType {
+  using type = typename std::aligned_storage<N * sizeof(T), N * sizeof(T)>::type;
+};
+
+template<typename T, int N>
+using PackType = typename GetPackType<T, N>::type;
+
+template<typename T, int N>
+union Pack {
+  static_assert(sizeof(PackType<T, N>) == sizeof(T) * N, "");
+  __device__ Pack() {
+    // do nothing
+  }
+  PackType<T, N> storage;
+  T elem[N];
+};
+
+template<typename SRC, typename DST>
+struct DirectLoad {
+  DirectLoad(const SRC* src, int64_t row_size) : src(src), row_size(row_size) {}
+  template<int N>
+  __device__ void load(DST* dst, int64_t row, int64_t col) const {
+    Pack<SRC, N> pack;
+    const int64_t offset = (row * row_size + col) / N;
+    pack.storage = *(reinterpret_cast<const PackType<SRC, N>*>(src) + offset);
+#pragma unroll
+    for (int i = 0; i < N; ++i) { dst[i] = static_cast<DST>(pack.elem[i]); }
+  }
+  const SRC* src;
+  int64_t row_size;
+};
+
+template<typename SRC, typename DST>
+struct DirectStore {
+  DirectStore(DST* dst, int64_t row_size) : dst(dst), row_size(row_size) {}
+  template<int N>
+  __device__ void store(const SRC* src, int64_t row, int64_t col) {
+    Pack<DST, N> pack;
+    const int64_t offset = (row * row_size + col) / N;
+#pragma unroll
+    for (int i = 0; i < N; ++i) { pack.elem[i] = static_cast<DST>(src[i]); }
+    *(reinterpret_cast<PackType<DST, N>*>(dst) + offset) = pack.storage;
+  }
+  DST* dst;
+  int64_t row_size;
+};
+
+enum class Algorithm {
+  kSoftmax = 0,
+  kLogSoftmax = 1,
+};
+
+template<typename LOAD, typename STORE, typename ComputeType, int pack_size, int cols_per_thread,
+         int thread_group_width, int rows_per_access, bool padding, Algorithm algorithm>
+__global__ void SoftmaxWarpImpl(LOAD load, STORE store, const int64_t rows, const int64_t cols) {
+  static_assert(cols_per_thread % pack_size == 0, "");
+  static_assert(thread_group_width <= kWarpSize, "");
+  static_assert(kWarpSize % thread_group_width == 0, "");
+  constexpr int num_packs = cols_per_thread / pack_size;
+  assert(cols <= cols_per_thread * thread_group_width);
+  ComputeType buf[rows_per_access][cols_per_thread];
+  const int global_thread_group_id = blockIdx.x * blockDim.y + threadIdx.y;
+  const int num_global_thread_group = gridDim.x * blockDim.y;
+  const int lane_id = threadIdx.x;
+  const int64_t step = num_global_thread_group * rows_per_access;
+  for (int64_t row = global_thread_group_id * rows_per_access; row < rows; row += step) {
+    ComputeType thread_max[rows_per_access];
+#pragma unroll
+    for (int row_id = 0; row_id < rows_per_access; ++row_id) {
+      thread_max[row_id] = -Inf<ComputeType>();
+      ComputeType* row_buf = buf[row_id];
+#pragma unroll
+      for (int pack_id = 0; pack_id < num_packs; ++pack_id) {
+        const int pack_offset = pack_id * pack_size;
+        const int col = (pack_id * thread_group_width + lane_id) * pack_size;
+        if (!padding || col < cols) {
+          load.template load<pack_size>(row_buf + pack_offset, row + row_id, col);
+#pragma unroll
+          for (int i = 0; i < pack_size; ++i) {
+            thread_max[row_id] = max(thread_max[row_id], row_buf[pack_offset + i]);
+          }
+        } else {
+#pragma unroll
+          for (int i = 0; i < pack_size; ++i) { row_buf[pack_offset + i] = -Inf<ComputeType>(); }
+        }
+      }
+    }
+    ComputeType warp_max[rows_per_access];
+#pragma unroll
+    for (int row_id = 0; row_id < rows_per_access; ++row_id) {
+      warp_max[row_id] = WarpAllReduce<MaxOp, ComputeType, thread_group_width>(thread_max[row_id]);
+    }
+    ComputeType thread_sum[rows_per_access];
+#pragma unroll
+    for (int row_id = 0; row_id < rows_per_access; ++row_id) {
+      thread_sum[row_id] = 0;
+      ComputeType* row_buf = buf[row_id];
+#pragma unroll
+      for (int i = 0; i < cols_per_thread; ++i) {
+        if (algorithm == Algorithm::kSoftmax) {
+          row_buf[i] = Exp(row_buf[i] - warp_max[row_id]);
+          thread_sum[row_id] += row_buf[i];
+        } else if (algorithm == Algorithm::kLogSoftmax) {
+          row_buf[i] -= warp_max[row_id];
+          thread_sum[row_id] += Exp(row_buf[i]);
+        } else {
+          asm volatile("s_trap 0;");
+        }
+      }
+    }
+    ComputeType warp_sum[rows_per_access];
+#pragma unroll
+    for (int row_id = 0; row_id < rows_per_access; ++row_id) {
+      warp_sum[row_id] = WarpAllReduce<SumOp, ComputeType, thread_group_width>(thread_sum[row_id]);
+    }
+#pragma unroll
+    for (int row_id = 0; row_id < rows_per_access; ++row_id) {
+      ComputeType* row_buf = buf[row_id];
+#pragma unroll
+      for (int i = 0; i < cols_per_thread; ++i) {
+        if (algorithm == Algorithm::kSoftmax) {
+          row_buf[i] = Div(row_buf[i], warp_sum[row_id]);
+        } else if (algorithm == Algorithm::kLogSoftmax) {
+          row_buf[i] -= Log(warp_sum[row_id]);
+        } else {
+          asm volatile("s_trap 0;");
+        }
+      }
+#pragma unroll
+      for (int i = 0; i < num_packs; ++i) {
+        const int col = (i * thread_group_width + lane_id) * pack_size;
+        if (!padding || col < cols) {
+          store.template store<pack_size>(row_buf + i * pack_size, row + row_id, col);
+        }
+      }
+    }
+  }
+}
+
+template<typename LOAD, typename STORE, typename ComputeType, int pack_size, int cols_per_thread,
+         int thread_group_width, int rows_per_access, bool padding, Algorithm algorithm>
+inline hipError_t LaunchSoftmaxWarpImpl(hipStream_t stream, LOAD load, STORE store,
+                                         const int64_t rows, const int64_t cols) {
+  // std::cout << "LaunchSoftmaxWarpImpl" << std::endl;                                       
+  constexpr int block_size = 128;
+  constexpr int waves = 32;
+  static_assert(block_size % thread_group_width == 0, "");
+  constexpr int thread_groups_per_block = block_size / thread_group_width;
+  dim3 block_dim(thread_group_width, thread_groups_per_block);
+  const int64_t num_blocks =
+      (rows / rows_per_access + thread_groups_per_block - 1) / thread_groups_per_block;
+  int grid_dim_x;
+  {
+    hipError_t err = GetNumBlocks(block_size, num_blocks, waves, &grid_dim_x);
+    if (err != hipSuccess) { return err; }
+  }
+  SoftmaxWarpImpl<LOAD, STORE, ComputeType, pack_size, cols_per_thread, thread_group_width,
+                  rows_per_access, padding, algorithm>
+      <<<grid_dim_x, block_dim, 0, stream>>>(load, store, rows, cols);
+  return hipPeekAtLastError();
+}
+
+template<typename LOAD, typename STORE, typename ComputeType, int pack_size, int cols_per_thread,
+         int thread_group_width, int rows_per_access, Algorithm algorithm>
+inline hipError_t DispatchSoftmaxWarpImplPadding(hipStream_t stream, LOAD load, STORE store,
+                                                  const int64_t rows, const int64_t cols) {
+  if (cols == cols_per_thread * thread_group_width) {
+    return LaunchSoftmaxWarpImpl<LOAD, STORE, ComputeType, pack_size, cols_per_thread,
+                                 thread_group_width, rows_per_access, false, algorithm>(
+        stream, load, store, rows, cols);
+  } else {
+    return LaunchSoftmaxWarpImpl<LOAD, STORE, ComputeType, pack_size, cols_per_thread,
+                                 thread_group_width, rows_per_access, true, algorithm>(
+        stream, load, store, rows, cols);
+  }
+}
+
+template<typename LOAD, typename STORE, typename ComputeType, int pack_size, Algorithm algorithm>
+typename std::enable_if<pack_size == 1, hipError_t>::type DispatchSoftmaxWarpImplCols(
+    hipStream_t stream, LOAD load, STORE store, const int64_t rows, const int64_t cols) {
+  if (cols <= 0) { return hipErrorInvalidValue; }
+#define DEFINE_ONE_ELIF(thread_group_width)                                                        \
+  else if (cols <= (thread_group_width)*pack_size) {                                               \
+    if (rows % 2 == 0) {                                                                           \
+      return DispatchSoftmaxWarpImplPadding<LOAD, STORE, ComputeType, pack_size, pack_size,        \
+                                            thread_group_width, 2, algorithm>(stream, load, store, \
+                                                                              rows, cols);         \
+    } else {                                                                                       \
+      return DispatchSoftmaxWarpImplPadding<LOAD, STORE, ComputeType, pack_size, pack_size,        \
+                                            thread_group_width, 1, algorithm>(stream, load, store, \
+                                                                              rows, cols);         \
+    }                                                                                              \
+  }
+  DEFINE_ONE_ELIF(1)
+  DEFINE_ONE_ELIF(2)
+  DEFINE_ONE_ELIF(4)
+  DEFINE_ONE_ELIF(8)
+  DEFINE_ONE_ELIF(16)
+  DEFINE_ONE_ELIF(32)
+#undef DEFINE_ONE_ELIF
+#define DEFINE_ONE_ELIF(col)                                                                      \
+  else if (cols <= (col)*kWarpSize) {                                                             \
+    return DispatchSoftmaxWarpImplPadding<LOAD, STORE, ComputeType, pack_size, col, kWarpSize, 1, \
+                                          algorithm>(stream, load, store, rows, cols);            \
+  }
+  DEFINE_ONE_ELIF(2)
+  DEFINE_ONE_ELIF(3)
+  DEFINE_ONE_ELIF(4)
+  DEFINE_ONE_ELIF(5)
+  DEFINE_ONE_ELIF(6)
+  DEFINE_ONE_ELIF(7)
+  DEFINE_ONE_ELIF(8)
+  DEFINE_ONE_ELIF(9)
+  DEFINE_ONE_ELIF(10)
+  DEFINE_ONE_ELIF(11)
+  DEFINE_ONE_ELIF(12)
+  DEFINE_ONE_ELIF(13)
+  DEFINE_ONE_ELIF(14)
+  DEFINE_ONE_ELIF(15)
+  DEFINE_ONE_ELIF(16)
+  DEFINE_ONE_ELIF(17)
+  DEFINE_ONE_ELIF(18)
+  DEFINE_ONE_ELIF(19)
+  DEFINE_ONE_ELIF(20)
+  DEFINE_ONE_ELIF(21)
+  DEFINE_ONE_ELIF(22)
+  DEFINE_ONE_ELIF(23)
+  DEFINE_ONE_ELIF(24)
+  DEFINE_ONE_ELIF(25)
+  DEFINE_ONE_ELIF(26)
+  DEFINE_ONE_ELIF(27)
+  DEFINE_ONE_ELIF(28)
+  DEFINE_ONE_ELIF(29)
+  DEFINE_ONE_ELIF(30)
+  DEFINE_ONE_ELIF(31)
+  DEFINE_ONE_ELIF(32)
+#undef DEFINE_ONE_ELIF
+  else {
+    return hipErrorInvalidValue;
+  }
+}
+
+template<typename LOAD, typename STORE, typename ComputeType, int pack_size, Algorithm algorithm>
+typename std::enable_if<pack_size == 2, hipError_t>::type DispatchSoftmaxWarpImplCols(
+    hipStream_t stream, LOAD load, STORE store, const int64_t rows, const int64_t cols) {
+  if (cols <= 0) { return hipErrorInvalidValue; }
+#define DEFINE_ONE_ELIF(thread_group_width)                                                        \
+  else if (cols <= (thread_group_width)*pack_size) {                                               \
+    if (rows % 2 == 0) {                                                                           \
+      return DispatchSoftmaxWarpImplPadding<LOAD, STORE, ComputeType, pack_size, pack_size,        \
+                                            thread_group_width, 2, algorithm>(stream, load, store, \
+                                                                              rows, cols);         \
+    } else {                                                                                       \
+      return DispatchSoftmaxWarpImplPadding<LOAD, STORE, ComputeType, pack_size, pack_size,        \
+                                            thread_group_width, 1, algorithm>(stream, load, store, \
+                                                                              rows, cols);         \
+    }                                                                                              \
+  }
+  DEFINE_ONE_ELIF(1)
+  DEFINE_ONE_ELIF(2)
+  DEFINE_ONE_ELIF(4)
+  DEFINE_ONE_ELIF(8)
+  DEFINE_ONE_ELIF(16)
+  DEFINE_ONE_ELIF(32)
+#undef DEFINE_ONE_ELIF
+#define DEFINE_ONE_ELIF(col)                                                                      \
+  else if (cols <= (col)*kWarpSize) {                                                             \
+    return DispatchSoftmaxWarpImplPadding<LOAD, STORE, ComputeType, pack_size, col, kWarpSize, 1, \
+                                          algorithm>(stream, load, store, rows, cols);            \
+  }
+  DEFINE_ONE_ELIF(4)
+  DEFINE_ONE_ELIF(6)
+  DEFINE_ONE_ELIF(8)
+  DEFINE_ONE_ELIF(10)
+  DEFINE_ONE_ELIF(12)
+  DEFINE_ONE_ELIF(14)
+  DEFINE_ONE_ELIF(16)
+  DEFINE_ONE_ELIF(18)
+  DEFINE_ONE_ELIF(20)
+  DEFINE_ONE_ELIF(22)
+  DEFINE_ONE_ELIF(24)
+  DEFINE_ONE_ELIF(26)
+  DEFINE_ONE_ELIF(28)
+  DEFINE_ONE_ELIF(30)
+  DEFINE_ONE_ELIF(32)
+#undef DEFINE_ONE_ELIF
+  else {
+    return hipErrorInvalidValue;
+  }
+}
+
+template<typename LOAD, typename STORE, typename ComputeType, Algorithm algorithm>
+struct DispatchSoftmaxWarpImplPackSize {
+  hipError_t operator()(hipStream_t stream, LOAD load, STORE store, const int64_t rows,
+                         const int64_t cols) {
+    if (cols % 2 == 0) {
+      return DispatchSoftmaxWarpImplCols<LOAD, STORE, ComputeType, 2, algorithm>(stream, load,
+                                                                                 store, rows, cols);
+    } else {
+      return DispatchSoftmaxWarpImplCols<LOAD, STORE, ComputeType, 1, algorithm>(stream, load,
+                                                                                 store, rows, cols);
+    }
+  }
+};
+
+template<typename LOAD, typename STORE, typename ComputeType, Algorithm algorithm>
+inline hipError_t DispatchSoftmaxWarpImpl(hipStream_t stream, LOAD load, STORE store,
+                                           const int64_t rows, const int64_t cols) {
+  return DispatchSoftmaxWarpImplPackSize<LOAD, STORE, ComputeType, algorithm>()(stream, load, store,
+                                                                                rows, cols);
+}
+
+template<typename LOAD, typename STORE, typename ComputeType, int pack_size, int block_size,
+         Algorithm algorithm>
+__global__ void SoftmaxBlockSMemImpl(LOAD load, STORE store, const int64_t rows,
+                                     const int64_t cols) {
+  extern __shared__ __align__(sizeof(double)) unsigned char shared_buf[];
+  auto* buf = reinterpret_cast<ComputeType*>(shared_buf);
+  const int tid = threadIdx.x;
+  assert(cols % pack_size == 0);
+  const int num_packs = cols / pack_size;
+  for (int64_t row = blockIdx.x; row < rows; row += gridDim.x) {
+    ComputeType thread_max = -Inf<ComputeType>();
+    for (int pack_id = tid; pack_id < num_packs; pack_id += block_size) {
+      ComputeType pack[pack_size];
+      load.template load<pack_size>(pack, row, pack_id * pack_size);
+#pragma unroll
+      for (int i = 0; i < pack_size; ++i) {
+        buf[i * num_packs + pack_id] = pack[i];
+        thread_max = max(thread_max, pack[i]);
+      }
+    }
+    const ComputeType row_max = BlockAllReduce<MaxOp, ComputeType, block_size>(thread_max);
+    ComputeType thread_sum = 0;
+    for (int col = tid; col < cols; col += block_size) {
+      if (algorithm == Algorithm::kSoftmax) {
+        const ComputeType exp_x = Exp(buf[col] - row_max);
+        buf[col] = exp_x;
+        thread_sum += exp_x;
+      } else {
+        const ComputeType x = buf[col] - row_max;
+        buf[col] = x;
+        thread_sum += Exp(x);
+      }
+    }
+    const ComputeType row_sum = BlockAllReduce<SumOp, ComputeType, block_size>(thread_sum);
+    for (int pack_id = tid; pack_id < num_packs; pack_id += block_size) {
+      ComputeType pack[pack_size];
+#pragma unroll
+      for (int i = 0; i < pack_size; ++i) {
+        if (algorithm == Algorithm::kSoftmax) {
+          pack[i] = Div(buf[i * num_packs + pack_id], row_sum);
+        } else if (algorithm == Algorithm::kLogSoftmax) {
+          pack[i] = buf[i * num_packs + pack_id] - Log(row_sum);
+        } else {
+          asm volatile("s_trap 0;");
+        }
+      }
+      store.template store<pack_size>(pack, row, pack_id * pack_size);
+    }
+  }
+}
+
+template<typename LOAD, typename STORE, typename ComputeType, int pack_size, int block_size,
+         Algorithm algorithm>
+__global__ void SoftmaxBlockSMemImpl_1024(LOAD load, STORE store, const int64_t rows,
+                                     const int64_t cols) __attribute__((amdgpu_flat_work_group_size(1,1024))) {
+  extern __shared__ __align__(sizeof(double)) unsigned char shared_buf[];
+  auto* buf = reinterpret_cast<ComputeType*>(shared_buf);
+  const int tid = threadIdx.x;
+  assert(cols % pack_size == 0);
+  const int num_packs = cols / pack_size;
+  for (int64_t row = blockIdx.x; row < rows; row += gridDim.x) {
+    ComputeType thread_max = -Inf<ComputeType>();
+    for (int pack_id = tid; pack_id < num_packs; pack_id += block_size) {
+      ComputeType pack[pack_size];
+      load.template load<pack_size>(pack, row, pack_id * pack_size);
+#pragma unroll
+      for (int i = 0; i < pack_size; ++i) {
+        buf[i * num_packs + pack_id] = pack[i];
+        thread_max = max(thread_max, pack[i]);
+      }
+    }
+    const ComputeType row_max = BlockAllReduce<MaxOp, ComputeType, block_size>(thread_max);
+    ComputeType thread_sum = 0;
+    for (int col = tid; col < cols; col += block_size) {
+      if (algorithm == Algorithm::kSoftmax) {
+        const ComputeType exp_x = Exp(buf[col] - row_max);
+        buf[col] = exp_x;
+        thread_sum += exp_x;
+      } else {
+        const ComputeType x = buf[col] - row_max;
+        buf[col] = x;
+        thread_sum += Exp(x);
+      }
+    }
+    const ComputeType row_sum = BlockAllReduce<SumOp, ComputeType, block_size>(thread_sum);
+    for (int pack_id = tid; pack_id < num_packs; pack_id += block_size) {
+      ComputeType pack[pack_size];
+#pragma unroll
+      for (int i = 0; i < pack_size; ++i) {
+        if (algorithm == Algorithm::kSoftmax) {
+          pack[i] = Div(buf[i * num_packs + pack_id], row_sum);
+        } else if (algorithm == Algorithm::kLogSoftmax) {
+          pack[i] = buf[i * num_packs + pack_id] - Log(row_sum);
+        } else {
+          asm volatile("s_trap 0;");
+        }
+      }
+      store.template store<pack_size>(pack, row, pack_id * pack_size);
+    }
+  }
+}
+
+template<typename LOAD, typename STORE, typename ComputeType, int pack_size, int block_size,
+         Algorithm algorithm>
+inline hipError_t LaunchSoftmaxBlockSMemImpl(hipStream_t stream, LOAD load, STORE store, int smem,
+                                              const int64_t rows, const int64_t cols) {
+                                               
+  constexpr int waves = 32;
+  int grid_dim_x;
+  {
+    hipError_t err = GetNumBlocks(block_size, rows, waves, &grid_dim_x);
+    if (err != hipSuccess) { return err; }
+  }
+  SoftmaxBlockSMemImpl<LOAD, STORE, ComputeType, pack_size, block_size, algorithm>
+      <<<grid_dim_x, block_size, smem, stream>>>(load, store, rows, cols);
+  return hipPeekAtLastError();
+}
+
+template<typename LOAD, typename STORE, typename ComputeType, int pack_size, int block_size,
+         Algorithm algorithm>
+inline hipError_t LaunchSoftmaxBlockSMemImpl_1024(hipStream_t stream, LOAD load, STORE store, int smem,
+                                              const int64_t rows, const int64_t cols) {
+                                               
+  constexpr int waves = 32;
+  int grid_dim_x;
+  {
+    hipError_t err = GetNumBlocks(block_size, rows, waves, &grid_dim_x);
+    if (err != hipSuccess) { return err; }
+  }
+  SoftmaxBlockSMemImpl_1024<LOAD, STORE, ComputeType, pack_size, block_size, algorithm>
+      <<<grid_dim_x, block_size, smem, stream>>>(load, store, rows, cols);
+  return hipPeekAtLastError();
+}
+
+template<typename LOAD, typename STORE, typename ComputeType, int pack_size, Algorithm algorithm>
+inline hipError_t TryDispatchSoftmaxBlockSMemImplBlockSize(hipStream_t stream, LOAD load,
+                                                            STORE store, const int64_t rows,
+                                                            const int64_t cols, bool* success) {
+  
+  constexpr int block_size_conf_1 = 128;
+  constexpr int block_size_conf_2 = 256;
+  constexpr int block_size_conf_3 = 512;
+  constexpr int block_size_conf_4 = 1024;
+  const size_t smem = cols * sizeof(ComputeType);
+  int max_active_blocks_conf_1;
+  {
+    hipError_t err = hipOccupancyMaxActiveBlocksPerMultiprocessor(
+        &max_active_blocks_conf_1,
+        SoftmaxBlockSMemImpl<LOAD, STORE, ComputeType, pack_size, block_size_conf_1, algorithm>,
+        block_size_conf_1, smem);
+    if (err != hipSuccess) { return err; }
+  }
+  if (max_active_blocks_conf_1 <= 0) {
+    *success = false;
+    return hipSuccess;
+  }
+  int max_active_blocks_conf_4;
+  {
+    hipError_t err = hipOccupancyMaxActiveBlocksPerMultiprocessor(
+        &max_active_blocks_conf_4,
+        SoftmaxBlockSMemImpl_1024<LOAD, STORE, ComputeType, pack_size, block_size_conf_4, algorithm>,
+        block_size_conf_4, smem);
+    if (err != hipSuccess) { return err; }
+  }
+  if (max_active_blocks_conf_4 == max_active_blocks_conf_1) {
+    *success = true;
+    return LaunchSoftmaxBlockSMemImpl_1024<LOAD, STORE, ComputeType, pack_size, block_size_conf_4,
+                                      algorithm>(stream, load, store, smem, rows, cols);
+  }
+  int max_active_blocks_conf_3;
+  {
+    hipError_t err = hipOccupancyMaxActiveBlocksPerMultiprocessor(
+        &max_active_blocks_conf_3,
+        SoftmaxBlockSMemImpl<LOAD, STORE, ComputeType, pack_size, block_size_conf_3, algorithm>,
+        block_size_conf_3, smem);
+    if (err != hipSuccess) { return err; }
+  }
+  if (max_active_blocks_conf_3 == max_active_blocks_conf_1) {
+    *success = true;
+    return LaunchSoftmaxBlockSMemImpl<LOAD, STORE, ComputeType, pack_size, block_size_conf_3,
+                                      algorithm>(stream, load, store, smem, rows, cols);
+  }
+  int max_active_blocks_conf_2;
+  {
+    hipError_t err = hipOccupancyMaxActiveBlocksPerMultiprocessor(
+        &max_active_blocks_conf_2,
+        SoftmaxBlockSMemImpl<LOAD, STORE, ComputeType, pack_size, block_size_conf_2, algorithm>,
+        block_size_conf_2, smem);
+    if (err != hipSuccess) { return err; }
+  }
+  if (max_active_blocks_conf_2 == max_active_blocks_conf_1) {
+    *success = true;
+    return LaunchSoftmaxBlockSMemImpl<LOAD, STORE, ComputeType, pack_size, block_size_conf_2,
+                                      algorithm>(stream, load, store, smem, rows, cols);
+  }
+  *success = true;
+  return LaunchSoftmaxBlockSMemImpl<LOAD, STORE, ComputeType, pack_size, block_size_conf_1,
+                                    algorithm>(stream, load, store, smem, rows, cols);
+}
+
+template<typename LOAD, typename STORE, typename ComputeType, Algorithm algorithm>
+struct TryDispatchSoftmaxBlockSMemImplPackSize {
+  hipError_t operator()(hipStream_t stream, LOAD load, STORE store, const int64_t rows,
+                         const int64_t cols, bool* success) {
+    if (cols % 2 == 0) {
+      return TryDispatchSoftmaxBlockSMemImplBlockSize<LOAD, STORE, ComputeType, 2, algorithm>(
+          stream, load, store, rows, cols, success);
+    } else {
+      return TryDispatchSoftmaxBlockSMemImplBlockSize<LOAD, STORE, ComputeType, 1, algorithm>(
+          stream, load, store, rows, cols, success);
+    }
+  }
+};
+
+template<typename LOAD, typename STORE, typename ComputeType, Algorithm algorithm>
+inline hipError_t TryDispatchSoftmaxBlockSMemImpl(hipStream_t stream, LOAD load, STORE store,
+                                                   const int64_t rows, const int64_t cols,
+                                                   bool* success) {
+  return TryDispatchSoftmaxBlockSMemImplPackSize<LOAD, STORE, ComputeType, algorithm>()(
+      stream, load, store, rows, cols, success);
+}
+
+template<typename LOAD, typename STORE, typename ComputeType, int pack_size, int block_size,
+         Algorithm algorithm>
+__global__ void SoftmaxBlockUncachedImpl(LOAD load, STORE store, const int64_t rows,
+                                         const int64_t cols) __attribute__((amdgpu_flat_work_group_size(1,1024))) {
+  const int tid = threadIdx.x;
+  assert(cols % pack_size == 0);
+  const int num_packs = cols / pack_size;
+  for (int64_t row = blockIdx.x; row < rows; row += gridDim.x) {
+    ComputeType thread_max = -Inf<ComputeType>();
+    for (int pack_id = tid; pack_id < num_packs; pack_id += block_size) {
+      ComputeType pack[pack_size];
+      load.template load<pack_size>(pack, row, pack_id * pack_size);
+#pragma unroll
+      for (int i = 0; i < pack_size; ++i) { thread_max = max(thread_max, pack[i]); }
+    }
+    const ComputeType row_max = BlockAllReduce<MaxOp, ComputeType, block_size>(thread_max);
+    ComputeType thread_sum = 0;
+    for (int pack_id = tid; pack_id < num_packs; pack_id += block_size) {
+      ComputeType pack[pack_size];
+      load.template load<pack_size>(pack, row, pack_id * pack_size);
+#pragma unroll
+      for (int i = 0; i < pack_size; ++i) { thread_sum += Exp(pack[i] - row_max); }
+    }
+    const ComputeType row_sum = BlockAllReduce<SumOp, ComputeType, block_size>(thread_sum);
+    for (int pack_id = tid; pack_id < num_packs; pack_id += block_size) {
+      ComputeType pack[pack_size];
+      load.template load<pack_size>(pack, row, pack_id * pack_size);
+#pragma unroll
+      for (int i = 0; i < pack_size; ++i) {
+        if (algorithm == Algorithm::kSoftmax) {
+          pack[i] = Div(Exp(pack[i] - row_max), row_sum);
+        } else if (algorithm == Algorithm::kLogSoftmax) {
+          pack[i] = (pack[i] - row_max) - Log(row_sum);
+        } else {
+          asm volatile("s_trap 0;");
+        }
+      }
+      store.template store<pack_size>(pack, row, pack_id * pack_size);
+    }
+  }
+}
+
+template<typename LOAD, typename STORE, typename ComputeType, int pack_size, Algorithm algorithm>
+inline hipError_t LaunchSoftmaxBlockUncachedImpl(hipStream_t stream, LOAD load, STORE store,
+                                                  const int64_t rows, const int64_t cols) {
+  // std::cout << "LaunchSoftmaxBlockUncachedImpl" << std::endl;                                                
+  constexpr int block_size = 1024;
+  constexpr int waves = 32;
+  int grid_dim_x;
+  {
+    hipError_t err = GetNumBlocks(block_size, rows, waves, &grid_dim_x);
+    if (err != hipSuccess) { return err; }
+  }
+  SoftmaxBlockUncachedImpl<LOAD, STORE, ComputeType, pack_size, block_size, algorithm>
+      <<<grid_dim_x, block_size, 0, stream>>>(load, store, rows, cols);
+  return hipPeekAtLastError();
+}
+
+template<typename LOAD, typename STORE, typename ComputeType, Algorithm algorithm>
+struct DispatchSoftmaxBlockUncachedImplPackSize {
+  hipError_t operator()(hipStream_t stream, LOAD load, STORE store, const int64_t rows,
+                         const int64_t cols) {
+    if (cols % 2 == 0) {
+      return LaunchSoftmaxBlockUncachedImpl<LOAD, STORE, ComputeType, 2, algorithm>(
+          stream, load, store, rows, cols);
+    } else {
+      return LaunchSoftmaxBlockUncachedImpl<LOAD, STORE, ComputeType, 1, algorithm>(
+          stream, load, store, rows, cols);
+    }
+  }
+};
+
+template<typename LOAD, typename STORE, typename ComputeType, Algorithm algorithm>
+inline hipError_t DispatchSoftmaxBlockUncachedImpl(hipStream_t stream, LOAD load, STORE store,
+                                                    const int64_t rows, const int64_t cols) {
+  return DispatchSoftmaxBlockUncachedImplPackSize<LOAD, STORE, ComputeType, algorithm>()(
+      stream, load, store, rows, cols);
+}
+
+template<typename LOAD, typename STORE, typename ComputeType>
+inline typename std::enable_if<!std::is_same<ComputeType, double>::value, hipError_t>::type
+DispatchSoftmax(hipStream_t stream, LOAD load, STORE store, const int64_t rows,
+                const int64_t cols) {
+  if (cols < 1024) {
+    return DispatchSoftmaxWarpImpl<LOAD, STORE, ComputeType, Algorithm::kSoftmax>(
+        stream, load, store, rows, cols);
+  } else {
+    bool dispatch_smem_impl_success;
+    {
+      hipError_t err =
+          TryDispatchSoftmaxBlockSMemImpl<LOAD, STORE, ComputeType, Algorithm::kSoftmax>(
+              stream, load, store, rows, cols, &dispatch_smem_impl_success);
+      if (err != hipSuccess) { return err; }
+    }
+    if (!dispatch_smem_impl_success) {
+      return DispatchSoftmaxBlockUncachedImpl<LOAD, STORE, ComputeType, Algorithm::kSoftmax>(
+          stream, load, store, rows, cols);
+    }
+    return hipSuccess;
+  }
+}
+
+template<typename LOAD, typename STORE, typename ComputeType>
+inline typename std::enable_if<std::is_same<ComputeType, double>::value, hipError_t>::type
+DispatchSoftmax(hipStream_t stream, LOAD load, STORE store, const int64_t rows,
+                const int64_t cols) {
+  return DispatchSoftmaxBlockUncachedImpl<LOAD, STORE, ComputeType, Algorithm::kSoftmax>(
+      stream, load, store, rows, cols);
+}
+
+template<typename LOAD, typename STORE, typename ComputeType>
+inline typename std::enable_if<!std::is_same<ComputeType, double>::value, hipError_t>::type
+DispatchLogSoftmax(hipStream_t stream, LOAD load, STORE store, const int64_t rows,
+                   const int64_t cols) {
+  if (cols <= 1024) {
+    return DispatchSoftmaxWarpImpl<LOAD, STORE, ComputeType, Algorithm::kLogSoftmax>(
+        stream, load, store, rows, cols);
+  } else {
+    bool dispatch_smem_impl_success;
+    {
+      hipError_t err =
+          TryDispatchSoftmaxBlockSMemImpl<LOAD, STORE, ComputeType, Algorithm::kLogSoftmax>(
+              stream, load, store, rows, cols, &dispatch_smem_impl_success);
+      if (err != hipSuccess) { return err; }
+    }
+    if (!dispatch_smem_impl_success) {
+      return DispatchSoftmaxBlockUncachedImpl<LOAD, STORE, ComputeType, Algorithm::kLogSoftmax>(
+          stream, load, store, rows, cols);
+    }
+    return hipSuccess;
+  }
+}
+
+template<typename LOAD, typename STORE, typename ComputeType>
+inline typename std::enable_if<std::is_same<ComputeType, double>::value, hipError_t>::type
+DispatchLogSoftmax(hipStream_t stream, LOAD load, STORE store, const int64_t rows,
+                   const int64_t cols) {
+  return DispatchSoftmaxBlockUncachedImpl<LOAD, STORE, ComputeType, Algorithm::kLogSoftmax>(
+      stream, load, store, rows, cols);
+}
+
+template<typename LOAD_Y, typename LOAD_DY, typename STORE, typename ComputeType, int pack_size,
+         int cols_per_thread, int thread_group_width, int rows_per_access, bool padding,
+         Algorithm algorithm>
+__global__ void SoftmaxGradWarpImpl(LOAD_Y load_y, LOAD_DY load_dy, STORE store, const int64_t rows,
+                                    const int64_t cols) {
+  static_assert(cols_per_thread % pack_size == 0, "");
+  constexpr int pack_per_thread = cols_per_thread / pack_size;
+  assert(cols <= cols_per_thread * thread_group_width);
+  static_assert(thread_group_width <= kWarpSize, "");
+  static_assert(kWarpSize % thread_group_width == 0, "");
+  ComputeType y_buf[rows_per_access][cols_per_thread];
+  ComputeType dy_buf[rows_per_access][cols_per_thread];
+  const int global_thread_group_id = blockIdx.x * blockDim.y + threadIdx.y;
+  const int num_global_thread_group = gridDim.x * blockDim.y;
+  const int lane_id = threadIdx.x;
+  const int64_t step = num_global_thread_group * rows_per_access;
+  for (int64_t row = global_thread_group_id * rows_per_access; row < rows; row += step) {
+    ComputeType thread_sum[rows_per_access];
+#pragma unroll
+    for (int row_id = 0; row_id < rows_per_access; ++row_id) {
+      thread_sum[row_id] = 0;
+      ComputeType* row_y_buf = y_buf[row_id];
+      ComputeType* row_dy_buf = dy_buf[row_id];
+#pragma unroll
+      for (int pack_id = 0; pack_id < pack_per_thread; ++pack_id) {
+        const int pack_offset = pack_id * pack_size;
+        const int col = (pack_id * thread_group_width + lane_id) * pack_size;
+        if (!padding || col < cols) {
+          load_y.template load<pack_size>(row_y_buf + pack_offset, row + row_id, col);
+          load_dy.template load<pack_size>(row_dy_buf + pack_offset, row + row_id, col);
+#pragma unroll
+          for (int i = 0; i < pack_size; ++i) {
+            if (algorithm == Algorithm::kSoftmax) {
+              thread_sum[row_id] += row_y_buf[pack_offset + i] * row_dy_buf[pack_offset + i];
+            } else if (algorithm == Algorithm::kLogSoftmax) {
+              thread_sum[row_id] += row_dy_buf[pack_offset + i];
+            } else {
+              asm volatile("s_trap 0;");
+            }
+          }
+        }
+      }
+    }
+    ComputeType warp_sum[rows_per_access];
+#pragma unroll
+    for (int row_id = 0; row_id < rows_per_access; ++row_id) {
+      warp_sum[row_id] = WarpAllReduce<SumOp, ComputeType, thread_group_width>(thread_sum[row_id]);
+    }
+#pragma unroll
+    for (int row_id = 0; row_id < rows_per_access; ++row_id) {
+      ComputeType* row_y_buf = y_buf[row_id];
+      ComputeType* row_dy_buf = dy_buf[row_id];
+#pragma unroll
+      for (int pack_id = 0; pack_id < pack_per_thread; ++pack_id) {
+        const int pack_offset = pack_id * pack_size;
+        const int col = (pack_id * thread_group_width + lane_id) * pack_size;
+        if (!padding || col < cols) {
+          for (int i = 0; i < pack_size; ++i) {
+            if (algorithm == Algorithm::kSoftmax) {
+              row_dy_buf[pack_offset + i] =
+                  (row_dy_buf[pack_offset + i] - warp_sum[row_id]) * row_y_buf[pack_offset + i];
+            } else if (algorithm == Algorithm::kLogSoftmax) {
+              row_dy_buf[pack_offset + i] -= Exp(row_y_buf[pack_offset + i]) * warp_sum[row_id];
+            } else {
+              asm volatile("s_trap 0;");
+            }
+          }
+          store.template store<pack_size>(row_dy_buf + pack_offset, row + row_id, col);
+        }
+      }
+    }
+  }
+}
+
+template<typename LOAD_Y, typename LOAD_DY, typename STORE, typename ComputeType, int pack_size,
+         int cols_per_thread, int thread_group_width, int rows_per_access, bool padding,
+         Algorithm algorithm>
+inline hipError_t LaunchSoftmaxGradWarpImpl(hipStream_t stream, LOAD_Y load_y, LOAD_DY load_dy,
+                                             STORE store, const int64_t rows, const int64_t cols) {
+  constexpr int block_size = 128;
+  constexpr int waves = 32;
+  static_assert(block_size % thread_group_width == 0, "");
+  constexpr int thread_groups_per_block = block_size / thread_group_width;
+  dim3 block_dim(thread_group_width, thread_groups_per_block);
+  const int64_t num_blocks =
+      (rows / rows_per_access + thread_groups_per_block - 1) / thread_groups_per_block;
+  int grid_dim_x;
+  {
+    hipError_t err = GetNumBlocks(block_size, num_blocks, waves, &grid_dim_x);
+    if (err != hipSuccess) { return err; }
+  }
+  SoftmaxGradWarpImpl<LOAD_Y, LOAD_DY, STORE, ComputeType, pack_size, cols_per_thread,
+                      thread_group_width, rows_per_access, padding, algorithm>
+      <<<grid_dim_x, block_dim, 0, stream>>>(load_y, load_dy, store, rows, cols);
+  return hipPeekAtLastError();
+}
+
+template<typename LOAD_Y, typename LOAD_DY, typename STORE, typename ComputeType, int pack_size,
+         int cols_per_thread, int thread_group_width, int rows_per_access, Algorithm algorithm>
+inline hipError_t DispatchSoftmaxGradWarpImplPadding(hipStream_t stream, LOAD_Y load_y,
+                                                      LOAD_DY load_dy, STORE store,
+                                                      const int64_t rows, const int64_t cols) {
+  if (cols == cols_per_thread * thread_group_width) {
+    return LaunchSoftmaxGradWarpImpl<LOAD_Y, LOAD_DY, STORE, ComputeType, pack_size,
+                                     cols_per_thread, thread_group_width, rows_per_access, false,
+                                     algorithm>(stream, load_y, load_dy, store, rows, cols);
+  } else {
+    return LaunchSoftmaxGradWarpImpl<LOAD_Y, LOAD_DY, STORE, ComputeType, pack_size,
+                                     cols_per_thread, thread_group_width, rows_per_access, true,
+                                     algorithm>(stream, load_y, load_dy, store, rows, cols);
+  }
+}
+
+template<typename LOAD_Y, typename LOAD_DY, typename STORE, typename ComputeType, int pack_size,
+         Algorithm algorithm>
+typename std::enable_if<pack_size == 1, hipError_t>::type DispatchSoftmaxGradWarpImplCols(
+    hipStream_t stream, LOAD_Y load_y, LOAD_DY load_dy, STORE store, const int64_t rows,
+    const int64_t cols) {
+  if (cols <= 0) { return hipErrorInvalidValue; }
+#define DEFINE_ONE_ELIF(thread_group_width)                                                     \
+  else if (cols <= (thread_group_width)*pack_size) {                                            \
+    if (rows % 2 == 0) {                                                                        \
+      return DispatchSoftmaxGradWarpImplPadding<LOAD_Y, LOAD_DY, STORE, ComputeType, pack_size, \
+                                                pack_size, thread_group_width, 2, algorithm>(   \
+          stream, load_y, load_dy, store, rows, cols);                                          \
+    } else {                                                                                    \
+      return DispatchSoftmaxGradWarpImplPadding<LOAD_Y, LOAD_DY, STORE, ComputeType, pack_size, \
+                                                pack_size, thread_group_width, 1, algorithm>(   \
+          stream, load_y, load_dy, store, rows, cols);                                          \
+    }                                                                                           \
+  }
+  DEFINE_ONE_ELIF(1)
+  DEFINE_ONE_ELIF(2)
+  DEFINE_ONE_ELIF(4)
+  DEFINE_ONE_ELIF(8)
+  DEFINE_ONE_ELIF(16)
+  DEFINE_ONE_ELIF(32)
+#undef DEFINE_ONE_ELIF
+#define DEFINE_ONE_ELIF(col)                                                                       \
+  else if (cols <= (col)*kWarpSize) {                                                              \
+    return DispatchSoftmaxGradWarpImplPadding<LOAD_Y, LOAD_DY, STORE, ComputeType, pack_size, col, \
+                                              kWarpSize, 1, algorithm>(stream, load_y, load_dy,    \
+                                                                       store, rows, cols);         \
+  }
+  DEFINE_ONE_ELIF(2)
+  DEFINE_ONE_ELIF(3)
+  DEFINE_ONE_ELIF(4)
+  DEFINE_ONE_ELIF(5)
+  DEFINE_ONE_ELIF(6)
+  DEFINE_ONE_ELIF(7)
+  DEFINE_ONE_ELIF(8)
+  DEFINE_ONE_ELIF(9)
+  DEFINE_ONE_ELIF(10)
+  DEFINE_ONE_ELIF(11)
+  DEFINE_ONE_ELIF(12)
+  DEFINE_ONE_ELIF(13)
+  DEFINE_ONE_ELIF(14)
+  DEFINE_ONE_ELIF(15)
+  DEFINE_ONE_ELIF(16)
+  DEFINE_ONE_ELIF(17)
+  DEFINE_ONE_ELIF(18)
+  DEFINE_ONE_ELIF(19)
+  DEFINE_ONE_ELIF(20)
+  DEFINE_ONE_ELIF(21)
+  DEFINE_ONE_ELIF(22)
+  DEFINE_ONE_ELIF(23)
+  DEFINE_ONE_ELIF(24)
+  DEFINE_ONE_ELIF(25)
+  DEFINE_ONE_ELIF(26)
+  DEFINE_ONE_ELIF(27)
+  DEFINE_ONE_ELIF(28)
+  DEFINE_ONE_ELIF(29)
+  DEFINE_ONE_ELIF(30)
+  DEFINE_ONE_ELIF(31)
+  DEFINE_ONE_ELIF(32)
+#undef DEFINE_ONE_ELIF
+  else {
+    return hipErrorInvalidValue;
+  }
+}
+
+template<typename LOAD_Y, typename LOAD_DY, typename STORE, typename ComputeType, int pack_size,
+         Algorithm algorithm>
+typename std::enable_if<pack_size == 2, hipError_t>::type DispatchSoftmaxGradWarpImplCols(
+    hipStream_t stream, LOAD_Y load_y, LOAD_DY load_dy, STORE store, const int64_t rows,
+    const int64_t cols) {
+  if (cols <= 0) { return hipErrorInvalidValue; }
+#define DEFINE_ONE_ELIF(thread_group_width)                                                     \
+  else if (cols <= (thread_group_width)*pack_size) {                                            \
+    if (rows % 2 == 0) {                                                                        \
+      return DispatchSoftmaxGradWarpImplPadding<LOAD_Y, LOAD_DY, STORE, ComputeType, pack_size, \
+                                                pack_size, thread_group_width, 2, algorithm>(   \
+          stream, load_y, load_dy, store, rows, cols);                                          \
+    } else {                                                                                    \
+      return DispatchSoftmaxGradWarpImplPadding<LOAD_Y, LOAD_DY, STORE, ComputeType, pack_size, \
+                                                pack_size, thread_group_width, 1, algorithm>(   \
+          stream, load_y, load_dy, store, rows, cols);                                          \
+    }                                                                                           \
+  }
+  DEFINE_ONE_ELIF(1)
+  DEFINE_ONE_ELIF(2)
+  DEFINE_ONE_ELIF(4)
+  DEFINE_ONE_ELIF(8)
+  DEFINE_ONE_ELIF(16)
+  DEFINE_ONE_ELIF(32)
+#undef DEFINE_ONE_ELIF
+#define DEFINE_ONE_ELIF(col)                                                                       \
+  else if (cols <= (col)*kWarpSize) {                                                              \
+    return DispatchSoftmaxGradWarpImplPadding<LOAD_Y, LOAD_DY, STORE, ComputeType, pack_size, col, \
+                                              kWarpSize, 1, algorithm>(stream, load_y, load_dy,    \
+                                                                       store, rows, cols);         \
+  }
+  DEFINE_ONE_ELIF(4)
+  DEFINE_ONE_ELIF(6)
+  DEFINE_ONE_ELIF(8)
+  DEFINE_ONE_ELIF(10)
+  DEFINE_ONE_ELIF(12)
+  DEFINE_ONE_ELIF(14)
+  DEFINE_ONE_ELIF(16)
+  DEFINE_ONE_ELIF(18)
+  DEFINE_ONE_ELIF(20)
+  DEFINE_ONE_ELIF(22)
+  DEFINE_ONE_ELIF(24)
+  DEFINE_ONE_ELIF(26)
+  DEFINE_ONE_ELIF(28)
+  DEFINE_ONE_ELIF(30)
+  DEFINE_ONE_ELIF(32)
+#undef DEFINE_ONE_ELIF
+  else {
+    return hipErrorInvalidValue;
+  }
+}
+
+template<typename LOAD_Y, typename LOAD_DY, typename STORE, typename ComputeType,
+         Algorithm algorithm>
+struct DispatchSoftmaxGradWarpImplPackSize {
+  hipError_t operator()(hipStream_t stream, LOAD_Y load_y, LOAD_DY load_dy, STORE store,
+                         const int64_t rows, const int64_t cols) {
+    if (cols % 2 == 0) {
+      return DispatchSoftmaxGradWarpImplCols<LOAD_Y, LOAD_DY, STORE, ComputeType, 2, algorithm>(
+          stream, load_y, load_dy, store, rows, cols);
+    } else {
+      return DispatchSoftmaxGradWarpImplCols<LOAD_Y, LOAD_DY, STORE, ComputeType, 1, algorithm>(
+          stream, load_y, load_dy, store, rows, cols);
+    }
+  }
+};
+
+template<typename LOAD_Y, typename LOAD_DY, typename STORE, typename ComputeType,
+         Algorithm algorithm>
+inline hipError_t DispatchSoftmaxGradWarpImpl(hipStream_t stream, LOAD_Y load_y, LOAD_DY load_dy,
+                                               STORE store, const int64_t rows,
+                                               const int64_t cols) {
+  return DispatchSoftmaxGradWarpImplPackSize<LOAD_Y, LOAD_DY, STORE, ComputeType, algorithm>()(
+      stream, load_y, load_dy, store, rows, cols);
+}
+
+template<typename LOAD_Y, typename LOAD_DY, typename STORE, typename ComputeType, int pack_size,
+         int block_size, Algorithm algorithm>
+__global__ void SoftmaxGradBlockSMemImpl(LOAD_Y load_y, LOAD_DY load_dy, STORE store,
+                                         const int64_t rows, const int64_t cols) {
+  extern __shared__ __align__(sizeof(double)) unsigned char grad_shared_buf[];
+  auto* y_buf = reinterpret_cast<ComputeType*>(grad_shared_buf);
+  auto* dy_buf = y_buf + cols;
+  const int tid = threadIdx.x;
+  assert(cols % pack_size == 0);
+  const int num_packs = cols / pack_size;
+  for (int64_t row = blockIdx.x; row < rows; row += gridDim.x) {
+    ComputeType thread_sum = 0;
+    for (int pack_id = tid; pack_id < num_packs; pack_id += block_size) {
+      ComputeType y_pack[pack_size];
+      ComputeType dy_pack[pack_size];
+      load_y.template load<pack_size>(y_pack, row, pack_id * pack_size);
+      load_dy.template load<pack_size>(dy_pack, row, pack_id * pack_size);
+#pragma unroll
+      for (int i = 0; i < pack_size; ++i) {
+        y_buf[i * num_packs + pack_id] = y_pack[i];
+        dy_buf[i * num_packs + pack_id] = dy_pack[i];
+        if (algorithm == Algorithm::kSoftmax) {
+          thread_sum += y_pack[i] * dy_pack[i];
+        } else if (algorithm == Algorithm::kLogSoftmax) {
+          thread_sum += dy_pack[i];
+        } else {
+          asm volatile("s_trap 0;");
+        }
+      }
+    }
+    const ComputeType row_sum = BlockAllReduce<SumOp, ComputeType, block_size>(thread_sum);
+    for (int pack_id = tid; pack_id < num_packs; pack_id += block_size) {
+      ComputeType pack[pack_size];
+#pragma unroll
+      for (int i = 0; i < pack_size; ++i) {
+        if (algorithm == Algorithm::kSoftmax) {
+          pack[i] = (dy_buf[i * num_packs + pack_id] - row_sum) * y_buf[i * num_packs + pack_id];
+        } else if (algorithm == Algorithm::kLogSoftmax) {
+          pack[i] = dy_buf[i * num_packs + pack_id] - Exp(y_buf[i * num_packs + pack_id]) * row_sum;
+        } else {
+          asm volatile("s_trap 0;");
+        }
+      }
+      store.template store<pack_size>(pack, row, pack_id * pack_size);
+    }
+  }
+}
+
+template<typename LOAD_Y, typename LOAD_DY, typename STORE, typename ComputeType, int pack_size,
+         int block_size, Algorithm algorithm>
+__global__ void SoftmaxGradBlockSMemImpl_1024(LOAD_Y load_y, LOAD_DY load_dy, STORE store,
+                                         const int64_t rows, const int64_t cols) __attribute__((amdgpu_flat_work_group_size(1,1024))) {
+  extern __shared__ __align__(sizeof(double)) unsigned char grad_shared_buf[];
+  auto* y_buf = reinterpret_cast<ComputeType*>(grad_shared_buf);
+  auto* dy_buf = y_buf + cols;
+  const int tid = threadIdx.x;
+  assert(cols % pack_size == 0);
+  const int num_packs = cols / pack_size;
+  for (int64_t row = blockIdx.x; row < rows; row += gridDim.x) {
+    ComputeType thread_sum = 0;
+    for (int pack_id = tid; pack_id < num_packs; pack_id += block_size) {
+      ComputeType y_pack[pack_size];
+      ComputeType dy_pack[pack_size];
+      load_y.template load<pack_size>(y_pack, row, pack_id * pack_size);
+      load_dy.template load<pack_size>(dy_pack, row, pack_id * pack_size);
+#pragma unroll
+      for (int i = 0; i < pack_size; ++i) {
+        y_buf[i * num_packs + pack_id] = y_pack[i];
+        dy_buf[i * num_packs + pack_id] = dy_pack[i];
+        if (algorithm == Algorithm::kSoftmax) {
+          thread_sum += y_pack[i] * dy_pack[i];
+        } else if (algorithm == Algorithm::kLogSoftmax) {
+          thread_sum += dy_pack[i];
+        } else {
+          asm volatile("s_trap 0;");
+        }
+      }
+    }
+    const ComputeType row_sum = BlockAllReduce<SumOp, ComputeType, block_size>(thread_sum);
+    for (int pack_id = tid; pack_id < num_packs; pack_id += block_size) {
+      ComputeType pack[pack_size];
+#pragma unroll
+      for (int i = 0; i < pack_size; ++i) {
+        if (algorithm == Algorithm::kSoftmax) {
+          pack[i] = (dy_buf[i * num_packs + pack_id] - row_sum) * y_buf[i * num_packs + pack_id];
+        } else if (algorithm == Algorithm::kLogSoftmax) {
+          pack[i] = dy_buf[i * num_packs + pack_id] - Exp(y_buf[i * num_packs + pack_id]) * row_sum;
+        } else {
+          asm volatile("s_trap 0;");
+        }
+      }
+      store.template store<pack_size>(pack, row, pack_id * pack_size);
+    }
+  }
+}
+
+template<typename LOAD_Y, typename LOAD_DY, typename STORE, typename ComputeType, int pack_size,
+         int block_size, Algorithm algorithm>
+inline hipError_t LaunchSoftmaxGradBlockSMemImpl(hipStream_t stream, LOAD_Y load_y,
+                                                  LOAD_DY load_dy, STORE store, int smem,
+                                                  const int64_t rows, const int64_t cols) {
+  constexpr int waves = 32;
+  int grid_dim_x;
+  {
+    hipError_t err = GetNumBlocks(block_size, rows, waves, &grid_dim_x);
+    if (err != hipSuccess) { return err; }
+  }
+  SoftmaxGradBlockSMemImpl<LOAD_Y, LOAD_DY, STORE, ComputeType, pack_size, block_size, algorithm>
+      <<<grid_dim_x, block_size, smem, stream>>>(load_y, load_dy, store, rows, cols);
+  return hipPeekAtLastError();
+}
+
+template<typename LOAD_Y, typename LOAD_DY, typename STORE, typename ComputeType, int pack_size,
+         int block_size, Algorithm algorithm>
+inline hipError_t LaunchSoftmaxGradBlockSMemImpl_1024(hipStream_t stream, LOAD_Y load_y,
+                                                  LOAD_DY load_dy, STORE store, int smem,
+                                                  const int64_t rows, const int64_t cols) {
+  constexpr int waves = 32;
+  int grid_dim_x;
+  {
+    hipError_t err = GetNumBlocks(block_size, rows, waves, &grid_dim_x);
+    if (err != hipSuccess) { return err; }
+  }
+  SoftmaxGradBlockSMemImpl_1024<LOAD_Y, LOAD_DY, STORE, ComputeType, pack_size, block_size, algorithm>
+      <<<grid_dim_x, block_size, smem, stream>>>(load_y, load_dy, store, rows, cols);
+  return hipPeekAtLastError();
+}
+
+template<typename LOAD_Y, typename LOAD_DY, typename STORE, typename ComputeType, int pack_size,
+         Algorithm algorithm>
+inline hipError_t TryDispatchSoftmaxGradBlockSMemImplBlockSize(hipStream_t stream, LOAD_Y load_y,
+                                                                LOAD_DY load_dy, STORE store,
+                                                                const int64_t rows,
+                                                                const int64_t cols, bool* success) {
+  constexpr int block_size_conf_1 = 128;
+  constexpr int block_size_conf_2 = 256;
+  constexpr int block_size_conf_3 = 512;
+  constexpr int block_size_conf_4 = 1024;
+  const size_t smem = cols * sizeof(ComputeType) * 2;
+  int max_active_blocks_conf_1;
+  {
+    hipError_t err = hipOccupancyMaxActiveBlocksPerMultiprocessor(
+        &max_active_blocks_conf_1,
+        SoftmaxGradBlockSMemImpl<LOAD_Y, LOAD_DY, STORE, ComputeType, pack_size, block_size_conf_1,
+                                 algorithm>,
+        block_size_conf_1, smem);
+    if (err != hipSuccess) { return err; }
+  }
+  if (max_active_blocks_conf_1 <= 0) {
+    *success = false;
+    return hipSuccess;
+  }
+  int max_active_blocks_conf_4;
+  {
+    hipError_t err = hipOccupancyMaxActiveBlocksPerMultiprocessor(
+        &max_active_blocks_conf_4,
+        SoftmaxGradBlockSMemImpl_1024<LOAD_Y, LOAD_DY, STORE, ComputeType, pack_size, block_size_conf_4,
+                                 algorithm>,
+        block_size_conf_4, smem);
+    if (err != hipSuccess) { return err; }
+  }
+  if (max_active_blocks_conf_4 == max_active_blocks_conf_1) {
+    *success = true;
+    return LaunchSoftmaxGradBlockSMemImpl_1024<LOAD_Y, LOAD_DY, STORE, ComputeType, pack_size,
+                                          block_size_conf_4, algorithm>(stream, load_y, load_dy,
+                                                                        store, smem, rows, cols);
+  }
+  int max_active_blocks_conf_3;
+  {
+    hipError_t err = hipOccupancyMaxActiveBlocksPerMultiprocessor(
+        &max_active_blocks_conf_3,
+        SoftmaxGradBlockSMemImpl<LOAD_Y, LOAD_DY, STORE, ComputeType, pack_size, block_size_conf_3,
+                                 algorithm>,
+        block_size_conf_3, smem);
+    if (err != hipSuccess) { return err; }
+  }
+  if (max_active_blocks_conf_3 == max_active_blocks_conf_1) {
+    *success = true;
+    return LaunchSoftmaxGradBlockSMemImpl<LOAD_Y, LOAD_DY, STORE, ComputeType, pack_size,
+                                          block_size_conf_3, algorithm>(stream, load_y, load_dy,
+                                                                        store, smem, rows, cols);
+  }
+  int max_active_blocks_conf_2;
+  {
+    hipError_t err = hipOccupancyMaxActiveBlocksPerMultiprocessor(
+        &max_active_blocks_conf_2,
+        SoftmaxGradBlockSMemImpl<LOAD_Y, LOAD_DY, STORE, ComputeType, pack_size, block_size_conf_2,
+                                 algorithm>,
+        block_size_conf_2, smem);
+    if (err != hipSuccess) { return err; }
+  }
+  if (max_active_blocks_conf_2 == max_active_blocks_conf_1) {
+    *success = true;
+    return LaunchSoftmaxGradBlockSMemImpl<LOAD_Y, LOAD_DY, STORE, ComputeType, pack_size,
+                                          block_size_conf_2, algorithm>(stream, load_y, load_dy,
+                                                                        store, smem, rows, cols);
+  }
+  *success = true;
+  return LaunchSoftmaxGradBlockSMemImpl<LOAD_Y, LOAD_DY, STORE, ComputeType, pack_size,
+                                        block_size_conf_1, algorithm>(stream, load_y, load_dy,
+                                                                      store, smem, rows, cols);
+}
+
+template<typename LOAD_Y, typename LOAD_DY, typename STORE, typename ComputeType,
+         Algorithm algorithm>
+struct TryDispatchSoftmaxGradBlockSMemImplPackSize {
+  hipError_t operator()(hipStream_t stream, LOAD_Y load_y, LOAD_DY load_dy, STORE store,
+                         const int64_t rows, const int64_t cols, bool* success) {
+    if (cols % 2 == 0) {
+      return TryDispatchSoftmaxGradBlockSMemImplBlockSize<LOAD_Y, LOAD_DY, STORE, ComputeType, 2,
+                                                          algorithm>(stream, load_y, load_dy, store,
+                                                                     rows, cols, success);
+    } else {
+      return TryDispatchSoftmaxGradBlockSMemImplBlockSize<LOAD_Y, LOAD_DY, STORE, ComputeType, 1,
+                                                          algorithm>(stream, load_y, load_dy, store,
+                                                                     rows, cols, success);
+    }
+  }
+};
+
+template<typename LOAD_Y, typename LOAD_DY, typename STORE, typename ComputeType,
+         Algorithm algorithm>
+inline hipError_t TryDispatchSoftmaxGradBlockSMemImpl(hipStream_t stream, LOAD_Y load_y,
+                                                       LOAD_DY load_dy, STORE store,
+                                                       const int64_t rows, const int64_t cols,
+                                                       bool* success) {
+  return TryDispatchSoftmaxGradBlockSMemImplPackSize<LOAD_Y, LOAD_DY, STORE, ComputeType,
+                                                     algorithm>()(stream, load_y, load_dy, store,
+                                                                  rows, cols, success);
+}
+
+template<typename LOAD_Y, typename LOAD_DY, typename STORE, typename ComputeType, int pack_size,
+         int block_size, Algorithm algorithm>
+__global__ void SoftmaxGradBlockUncachedImpl(LOAD_Y load_y, LOAD_DY load_dy, STORE store,
+                                             const int64_t rows, const int64_t cols) __attribute__((amdgpu_flat_work_group_size(1,1024))) {
+  const int tid = threadIdx.x;
+  assert(cols % pack_size == 0);
+  const int num_packs = cols / pack_size;
+  for (int64_t row = blockIdx.x; row < rows; row += gridDim.x) {
+    ComputeType thread_sum = 0;
+    for (int pack_id = tid; pack_id < num_packs; pack_id += block_size) {
+      ComputeType y_pack[pack_size];
+      ComputeType dy_pack[pack_size];
+      load_y.template load<pack_size>(y_pack, row, pack_id * pack_size);
+      load_dy.template load<pack_size>(dy_pack, row, pack_id * pack_size);
+
+#pragma unroll
+      for (int i = 0; i < pack_size; ++i) {
+        if (algorithm == Algorithm::kSoftmax) {
+          thread_sum += y_pack[i] * dy_pack[i];
+        } else if (algorithm == Algorithm::kLogSoftmax) {
+          thread_sum += dy_pack[i];
+        } else {
+          asm volatile("s_trap 0;");
+        }
+      }
+    }
+    const ComputeType row_sum = BlockAllReduce<SumOp, ComputeType, block_size>(thread_sum);
+    for (int pack_id = tid; pack_id < num_packs; pack_id += block_size) {
+      ComputeType y_pack[pack_size];
+      ComputeType dy_pack[pack_size];
+      load_y.template load<pack_size>(y_pack, row, pack_id * pack_size);
+      load_dy.template load<pack_size>(dy_pack, row, pack_id * pack_size);
+#pragma unroll
+      for (int i = 0; i < pack_size; ++i) {
+        if (algorithm == Algorithm::kSoftmax) {
+          dy_pack[i] = (dy_pack[i] - row_sum) * y_pack[i];
+        } else if (algorithm == Algorithm::kLogSoftmax) {
+          dy_pack[i] -= Exp(y_pack[i]) * row_sum;
+        } else {
+          asm volatile("s_trap 0;");
+        }
+      }
+      store.template store<pack_size>(dy_pack, row, pack_id * pack_size);
+    }
+  }
+}
+
+template<typename LOAD_Y, typename LOAD_DY, typename STORE, typename ComputeType, int pack_size,
+         Algorithm algorithm>
+inline hipError_t LaunchSoftmaxGradBlockUncachedImpl(hipStream_t stream, LOAD_Y load_y,
+                                                      LOAD_DY load_dy, STORE store,
+                                                      const int64_t rows, const int64_t cols) {
+  constexpr int block_size = 1024;
+  constexpr int waves = 32;
+  int grid_dim_x;
+  {
+    hipError_t err = GetNumBlocks(block_size, rows, waves, &grid_dim_x);
+    if (err != hipSuccess) { return err; }
+  }
+  SoftmaxGradBlockUncachedImpl<LOAD_Y, LOAD_DY, STORE, ComputeType, pack_size, block_size,
+                               algorithm>
+      <<<grid_dim_x, block_size, 0, stream>>>(load_y, load_dy, store, rows, cols);
+  return hipPeekAtLastError();
+}
+
+template<typename LOAD_Y, typename LOAD_DY, typename STORE, typename ComputeType,
+         Algorithm algorithm>
+struct DispatchSoftmaxGradBlockUncachedImplPackSize {
+  hipError_t operator()(hipStream_t stream, LOAD_Y load_y, LOAD_DY load_dy, STORE store,
+                         const int64_t rows, const int64_t cols) {
+    if (cols % 2 == 0 && cols > kWarpSize) {
+      return LaunchSoftmaxGradBlockUncachedImpl<LOAD_Y, LOAD_DY, STORE, ComputeType, 2, algorithm>(
+          stream, load_y, load_dy, store, rows, cols);
+    } else {
+      return LaunchSoftmaxGradBlockUncachedImpl<LOAD_Y, LOAD_DY, STORE, ComputeType, 1, algorithm>(
+          stream, load_y, load_dy, store, rows, cols);
+    }
+  }
+};
+
+template<typename LOAD_Y, typename LOAD_DY, typename STORE, typename ComputeType,
+         Algorithm algorithm>
+inline hipError_t DispatchSoftmaxGradBlockUncachedImpl(hipStream_t stream, LOAD_Y load_y,
+                                                        LOAD_DY load_dy, STORE store,
+                                                        const int64_t rows, const int64_t cols) {
+  return DispatchSoftmaxGradBlockUncachedImplPackSize<LOAD_Y, LOAD_DY, STORE, ComputeType,
+                                                      algorithm>()(stream, load_y, load_dy, store,
+                                                                   rows, cols);
+}
+
+template<typename LOAD_Y, typename LOAD_DY, typename STORE, typename ComputeType>
+inline typename std::enable_if<!std::is_same<ComputeType, double>::value, hipError_t>::type
+DispatchSoftmaxGrad(hipStream_t stream, LOAD_Y load_y, LOAD_DY load_dy, STORE store,
+                    const int64_t rows, const int64_t cols) {
+  if (cols <= 1024) {
+    return DispatchSoftmaxGradWarpImpl<LOAD_Y, LOAD_DY, STORE, ComputeType, Algorithm::kSoftmax>(
+        stream, load_y, load_dy, store, rows, cols);
+  } else {
+    bool dispatch_smem_impl_success;
+    {
+      hipError_t err = TryDispatchSoftmaxGradBlockSMemImpl<LOAD_Y, LOAD_DY, STORE, ComputeType,
+                                                            Algorithm::kSoftmax>(
+          stream, load_y, load_dy, store, rows, cols, &dispatch_smem_impl_success);
+      if (err != hipSuccess) { return err; }
+    }
+    if (!dispatch_smem_impl_success) {
+      return DispatchSoftmaxGradBlockUncachedImpl<LOAD_Y, LOAD_DY, STORE, ComputeType,
+                                                  Algorithm::kSoftmax>(stream, load_y, load_dy,
+                                                                       store, rows, cols);
+    }
+    return hipSuccess;
+  }
+}
+
+template<typename LOAD_Y, typename LOAD_DY, typename STORE, typename ComputeType>
+inline typename std::enable_if<std::is_same<ComputeType, double>::value, hipError_t>::type
+DispatchSoftmaxGrad(hipStream_t stream, LOAD_Y load_y, LOAD_DY load_dy, STORE store,
+                    const int64_t rows, const int64_t cols) {
+  return DispatchSoftmaxGradBlockUncachedImpl<LOAD_Y, LOAD_DY, STORE, ComputeType,
+                                              Algorithm::kSoftmax>(stream, load_y, load_dy, store,
+                                                                   rows, cols);
+}
+
+template<typename LOAD_Y, typename LOAD_DY, typename STORE, typename ComputeType>
+inline typename std::enable_if<!std::is_same<ComputeType, double>::value, hipError_t>::type
+DispatchLogSoftmaxGrad(hipStream_t stream, LOAD_Y load_y, LOAD_DY load_dy, STORE store,
+                       const int64_t rows, const int64_t cols) {
+  if (cols <= 1024) {
+    return DispatchSoftmaxGradWarpImpl<LOAD_Y, LOAD_DY, STORE, ComputeType, Algorithm::kLogSoftmax>(
+        stream, load_y, load_dy, store, rows, cols);
+  } else {
+    bool dispatch_smem_impl_success;
+    {
+      hipError_t err = TryDispatchSoftmaxGradBlockSMemImpl<LOAD_Y, LOAD_DY, STORE, ComputeType,
+                                                            Algorithm::kLogSoftmax>(
+          stream, load_y, load_dy, store, rows, cols, &dispatch_smem_impl_success);
+      if (err != hipSuccess) { return err; }
+    }
+    if (!dispatch_smem_impl_success) {
+      return DispatchSoftmaxGradBlockUncachedImpl<LOAD_Y, LOAD_DY, STORE, ComputeType,
+                                                  Algorithm::kLogSoftmax>(stream, load_y, load_dy,
+                                                                          store, rows, cols);
+    }
+    return hipSuccess;
+  }
+}
+
+template<typename LOAD_Y, typename LOAD_DY, typename STORE, typename ComputeType>
+inline typename std::enable_if<std::is_same<ComputeType, double>::value, hipError_t>::type
+DispatchLogSoftmaxGrad(hipStream_t stream, LOAD_Y load_y, LOAD_DY load_dy, STORE store,
+                       const int64_t rows, const int64_t cols) {
+  return DispatchSoftmaxGradBlockUncachedImpl<LOAD_Y, LOAD_DY, STORE, ComputeType,
+                                              Algorithm::kLogSoftmax>(stream, load_y, load_dy,
+                                                                      store, rows, cols);
+}
+
+}  // namespace softmax
+
+}  // namespace cuda
+
+}  // namespace oneflow
+
+#endif // WITH_ROCM
+
+#endif  // ONEFLOW_CORE_CUDA_SOFTMAX_H_
diff --git a/oneflow/core/hip/unique.hip.h b/oneflow/core/hip/unique.hip.h
index 6e7e671..4de2c65 100644
--- a/oneflow/core/hip/unique.hip.h
+++ b/oneflow/core/hip/unique.hip.h
@@ -1,251 +1,251 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#ifndef ONEFLOW_CORE_HIP_UNIQUE_H_
-#define ONEFLOW_CORE_HIP_UNIQUE_H_
-
-#ifdef WITH_ROCM
-
-#include <hipcub/hipcub.hpp>
-#include "hip/hip_runtime.h"
-// #include <device_launch_parameters.h>
-#include "hip/hip_runtime.h"
-#include "oneflow/core/common/permutation_iterator.h"
-#include "oneflow/core/common/not_equal_to_previous_adjacent_iterator.h"
-
-namespace oneflow {
-
-namespace cuda {
-
-namespace unique {
-
-using Flag = uint32_t;
-static constexpr Flag kDefault = 0x0;
-static constexpr Flag kInputSorted = 0x1;
-static constexpr Flag kOutputInverseIndices = 0x1 << 1;
-static constexpr Flag kOutputCounts = 0x1 << 2;
-
-namespace {
-
-constexpr size_t kCudaAlignSize = 512;
-
-__device__ __host__ __forceinline__ size_t GetCudaAlignedSize(size_t size) {
-  return (size + kCudaAlignSize - 1) / kCudaAlignSize * kCudaAlignSize;
-}
-
-template<typename T>
-__device__ __host__ __forceinline__ T* PtrOffset(void* ptr, size_t offset) {
-  return reinterpret_cast<T*>(reinterpret_cast<unsigned char*>(ptr) + offset);
-}
-
-__device__ __host__ __forceinline__ size_t max(size_t a, size_t b) { return a > b ? a : b; }
-
-template<typename Key, typename Index>
-hipError_t DoUnique(size_t n, const Key* sorted_in, Key* unique, Index* num_unique,
-                     void* workspace, size_t* workspace_size, hipStream_t stream) {
-  size_t ws = *workspace_size;
-  hipError_t err = hipcub::DeviceSelect::Unique<const Key*, Key*, Index*>(
-      workspace, ws, sorted_in, unique, num_unique, n, stream);
-  if (err != hipSuccess) { return err; }
-  if (*workspace_size == 0) { *workspace_size = ws; }
-  return hipSuccess;
-}
-
-template<typename Key, typename Index>
-hipError_t DoUniqueWithCounts(size_t n, const Key* sorted_in, Key* unique, Index* num_unique,
-                               Index* counts, void* workspace, size_t* workspace_size,
-                               hipStream_t stream) {
-  size_t ws = *workspace_size;
-  hipError_t err = hipcub::DeviceRunLengthEncode::Encode<const Key*, Key*, Index*, Index*>(
-      workspace, ws, sorted_in, unique, counts, num_unique, n, stream);
-  if (err != hipSuccess) { return err; }
-  if (*workspace_size == 0) { *workspace_size = ws; }
-  return hipSuccess;
-}
-
-template<typename Key, typename Index>
-hipError_t DispatchOutputCounts(Flag flag, size_t n, const Key* sorted_in, Key* unique,
-                                 Index* num_unique, Index* counts, void* workspace,
-                                 size_t* workspace_size, hipStream_t stream) {
-  size_t ws = *workspace_size;
-  if ((flag & kOutputCounts) != 0) {
-    hipError_t err = DoUniqueWithCounts<Key, Index>(n, sorted_in, unique, num_unique, counts,
-                                                     workspace, &ws, stream);
-    if (err != hipSuccess) { return err; }
-  } else {
-    hipError_t err =
-        DoUnique<Key, Index>(n, sorted_in, unique, num_unique, workspace, &ws, stream);
-    if (err != hipSuccess) { return err; }
-  }
-  if (*workspace_size == 0) { *workspace_size = ws; }
-  return hipSuccess;
-}
-
-template<typename Key, typename Index, typename InverseIndicesIter>
-hipError_t DoGenInverseIndices(size_t n, const Key* sorted_in,
-                                InverseIndicesIter inverse_indices_iter, void* workspace,
-                                size_t* workspace_size, hipStream_t stream) {
-  size_t ws = *workspace_size;
-  NotEqualToPreviousAdjacentIterator<Index, Key> unique_counting_iter(sorted_in, 0);
-  hipError_t err =
-      hipcub::DeviceScan::InclusiveSum<decltype(unique_counting_iter), InverseIndicesIter>(
-          workspace, ws, unique_counting_iter, inverse_indices_iter, n, stream);
-  if (err != hipSuccess) { return err; }
-  if (*workspace_size == 0) { *workspace_size = ws; }
-  return hipSuccess;
-}
-
-template<typename Key, typename Index, typename InverseIndicesIter>
-hipError_t DispatchOutputInverseIndices(Flag flag, size_t n, const Key* sorted_in, Key* unique,
-                                         Index* num_unique, InverseIndicesIter inverse_indices_iter,
-                                         Index* counts, void* workspace, size_t* workspace_size,
-                                         hipStream_t stream) {
-  size_t dispatch_with_counts_ws = *workspace_size;
-  size_t do_gen_inverse_indices_ws = *workspace_size;
-  {
-    hipError_t err =
-        DispatchOutputCounts<Key, Index>(flag, n, sorted_in, unique, num_unique, counts, workspace,
-                                         &dispatch_with_counts_ws, stream);
-    if (err != hipSuccess) { return err; }
-  }
-  if ((flag & kOutputInverseIndices) != 0) {
-    hipError_t err = DoGenInverseIndices<Key, Index, InverseIndicesIter>(
-        n, sorted_in, inverse_indices_iter, workspace, &do_gen_inverse_indices_ws, stream);
-    if (err != hipSuccess) { return err; }
-  }
-  if (*workspace_size == 0) {
-    *workspace_size = max(dispatch_with_counts_ws, do_gen_inverse_indices_ws);
-  }
-  return hipSuccess;
-}
-
-template<typename T>
-__global__ void IotaKernel(size_t n, T* out) {
-  for (T i = blockIdx.x * blockDim.x + threadIdx.x, step = blockDim.x * gridDim.x; i < n;
-       i += step) {
-    out[i] = i;
-  }
-}
-
-template<typename Key, typename Index>
-hipError_t DoSort(size_t n, const Key* in, Key* sorted, Index* sorted_indices, void* workspace,
-                   size_t* workspace_size, hipStream_t stream) {
-  Index* indices;
-  const size_t indices_size = GetCudaAlignedSize(n * sizeof(Index));
-  void* sort_workspace;
-  size_t sort_ws;
-  if (*workspace_size == 0) {
-    indices = nullptr;
-    sort_workspace = nullptr;
-    sort_ws = 0;
-  } else {
-    if (*workspace_size <= indices_size) { return hipErrorInvalidValue; }
-    indices = PtrOffset<Index>(workspace, 0);
-    sort_workspace = PtrOffset<Index>(workspace, indices_size);
-    sort_ws = *workspace_size - indices_size;
-  }
-  if (*workspace_size != 0) {
-    const int block_size = 1024;
-    const int num_blocks = static_cast<int>((n + block_size - 1) / block_size);
-    IotaKernel<Index><<<num_blocks, block_size, 0, stream>>>(n, indices);
-  }
-  hipError_t err = hipcub::DeviceRadixSort::SortPairs<Key, Index>(
-      sort_workspace, sort_ws, in, sorted, indices, sorted_indices, n, 0, sizeof(Key) * 8, stream);
-  if (err != hipSuccess) { return err; }
-  if (*workspace_size == 0) { *workspace_size = indices_size + sort_ws; }
-  return hipSuccess;
-}
-
-template<typename Key, typename Index>
-hipError_t DispatchInputSorted(Flag flag, size_t n, const Key* in, Key* unique, Index* num_unique,
-                                Index* inverse_indices, Index* counts, void* workspace,
-                                size_t* workspace_size, hipStream_t stream) {
-  if ((flag & kInputSorted) != 0) {
-    return DispatchOutputInverseIndices<Key, Index, Index*>(flag, n, in, unique, num_unique,
-                                                            inverse_indices, counts, workspace,
-                                                            workspace_size, stream);
-  } else {
-    const size_t sorted_in_size = GetCudaAlignedSize(n * sizeof(Key));
-    const size_t sorted_indices_size = GetCudaAlignedSize(n * sizeof(Index));
-    const size_t sort_buffer_size = sorted_in_size + sorted_indices_size;
-    Key* sorted_in;
-    Index* sorted_indices;
-    size_t do_sort_ws;
-    void* do_sort_workspace;
-    size_t do_inverse_indices_ws;
-    void* do_inverse_indices_workspace;
-    if (*workspace_size == 0) {
-      sorted_in = nullptr;
-      sorted_indices = nullptr;
-      do_sort_ws = 0;
-      do_sort_workspace = nullptr;
-      do_inverse_indices_ws = 0;
-      do_inverse_indices_workspace = nullptr;
-    } else {
-      if (*workspace_size <= sort_buffer_size) { return hipErrorInvalidValue; }
-      sorted_in = PtrOffset<Key>(workspace, 0);
-      sorted_indices = PtrOffset<Index>(workspace, sorted_in_size);
-      do_sort_ws = *workspace_size - sort_buffer_size;
-      do_sort_workspace = PtrOffset<void>(workspace, sort_buffer_size);
-      do_inverse_indices_ws = do_sort_ws;
-      do_inverse_indices_workspace = do_sort_workspace;
-    }
-    {
-      hipError_t err = DoSort<Key, Index>(n, in, sorted_in, sorted_indices, do_sort_workspace,
-                                           &do_sort_ws, stream);
-      if (err != hipSuccess) { return err; }
-    }
-    PermutationIterator<Index, Index*, Index*> inverse_indices_iter(inverse_indices,
-                                                                    sorted_indices);
-    {
-      hipError_t err = DispatchOutputInverseIndices<Key, Index, decltype(inverse_indices_iter)>(
-          flag, n, sorted_in, unique, num_unique, inverse_indices_iter, counts,
-          do_inverse_indices_workspace, &do_inverse_indices_ws, stream);
-      if (err != hipSuccess) { return err; }
-    }
-    if (*workspace_size == 0) {
-      *workspace_size = sort_buffer_size + max(do_sort_ws, do_inverse_indices_ws);
-    }
-    return hipSuccess;
-  }
-}
-
-}  // namespace
-
-template<typename Key, typename Index>
-hipError_t Launch(Flag flag, size_t n, const Key* in, Key* unique, Index* num_unique,
-                   Index* inverse_indices, Index* counts, void* workspace, size_t workspace_size,
-                   hipStream_t stream) {
-  if (workspace_size == 0) { return hipErrorInvalidValue; }
-  return DispatchInputSorted<Key, Index>(flag, n, in, unique, num_unique, inverse_indices, counts,
-                                         workspace, &workspace_size, stream);
-}
-
-template<typename Key, typename Index>
-hipError_t GetWorkspaceSize(Flag flag, size_t n, size_t* workspace_size) {
-  *workspace_size = 0;
-  return DispatchInputSorted<Key, Index>(flag, n, nullptr, nullptr, nullptr, nullptr, nullptr,
-                                         nullptr, workspace_size, 0);
-}
-
-}  // namespace unique
-
-}  // namespace cuda
-
-}  // namespace oneflow
-
-#endif // WITH_ROCM
-
-#endif  // ONEFLOW_CORE_CUDA_UNIQUE_H_
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#ifndef ONEFLOW_CORE_HIP_UNIQUE_H_
+#define ONEFLOW_CORE_HIP_UNIQUE_H_
+
+#ifdef WITH_ROCM
+
+#include <hipcub/hipcub.hpp>
+#include "hip/hip_runtime.h"
+// #include <device_launch_parameters.h>
+#include "hip/hip_runtime.h"
+#include "oneflow/core/common/permutation_iterator.h"
+#include "oneflow/core/common/not_equal_to_previous_adjacent_iterator.h"
+
+namespace oneflow {
+
+namespace cuda {
+
+namespace unique {
+
+using Flag = uint32_t;
+static constexpr Flag kDefault = 0x0;
+static constexpr Flag kInputSorted = 0x1;
+static constexpr Flag kOutputInverseIndices = 0x1 << 1;
+static constexpr Flag kOutputCounts = 0x1 << 2;
+
+namespace {
+
+constexpr size_t kCudaAlignSize = 512;
+
+__device__ __host__ __forceinline__ size_t GetCudaAlignedSize(size_t size) {
+  return (size + kCudaAlignSize - 1) / kCudaAlignSize * kCudaAlignSize;
+}
+
+template<typename T>
+__device__ __host__ __forceinline__ T* PtrOffset(void* ptr, size_t offset) {
+  return reinterpret_cast<T*>(reinterpret_cast<unsigned char*>(ptr) + offset);
+}
+
+__device__ __host__ __forceinline__ size_t max(size_t a, size_t b) { return a > b ? a : b; }
+
+template<typename Key, typename Index>
+hipError_t DoUnique(size_t n, const Key* sorted_in, Key* unique, Index* num_unique,
+                     void* workspace, size_t* workspace_size, hipStream_t stream) {
+  size_t ws = *workspace_size;
+  hipError_t err = hipcub::DeviceSelect::Unique<const Key*, Key*, Index*>(
+      workspace, ws, sorted_in, unique, num_unique, n, stream);
+  if (err != hipSuccess) { return err; }
+  if (*workspace_size == 0) { *workspace_size = ws; }
+  return hipSuccess;
+}
+
+template<typename Key, typename Index>
+hipError_t DoUniqueWithCounts(size_t n, const Key* sorted_in, Key* unique, Index* num_unique,
+                               Index* counts, void* workspace, size_t* workspace_size,
+                               hipStream_t stream) {
+  size_t ws = *workspace_size;
+  hipError_t err = hipcub::DeviceRunLengthEncode::Encode<const Key*, Key*, Index*, Index*>(
+      workspace, ws, sorted_in, unique, counts, num_unique, n, stream);
+  if (err != hipSuccess) { return err; }
+  if (*workspace_size == 0) { *workspace_size = ws; }
+  return hipSuccess;
+}
+
+template<typename Key, typename Index>
+hipError_t DispatchOutputCounts(Flag flag, size_t n, const Key* sorted_in, Key* unique,
+                                 Index* num_unique, Index* counts, void* workspace,
+                                 size_t* workspace_size, hipStream_t stream) {
+  size_t ws = *workspace_size;
+  if ((flag & kOutputCounts) != 0) {
+    hipError_t err = DoUniqueWithCounts<Key, Index>(n, sorted_in, unique, num_unique, counts,
+                                                     workspace, &ws, stream);
+    if (err != hipSuccess) { return err; }
+  } else {
+    hipError_t err =
+        DoUnique<Key, Index>(n, sorted_in, unique, num_unique, workspace, &ws, stream);
+    if (err != hipSuccess) { return err; }
+  }
+  if (*workspace_size == 0) { *workspace_size = ws; }
+  return hipSuccess;
+}
+
+template<typename Key, typename Index, typename InverseIndicesIter>
+hipError_t DoGenInverseIndices(size_t n, const Key* sorted_in,
+                                InverseIndicesIter inverse_indices_iter, void* workspace,
+                                size_t* workspace_size, hipStream_t stream) {
+  size_t ws = *workspace_size;
+  NotEqualToPreviousAdjacentIterator<Index, Key> unique_counting_iter(sorted_in, 0);
+  hipError_t err =
+      hipcub::DeviceScan::InclusiveSum<decltype(unique_counting_iter), InverseIndicesIter>(
+          workspace, ws, unique_counting_iter, inverse_indices_iter, n, stream);
+  if (err != hipSuccess) { return err; }
+  if (*workspace_size == 0) { *workspace_size = ws; }
+  return hipSuccess;
+}
+
+template<typename Key, typename Index, typename InverseIndicesIter>
+hipError_t DispatchOutputInverseIndices(Flag flag, size_t n, const Key* sorted_in, Key* unique,
+                                         Index* num_unique, InverseIndicesIter inverse_indices_iter,
+                                         Index* counts, void* workspace, size_t* workspace_size,
+                                         hipStream_t stream) {
+  size_t dispatch_with_counts_ws = *workspace_size;
+  size_t do_gen_inverse_indices_ws = *workspace_size;
+  {
+    hipError_t err =
+        DispatchOutputCounts<Key, Index>(flag, n, sorted_in, unique, num_unique, counts, workspace,
+                                         &dispatch_with_counts_ws, stream);
+    if (err != hipSuccess) { return err; }
+  }
+  if ((flag & kOutputInverseIndices) != 0) {
+    hipError_t err = DoGenInverseIndices<Key, Index, InverseIndicesIter>(
+        n, sorted_in, inverse_indices_iter, workspace, &do_gen_inverse_indices_ws, stream);
+    if (err != hipSuccess) { return err; }
+  }
+  if (*workspace_size == 0) {
+    *workspace_size = max(dispatch_with_counts_ws, do_gen_inverse_indices_ws);
+  }
+  return hipSuccess;
+}
+
+template<typename T>
+__global__ void IotaKernel(size_t n, T* out) {
+  for (T i = blockIdx.x * blockDim.x + threadIdx.x, step = blockDim.x * gridDim.x; i < n;
+       i += step) {
+    out[i] = i;
+  }
+}
+
+template<typename Key, typename Index>
+hipError_t DoSort(size_t n, const Key* in, Key* sorted, Index* sorted_indices, void* workspace,
+                   size_t* workspace_size, hipStream_t stream) {
+  Index* indices;
+  const size_t indices_size = GetCudaAlignedSize(n * sizeof(Index));
+  void* sort_workspace;
+  size_t sort_ws;
+  if (*workspace_size == 0) {
+    indices = nullptr;
+    sort_workspace = nullptr;
+    sort_ws = 0;
+  } else {
+    if (*workspace_size <= indices_size) { return hipErrorInvalidValue; }
+    indices = PtrOffset<Index>(workspace, 0);
+    sort_workspace = PtrOffset<Index>(workspace, indices_size);
+    sort_ws = *workspace_size - indices_size;
+  }
+  if (*workspace_size != 0) {
+    const int block_size = 1024;
+    const int num_blocks = static_cast<int>((n + block_size - 1) / block_size);
+    IotaKernel<Index><<<num_blocks, block_size, 0, stream>>>(n, indices);
+  }
+  hipError_t err = hipcub::DeviceRadixSort::SortPairs<Key, Index>(
+      sort_workspace, sort_ws, in, sorted, indices, sorted_indices, n, 0, sizeof(Key) * 8, stream);
+  if (err != hipSuccess) { return err; }
+  if (*workspace_size == 0) { *workspace_size = indices_size + sort_ws; }
+  return hipSuccess;
+}
+
+template<typename Key, typename Index>
+hipError_t DispatchInputSorted(Flag flag, size_t n, const Key* in, Key* unique, Index* num_unique,
+                                Index* inverse_indices, Index* counts, void* workspace,
+                                size_t* workspace_size, hipStream_t stream) {
+  if ((flag & kInputSorted) != 0) {
+    return DispatchOutputInverseIndices<Key, Index, Index*>(flag, n, in, unique, num_unique,
+                                                            inverse_indices, counts, workspace,
+                                                            workspace_size, stream);
+  } else {
+    const size_t sorted_in_size = GetCudaAlignedSize(n * sizeof(Key));
+    const size_t sorted_indices_size = GetCudaAlignedSize(n * sizeof(Index));
+    const size_t sort_buffer_size = sorted_in_size + sorted_indices_size;
+    Key* sorted_in;
+    Index* sorted_indices;
+    size_t do_sort_ws;
+    void* do_sort_workspace;
+    size_t do_inverse_indices_ws;
+    void* do_inverse_indices_workspace;
+    if (*workspace_size == 0) {
+      sorted_in = nullptr;
+      sorted_indices = nullptr;
+      do_sort_ws = 0;
+      do_sort_workspace = nullptr;
+      do_inverse_indices_ws = 0;
+      do_inverse_indices_workspace = nullptr;
+    } else {
+      if (*workspace_size <= sort_buffer_size) { return hipErrorInvalidValue; }
+      sorted_in = PtrOffset<Key>(workspace, 0);
+      sorted_indices = PtrOffset<Index>(workspace, sorted_in_size);
+      do_sort_ws = *workspace_size - sort_buffer_size;
+      do_sort_workspace = PtrOffset<void>(workspace, sort_buffer_size);
+      do_inverse_indices_ws = do_sort_ws;
+      do_inverse_indices_workspace = do_sort_workspace;
+    }
+    {
+      hipError_t err = DoSort<Key, Index>(n, in, sorted_in, sorted_indices, do_sort_workspace,
+                                           &do_sort_ws, stream);
+      if (err != hipSuccess) { return err; }
+    }
+    PermutationIterator<Index, Index*, Index*> inverse_indices_iter(inverse_indices,
+                                                                    sorted_indices);
+    {
+      hipError_t err = DispatchOutputInverseIndices<Key, Index, decltype(inverse_indices_iter)>(
+          flag, n, sorted_in, unique, num_unique, inverse_indices_iter, counts,
+          do_inverse_indices_workspace, &do_inverse_indices_ws, stream);
+      if (err != hipSuccess) { return err; }
+    }
+    if (*workspace_size == 0) {
+      *workspace_size = sort_buffer_size + max(do_sort_ws, do_inverse_indices_ws);
+    }
+    return hipSuccess;
+  }
+}
+
+}  // namespace
+
+template<typename Key, typename Index>
+hipError_t Launch(Flag flag, size_t n, const Key* in, Key* unique, Index* num_unique,
+                   Index* inverse_indices, Index* counts, void* workspace, size_t workspace_size,
+                   hipStream_t stream) {
+  if (workspace_size == 0) { return hipErrorInvalidValue; }
+  return DispatchInputSorted<Key, Index>(flag, n, in, unique, num_unique, inverse_indices, counts,
+                                         workspace, &workspace_size, stream);
+}
+
+template<typename Key, typename Index>
+hipError_t GetWorkspaceSize(Flag flag, size_t n, size_t* workspace_size) {
+  *workspace_size = 0;
+  return DispatchInputSorted<Key, Index>(flag, n, nullptr, nullptr, nullptr, nullptr, nullptr,
+                                         nullptr, workspace_size, 0);
+}
+
+}  // namespace unique
+
+}  // namespace cuda
+
+}  // namespace oneflow
+
+#endif // WITH_ROCM
+
+#endif  // ONEFLOW_CORE_CUDA_UNIQUE_H_
diff --git a/oneflow/core/job/collective_boxing/nccl_executor_backend.hip.cpp b/oneflow/core/job/collective_boxing/nccl_executor_backend.hip.cpp
index cab89de..3e2887c 100644
--- a/oneflow/core/job/collective_boxing/nccl_executor_backend.hip.cpp
+++ b/oneflow/core/job/collective_boxing/nccl_executor_backend.hip.cpp
@@ -1,665 +1,665 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "hip/hip_runtime.h"
-#include "oneflow/core/job/collective_boxing/nccl_executor_backend.h"
-#include "oneflow/core/job/collective_boxing/request_store.h"
-#include "oneflow/core/device/nccl_util.h"
-#include "oneflow/core/graph/boxing/collective_boxing_util.h"
-#include "oneflow/core/job/resource_desc.h"
-#include "oneflow/core/control/ctrl_client.h"
-#include "oneflow/core/control/global_process_ctx.h"
-#include "oneflow/core/job/global_for.h"
-#include "oneflow/core/thread/thread_pool.h"
-#include "oneflow/core/device/cuda_util.h"
-
-#include <rccl.h>
-
-#include <memory>
-#include <utility>
-
-namespace oneflow {
-
-namespace boxing {
-
-namespace collective {
-
-namespace {
-
-ncclRedOp_t GetNcclReduceOp(ReduceMethod reduce_method) {
-  if (reduce_method == kReduceMethodSum) {
-    return ncclRedOp_t::ncclSum;
-  } else {
-    UNIMPLEMENTED();
-    return ncclRedOp_t{};
-  }
-}
-
-std::string GetNcclUniqueIdRpcKey(const std::string& name, int64_t stream_id) {
-  return "CollectiveBoxingExecutorNcclUniqueIdRpcKey-" + name + "-" + std::to_string(stream_id);
-}
-
-struct CopyParams {
-  void* dst;
-  const void* src;
-  int64_t count;
-};
-
-constexpr int64_t kMultiCopyParamsMaxSize = 128;
-constexpr int64_t kMultiCopyAlignSize = 32;
-
-int64_t GetMultiCopyAlignedSize(int64_t size) {
-  return ((size + kMultiCopyAlignSize - 1) / kMultiCopyAlignSize) * kMultiCopyAlignSize;
-}
-
-struct MultiCopyParams {
-  CopyParams params[kMultiCopyParamsMaxSize];
-  int64_t count;
-
-  MultiCopyParams() : count(0), params{} {}
-
-  void Add(void* dst, const void* src, int64_t count) {
-    CHECK_LT(this->count, kMultiCopyParamsMaxSize);
-    params[this->count].dst = dst;
-    params[this->count].src = src;
-    params[this->count].count = count;
-    this->count += 1;
-  }
-};
-
-using BulkType = ulonglong2;
-
-__global__ void MultiCopyGpu(MultiCopyParams multi_params) {
-  for (int64_t p = 0; p < multi_params.count; ++p) {
-    const CopyParams params = multi_params.params[p];
-    auto* bulk_dst = reinterpret_cast<BulkType*>(params.dst);
-    const auto* bulk_src = reinterpret_cast<const BulkType*>(params.src);
-    const int64_t bulk_count = params.count / sizeof(BulkType);
-    CUDA_1D_KERNEL_LOOP_T(int64_t, i, bulk_count) { bulk_dst[i] = bulk_src[i]; }
-    const int64_t tail_offset = bulk_count * sizeof(BulkType);
-    auto* tail_dst = reinterpret_cast<char*>(params.dst) + tail_offset;
-    const auto* tail_src = reinterpret_cast<const char*>(params.src) + tail_offset;
-    const int64_t tail_count = params.count - tail_offset;
-    CUDA_1D_KERNEL_LOOP_T(int64_t, i, tail_count) { tail_dst[i] = tail_src[i]; }
-  }
-}
-
-void MultiCopy(hipStream_t stream, const MultiCopyParams& multi_params) {
-  if (multi_params.count <= 0) { return; }
-  CHECK_LE(multi_params.count, kMultiCopyParamsMaxSize);
-  int64_t max_count = multi_params.params[0].count;
-  for (int64_t i = 0; i < multi_params.count; ++i) {
-    max_count = std::max(max_count, multi_params.params[i].count);
-  }
-  hipLaunchKernelGGL(MultiCopyGpu, BlocksNum4ThreadsNum(max_count), kCudaThreadsNumPerBlock, 0, stream, 
-      multi_params);
-}
-
-class CommRank final {
- public:
-  OF_DISALLOW_COPY(CommRank);
-  CommRank(int32_t device_id, int32_t global_rank, int32_t global_rank_count, int32_t local_rank,
-           int32_t local_rank_count)
-      : device_id_(device_id),
-        global_rank_(global_rank),
-        local_rank_(local_rank),
-        nccl_comm_(nullptr) {}
-
-  CommRank(CommRank&& rhs) noexcept {
-    this->device_id_ = rhs.device_id_;
-    this->global_rank_ = rhs.global_rank_;
-    this->local_rank_ = rhs.local_rank_;
-    this->nccl_comm_ = rhs.nccl_comm_;
-    rhs.nccl_comm_ = nullptr;
-  }
-
-  ~CommRank() {
-    if (nccl_comm_ != nullptr) {
-      CudaCurrentDeviceGuard guard(device_id_);
-      OF_NCCL_CHECK(ncclCommDestroy(nccl_comm_));
-    }
-  }
-
-  int32_t device_id() const { return device_id_; }
-
-  ncclComm_t nccl_comm() const { return nccl_comm_; }
-
-  void InitRank(ncclUniqueId unique_id, int32_t global_rank_count) {
-    CudaCurrentDeviceGuard guard(device_id_);
-    OF_NCCL_CHECK(ncclCommInitRank(&nccl_comm_, global_rank_count, unique_id, global_rank_));
-  }
-
- private:
-  int32_t device_id_;
-  int32_t global_rank_;
-  int32_t local_rank_;
-  ncclComm_t nccl_comm_;
-};
-
-class CommGroup final {
- public:
-  OF_DISALLOW_COPY(CommGroup);
-  CommGroup() = default;
-  ~CommGroup() = default;
-  CommGroup(CommGroup&& rhs) noexcept {
-    rank_vec_.swap(rhs.rank_vec_);
-    global_rank_count_ = rhs.global_rank_count_;
-  }
-
-  void InitGroup(const DeviceSet& device_set, const std::string& unique_name) {
-    CudaCurrentDeviceGuard guard;
-    const int64_t this_machine_id = GlobalProcessCtx::Rank();
-    global_rank_count_ = device_set.device_size();
-    std::vector<int32_t> local_ranks;
-    for (int32_t i = 0; i < global_rank_count_; ++i) {
-      if (device_set.device(i).machine_id() == this_machine_id) { local_ranks.emplace_back(i); }
-    }
-    const int32_t local_rank_count = local_ranks.size();
-    CHECK_GT(local_rank_count, 0);
-    ncclUniqueId nccl_unique_id{};
-    if (local_ranks.front() == 0) {
-      OF_NCCL_CHECK(ncclGetUniqueId(&nccl_unique_id));
-      if (local_rank_count != global_rank_count_) {
-        Singleton<CtrlClient>::Get()->PushKV(unique_name, NcclUniqueIdToString(nccl_unique_id));
-      }
-    } else {
-      Singleton<CtrlClient>::Get()->PullKV(unique_name, [&nccl_unique_id](const std::string& val) {
-        NcclUniqueIdFromString(val, &nccl_unique_id);
-      });
-    }
-    rank_vec_.reserve(local_rank_count);
-    OF_NCCL_CHECK(ncclGroupStart());
-    for (int32_t local_rank = 0; local_rank < local_ranks.size(); ++local_rank) {
-      const int32_t global_rank = local_ranks.at(local_rank);
-      const int32_t device_id = device_set.device(global_rank).device_id();
-      OF_CUDA_CHECK(hipSetDevice(device_id));
-      rank_vec_.emplace_back(device_id, global_rank, global_rank_count_, local_rank,
-                             local_rank_count);
-      rank_vec_.at(local_rank).InitRank(nccl_unique_id, global_rank_count_);
-    }
-    OF_NCCL_CHECK(ncclGroupEnd());
-  }
-
-  int32_t global_rank_count() const { return global_rank_count_; }
-
-  int32_t local_rank_count() const { return rank_vec_.size(); }
-
-  const CommRank& GetCommRank(int32_t local_rank) const { return rank_vec_.at(local_rank); }
-
- private:
-  std::vector<CommRank> rank_vec_;
-  int32_t global_rank_count_ = 0;
-};
-
-class StreamCtx {
- public:
-  OF_DISALLOW_COPY(StreamCtx);
-  StreamCtx(int32_t device_id, size_t fusion_buffer_size)
-      : device_id_(device_id), fusion_buffer_size_(fusion_buffer_size) {
-    CudaCurrentDeviceGuard guard(device_id_);
-    int priority;
-    OF_CUDA_CHECK(hipDeviceGetStreamPriorityRange(nullptr, &priority));
-    OF_CUDA_CHECK(hipStreamCreateWithPriority(&stream_, hipStreamNonBlocking, priority));
-    OF_CUDA_CHECK(hipMalloc(&fusion_buffer_, fusion_buffer_size_));
-    cb_event_poller_ = std::thread(&StreamCtx::PollEvent, this);
-  }
-  ~StreamCtx() {
-    cb_event_chan_.Close();
-    cb_event_poller_.join();
-    CudaCurrentDeviceGuard guard(device_id_);
-    OF_CUDA_CHECK(hipStreamSynchronize(stream_));
-    OF_CUDA_CHECK(hipStreamDestroy(stream_));
-    OF_CUDA_CHECK(hipFree(fusion_buffer_));
-  }
-
-  void PollEvent() {
-    CudaCurrentDeviceGuard guard(device_id_);
-    while (true) {
-      std::pair<hipEvent_t, std::function<void()>> cb_event;
-      ChannelStatus status = cb_event_chan_.Receive(&cb_event);
-      if (status == kChannelStatusErrorClosed) { break; }
-      CHECK_EQ(status, kChannelStatusSuccess);
-      OF_CUDA_CHECK(hipEventSynchronize(cb_event.first));
-      cb_event.second();
-      OF_CUDA_CHECK(hipEventDestroy(cb_event.first));
-    }
-  }
-
-  void AddCallback(const std::function<void()>& callback) {
-    hipEvent_t event;
-    OF_CUDA_CHECK(hipEventCreateWithFlags(&event, hipEventDisableTiming));
-    OF_CUDA_CHECK(hipEventRecord(event, stream_));
-    CHECK_EQ(cb_event_chan_.Send(std::make_pair(event, callback)), kChannelStatusSuccess);
-  }
-
-  int32_t device_id() const { return device_id_; }
-
-  hipStream_t stream() const { return stream_; }
-
-  size_t fusion_buffer_size() const { return fusion_buffer_size_; }
-
-  char* fusion_buffer() const { return fusion_buffer_; }
-
- private:
-  int32_t device_id_;
-  hipStream_t stream_ = nullptr;
-  size_t fusion_buffer_size_;
-  char* fusion_buffer_ = nullptr;
-  Channel<std::pair<hipEvent_t, std::function<void()>>> cb_event_chan_;
-  std::thread cb_event_poller_;
-};
-
-void LaunchFusedAllReduce(const CommGroup& comm_group,
-                          const std::vector<std::unique_ptr<StreamCtx>>& device_id2stream_ctx,
-                          const std::shared_ptr<RequestStore>& request_store,
-                          const std::vector<RequestId>& request_ids) {
-  CHECK_LE(request_ids.size(), kMultiCopyParamsMaxSize);
-  RequestEntry* first_request_entry = request_store->MutRequestEntry(request_ids.front());
-  const ncclDataType_t nccl_data_type =
-      GetNcclDataType(first_request_entry->desc().op_desc().data_type());
-  const ncclRedOp_t nccl_reduce_op =
-      GetNcclReduceOp(first_request_entry->desc().op_desc().reduce_method());
-  const int64_t size_of_data_type =
-      GetSizeOfDataType(first_request_entry->desc().op_desc().data_type());
-  std::vector<int64_t> offset_vec;
-  offset_vec.reserve(request_ids.size());
-  int64_t offset = 0;
-  request_store->ForEachMutRequestEntryForIdsInJob(
-      request_ids, [&](RequestEntry* request_entry, int32_t i, const RequestId& request_id) {
-        offset_vec.emplace_back(offset);
-        offset += GetMultiCopyAlignedSize(request_entry->size_in_bytes());
-      });
-  const int64_t elem_cnt = offset / size_of_data_type;
-  for (int32_t local_rank = 0; local_rank < comm_group.local_rank_count(); ++local_rank) {
-    MultiCopyParams copy_in_params;
-    const CommRank& comm_rank = comm_group.GetCommRank(local_rank);
-    const StreamCtx* stream_ctx = device_id2stream_ctx.at(comm_rank.device_id()).get();
-    CHECK_LE(offset, stream_ctx->fusion_buffer_size());
-    request_store->ForEachMutRequestEntryForIdsInJob(
-        request_ids, [&](RequestEntry* request_entry, int32_t i, const RequestId& request_id) {
-          copy_in_params.Add(stream_ctx->fusion_buffer() + offset_vec.at(i),
-                             request_entry->GetRuntimeRequest(local_rank)->send_buff,
-                             request_entry->size_in_bytes());
-        });
-    OF_CUDA_CHECK(hipSetDevice(comm_rank.device_id()));
-    MultiCopy(stream_ctx->stream(), copy_in_params);
-  }
-
-  OF_NCCL_CHECK(ncclGroupStart());
-  for (int32_t local_rank = 0; local_rank < comm_group.local_rank_count(); ++local_rank) {
-    const CommRank& comm_rank = comm_group.GetCommRank(local_rank);
-    const StreamCtx* stream_ctx = device_id2stream_ctx.at(comm_rank.device_id()).get();
-    OF_CUDA_CHECK(hipSetDevice(comm_rank.device_id()));
-    OF_NCCL_CHECK(ncclAllReduce(stream_ctx->fusion_buffer(), stream_ctx->fusion_buffer(), elem_cnt,
-                                nccl_data_type, nccl_reduce_op, comm_rank.nccl_comm(),
-                                stream_ctx->stream()));
-  }
-  OF_NCCL_CHECK(ncclGroupEnd());
-
-  for (int32_t local_rank = 0; local_rank < comm_group.local_rank_count(); ++local_rank) {
-    MultiCopyParams copy_out_params;
-    const CommRank& comm_rank = comm_group.GetCommRank(local_rank);
-    const StreamCtx* stream_ctx = device_id2stream_ctx.at(comm_rank.device_id()).get();
-    request_store->ForEachMutRequestEntryForIdsInJob(
-        request_ids, [&](RequestEntry* request_entry, int32_t i, const RequestId& request_id) {
-          copy_out_params.Add(request_entry->GetRuntimeRequest(local_rank)->recv_buff,
-                              stream_ctx->fusion_buffer() + offset_vec.at(i),
-                              request_entry->size_in_bytes());
-        });
-    OF_CUDA_CHECK(hipSetDevice(comm_rank.device_id()));
-    MultiCopy(stream_ctx->stream(), copy_out_params);
-  }
-}
-
-void LaunchAggregatedOps(const CommGroup& comm_group,
-                         const std::vector<std::unique_ptr<StreamCtx>>& device_id2stream_ctx,
-                         const std::shared_ptr<RequestStore>& request_store,
-                         const std::vector<RequestId>& request_ids) {
-  OF_NCCL_CHECK(ncclGroupStart());
-  for (int32_t local_rank = 0; local_rank < comm_group.local_rank_count(); ++local_rank) {
-    const CommRank& comm_rank = comm_group.GetCommRank(local_rank);
-    const auto comm = comm_rank.nccl_comm();
-    const StreamCtx* stream_ctx = device_id2stream_ctx.at(comm_rank.device_id()).get();
-    OF_CUDA_CHECK(hipSetDevice(comm_rank.device_id()));
-    request_store->ForEachMutRequestEntryForIdsInJob(
-        request_ids, [&](RequestEntry* request_entry, int32_t i, const RequestId& request_id) {
-          const auto& op_desc = request_entry->desc().op_desc();
-          const std::shared_ptr<const RuntimeRequestInfo>& runtime_request_info =
-              request_entry->GetRuntimeRequest(local_rank);
-          const OpType op_type = op_desc.op_type();
-          const void* send_buff = runtime_request_info->send_buff;
-          void* recv_buff = runtime_request_info->recv_buff;
-          const int64_t elem_cnt = request_entry->elem_cnt();
-          const ncclDataType_t nccl_data_type = GetNcclDataType(op_desc.data_type());
-          const int32_t num_ranks = comm_group.global_rank_count();
-          if (op_type == OpType::kOpTypeAllReduce) {
-            OF_NCCL_CHECK(ncclAllReduce(send_buff, recv_buff, elem_cnt, nccl_data_type,
-                                        GetNcclReduceOp(op_desc.reduce_method()), comm,
-                                        stream_ctx->stream()));
-          } else if (op_type == OpType::kOpTypeAllGather) {
-            CHECK_EQ(elem_cnt % num_ranks, 0);
-            OF_NCCL_CHECK(ncclAllGather(send_buff, recv_buff, elem_cnt / num_ranks, nccl_data_type,
-                                        comm, stream_ctx->stream()));
-          } else if (op_type == OpType::kOpTypeReduceScatter) {
-            CHECK_EQ(elem_cnt % num_ranks, 0);
-            OF_NCCL_CHECK(ncclReduceScatter(
-                send_buff, recv_buff, elem_cnt / num_ranks, nccl_data_type,
-                GetNcclReduceOp(op_desc.reduce_method()), comm, stream_ctx->stream()));
-          } else if (op_type == OpType::kOpTypeReduce) {
-            OF_NCCL_CHECK(ncclReduce(send_buff, recv_buff, elem_cnt, nccl_data_type,
-                                     GetNcclReduceOp(op_desc.reduce_method()), op_desc.root(), comm,
-                                     stream_ctx->stream()));
-          } else if (op_type == OpType::kOpTypeBroadcast) {
-            OF_NCCL_CHECK(ncclBroadcast(send_buff, recv_buff, elem_cnt, nccl_data_type,
-                                        op_desc.root(), comm, stream_ctx->stream()));
-          } else if (op_type == OpType::kOpTypeAll2All) {
-#if NCCL_VERSION_CODE > 2700
-            const int64_t elem_per_rank = elem_cnt / num_ranks;
-            const int64_t elem_per_chunk = elem_per_rank / num_ranks;
-            const int64_t dtype_size = GetSizeOfDataType(op_desc.data_type());
-            const int64_t chunk_size = elem_per_chunk * dtype_size;
-            for (int64_t j = 0; j < num_ranks; ++j) {
-              OF_NCCL_CHECK(ncclSend(reinterpret_cast<const void*>(
-                                         reinterpret_cast<const char*>(send_buff) + j * chunk_size),
-                                     elem_per_chunk, nccl_data_type, j, comm,
-                                     stream_ctx->stream()));
-              OF_NCCL_CHECK(ncclRecv(
-                  reinterpret_cast<void*>(reinterpret_cast<char*>(recv_buff) + j * chunk_size),
-                  elem_per_chunk, nccl_data_type, j, comm, stream_ctx->stream()));
-            }
-#else
-        UNIMPLEMENTED();
-#endif
-          } else {
-            UNIMPLEMENTED();
-          }
-        });
-  }
-  OF_NCCL_CHECK(ncclGroupEnd());
-}
-
-void AddCallbackAndResetRuntimeRequest(
-    const CommGroup& comm_group,
-    const std::vector<std::unique_ptr<StreamCtx>>& device_id2stream_ctx,
-    const std::shared_ptr<RequestStore>& request_store, const std::vector<RequestId>& request_ids) {
-  std::vector<std::vector<std::shared_ptr<const RuntimeRequestInfo>>> saved_runtime_request_info(
-      request_ids.size());
-  request_store->ForEachMutRequestEntryForIdsInJob(
-      request_ids, [&](RequestEntry* request_entry, int32_t i, const RequestId& request_id) {
-        saved_runtime_request_info.at(i) = std::move(request_entry->ResetRuntimeRequest());
-      });
-  for (int32_t local_rank = 0; local_rank < comm_group.local_rank_count(); ++local_rank) {
-    const CommRank& comm_rank = comm_group.GetCommRank(local_rank);
-    StreamCtx* stream_ctx = device_id2stream_ctx.at(comm_rank.device_id()).get();
-    auto runtime_request_info_vec =
-        std::make_shared<std::vector<std::shared_ptr<const RuntimeRequestInfo>>>();
-    runtime_request_info_vec->reserve(request_ids.size());
-    request_store->ForEachMutRequestEntryForIdsInJob(
-        request_ids, [&](RequestEntry* request_entry, int32_t i, const RequestId& request_id) {
-          runtime_request_info_vec->emplace_back(
-              std::move(saved_runtime_request_info.at(i).at(local_rank)));
-        });
-    OF_CUDA_CHECK(hipSetDevice(comm_rank.device_id()));
-    stream_ctx->AddCallback([runtime_request_info_vec]() {
-      for (auto& runtime_request_info : *runtime_request_info_vec) {
-        runtime_request_info->callback(Maybe<void>::Ok());
-      }
-    });
-  }
-}
-
-}  // namespace
-
-struct NcclExecutorBackend::Impl {
-  Impl(const CollectiveBoxingConf& conf, std::shared_ptr<RequestStore> request_store)
-      : conf(conf), request_store(std::move(request_store)) {
-    CHECK_GT(conf.nccl_num_streams(), 0);
-    CHECK_GE(conf.nccl_fusion_threshold_mb(), 0);
-    fusion_threshold = conf.nccl_fusion_threshold_mb() * 1024 * 1024;
-    num_streams = conf.nccl_num_streams();
-    current_stream_id = 0;
-    enable_mixed_fusion =
-        (!conf.nccl_fusion_all_reduce_use_buffer()) && conf.nccl_enable_mixed_fusion();
-    int nccl_version;
-    OF_NCCL_CHECK(ncclGetVersion(&nccl_version));
-    if (nccl_version == 21003) {
-      LOG(WARNING)
-          << "Current nccl version is 2.10.3, in this version, ncclGroup() with mixed "
-             "datatype/element/collective could induce crash or corruption, so we will not "
-             "fuse any request.";
-    }
-    InitStreamCtx();
-    InitIsOpTypeFusionEnabled();
-  }
-  ~Impl() {
-    stream_id2device_id2stream_ctx.clear();
-    device_set2stream_id2comm_group.clear();
-  }
-
-  void InitCommGroup(int64_t job_id) {
-    std::set<int64_t> local_device_ids;
-    request_store->ForEachMutRequestEntryInJob(
-        job_id, [&](RequestEntry* request_entry, int32_t i, const RequestId& request_id) {
-          const auto& request = request_entry->desc();
-          if (request.op_desc().backend() != Backend::kBackendNCCL) { return; }
-          if (!request_entry->HasRankOnThisNode()) { return; }
-          const DeviceSet& device_set = request.device_set();
-          if (device_set2stream_id2comm_group.count(device_set) > 0) { return; }
-          auto& stream_id2comm_group = device_set2stream_id2comm_group[device_set];
-          stream_id2comm_group.resize(num_streams);
-          for (int32_t stream_id = 0; stream_id < num_streams; ++stream_id) {
-            stream_id2comm_group.at(stream_id).InitGroup(
-                device_set, GetNcclUniqueIdRpcKey(request.op_desc().name(), stream_id));
-          }
-          for (int32_t j = 0; j < stream_id2comm_group.at(0).local_rank_count(); ++j) {
-            local_device_ids.emplace(stream_id2comm_group.at(0).GetCommRank(j).device_id());
-          }
-        });
-    for (int32_t stream_id = 0; stream_id < num_streams; ++stream_id) {
-      for (const int64_t device_id : local_device_ids) {
-        if (stream_id2device_id2stream_ctx.at(stream_id).at(device_id) == nullptr) {
-          stream_id2device_id2stream_ctx.at(stream_id).at(device_id) =
-              std::make_unique<StreamCtx>(device_id, fusion_threshold);
-        }
-      }
-    }
-  }
-
-  void InitStreamCtx() {
-    int32_t num_devices;
-    OF_CUDA_CHECK(hipGetDeviceCount(&num_devices));
-    stream_id2device_id2stream_ctx.resize(num_streams);
-    for (int64_t stream_id = 0; stream_id < num_streams; ++stream_id) {
-      stream_id2device_id2stream_ctx.at(stream_id).resize(num_devices);
-    }
-  }
-
-  void InitIsOpTypeFusionEnabled() {
-    op_type2fusion_enabled.resize(OpType_ARRAYSIZE, false);
-    op_type2fusion_enabled.at(OpType::kOpTypeAllReduce) = conf.nccl_fusion_all_reduce();
-    op_type2fusion_enabled.at(OpType::kOpTypeAllGather) = conf.nccl_fusion_all_gather();
-    op_type2fusion_enabled.at(OpType::kOpTypeReduceScatter) = conf.nccl_fusion_reduce_scatter();
-    op_type2fusion_enabled.at(OpType::kOpTypeReduce) = conf.nccl_fusion_reduce();
-    op_type2fusion_enabled.at(OpType::kOpTypeBroadcast) = conf.nccl_fusion_broadcast();
-    op_type2fusion_enabled.at(OpType::kOpTypeAll2All) = false;
-  }
-
-  int32_t NextStreamId() {
-    const int32_t stream_id = current_stream_id;
-    current_stream_id = (current_stream_id + 1) % num_streams;
-    return stream_id;
-  }
-
-  bool IsOpTypeFusionEnabled(OpType op_type) const { return op_type2fusion_enabled.at(op_type); }
-
-  bool IsRequestEntryFusionEnabled(const RequestEntry* entry) const {
-    return IsOpTypeFusionEnabled(entry->desc().op_desc().op_type());
-  }
-
-  bool CanRequestEntryFuse(const RequestEntry* lhs, const RequestEntry* rhs) const {
-    {
-      int nccl_version;
-      OF_NCCL_CHECK(ncclGetVersion(&nccl_version));
-      // Workaround for https://github.com/NVIDIA/nccl/issues/560
-      if (nccl_version == 21003) { return false; }
-    }
-    if (lhs->device_set_symbol() != rhs->device_set_symbol()) { return false; }
-    if ((!IsRequestEntryFusionEnabled(lhs)) || (!IsRequestEntryFusionEnabled(rhs))) {
-      return false;
-    }
-    if ((!enable_mixed_fusion)
-        && lhs->desc().op_desc().op_type() != rhs->desc().op_desc().op_type()) {
-      return false;
-    }
-    if (conf.nccl_fusion_all_reduce_use_buffer()) {
-      if (lhs->desc().op_desc().op_type() == OpType::kOpTypeAllReduce
-          && rhs->desc().op_desc().op_type() == OpType::kOpTypeAllReduce) {
-        CHECK(lhs->desc().op_desc().has_reduce_method());
-        CHECK(rhs->desc().op_desc().has_reduce_method());
-        return lhs->desc().op_desc().reduce_method() == rhs->desc().op_desc().reduce_method()
-               && lhs->desc().op_desc().data_type() == rhs->desc().op_desc().data_type();
-      } else if (lhs->desc().op_desc().op_type() == OpType::kOpTypeAllReduce
-                 || rhs->desc().op_desc().op_type() == OpType::kOpTypeAllReduce) {
-        return false;
-      } else {
-        return true;
-      }
-    } else {
-      return true;
-    }
-  }
-
-  void GroupRequests(const std::vector<RequestId>& request_ids,
-                     const std::function<void(std::vector<RequestId>&&, void*)>& Handler) {
-    std::vector<RequestId> group;
-    int64_t group_size = 0;
-    const int64_t fusion_max_ops = std::min(conf.nccl_fusion_max_ops(), kMultiCopyParamsMaxSize);
-    request_store->ForEachMutRequestEntryForIdsInJob(
-        request_ids, [&](RequestEntry* request_entry, int32_t i, const RequestId& request_id) {
-          const auto& request = request_entry->desc();
-          const int64_t size = GetMultiCopyAlignedSize(request_entry->size_in_bytes());
-          if (group.empty()
-              || !CanRequestEntryFuse(request_store->MutRequestEntry(group.back()), request_entry)
-              || group_size + size > fusion_threshold || group.size() >= fusion_max_ops) {
-            if (!group.empty()) {
-              void* token = CreateGroupToken(group);
-              Handler(std::move(group), token);
-              group.clear();
-              group_size = 0;
-            }
-          }
-          group.emplace_back(request_id);
-          group_size += size;
-        });
-    if (!group.empty()) {
-      void* token = CreateGroupToken(group);
-      Handler(std::move(group), token);
-    }
-  }
-
-  struct GroupToken {
-    GroupToken(const std::vector<RequestId>& group, std::vector<CommGroup>* stream_id2comm_group)
-        : request_ids(group), stream_id2comm_group(stream_id2comm_group) {}
-    std::vector<RequestId> request_ids;
-    std::vector<CommGroup>* stream_id2comm_group;
-  };
-
-  void* CreateGroupToken(const std::vector<RequestId>& group) {
-    CHECK_GT(group.size(), 0);
-    void* group_token;
-    const DeviceSet& first_device_set =
-        request_store->MutRequestEntry(group.front())->desc().device_set();
-    auto it = device_set2stream_id2comm_group.find(first_device_set);
-    CHECK(it != device_set2stream_id2comm_group.end());
-    group_token = new GroupToken(group, &it->second);
-    request_store->ForEachMutRequestEntryForIdsInJob(
-        group, [&](RequestEntry* request_entry, int32_t i, const RequestId& request_id) {
-          const DeviceSet& device_set = request_entry->desc().device_set();
-          CHECK(first_device_set == device_set);
-        });
-    return group_token;
-  }
-
-  void DestroyGroupToken(void* group_token) {
-    GroupToken* token = static_cast<GroupToken*>(group_token);
-    delete token;
-  }
-
-  void ExecuteGroup(void* group_token) {
-    GroupToken* token = static_cast<GroupToken*>(group_token);
-    const std::vector<RequestId>& request_ids = token->request_ids;
-    if (request_ids.empty()) { return; }
-    const int32_t stream_id = NextStreamId();
-    CudaCurrentDeviceGuard device_guard;
-    const auto& comm_group = token->stream_id2comm_group->at(stream_id);
-    auto& device_id2stream_ctx = stream_id2device_id2stream_ctx.at(stream_id);
-    if (request_store->MutRequestEntry(request_ids.front())->desc().op_desc().op_type()
-            == OpType::kOpTypeAllReduce
-        && conf.nccl_fusion_all_reduce_use_buffer() && request_ids.size() > 1) {
-      LaunchFusedAllReduce(comm_group, device_id2stream_ctx, request_store, request_ids);
-    } else {
-      LaunchAggregatedOps(comm_group, device_id2stream_ctx, request_store, request_ids);
-    }
-    AddCallbackAndResetRuntimeRequest(comm_group, device_id2stream_ctx, request_store, request_ids);
-  }
-
-  CollectiveBoxingConf conf;
-  int64_t fusion_threshold;
-  int32_t num_streams;
-  int32_t current_stream_id;
-  bool enable_mixed_fusion;
-  std::vector<bool> op_type2fusion_enabled;
-  std::shared_ptr<RequestStore> request_store;
-  HashMap<DeviceSet, std::vector<CommGroup>> device_set2stream_id2comm_group;
-  std::vector<std::vector<std::unique_ptr<StreamCtx>>> stream_id2device_id2stream_ctx;
-};
-
-NcclExecutorBackend::NcclExecutorBackend() = default;
-
-NcclExecutorBackend::~NcclExecutorBackend() = default;
-
-void NcclExecutorBackend::Init(std::shared_ptr<RequestStore> request_store) {
-  impl_ = std::make_unique<Impl>(
-      Singleton<ResourceDesc, ForSession>::Get()->collective_boxing_conf(), request_store);
-}
-
-void NcclExecutorBackend::InitJob(int64_t job_id) {
-  CudaCurrentDeviceGuard guard;
-  impl_->InitCommGroup(job_id);
-}
-
-void NcclExecutorBackend::DeinitJob(int64_t job_id) {}
-
-void NcclExecutorBackend::GroupRequests(
-    const std::vector<RequestId>& request_ids,
-    const std::function<void(std::vector<RequestId>&&, void*)>& Handler) {
-  impl_->GroupRequests(request_ids, Handler);
-}
-
-void* NcclExecutorBackend::CreateGroupToken(const std::vector<RequestId>& group) {
-  return impl_->CreateGroupToken(group);
-}
-
-void NcclExecutorBackend::DestroyGroupToken(void* group_token) {
-  return impl_->DestroyGroupToken(group_token);
-}
-
-void NcclExecutorBackend::ExecuteGroup(void* group_token) { impl_->ExecuteGroup(group_token); }
-
-}  // namespace collective
-
-}  // namespace boxing
-
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "hip/hip_runtime.h"
+#include "oneflow/core/job/collective_boxing/nccl_executor_backend.h"
+#include "oneflow/core/job/collective_boxing/request_store.h"
+#include "oneflow/core/device/nccl_util.h"
+#include "oneflow/core/graph/boxing/collective_boxing_util.h"
+#include "oneflow/core/job/resource_desc.h"
+#include "oneflow/core/control/ctrl_client.h"
+#include "oneflow/core/control/global_process_ctx.h"
+#include "oneflow/core/job/global_for.h"
+#include "oneflow/core/thread/thread_pool.h"
+#include "oneflow/core/device/cuda_util.h"
+
+#include <rccl.h>
+
+#include <memory>
+#include <utility>
+
+namespace oneflow {
+
+namespace boxing {
+
+namespace collective {
+
+namespace {
+
+ncclRedOp_t GetNcclReduceOp(ReduceMethod reduce_method) {
+  if (reduce_method == kReduceMethodSum) {
+    return ncclRedOp_t::ncclSum;
+  } else {
+    UNIMPLEMENTED();
+    return ncclRedOp_t{};
+  }
+}
+
+std::string GetNcclUniqueIdRpcKey(const std::string& name, int64_t stream_id) {
+  return "CollectiveBoxingExecutorNcclUniqueIdRpcKey-" + name + "-" + std::to_string(stream_id);
+}
+
+struct CopyParams {
+  void* dst;
+  const void* src;
+  int64_t count;
+};
+
+constexpr int64_t kMultiCopyParamsMaxSize = 128;
+constexpr int64_t kMultiCopyAlignSize = 32;
+
+int64_t GetMultiCopyAlignedSize(int64_t size) {
+  return ((size + kMultiCopyAlignSize - 1) / kMultiCopyAlignSize) * kMultiCopyAlignSize;
+}
+
+struct MultiCopyParams {
+  CopyParams params[kMultiCopyParamsMaxSize];
+  int64_t count;
+
+  MultiCopyParams() : count(0), params{} {}
+
+  void Add(void* dst, const void* src, int64_t count) {
+    CHECK_LT(this->count, kMultiCopyParamsMaxSize);
+    params[this->count].dst = dst;
+    params[this->count].src = src;
+    params[this->count].count = count;
+    this->count += 1;
+  }
+};
+
+using BulkType = ulonglong2;
+
+__global__ void MultiCopyGpu(MultiCopyParams multi_params) {
+  for (int64_t p = 0; p < multi_params.count; ++p) {
+    const CopyParams params = multi_params.params[p];
+    auto* bulk_dst = reinterpret_cast<BulkType*>(params.dst);
+    const auto* bulk_src = reinterpret_cast<const BulkType*>(params.src);
+    const int64_t bulk_count = params.count / sizeof(BulkType);
+    CUDA_1D_KERNEL_LOOP_T(int64_t, i, bulk_count) { bulk_dst[i] = bulk_src[i]; }
+    const int64_t tail_offset = bulk_count * sizeof(BulkType);
+    auto* tail_dst = reinterpret_cast<char*>(params.dst) + tail_offset;
+    const auto* tail_src = reinterpret_cast<const char*>(params.src) + tail_offset;
+    const int64_t tail_count = params.count - tail_offset;
+    CUDA_1D_KERNEL_LOOP_T(int64_t, i, tail_count) { tail_dst[i] = tail_src[i]; }
+  }
+}
+
+void MultiCopy(hipStream_t stream, const MultiCopyParams& multi_params) {
+  if (multi_params.count <= 0) { return; }
+  CHECK_LE(multi_params.count, kMultiCopyParamsMaxSize);
+  int64_t max_count = multi_params.params[0].count;
+  for (int64_t i = 0; i < multi_params.count; ++i) {
+    max_count = std::max(max_count, multi_params.params[i].count);
+  }
+  hipLaunchKernelGGL(MultiCopyGpu, BlocksNum4ThreadsNum(max_count), kCudaThreadsNumPerBlock, 0, stream, 
+      multi_params);
+}
+
+class CommRank final {
+ public:
+  OF_DISALLOW_COPY(CommRank);
+  CommRank(int32_t device_id, int32_t global_rank, int32_t global_rank_count, int32_t local_rank,
+           int32_t local_rank_count)
+      : device_id_(device_id),
+        global_rank_(global_rank),
+        local_rank_(local_rank),
+        nccl_comm_(nullptr) {}
+
+  CommRank(CommRank&& rhs) noexcept {
+    this->device_id_ = rhs.device_id_;
+    this->global_rank_ = rhs.global_rank_;
+    this->local_rank_ = rhs.local_rank_;
+    this->nccl_comm_ = rhs.nccl_comm_;
+    rhs.nccl_comm_ = nullptr;
+  }
+
+  ~CommRank() {
+    if (nccl_comm_ != nullptr) {
+      CudaCurrentDeviceGuard guard(device_id_);
+      OF_NCCL_CHECK(ncclCommDestroy(nccl_comm_));
+    }
+  }
+
+  int32_t device_id() const { return device_id_; }
+
+  ncclComm_t nccl_comm() const { return nccl_comm_; }
+
+  void InitRank(ncclUniqueId unique_id, int32_t global_rank_count) {
+    CudaCurrentDeviceGuard guard(device_id_);
+    OF_NCCL_CHECK(ncclCommInitRank(&nccl_comm_, global_rank_count, unique_id, global_rank_));
+  }
+
+ private:
+  int32_t device_id_;
+  int32_t global_rank_;
+  int32_t local_rank_;
+  ncclComm_t nccl_comm_;
+};
+
+class CommGroup final {
+ public:
+  OF_DISALLOW_COPY(CommGroup);
+  CommGroup() = default;
+  ~CommGroup() = default;
+  CommGroup(CommGroup&& rhs) noexcept {
+    rank_vec_.swap(rhs.rank_vec_);
+    global_rank_count_ = rhs.global_rank_count_;
+  }
+
+  void InitGroup(const DeviceSet& device_set, const std::string& unique_name) {
+    CudaCurrentDeviceGuard guard;
+    const int64_t this_machine_id = GlobalProcessCtx::Rank();
+    global_rank_count_ = device_set.device_size();
+    std::vector<int32_t> local_ranks;
+    for (int32_t i = 0; i < global_rank_count_; ++i) {
+      if (device_set.device(i).machine_id() == this_machine_id) { local_ranks.emplace_back(i); }
+    }
+    const int32_t local_rank_count = local_ranks.size();
+    CHECK_GT(local_rank_count, 0);
+    ncclUniqueId nccl_unique_id{};
+    if (local_ranks.front() == 0) {
+      OF_NCCL_CHECK(ncclGetUniqueId(&nccl_unique_id));
+      if (local_rank_count != global_rank_count_) {
+        Singleton<CtrlClient>::Get()->PushKV(unique_name, NcclUniqueIdToString(nccl_unique_id));
+      }
+    } else {
+      Singleton<CtrlClient>::Get()->PullKV(unique_name, [&nccl_unique_id](const std::string& val) {
+        NcclUniqueIdFromString(val, &nccl_unique_id);
+      });
+    }
+    rank_vec_.reserve(local_rank_count);
+    OF_NCCL_CHECK(ncclGroupStart());
+    for (int32_t local_rank = 0; local_rank < local_ranks.size(); ++local_rank) {
+      const int32_t global_rank = local_ranks.at(local_rank);
+      const int32_t device_id = device_set.device(global_rank).device_id();
+      OF_CUDA_CHECK(hipSetDevice(device_id));
+      rank_vec_.emplace_back(device_id, global_rank, global_rank_count_, local_rank,
+                             local_rank_count);
+      rank_vec_.at(local_rank).InitRank(nccl_unique_id, global_rank_count_);
+    }
+    OF_NCCL_CHECK(ncclGroupEnd());
+  }
+
+  int32_t global_rank_count() const { return global_rank_count_; }
+
+  int32_t local_rank_count() const { return rank_vec_.size(); }
+
+  const CommRank& GetCommRank(int32_t local_rank) const { return rank_vec_.at(local_rank); }
+
+ private:
+  std::vector<CommRank> rank_vec_;
+  int32_t global_rank_count_ = 0;
+};
+
+class StreamCtx {
+ public:
+  OF_DISALLOW_COPY(StreamCtx);
+  StreamCtx(int32_t device_id, size_t fusion_buffer_size)
+      : device_id_(device_id), fusion_buffer_size_(fusion_buffer_size) {
+    CudaCurrentDeviceGuard guard(device_id_);
+    int priority;
+    OF_CUDA_CHECK(hipDeviceGetStreamPriorityRange(nullptr, &priority));
+    OF_CUDA_CHECK(hipStreamCreateWithPriority(&stream_, hipStreamNonBlocking, priority));
+    OF_CUDA_CHECK(hipMalloc(&fusion_buffer_, fusion_buffer_size_));
+    cb_event_poller_ = std::thread(&StreamCtx::PollEvent, this);
+  }
+  ~StreamCtx() {
+    cb_event_chan_.Close();
+    cb_event_poller_.join();
+    CudaCurrentDeviceGuard guard(device_id_);
+    OF_CUDA_CHECK(hipStreamSynchronize(stream_));
+    OF_CUDA_CHECK(hipStreamDestroy(stream_));
+    OF_CUDA_CHECK(hipFree(fusion_buffer_));
+  }
+
+  void PollEvent() {
+    CudaCurrentDeviceGuard guard(device_id_);
+    while (true) {
+      std::pair<hipEvent_t, std::function<void()>> cb_event;
+      ChannelStatus status = cb_event_chan_.Receive(&cb_event);
+      if (status == kChannelStatusErrorClosed) { break; }
+      CHECK_EQ(status, kChannelStatusSuccess);
+      OF_CUDA_CHECK(hipEventSynchronize(cb_event.first));
+      cb_event.second();
+      OF_CUDA_CHECK(hipEventDestroy(cb_event.first));
+    }
+  }
+
+  void AddCallback(const std::function<void()>& callback) {
+    hipEvent_t event;
+    OF_CUDA_CHECK(hipEventCreateWithFlags(&event, hipEventDisableTiming));
+    OF_CUDA_CHECK(hipEventRecord(event, stream_));
+    CHECK_EQ(cb_event_chan_.Send(std::make_pair(event, callback)), kChannelStatusSuccess);
+  }
+
+  int32_t device_id() const { return device_id_; }
+
+  hipStream_t stream() const { return stream_; }
+
+  size_t fusion_buffer_size() const { return fusion_buffer_size_; }
+
+  char* fusion_buffer() const { return fusion_buffer_; }
+
+ private:
+  int32_t device_id_;
+  hipStream_t stream_ = nullptr;
+  size_t fusion_buffer_size_;
+  char* fusion_buffer_ = nullptr;
+  Channel<std::pair<hipEvent_t, std::function<void()>>> cb_event_chan_;
+  std::thread cb_event_poller_;
+};
+
+void LaunchFusedAllReduce(const CommGroup& comm_group,
+                          const std::vector<std::unique_ptr<StreamCtx>>& device_id2stream_ctx,
+                          const std::shared_ptr<RequestStore>& request_store,
+                          const std::vector<RequestId>& request_ids) {
+  CHECK_LE(request_ids.size(), kMultiCopyParamsMaxSize);
+  RequestEntry* first_request_entry = request_store->MutRequestEntry(request_ids.front());
+  const ncclDataType_t nccl_data_type =
+      GetNcclDataType(first_request_entry->desc().op_desc().data_type());
+  const ncclRedOp_t nccl_reduce_op =
+      GetNcclReduceOp(first_request_entry->desc().op_desc().reduce_method());
+  const int64_t size_of_data_type =
+      GetSizeOfDataType(first_request_entry->desc().op_desc().data_type());
+  std::vector<int64_t> offset_vec;
+  offset_vec.reserve(request_ids.size());
+  int64_t offset = 0;
+  request_store->ForEachMutRequestEntryForIdsInJob(
+      request_ids, [&](RequestEntry* request_entry, int32_t i, const RequestId& request_id) {
+        offset_vec.emplace_back(offset);
+        offset += GetMultiCopyAlignedSize(request_entry->size_in_bytes());
+      });
+  const int64_t elem_cnt = offset / size_of_data_type;
+  for (int32_t local_rank = 0; local_rank < comm_group.local_rank_count(); ++local_rank) {
+    MultiCopyParams copy_in_params;
+    const CommRank& comm_rank = comm_group.GetCommRank(local_rank);
+    const StreamCtx* stream_ctx = device_id2stream_ctx.at(comm_rank.device_id()).get();
+    CHECK_LE(offset, stream_ctx->fusion_buffer_size());
+    request_store->ForEachMutRequestEntryForIdsInJob(
+        request_ids, [&](RequestEntry* request_entry, int32_t i, const RequestId& request_id) {
+          copy_in_params.Add(stream_ctx->fusion_buffer() + offset_vec.at(i),
+                             request_entry->GetRuntimeRequest(local_rank)->send_buff,
+                             request_entry->size_in_bytes());
+        });
+    OF_CUDA_CHECK(hipSetDevice(comm_rank.device_id()));
+    MultiCopy(stream_ctx->stream(), copy_in_params);
+  }
+
+  OF_NCCL_CHECK(ncclGroupStart());
+  for (int32_t local_rank = 0; local_rank < comm_group.local_rank_count(); ++local_rank) {
+    const CommRank& comm_rank = comm_group.GetCommRank(local_rank);
+    const StreamCtx* stream_ctx = device_id2stream_ctx.at(comm_rank.device_id()).get();
+    OF_CUDA_CHECK(hipSetDevice(comm_rank.device_id()));
+    OF_NCCL_CHECK(ncclAllReduce(stream_ctx->fusion_buffer(), stream_ctx->fusion_buffer(), elem_cnt,
+                                nccl_data_type, nccl_reduce_op, comm_rank.nccl_comm(),
+                                stream_ctx->stream()));
+  }
+  OF_NCCL_CHECK(ncclGroupEnd());
+
+  for (int32_t local_rank = 0; local_rank < comm_group.local_rank_count(); ++local_rank) {
+    MultiCopyParams copy_out_params;
+    const CommRank& comm_rank = comm_group.GetCommRank(local_rank);
+    const StreamCtx* stream_ctx = device_id2stream_ctx.at(comm_rank.device_id()).get();
+    request_store->ForEachMutRequestEntryForIdsInJob(
+        request_ids, [&](RequestEntry* request_entry, int32_t i, const RequestId& request_id) {
+          copy_out_params.Add(request_entry->GetRuntimeRequest(local_rank)->recv_buff,
+                              stream_ctx->fusion_buffer() + offset_vec.at(i),
+                              request_entry->size_in_bytes());
+        });
+    OF_CUDA_CHECK(hipSetDevice(comm_rank.device_id()));
+    MultiCopy(stream_ctx->stream(), copy_out_params);
+  }
+}
+
+void LaunchAggregatedOps(const CommGroup& comm_group,
+                         const std::vector<std::unique_ptr<StreamCtx>>& device_id2stream_ctx,
+                         const std::shared_ptr<RequestStore>& request_store,
+                         const std::vector<RequestId>& request_ids) {
+  OF_NCCL_CHECK(ncclGroupStart());
+  for (int32_t local_rank = 0; local_rank < comm_group.local_rank_count(); ++local_rank) {
+    const CommRank& comm_rank = comm_group.GetCommRank(local_rank);
+    const auto comm = comm_rank.nccl_comm();
+    const StreamCtx* stream_ctx = device_id2stream_ctx.at(comm_rank.device_id()).get();
+    OF_CUDA_CHECK(hipSetDevice(comm_rank.device_id()));
+    request_store->ForEachMutRequestEntryForIdsInJob(
+        request_ids, [&](RequestEntry* request_entry, int32_t i, const RequestId& request_id) {
+          const auto& op_desc = request_entry->desc().op_desc();
+          const std::shared_ptr<const RuntimeRequestInfo>& runtime_request_info =
+              request_entry->GetRuntimeRequest(local_rank);
+          const OpType op_type = op_desc.op_type();
+          const void* send_buff = runtime_request_info->send_buff;
+          void* recv_buff = runtime_request_info->recv_buff;
+          const int64_t elem_cnt = request_entry->elem_cnt();
+          const ncclDataType_t nccl_data_type = GetNcclDataType(op_desc.data_type());
+          const int32_t num_ranks = comm_group.global_rank_count();
+          if (op_type == OpType::kOpTypeAllReduce) {
+            OF_NCCL_CHECK(ncclAllReduce(send_buff, recv_buff, elem_cnt, nccl_data_type,
+                                        GetNcclReduceOp(op_desc.reduce_method()), comm,
+                                        stream_ctx->stream()));
+          } else if (op_type == OpType::kOpTypeAllGather) {
+            CHECK_EQ(elem_cnt % num_ranks, 0);
+            OF_NCCL_CHECK(ncclAllGather(send_buff, recv_buff, elem_cnt / num_ranks, nccl_data_type,
+                                        comm, stream_ctx->stream()));
+          } else if (op_type == OpType::kOpTypeReduceScatter) {
+            CHECK_EQ(elem_cnt % num_ranks, 0);
+            OF_NCCL_CHECK(ncclReduceScatter(
+                send_buff, recv_buff, elem_cnt / num_ranks, nccl_data_type,
+                GetNcclReduceOp(op_desc.reduce_method()), comm, stream_ctx->stream()));
+          } else if (op_type == OpType::kOpTypeReduce) {
+            OF_NCCL_CHECK(ncclReduce(send_buff, recv_buff, elem_cnt, nccl_data_type,
+                                     GetNcclReduceOp(op_desc.reduce_method()), op_desc.root(), comm,
+                                     stream_ctx->stream()));
+          } else if (op_type == OpType::kOpTypeBroadcast) {
+            OF_NCCL_CHECK(ncclBroadcast(send_buff, recv_buff, elem_cnt, nccl_data_type,
+                                        op_desc.root(), comm, stream_ctx->stream()));
+          } else if (op_type == OpType::kOpTypeAll2All) {
+#if NCCL_VERSION_CODE > 2700
+            const int64_t elem_per_rank = elem_cnt / num_ranks;
+            const int64_t elem_per_chunk = elem_per_rank / num_ranks;
+            const int64_t dtype_size = GetSizeOfDataType(op_desc.data_type());
+            const int64_t chunk_size = elem_per_chunk * dtype_size;
+            for (int64_t j = 0; j < num_ranks; ++j) {
+              OF_NCCL_CHECK(ncclSend(reinterpret_cast<const void*>(
+                                         reinterpret_cast<const char*>(send_buff) + j * chunk_size),
+                                     elem_per_chunk, nccl_data_type, j, comm,
+                                     stream_ctx->stream()));
+              OF_NCCL_CHECK(ncclRecv(
+                  reinterpret_cast<void*>(reinterpret_cast<char*>(recv_buff) + j * chunk_size),
+                  elem_per_chunk, nccl_data_type, j, comm, stream_ctx->stream()));
+            }
+#else
+        UNIMPLEMENTED();
+#endif
+          } else {
+            UNIMPLEMENTED();
+          }
+        });
+  }
+  OF_NCCL_CHECK(ncclGroupEnd());
+}
+
+void AddCallbackAndResetRuntimeRequest(
+    const CommGroup& comm_group,
+    const std::vector<std::unique_ptr<StreamCtx>>& device_id2stream_ctx,
+    const std::shared_ptr<RequestStore>& request_store, const std::vector<RequestId>& request_ids) {
+  std::vector<std::vector<std::shared_ptr<const RuntimeRequestInfo>>> saved_runtime_request_info(
+      request_ids.size());
+  request_store->ForEachMutRequestEntryForIdsInJob(
+      request_ids, [&](RequestEntry* request_entry, int32_t i, const RequestId& request_id) {
+        saved_runtime_request_info.at(i) = std::move(request_entry->ResetRuntimeRequest());
+      });
+  for (int32_t local_rank = 0; local_rank < comm_group.local_rank_count(); ++local_rank) {
+    const CommRank& comm_rank = comm_group.GetCommRank(local_rank);
+    StreamCtx* stream_ctx = device_id2stream_ctx.at(comm_rank.device_id()).get();
+    auto runtime_request_info_vec =
+        std::make_shared<std::vector<std::shared_ptr<const RuntimeRequestInfo>>>();
+    runtime_request_info_vec->reserve(request_ids.size());
+    request_store->ForEachMutRequestEntryForIdsInJob(
+        request_ids, [&](RequestEntry* request_entry, int32_t i, const RequestId& request_id) {
+          runtime_request_info_vec->emplace_back(
+              std::move(saved_runtime_request_info.at(i).at(local_rank)));
+        });
+    OF_CUDA_CHECK(hipSetDevice(comm_rank.device_id()));
+    stream_ctx->AddCallback([runtime_request_info_vec]() {
+      for (auto& runtime_request_info : *runtime_request_info_vec) {
+        runtime_request_info->callback(Maybe<void>::Ok());
+      }
+    });
+  }
+}
+
+}  // namespace
+
+struct NcclExecutorBackend::Impl {
+  Impl(const CollectiveBoxingConf& conf, std::shared_ptr<RequestStore> request_store)
+      : conf(conf), request_store(std::move(request_store)) {
+    CHECK_GT(conf.nccl_num_streams(), 0);
+    CHECK_GE(conf.nccl_fusion_threshold_mb(), 0);
+    fusion_threshold = conf.nccl_fusion_threshold_mb() * 1024 * 1024;
+    num_streams = conf.nccl_num_streams();
+    current_stream_id = 0;
+    enable_mixed_fusion =
+        (!conf.nccl_fusion_all_reduce_use_buffer()) && conf.nccl_enable_mixed_fusion();
+    int nccl_version;
+    OF_NCCL_CHECK(ncclGetVersion(&nccl_version));
+    if (nccl_version == 21003) {
+      LOG(WARNING)
+          << "Current nccl version is 2.10.3, in this version, ncclGroup() with mixed "
+             "datatype/element/collective could induce crash or corruption, so we will not "
+             "fuse any request.";
+    }
+    InitStreamCtx();
+    InitIsOpTypeFusionEnabled();
+  }
+  ~Impl() {
+    stream_id2device_id2stream_ctx.clear();
+    device_set2stream_id2comm_group.clear();
+  }
+
+  void InitCommGroup(int64_t job_id) {
+    std::set<int64_t> local_device_ids;
+    request_store->ForEachMutRequestEntryInJob(
+        job_id, [&](RequestEntry* request_entry, int32_t i, const RequestId& request_id) {
+          const auto& request = request_entry->desc();
+          if (request.op_desc().backend() != Backend::kBackendNCCL) { return; }
+          if (!request_entry->HasRankOnThisNode()) { return; }
+          const DeviceSet& device_set = request.device_set();
+          if (device_set2stream_id2comm_group.count(device_set) > 0) { return; }
+          auto& stream_id2comm_group = device_set2stream_id2comm_group[device_set];
+          stream_id2comm_group.resize(num_streams);
+          for (int32_t stream_id = 0; stream_id < num_streams; ++stream_id) {
+            stream_id2comm_group.at(stream_id).InitGroup(
+                device_set, GetNcclUniqueIdRpcKey(request.op_desc().name(), stream_id));
+          }
+          for (int32_t j = 0; j < stream_id2comm_group.at(0).local_rank_count(); ++j) {
+            local_device_ids.emplace(stream_id2comm_group.at(0).GetCommRank(j).device_id());
+          }
+        });
+    for (int32_t stream_id = 0; stream_id < num_streams; ++stream_id) {
+      for (const int64_t device_id : local_device_ids) {
+        if (stream_id2device_id2stream_ctx.at(stream_id).at(device_id) == nullptr) {
+          stream_id2device_id2stream_ctx.at(stream_id).at(device_id) =
+              std::make_unique<StreamCtx>(device_id, fusion_threshold);
+        }
+      }
+    }
+  }
+
+  void InitStreamCtx() {
+    int32_t num_devices;
+    OF_CUDA_CHECK(hipGetDeviceCount(&num_devices));
+    stream_id2device_id2stream_ctx.resize(num_streams);
+    for (int64_t stream_id = 0; stream_id < num_streams; ++stream_id) {
+      stream_id2device_id2stream_ctx.at(stream_id).resize(num_devices);
+    }
+  }
+
+  void InitIsOpTypeFusionEnabled() {
+    op_type2fusion_enabled.resize(OpType_ARRAYSIZE, false);
+    op_type2fusion_enabled.at(OpType::kOpTypeAllReduce) = conf.nccl_fusion_all_reduce();
+    op_type2fusion_enabled.at(OpType::kOpTypeAllGather) = conf.nccl_fusion_all_gather();
+    op_type2fusion_enabled.at(OpType::kOpTypeReduceScatter) = conf.nccl_fusion_reduce_scatter();
+    op_type2fusion_enabled.at(OpType::kOpTypeReduce) = conf.nccl_fusion_reduce();
+    op_type2fusion_enabled.at(OpType::kOpTypeBroadcast) = conf.nccl_fusion_broadcast();
+    op_type2fusion_enabled.at(OpType::kOpTypeAll2All) = false;
+  }
+
+  int32_t NextStreamId() {
+    const int32_t stream_id = current_stream_id;
+    current_stream_id = (current_stream_id + 1) % num_streams;
+    return stream_id;
+  }
+
+  bool IsOpTypeFusionEnabled(OpType op_type) const { return op_type2fusion_enabled.at(op_type); }
+
+  bool IsRequestEntryFusionEnabled(const RequestEntry* entry) const {
+    return IsOpTypeFusionEnabled(entry->desc().op_desc().op_type());
+  }
+
+  bool CanRequestEntryFuse(const RequestEntry* lhs, const RequestEntry* rhs) const {
+    {
+      int nccl_version;
+      OF_NCCL_CHECK(ncclGetVersion(&nccl_version));
+      // Workaround for https://github.com/NVIDIA/nccl/issues/560
+      if (nccl_version == 21003) { return false; }
+    }
+    if (lhs->device_set_symbol() != rhs->device_set_symbol()) { return false; }
+    if ((!IsRequestEntryFusionEnabled(lhs)) || (!IsRequestEntryFusionEnabled(rhs))) {
+      return false;
+    }
+    if ((!enable_mixed_fusion)
+        && lhs->desc().op_desc().op_type() != rhs->desc().op_desc().op_type()) {
+      return false;
+    }
+    if (conf.nccl_fusion_all_reduce_use_buffer()) {
+      if (lhs->desc().op_desc().op_type() == OpType::kOpTypeAllReduce
+          && rhs->desc().op_desc().op_type() == OpType::kOpTypeAllReduce) {
+        CHECK(lhs->desc().op_desc().has_reduce_method());
+        CHECK(rhs->desc().op_desc().has_reduce_method());
+        return lhs->desc().op_desc().reduce_method() == rhs->desc().op_desc().reduce_method()
+               && lhs->desc().op_desc().data_type() == rhs->desc().op_desc().data_type();
+      } else if (lhs->desc().op_desc().op_type() == OpType::kOpTypeAllReduce
+                 || rhs->desc().op_desc().op_type() == OpType::kOpTypeAllReduce) {
+        return false;
+      } else {
+        return true;
+      }
+    } else {
+      return true;
+    }
+  }
+
+  void GroupRequests(const std::vector<RequestId>& request_ids,
+                     const std::function<void(std::vector<RequestId>&&, void*)>& Handler) {
+    std::vector<RequestId> group;
+    int64_t group_size = 0;
+    const int64_t fusion_max_ops = std::min(conf.nccl_fusion_max_ops(), kMultiCopyParamsMaxSize);
+    request_store->ForEachMutRequestEntryForIdsInJob(
+        request_ids, [&](RequestEntry* request_entry, int32_t i, const RequestId& request_id) {
+          const auto& request = request_entry->desc();
+          const int64_t size = GetMultiCopyAlignedSize(request_entry->size_in_bytes());
+          if (group.empty()
+              || !CanRequestEntryFuse(request_store->MutRequestEntry(group.back()), request_entry)
+              || group_size + size > fusion_threshold || group.size() >= fusion_max_ops) {
+            if (!group.empty()) {
+              void* token = CreateGroupToken(group);
+              Handler(std::move(group), token);
+              group.clear();
+              group_size = 0;
+            }
+          }
+          group.emplace_back(request_id);
+          group_size += size;
+        });
+    if (!group.empty()) {
+      void* token = CreateGroupToken(group);
+      Handler(std::move(group), token);
+    }
+  }
+
+  struct GroupToken {
+    GroupToken(const std::vector<RequestId>& group, std::vector<CommGroup>* stream_id2comm_group)
+        : request_ids(group), stream_id2comm_group(stream_id2comm_group) {}
+    std::vector<RequestId> request_ids;
+    std::vector<CommGroup>* stream_id2comm_group;
+  };
+
+  void* CreateGroupToken(const std::vector<RequestId>& group) {
+    CHECK_GT(group.size(), 0);
+    void* group_token;
+    const DeviceSet& first_device_set =
+        request_store->MutRequestEntry(group.front())->desc().device_set();
+    auto it = device_set2stream_id2comm_group.find(first_device_set);
+    CHECK(it != device_set2stream_id2comm_group.end());
+    group_token = new GroupToken(group, &it->second);
+    request_store->ForEachMutRequestEntryForIdsInJob(
+        group, [&](RequestEntry* request_entry, int32_t i, const RequestId& request_id) {
+          const DeviceSet& device_set = request_entry->desc().device_set();
+          CHECK(first_device_set == device_set);
+        });
+    return group_token;
+  }
+
+  void DestroyGroupToken(void* group_token) {
+    GroupToken* token = static_cast<GroupToken*>(group_token);
+    delete token;
+  }
+
+  void ExecuteGroup(void* group_token) {
+    GroupToken* token = static_cast<GroupToken*>(group_token);
+    const std::vector<RequestId>& request_ids = token->request_ids;
+    if (request_ids.empty()) { return; }
+    const int32_t stream_id = NextStreamId();
+    CudaCurrentDeviceGuard device_guard;
+    const auto& comm_group = token->stream_id2comm_group->at(stream_id);
+    auto& device_id2stream_ctx = stream_id2device_id2stream_ctx.at(stream_id);
+    if (request_store->MutRequestEntry(request_ids.front())->desc().op_desc().op_type()
+            == OpType::kOpTypeAllReduce
+        && conf.nccl_fusion_all_reduce_use_buffer() && request_ids.size() > 1) {
+      LaunchFusedAllReduce(comm_group, device_id2stream_ctx, request_store, request_ids);
+    } else {
+      LaunchAggregatedOps(comm_group, device_id2stream_ctx, request_store, request_ids);
+    }
+    AddCallbackAndResetRuntimeRequest(comm_group, device_id2stream_ctx, request_store, request_ids);
+  }
+
+  CollectiveBoxingConf conf;
+  int64_t fusion_threshold;
+  int32_t num_streams;
+  int32_t current_stream_id;
+  bool enable_mixed_fusion;
+  std::vector<bool> op_type2fusion_enabled;
+  std::shared_ptr<RequestStore> request_store;
+  HashMap<DeviceSet, std::vector<CommGroup>> device_set2stream_id2comm_group;
+  std::vector<std::vector<std::unique_ptr<StreamCtx>>> stream_id2device_id2stream_ctx;
+};
+
+NcclExecutorBackend::NcclExecutorBackend() = default;
+
+NcclExecutorBackend::~NcclExecutorBackend() = default;
+
+void NcclExecutorBackend::Init(std::shared_ptr<RequestStore> request_store) {
+  impl_ = std::make_unique<Impl>(
+      Singleton<ResourceDesc, ForSession>::Get()->collective_boxing_conf(), request_store);
+}
+
+void NcclExecutorBackend::InitJob(int64_t job_id) {
+  CudaCurrentDeviceGuard guard;
+  impl_->InitCommGroup(job_id);
+}
+
+void NcclExecutorBackend::DeinitJob(int64_t job_id) {}
+
+void NcclExecutorBackend::GroupRequests(
+    const std::vector<RequestId>& request_ids,
+    const std::function<void(std::vector<RequestId>&&, void*)>& Handler) {
+  impl_->GroupRequests(request_ids, Handler);
+}
+
+void* NcclExecutorBackend::CreateGroupToken(const std::vector<RequestId>& group) {
+  return impl_->CreateGroupToken(group);
+}
+
+void NcclExecutorBackend::DestroyGroupToken(void* group_token) {
+  return impl_->DestroyGroupToken(group_token);
+}
+
+void NcclExecutorBackend::ExecuteGroup(void* group_token) { impl_->ExecuteGroup(group_token); }
+
+}  // namespace collective
+
+}  // namespace boxing
+
 }  // namespace oneflow
\ No newline at end of file
diff --git a/oneflow/core/job_rewriter/insert_nccl_logical_op_pass.cpp b/oneflow/core/job_rewriter/insert_nccl_logical_op_pass.cpp
index b2ab4cb..36d981a 100644
--- a/oneflow/core/job_rewriter/insert_nccl_logical_op_pass.cpp
+++ b/oneflow/core/job_rewriter/insert_nccl_logical_op_pass.cpp
@@ -14,7 +14,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 */
 #include "oneflow/core/job/nd_sbp_util.h"
-#if defined(WITH_CUDA) || defined(ROCM)
+#if defined(WITH_CUDA) || defined(WITH_ROCM)
 #include "oneflow/core/framework/framework.h"
 #include "oneflow/core/framework/nd_sbp.h"
 #include "oneflow/core/framework/instructions_builder.h"
diff --git a/oneflow/core/job_rewriter/sequential_one_embedding_shuffle_ops_pass.cpp b/oneflow/core/job_rewriter/sequential_one_embedding_shuffle_ops_pass.cpp
index 961e203..6a524fa 100644
--- a/oneflow/core/job_rewriter/sequential_one_embedding_shuffle_ops_pass.cpp
+++ b/oneflow/core/job_rewriter/sequential_one_embedding_shuffle_ops_pass.cpp
@@ -1,80 +1,80 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-#include "oneflow/core/job_rewriter/job_pass.h"
-#include "oneflow/core/framework/framework.h"
-
-namespace oneflow {
-
-namespace {
-
-class SequentialOneEmbeddingOpsPass final : public JobPass {
- public:
-  SequentialOneEmbeddingOpsPass() = default;
-  ~SequentialOneEmbeddingOpsPass() override = default;
-
-  bool IsEnabled(const JobPassCtx& ctx) const {
-    return ParseBooleanFromEnv("ONEFLOW_ONE_EMBEDDING_DISABLE_PIPELINED_EXECUTION", false);
-  }
-  Maybe<void> Apply(const OpGraph& op_graph, JobBuilder* job_builder) const;
-
-  Maybe<void> Apply(Job* job, JobPassCtx* ctx) const override {
-    if (!IsEnabled(*ctx)) { return Maybe<void>::Ok(); }
-    const OpGraph op_graph(*job);
-    JobBuilder job_builder(job);
-    return Apply(op_graph, &job_builder);
-  }
-};
-
-bool IsUserOpWithTypeName(const OperatorConf& op_conf, const std::string& op_type_name) {
-  return op_conf.has_user_conf() && op_conf.user_conf().op_type_name() == op_type_name;
-};
-
-Maybe<void> SequentialOneEmbeddingOpsPass::Apply(const OpGraph& op_graph,
-                                                 JobBuilder* job_builder) const {
-  HashMap<std::string, std::vector<std::string>> stream_name_hint2shuffle_op_names;
-  op_graph.TopoForEachNode([&](const OpNode* op_node) {
-    if (!(IsUserOpWithTypeName(op_node->op().op_conf(), "id_shuffle")
-          || IsUserOpWithTypeName(op_node->op().op_conf(), "embedding_shuffle")
-          || IsUserOpWithTypeName(op_node->op().op_conf(), "embedding_gradient_shuffle"))) {
-      return;
-    }
-    OperatorConf op_conf = op_node->op().op_conf();
-    std::string stream_name;
-    if (op_conf.has_stream_name_hint()) {
-      stream_name = op_conf.stream_name_hint();
-    } else {
-      stream_name = "DEFAULT";
-    }
-    const auto& it = stream_name_hint2shuffle_op_names.find(stream_name);
-    if (it != stream_name_hint2shuffle_op_names.end()) {
-      if (it->second.size() > 0) {
-        std::string pre_shuffle_op_name = it->second.back();
-        op_conf.add_ctrl_in_op_name(pre_shuffle_op_name);
-        job_builder->MutOpsOnlyOnce({op_conf});
-      }
-      it->second.push_back(op_conf.name());
-    } else {
-      std::vector<std::string> shuffle_ops{op_conf.name()};
-      CHECK(stream_name_hint2shuffle_op_names.emplace(stream_name, shuffle_ops).second);
-    }
-  });
-
-  return Maybe<void>::Ok();
-}
-
-}  // namespace
-
-REGISTER_JOB_PASS("SequentialOneEmbeddingOpsPass", SequentialOneEmbeddingOpsPass);
-
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+#include "oneflow/core/job_rewriter/job_pass.h"
+#include "oneflow/core/framework/framework.h"
+
+namespace oneflow {
+
+namespace {
+
+class SequentialOneEmbeddingOpsPass final : public JobPass {
+ public:
+  SequentialOneEmbeddingOpsPass() = default;
+  ~SequentialOneEmbeddingOpsPass() override = default;
+
+  bool IsEnabled(const JobPassCtx& ctx) const {
+    return ParseBooleanFromEnv("ONEFLOW_ONE_EMBEDDING_DISABLE_PIPELINED_EXECUTION", false);
+  }
+  Maybe<void> Apply(const OpGraph& op_graph, JobBuilder* job_builder) const;
+
+  Maybe<void> Apply(Job* job, JobPassCtx* ctx) const override {
+    if (!IsEnabled(*ctx)) { return Maybe<void>::Ok(); }
+    const OpGraph op_graph(*job);
+    JobBuilder job_builder(job);
+    return Apply(op_graph, &job_builder);
+  }
+};
+
+bool IsUserOpWithTypeName(const OperatorConf& op_conf, const std::string& op_type_name) {
+  return op_conf.has_user_conf() && op_conf.user_conf().op_type_name() == op_type_name;
+};
+
+Maybe<void> SequentialOneEmbeddingOpsPass::Apply(const OpGraph& op_graph,
+                                                 JobBuilder* job_builder) const {
+  HashMap<std::string, std::vector<std::string>> stream_name_hint2shuffle_op_names;
+  op_graph.TopoForEachNode([&](const OpNode* op_node) {
+    if (!(IsUserOpWithTypeName(op_node->op().op_conf(), "id_shuffle")
+          || IsUserOpWithTypeName(op_node->op().op_conf(), "embedding_shuffle")
+          || IsUserOpWithTypeName(op_node->op().op_conf(), "embedding_gradient_shuffle"))) {
+      return;
+    }
+    OperatorConf op_conf = op_node->op().op_conf();
+    std::string stream_name;
+    if (op_conf.has_stream_name_hint()) {
+      stream_name = op_conf.stream_name_hint();
+    } else {
+      stream_name = "DEFAULT";
+    }
+    const auto& it = stream_name_hint2shuffle_op_names.find(stream_name);
+    if (it != stream_name_hint2shuffle_op_names.end()) {
+      if (it->second.size() > 0) {
+        std::string pre_shuffle_op_name = it->second.back();
+        op_conf.add_ctrl_in_op_name(pre_shuffle_op_name);
+        job_builder->MutOpsOnlyOnce({op_conf});
+      }
+      it->second.push_back(op_conf.name());
+    } else {
+      std::vector<std::string> shuffle_ops{op_conf.name()};
+      CHECK(stream_name_hint2shuffle_op_names.emplace(stream_name, shuffle_ops).second);
+    }
+  });
+
+  return Maybe<void>::Ok();
+}
+
+}  // namespace
+
+REGISTER_JOB_PASS("SequentialOneEmbeddingOpsPass", SequentialOneEmbeddingOpsPass);
+
 }  // namespace oneflow
\ No newline at end of file
diff --git a/oneflow/core/kernel/cuda_check_numerics_kernel_observer.hip.cpp b/oneflow/core/kernel/cuda_check_numerics_kernel_observer.hip.cpp
index b0c81a6..2c38465 100644
--- a/oneflow/core/kernel/cuda_check_numerics_kernel_observer.hip.cpp
+++ b/oneflow/core/kernel/cuda_check_numerics_kernel_observer.hip.cpp
@@ -1,133 +1,133 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "hip/hip_runtime.h"
-#include "oneflow/core/kernel/cuda_check_numerics_kernel_observer.h"
-#include "oneflow/core/kernel/kernel.h"
-#include "oneflow/core/ep/rocm/cuda_stream.h"
-
-namespace oneflow {
-
-namespace {
-
-template<typename T>
-__device__ bool IsNotFinite(T x) {
-  return !isfinite(x);
-}
-
-template<>
-__device__ bool IsNotFinite<half>(half x) {
-  return (__hisinf(x) || __hisnan(x));
-}
-
-template<typename T>
-__global__ void HasNotFiniteGpuKernel(const int64_t n, const T* x, volatile bool* has_not_finite) {
-  if (*has_not_finite) { return; }
-  CUDA_1D_KERNEL_LOOP_T(int64_t, i, n) {
-    if (IsNotFinite(x[i])) {
-      *has_not_finite = true;
-      return;
-    }
-  }
-}
-
-template<typename T>
-bool HasNotFinite(ep::Stream* stream, const int64_t elem_cnt, const T* data_ptr,
-                  bool* has_not_finite_host, bool* has_not_finite_device) {
-  OF_CUDA_CHECK(hipMemsetAsync(has_not_finite_device, 0, sizeof(bool),
-                                stream->As<ep::CudaStream>()->cuda_stream()));
-  HasNotFiniteGpuKernel<T>
-      <<<BlocksNum4ThreadsNum(elem_cnt), kCudaThreadsNumPerBlock, 0,
-         stream->As<ep::CudaStream>()->cuda_stream()>>>(elem_cnt, data_ptr, has_not_finite_device);
-  OF_CUDA_CHECK(hipMemcpyAsync(has_not_finite_host, has_not_finite_device, sizeof(bool),
-                                hipMemcpyDefault, stream->As<ep::CudaStream>()->cuda_stream()));
-  OF_CUDA_CHECK(hipStreamSynchronize(stream->As<ep::CudaStream>()->cuda_stream()));
-  return *has_not_finite_host;
-}
-
-bool HasNotFiniteGpu(ep::Stream* stream, const Blob* blob, bool* has_not_finite_host,
-                     bool* has_not_finite_device) {
-  auto* cuda_stream = stream->As<ep::CudaStream>();
-  const DataType dtype = blob->data_type();
-  const int64_t elem_cnt = blob->shape().elem_cnt();
-  if (elem_cnt == 0) { return false; }
-  if (dtype == kFloat) {
-    return HasNotFinite<float>(stream, elem_cnt, blob->dptr<float>(), has_not_finite_host,
-                               has_not_finite_device);
-  } else if (dtype == kDouble) {
-    return HasNotFinite<double>(stream, elem_cnt, blob->dptr<double>(), has_not_finite_host,
-                                has_not_finite_device);
-  } else if (dtype == kFloat16) {
-    if (cuda_stream->cuda_arch() >= 530) {
-      return HasNotFinite<half>(stream, elem_cnt, blob->dptr<half>(), has_not_finite_host,
-                                has_not_finite_device);
-    } else {
-      LOG(FATAL) << "use half need nvcc arch >= 530";
-      return true;
-    }
-  } else {
-    return false;
-  }
-}
-
-void DumpBlob(KernelContext* ctx, const std::string& bn) {
-  Blob* blob = ctx->BnInOp2Blob(bn);
-  if (blob != nullptr) {
-    std::vector<char> buffer(blob->ByteSizeOfBlobBody());
-    OF_CUDA_CHECK(
-        hipMemcpy(buffer.data(), blob->dptr(), blob->ByteSizeOfBlobBody(), hipMemcpyDefault));
-    OF_CUDA_CHECK(hipDeviceSynchronize());
-    std::ofstream ofs(bn);
-    ofs.write(buffer.data(), blob->ByteSizeOfBlobBody());
-  }
-}
-
-void DumpBlobs(KernelContext* ctx, const Kernel* kernel) {
-  for (const auto& obn : kernel->op_attribute().output_bns()) { DumpBlob(ctx, obn); }
-  for (const auto& ibn : kernel->op_attribute().input_bns()) { DumpBlob(ctx, ibn); }
-}
-
-}  // namespace
-
-CudaCheckNumericsKernelObserver::CudaCheckNumericsKernelObserver()
-    : has_not_finite_host_(nullptr), has_not_finite_device_(nullptr) {
-  OF_CUDA_CHECK(hipGetDevice(&device_id_));
-  OF_CUDA_CHECK(hipMallocHost(reinterpret_cast<void **>(&has_not_finite_host_), sizeof(bool)));
-  OF_CUDA_CHECK(hipMalloc(&has_not_finite_device_, sizeof(bool)));
-}
-
-CudaCheckNumericsKernelObserver::~CudaCheckNumericsKernelObserver() {
-  CudaCurrentDeviceGuard guard(device_id_);
-  OF_CUDA_CHECK(hipHostFree(has_not_finite_host_));
-  OF_CUDA_CHECK(hipFree(has_not_finite_device_));
-}
-
-void CudaCheckNumericsKernelObserver::DidForwardDataContent(KernelContext* ctx,
-                                                            const Kernel* kernel) {
-  for (const auto& obn : kernel->op_attribute().output_bns()) {
-    Blob* blob = ctx->BnInOp2Blob(obn);
-    if (blob != nullptr) {
-      bool has_not_finite =
-          HasNotFiniteGpu(ctx->stream(), blob, has_not_finite_host_, has_not_finite_device_);
-      if (has_not_finite
-          && ParseBooleanFromEnv("ONEFLOW_DEBUG_KERNEL_SYNC_CHECK_NUMERICS_DUMP", false)) {
-        DumpBlobs(ctx, kernel);
-      }
-      CHECK(!has_not_finite) << kernel->op_conf().name() << " : " << obn << " has nan or inf";
-    }
-  }
-}
-
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "hip/hip_runtime.h"
+#include "oneflow/core/kernel/cuda_check_numerics_kernel_observer.h"
+#include "oneflow/core/kernel/kernel.h"
+#include "oneflow/core/ep/rocm/cuda_stream.h"
+
+namespace oneflow {
+
+namespace {
+
+template<typename T>
+__device__ bool IsNotFinite(T x) {
+  return !isfinite(x);
+}
+
+template<>
+__device__ bool IsNotFinite<half>(half x) {
+  return (__hisinf(x) || __hisnan(x));
+}
+
+template<typename T>
+__global__ void HasNotFiniteGpuKernel(const int64_t n, const T* x, volatile bool* has_not_finite) {
+  if (*has_not_finite) { return; }
+  CUDA_1D_KERNEL_LOOP_T(int64_t, i, n) {
+    if (IsNotFinite(x[i])) {
+      *has_not_finite = true;
+      return;
+    }
+  }
+}
+
+template<typename T>
+bool HasNotFinite(ep::Stream* stream, const int64_t elem_cnt, const T* data_ptr,
+                  bool* has_not_finite_host, bool* has_not_finite_device) {
+  OF_CUDA_CHECK(hipMemsetAsync(has_not_finite_device, 0, sizeof(bool),
+                                stream->As<ep::CudaStream>()->cuda_stream()));
+  HasNotFiniteGpuKernel<T>
+      <<<BlocksNum4ThreadsNum(elem_cnt), kCudaThreadsNumPerBlock, 0,
+         stream->As<ep::CudaStream>()->cuda_stream()>>>(elem_cnt, data_ptr, has_not_finite_device);
+  OF_CUDA_CHECK(hipMemcpyAsync(has_not_finite_host, has_not_finite_device, sizeof(bool),
+                                hipMemcpyDefault, stream->As<ep::CudaStream>()->cuda_stream()));
+  OF_CUDA_CHECK(hipStreamSynchronize(stream->As<ep::CudaStream>()->cuda_stream()));
+  return *has_not_finite_host;
+}
+
+bool HasNotFiniteGpu(ep::Stream* stream, const Blob* blob, bool* has_not_finite_host,
+                     bool* has_not_finite_device) {
+  auto* cuda_stream = stream->As<ep::CudaStream>();
+  const DataType dtype = blob->data_type();
+  const int64_t elem_cnt = blob->shape().elem_cnt();
+  if (elem_cnt == 0) { return false; }
+  if (dtype == kFloat) {
+    return HasNotFinite<float>(stream, elem_cnt, blob->dptr<float>(), has_not_finite_host,
+                               has_not_finite_device);
+  } else if (dtype == kDouble) {
+    return HasNotFinite<double>(stream, elem_cnt, blob->dptr<double>(), has_not_finite_host,
+                                has_not_finite_device);
+  } else if (dtype == kFloat16) {
+    if (cuda_stream->cuda_arch() >= 530) {
+      return HasNotFinite<half>(stream, elem_cnt, blob->dptr<half>(), has_not_finite_host,
+                                has_not_finite_device);
+    } else {
+      LOG(FATAL) << "use half need nvcc arch >= 530";
+      return true;
+    }
+  } else {
+    return false;
+  }
+}
+
+void DumpBlob(KernelContext* ctx, const std::string& bn) {
+  Blob* blob = ctx->BnInOp2Blob(bn);
+  if (blob != nullptr) {
+    std::vector<char> buffer(blob->ByteSizeOfBlobBody());
+    OF_CUDA_CHECK(
+        hipMemcpy(buffer.data(), blob->dptr(), blob->ByteSizeOfBlobBody(), hipMemcpyDefault));
+    OF_CUDA_CHECK(hipDeviceSynchronize());
+    std::ofstream ofs(bn);
+    ofs.write(buffer.data(), blob->ByteSizeOfBlobBody());
+  }
+}
+
+void DumpBlobs(KernelContext* ctx, const Kernel* kernel) {
+  for (const auto& obn : kernel->op_attribute().output_bns()) { DumpBlob(ctx, obn); }
+  for (const auto& ibn : kernel->op_attribute().input_bns()) { DumpBlob(ctx, ibn); }
+}
+
+}  // namespace
+
+CudaCheckNumericsKernelObserver::CudaCheckNumericsKernelObserver()
+    : has_not_finite_host_(nullptr), has_not_finite_device_(nullptr) {
+  OF_CUDA_CHECK(hipGetDevice(&device_id_));
+  OF_CUDA_CHECK(hipMallocHost(reinterpret_cast<void **>(&has_not_finite_host_), sizeof(bool)));
+  OF_CUDA_CHECK(hipMalloc(&has_not_finite_device_, sizeof(bool)));
+}
+
+CudaCheckNumericsKernelObserver::~CudaCheckNumericsKernelObserver() {
+  CudaCurrentDeviceGuard guard(device_id_);
+  OF_CUDA_CHECK(hipHostFree(has_not_finite_host_));
+  OF_CUDA_CHECK(hipFree(has_not_finite_device_));
+}
+
+void CudaCheckNumericsKernelObserver::DidForwardDataContent(KernelContext* ctx,
+                                                            const Kernel* kernel) {
+  for (const auto& obn : kernel->op_attribute().output_bns()) {
+    Blob* blob = ctx->BnInOp2Blob(obn);
+    if (blob != nullptr) {
+      bool has_not_finite =
+          HasNotFiniteGpu(ctx->stream(), blob, has_not_finite_host_, has_not_finite_device_);
+      if (has_not_finite
+          && ParseBooleanFromEnv("ONEFLOW_DEBUG_KERNEL_SYNC_CHECK_NUMERICS_DUMP", false)) {
+        DumpBlobs(ctx, kernel);
+      }
+      CHECK(!has_not_finite) << kernel->op_conf().name() << " : " << obn << " has nan or inf";
+    }
+  }
+}
+
 }  // namespace oneflow
\ No newline at end of file
diff --git a/oneflow/core/kernel/kernel_util.hip.h b/oneflow/core/kernel/kernel_util.hip.h
index fc466e8..15a01bc 100644
--- a/oneflow/core/kernel/kernel_util.hip.h
+++ b/oneflow/core/kernel/kernel_util.hip.h
@@ -1,53 +1,53 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#ifndef ONEFLOW_CORE_KERNEL_KERNEL_UTIL_HIP_H_
-#define ONEFLOW_CORE_KERNEL_KERNEL_UTIL_HIP_H_
-#include "oneflow/core/device/cuda_pseudo_half.h"
-#include "oneflow/core/common/data_type.h"
-
-namespace oneflow {
-
-template<typename T, typename std::enable_if<IsFloating<T>::value>::type* = nullptr>
-OF_DEVICE_FUNC T MaxWithLogThreshold(T x) {
-  const T threshold = 1e-20;
-  return x > threshold ? x : threshold;
-}
-
-template<typename T, typename std::enable_if<IsIntegral<T>::value>::type* = nullptr>
-OF_DEVICE_FUNC T MaxWithLogThreshold(T x) {
-  return x;
-}
-
-#if defined(__CUDACC__) || defined(__HIPCC__)
-__device__ __forceinline__ half MaxWithLogThreshold(half x) {
-  half threshold = hexp2(__float2half(-14.0));
-  if (__hgt(x, threshold)) { return x; }
-  return threshold;
-}
-#endif
-
-template<typename T>
-OF_DEVICE_FUNC T SafeLog(T x) {
-  return logf(MaxWithLogThreshold(x));
-}
-
-#if defined(__CUDACC__) || defined(__HIPCC__)
-__device__ __forceinline__ half SafeLog(half x) { return hlog(MaxWithLogThreshold(x)); }
-#endif
-
-}  // namespace oneflow
-
-#endif  // ONEFLOW_CORE_KERNEL_KERNEL_UTIL_HIP_H_
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#ifndef ONEFLOW_CORE_KERNEL_KERNEL_UTIL_HIP_H_
+#define ONEFLOW_CORE_KERNEL_KERNEL_UTIL_HIP_H_
+#include "oneflow/core/device/cuda_pseudo_half.h"
+#include "oneflow/core/common/data_type.h"
+
+namespace oneflow {
+
+template<typename T, typename std::enable_if<IsFloating<T>::value>::type* = nullptr>
+OF_DEVICE_FUNC T MaxWithLogThreshold(T x) {
+  const T threshold = 1e-20;
+  return x > threshold ? x : threshold;
+}
+
+template<typename T, typename std::enable_if<IsIntegral<T>::value>::type* = nullptr>
+OF_DEVICE_FUNC T MaxWithLogThreshold(T x) {
+  return x;
+}
+
+#if defined(__CUDACC__) || defined(__HIPCC__)
+__device__ __forceinline__ half MaxWithLogThreshold(half x) {
+  half threshold = hexp2(__float2half(-14.0));
+  if (__hgt(x, threshold)) { return x; }
+  return threshold;
+}
+#endif
+
+template<typename T>
+OF_DEVICE_FUNC T SafeLog(T x) {
+  return logf(MaxWithLogThreshold(x));
+}
+
+#if defined(__CUDACC__) || defined(__HIPCC__)
+__device__ __forceinline__ half SafeLog(half x) { return hlog(MaxWithLogThreshold(x)); }
+#endif
+
+}  // namespace oneflow
+
+#endif  // ONEFLOW_CORE_KERNEL_KERNEL_UTIL_HIP_H_
diff --git a/oneflow/core/kernel/random_generator.hip.cpp b/oneflow/core/kernel/random_generator.hip.cpp
index e35a1c6..db8dbd9 100644
--- a/oneflow/core/kernel/random_generator.hip.cpp
+++ b/oneflow/core/kernel/random_generator.hip.cpp
@@ -1,59 +1,59 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "oneflow/core/kernel/random_generator.h"
-#include "oneflow/core/ep/rocm/cuda_stream.h"
-
-
-namespace oneflow {
-
-namespace {
-
-template<typename T>
-void RngUniformGpu(const hiprandGenerator_t& gen, int64_t n, T* ret);
-
-template<>
-void RngUniformGpu<float>(const hiprandGenerator_t& gen, int64_t n, float* ret) {
-  OF_CURAND_CHECK(hiprandGenerateUniform(gen, ret, n));
-}
-
-template<>
-void RngUniformGpu<double>(const hiprandGenerator_t& gen, int64_t n, double* ret) {
-  OF_CURAND_CHECK(hiprandGenerateUniformDouble(gen, ret, n));
-}
-
-}  // namespace
-
-RandomGenerator<DeviceType::kCUDA>::RandomGenerator(int64_t seed, ep::Stream* stream) {
-  OF_CURAND_CHECK(hiprandCreateGenerator(&curand_generator_, HIPRAND_RNG_PSEUDO_DEFAULT));
-  OF_CURAND_CHECK(hiprandSetPseudoRandomGeneratorSeed(curand_generator_, seed));
-  OF_CURAND_CHECK(hiprandSetStream(curand_generator_, stream->As<ep::CudaStream>()->cuda_stream()));
-}
-
-RandomGenerator<DeviceType::kCUDA>::~RandomGenerator() {
-  OF_CURAND_CHECK(hiprandDestroyGenerator(curand_generator_));
-}
-
-template<typename T>
-void RandomGenerator<DeviceType::kCUDA>::Uniform(const int64_t elem_cnt, T* dptr) {
-  RngUniformGpu(curand_generator_, elem_cnt, dptr);
-}
-
-#define INITIATE_CUDA_RANDOM_GENERATOR_UNIFORM(T, typeproto) \
-  template void RandomGenerator<DeviceType::kCUDA>::Uniform<T>(const int64_t elem_cnt, T* dptr);
-
-OF_PP_FOR_EACH_TUPLE(INITIATE_CUDA_RANDOM_GENERATOR_UNIFORM, FLOATING_DATA_TYPE_SEQ);
-
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "oneflow/core/kernel/random_generator.h"
+#include "oneflow/core/ep/rocm/cuda_stream.h"
+
+
+namespace oneflow {
+
+namespace {
+
+template<typename T>
+void RngUniformGpu(const hiprandGenerator_t& gen, int64_t n, T* ret);
+
+template<>
+void RngUniformGpu<float>(const hiprandGenerator_t& gen, int64_t n, float* ret) {
+  OF_CURAND_CHECK(hiprandGenerateUniform(gen, ret, n));
+}
+
+template<>
+void RngUniformGpu<double>(const hiprandGenerator_t& gen, int64_t n, double* ret) {
+  OF_CURAND_CHECK(hiprandGenerateUniformDouble(gen, ret, n));
+}
+
+}  // namespace
+
+RandomGenerator<DeviceType::kCUDA>::RandomGenerator(int64_t seed, ep::Stream* stream) {
+  OF_CURAND_CHECK(hiprandCreateGenerator(&curand_generator_, HIPRAND_RNG_PSEUDO_DEFAULT));
+  OF_CURAND_CHECK(hiprandSetPseudoRandomGeneratorSeed(curand_generator_, seed));
+  OF_CURAND_CHECK(hiprandSetStream(curand_generator_, stream->As<ep::CudaStream>()->cuda_stream()));
+}
+
+RandomGenerator<DeviceType::kCUDA>::~RandomGenerator() {
+  OF_CURAND_CHECK(hiprandDestroyGenerator(curand_generator_));
+}
+
+template<typename T>
+void RandomGenerator<DeviceType::kCUDA>::Uniform(const int64_t elem_cnt, T* dptr) {
+  RngUniformGpu(curand_generator_, elem_cnt, dptr);
+}
+
+#define INITIATE_CUDA_RANDOM_GENERATOR_UNIFORM(T, typeproto) \
+  template void RandomGenerator<DeviceType::kCUDA>::Uniform<T>(const int64_t elem_cnt, T* dptr);
+
+OF_PP_FOR_EACH_TUPLE(INITIATE_CUDA_RANDOM_GENERATOR_UNIFORM, FLOATING_DATA_TYPE_SEQ);
+
 }  // namespace oneflow
\ No newline at end of file
diff --git a/oneflow/core/kernel/util/numeric_limits.hip.h b/oneflow/core/kernel/util/numeric_limits.hip.h
index 96a9b10..7cdc409 100644
--- a/oneflow/core/kernel/util/numeric_limits.hip.h
+++ b/oneflow/core/kernel/util/numeric_limits.hip.h
@@ -1,128 +1,128 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-// reference: https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/cuda/NumericLimits.cuh
-#pragma once
-#include <limits.h>
-#include <math.h>
-#include <float.h>
-
-#include "oneflow/core/device/cuda_util.h"
-#include "oneflow/core/framework/framework.h"
-
-// numeric_limits.cuh is a holder for numeric limits definitions of commonly used
-// types. This header is very specific to ROCm HIP and may be removed in the future.
-
-// The lower_bound and upper_bound constants are same as lowest and max for
-// integral types, but are -inf and +inf for floating point types. They are
-// useful in implementing min, max, etc.
-
-namespace oneflow {
-namespace detail {
-
-#if defined(__HIPCC__)
-#define OF_NUMERICS_FUNC static inline __host__ __device__
-#else
-#define OF_NUMERICS_FUNC static inline
-#endif
-
-template<typename T>
-struct numeric_limits {};
-
-// WARNING: the following oneflow::numeric_limits definitions are there only to support
-//          HIP compilation for the moment. Use std::numeric_limits if you are not
-//          compiling for ROCm.
-//          from @colesbury: "The functions on numeric_limits aren't marked with
-//          __device__ which is why they don't work with ROCm. CUDA allows them
-//          because they're constexpr."
-
-namespace {
-// ROCm doesn't like INFINITY too.
-constexpr double inf = INFINITY;
-}  // namespace
-
-template<>
-struct numeric_limits<bool> {
-  OF_NUMERICS_FUNC bool lowest() { return false; }
-  OF_NUMERICS_FUNC bool max() { return true; }
-  OF_NUMERICS_FUNC bool lower_bound() { return false; }
-  OF_NUMERICS_FUNC bool upper_bound() { return true; }
-};
-
-template<>
-struct numeric_limits<uint8_t> {
-  OF_NUMERICS_FUNC uint8_t lowest() { return 0; }
-  OF_NUMERICS_FUNC uint8_t max() { return UINT8_MAX; }
-  OF_NUMERICS_FUNC uint8_t lower_bound() { return 0; }
-  OF_NUMERICS_FUNC uint8_t upper_bound() { return UINT8_MAX; }
-};
-
-template<>
-struct numeric_limits<int8_t> {
-  OF_NUMERICS_FUNC int8_t lowest() { return INT8_MIN; }
-  OF_NUMERICS_FUNC int8_t max() { return INT8_MAX; }
-  OF_NUMERICS_FUNC int8_t lower_bound() { return INT8_MIN; }
-  OF_NUMERICS_FUNC int8_t upper_bound() { return INT8_MAX; }
-};
-
-template<>
-struct numeric_limits<int16_t> {
-  OF_NUMERICS_FUNC int16_t lowest() { return INT16_MIN; }
-  OF_NUMERICS_FUNC int16_t max() { return INT16_MAX; }
-  OF_NUMERICS_FUNC int16_t lower_bound() { return INT16_MIN; }
-  OF_NUMERICS_FUNC int16_t upper_bound() { return INT16_MAX; }
-};
-
-template<>
-struct numeric_limits<int32_t> {
-  OF_NUMERICS_FUNC int32_t lowest() { return INT32_MIN; }
-  OF_NUMERICS_FUNC int32_t max() { return INT32_MAX; }
-  OF_NUMERICS_FUNC int32_t lower_bound() { return INT32_MIN; }
-  OF_NUMERICS_FUNC int32_t upper_bound() { return INT32_MAX; }
-};
-
-template<>
-struct numeric_limits<int64_t> {
-#ifdef _MSC_VER
-  OF_NUMERICS_FUNC int64_t lowest() { return _I64_MIN; }
-  OF_NUMERICS_FUNC int64_t max() { return _I64_MAX; }
-  OF_NUMERICS_FUNC int64_t lower_bound() { return _I64_MIN; }
-  OF_NUMERICS_FUNC int64_t upper_bound() { return _I64_MAX; }
-#else
-  OF_NUMERICS_FUNC int64_t lowest() { return INT64_MIN; }
-  OF_NUMERICS_FUNC int64_t max() { return INT64_MAX; }
-  OF_NUMERICS_FUNC int64_t lower_bound() { return INT64_MIN; }
-  OF_NUMERICS_FUNC int64_t upper_bound() { return INT64_MAX; }
-#endif
-};
-
-template<>
-struct numeric_limits<float> {
-  OF_NUMERICS_FUNC float lowest() { return -FLT_MAX; }
-  OF_NUMERICS_FUNC float max() { return FLT_MAX; }
-  OF_NUMERICS_FUNC float lower_bound() { return -static_cast<float>(inf); }
-  OF_NUMERICS_FUNC float upper_bound() { return static_cast<float>(inf); }
-};
-
-template<>
-struct numeric_limits<double> {
-  OF_NUMERICS_FUNC double lowest() { return -DBL_MAX; }
-  OF_NUMERICS_FUNC double max() { return DBL_MAX; }
-  OF_NUMERICS_FUNC double lower_bound() { return -inf; }
-  OF_NUMERICS_FUNC double upper_bound() { return inf; }
-};
-
-}  // namespace detail
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+// reference: https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/cuda/NumericLimits.cuh
+#pragma once
+#include <limits.h>
+#include <math.h>
+#include <float.h>
+
+#include "oneflow/core/device/cuda_util.h"
+#include "oneflow/core/framework/framework.h"
+
+// numeric_limits.cuh is a holder for numeric limits definitions of commonly used
+// types. This header is very specific to ROCm HIP and may be removed in the future.
+
+// The lower_bound and upper_bound constants are same as lowest and max for
+// integral types, but are -inf and +inf for floating point types. They are
+// useful in implementing min, max, etc.
+
+namespace oneflow {
+namespace detail {
+
+#if defined(__HIPCC__)
+#define OF_NUMERICS_FUNC static inline __host__ __device__
+#else
+#define OF_NUMERICS_FUNC static inline
+#endif
+
+template<typename T>
+struct numeric_limits {};
+
+// WARNING: the following oneflow::numeric_limits definitions are there only to support
+//          HIP compilation for the moment. Use std::numeric_limits if you are not
+//          compiling for ROCm.
+//          from @colesbury: "The functions on numeric_limits aren't marked with
+//          __device__ which is why they don't work with ROCm. CUDA allows them
+//          because they're constexpr."
+
+namespace {
+// ROCm doesn't like INFINITY too.
+constexpr double inf = INFINITY;
+}  // namespace
+
+template<>
+struct numeric_limits<bool> {
+  OF_NUMERICS_FUNC bool lowest() { return false; }
+  OF_NUMERICS_FUNC bool max() { return true; }
+  OF_NUMERICS_FUNC bool lower_bound() { return false; }
+  OF_NUMERICS_FUNC bool upper_bound() { return true; }
+};
+
+template<>
+struct numeric_limits<uint8_t> {
+  OF_NUMERICS_FUNC uint8_t lowest() { return 0; }
+  OF_NUMERICS_FUNC uint8_t max() { return UINT8_MAX; }
+  OF_NUMERICS_FUNC uint8_t lower_bound() { return 0; }
+  OF_NUMERICS_FUNC uint8_t upper_bound() { return UINT8_MAX; }
+};
+
+template<>
+struct numeric_limits<int8_t> {
+  OF_NUMERICS_FUNC int8_t lowest() { return INT8_MIN; }
+  OF_NUMERICS_FUNC int8_t max() { return INT8_MAX; }
+  OF_NUMERICS_FUNC int8_t lower_bound() { return INT8_MIN; }
+  OF_NUMERICS_FUNC int8_t upper_bound() { return INT8_MAX; }
+};
+
+template<>
+struct numeric_limits<int16_t> {
+  OF_NUMERICS_FUNC int16_t lowest() { return INT16_MIN; }
+  OF_NUMERICS_FUNC int16_t max() { return INT16_MAX; }
+  OF_NUMERICS_FUNC int16_t lower_bound() { return INT16_MIN; }
+  OF_NUMERICS_FUNC int16_t upper_bound() { return INT16_MAX; }
+};
+
+template<>
+struct numeric_limits<int32_t> {
+  OF_NUMERICS_FUNC int32_t lowest() { return INT32_MIN; }
+  OF_NUMERICS_FUNC int32_t max() { return INT32_MAX; }
+  OF_NUMERICS_FUNC int32_t lower_bound() { return INT32_MIN; }
+  OF_NUMERICS_FUNC int32_t upper_bound() { return INT32_MAX; }
+};
+
+template<>
+struct numeric_limits<int64_t> {
+#ifdef _MSC_VER
+  OF_NUMERICS_FUNC int64_t lowest() { return _I64_MIN; }
+  OF_NUMERICS_FUNC int64_t max() { return _I64_MAX; }
+  OF_NUMERICS_FUNC int64_t lower_bound() { return _I64_MIN; }
+  OF_NUMERICS_FUNC int64_t upper_bound() { return _I64_MAX; }
+#else
+  OF_NUMERICS_FUNC int64_t lowest() { return INT64_MIN; }
+  OF_NUMERICS_FUNC int64_t max() { return INT64_MAX; }
+  OF_NUMERICS_FUNC int64_t lower_bound() { return INT64_MIN; }
+  OF_NUMERICS_FUNC int64_t upper_bound() { return INT64_MAX; }
+#endif
+};
+
+template<>
+struct numeric_limits<float> {
+  OF_NUMERICS_FUNC float lowest() { return -FLT_MAX; }
+  OF_NUMERICS_FUNC float max() { return FLT_MAX; }
+  OF_NUMERICS_FUNC float lower_bound() { return -static_cast<float>(inf); }
+  OF_NUMERICS_FUNC float upper_bound() { return static_cast<float>(inf); }
+};
+
+template<>
+struct numeric_limits<double> {
+  OF_NUMERICS_FUNC double lowest() { return -DBL_MAX; }
+  OF_NUMERICS_FUNC double max() { return DBL_MAX; }
+  OF_NUMERICS_FUNC double lower_bound() { return -inf; }
+  OF_NUMERICS_FUNC double upper_bound() { return inf; }
+};
+
+}  // namespace detail
 }  // namespace oneflow
\ No newline at end of file
diff --git a/oneflow/core/kernel/util/numerics.hip.h b/oneflow/core/kernel/util/numerics.hip.h
index 5b1bad8..68b1b53 100644
--- a/oneflow/core/kernel/util/numerics.hip.h
+++ b/oneflow/core/kernel/util/numerics.hip.h
@@ -1,250 +1,250 @@
-
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-// reference: https://github.com/pytorch/pytorch/blob/master/aten/src/THC/THCNumerics.cuh
-#ifndef ONEFLOW_CORE_KERNEL_UTIL_NUMERICS_HIP_H
-#define ONEFLOW_CORE_KERNEL_UTIL_NUMERICS_HIP_H
-#pragma once
-
-#include <limits.h>
-#include <math.h>
-#include <float.h>
-#include <cstdlib>
-#include <assert.h>
-
-#include "oneflow/core/device/cuda_util.h"
-#include "oneflow/core/framework/framework.h"
-#include "oneflow/core/kernel/util/numeric_limits.hip.h"
-
-namespace oneflow {
-namespace detail {
-
-template<typename T>
-struct numerics {};
-
-template<typename T>
-OF_NUMERICS_FUNC T powi(T a, T b) {
-  assert(numerics<T>::ge(b, 0));
-  T result = 1;
-  while (b) {
-    if (b & 1) { result *= a; }
-    b /= 2;
-    a *= a;
-  }
-  return result;
-}
-
-template<>
-struct numerics<uint8_t> {
-  OF_NUMERICS_FUNC uint8_t min() { return detail::numeric_limits<uint8_t>::lowest(); }
-  OF_NUMERICS_FUNC uint8_t max() { return detail::numeric_limits<uint8_t>::max(); }
-  OF_NUMERICS_FUNC uint8_t lower_bound() { return detail::numeric_limits<uint8_t>::lower_bound(); }
-  OF_NUMERICS_FUNC uint8_t upper_bound() { return detail::numeric_limits<uint8_t>::upper_bound(); }
-
-  OF_NUMERICS_FUNC bool lt(uint8_t a, uint8_t b) { return a < b; }
-  OF_NUMERICS_FUNC bool le(uint8_t a, uint8_t b) { return a <= b; }
-  OF_NUMERICS_FUNC bool gt(uint8_t a, uint8_t b) { return a > b; }
-  OF_NUMERICS_FUNC bool ge(uint8_t a, uint8_t b) { return a >= b; }
-  OF_NUMERICS_FUNC bool eq(uint8_t a, uint8_t b) { return a == b; }
-  OF_NUMERICS_FUNC bool ne(uint8_t a, uint8_t b) { return a != b; }
-
-  OF_NUMERICS_FUNC uint8_t add(uint8_t a, uint8_t b) { return a + b; }
-  OF_NUMERICS_FUNC uint8_t mul(uint8_t a, uint8_t b) { return a * b; }
-  OF_NUMERICS_FUNC uint8_t sub(uint8_t a, uint8_t b) { return a - b; }
-  OF_NUMERICS_FUNC uint8_t div(uint8_t a, uint8_t b) { return a / b; }
-  OF_NUMERICS_FUNC uint8_t pow(uint8_t a, uint8_t b) { return powi<uint8_t>(a, b); }
-  OF_NUMERICS_FUNC bool isnan(uint8_t a) { return false; }
-  OF_NUMERICS_FUNC bool isinf(uint8_t a) { return false; }
-};
-
-#ifdef _MSC_VER
-// Suppress warning C4804: '/': unsafe use of type 'bool' in operation
-#pragma warning(push)
-#pragma warning(disable : 4804)
-#endif
-
-template<>
-struct numerics<bool> {
-  OF_NUMERICS_FUNC bool min() { return detail::numeric_limits<bool>::lowest(); }
-  OF_NUMERICS_FUNC bool max() { return detail::numeric_limits<bool>::max(); }
-  OF_NUMERICS_FUNC bool lower_bound() { return detail::numeric_limits<bool>::lower_bound(); }
-  OF_NUMERICS_FUNC bool upper_bound() { return detail::numeric_limits<bool>::upper_bound(); }
-
-  OF_NUMERICS_FUNC bool lt(bool a, bool b) { return a < b; }
-  OF_NUMERICS_FUNC bool le(bool a, bool b) { return a <= b; }
-  OF_NUMERICS_FUNC bool gt(bool a, bool b) { return a > b; }
-  OF_NUMERICS_FUNC bool ge(bool a, bool b) { return a >= b; }
-  OF_NUMERICS_FUNC bool eq(bool a, bool b) { return a == b; }
-  OF_NUMERICS_FUNC bool ne(bool a, bool b) { return a != b; }
-  OF_NUMERICS_FUNC bool add(bool a, bool b) { return a + b; }
-  OF_NUMERICS_FUNC bool mul(bool a, bool b) { return a && b; }
-  OF_NUMERICS_FUNC bool sub(bool a, bool b) { return a - b; }
-  OF_NUMERICS_FUNC bool div(bool a, bool b) { return a / b; }
-  OF_NUMERICS_FUNC bool isnan(bool a) { return false; }
-  OF_NUMERICS_FUNC bool isinf(bool a) { return false; }
-};
-
-#ifdef _MSC_VER
-#pragma warning(pop)
-#endif
-
-template<>
-struct numerics<int8_t> {
-  OF_NUMERICS_FUNC int8_t min() { return detail::numeric_limits<int8_t>::lowest(); }
-  OF_NUMERICS_FUNC int8_t max() { return detail::numeric_limits<int8_t>::max(); }
-  OF_NUMERICS_FUNC int8_t lower_bound() { return detail::numeric_limits<int8_t>::lower_bound(); }
-  OF_NUMERICS_FUNC int8_t upper_bound() { return detail::numeric_limits<int8_t>::upper_bound(); }
-
-  OF_NUMERICS_FUNC bool lt(int8_t a, int8_t b) { return a < b; }
-  OF_NUMERICS_FUNC bool le(int8_t a, int8_t b) { return a <= b; }
-  OF_NUMERICS_FUNC bool gt(int8_t a, int8_t b) { return a > b; }
-  OF_NUMERICS_FUNC bool ge(int8_t a, int8_t b) { return a >= b; }
-  OF_NUMERICS_FUNC bool eq(int8_t a, int8_t b) { return a == b; }
-  OF_NUMERICS_FUNC bool ne(int8_t a, int8_t b) { return a != b; }
-
-  OF_NUMERICS_FUNC int8_t add(int8_t a, int8_t b) { return a + b; }
-  OF_NUMERICS_FUNC int8_t mul(int8_t a, int8_t b) { return a * b; }
-  OF_NUMERICS_FUNC int8_t sub(int8_t a, int8_t b) { return a - b; }
-  OF_NUMERICS_FUNC int8_t div(int8_t a, int8_t b) { return a / b; }
-  OF_NUMERICS_FUNC int8_t pow(int8_t a, int8_t b) { return powi<int8_t>(a, b); }
-  OF_NUMERICS_FUNC bool isnan(int8_t a) { return false; }
-  OF_NUMERICS_FUNC bool isinf(int8_t a) { return false; }
-};
-
-template<>
-struct numerics<int16_t> {
-  OF_NUMERICS_FUNC int16_t min() { return detail::numeric_limits<int16_t>::lowest(); }
-  OF_NUMERICS_FUNC int16_t max() { return detail::numeric_limits<int16_t>::max(); }
-  OF_NUMERICS_FUNC int16_t lower_bound() { return detail::numeric_limits<int16_t>::lower_bound(); }
-  OF_NUMERICS_FUNC int16_t upper_bound() { return detail::numeric_limits<int16_t>::upper_bound(); }
-
-  OF_NUMERICS_FUNC bool lt(int16_t a, int16_t b) { return a < b; }
-  OF_NUMERICS_FUNC bool le(int16_t a, int16_t b) { return a <= b; }
-  OF_NUMERICS_FUNC bool gt(int16_t a, int16_t b) { return a > b; }
-  OF_NUMERICS_FUNC bool ge(int16_t a, int16_t b) { return a >= b; }
-  OF_NUMERICS_FUNC bool eq(int16_t a, int16_t b) { return a == b; }
-  OF_NUMERICS_FUNC bool ne(int16_t a, int16_t b) { return a != b; }
-
-  OF_NUMERICS_FUNC int16_t add(int16_t a, int16_t b) { return a + b; }
-  OF_NUMERICS_FUNC int16_t mul(int16_t a, int16_t b) { return a * b; }
-  OF_NUMERICS_FUNC int16_t sub(int16_t a, int16_t b) { return a - b; }
-  OF_NUMERICS_FUNC int16_t div(int16_t a, int16_t b) { return a / b; }
-  OF_NUMERICS_FUNC int16_t pow(int16_t a, int16_t b) { return powi<int16_t>(a, b); }
-  OF_NUMERICS_FUNC bool isnan(int16_t a) { return false; }
-  OF_NUMERICS_FUNC bool isinf(int16_t a) { return false; }
-};
-
-template<>
-struct numerics<int32_t> {
-  OF_NUMERICS_FUNC int32_t min() { return detail::numeric_limits<int32_t>::lowest(); }
-  OF_NUMERICS_FUNC int32_t max() { return detail::numeric_limits<int32_t>::max(); }
-  OF_NUMERICS_FUNC int32_t lower_bound() { return detail::numeric_limits<int32_t>::lower_bound(); }
-  OF_NUMERICS_FUNC int32_t upper_bound() { return detail::numeric_limits<int32_t>::upper_bound(); }
-
-  OF_NUMERICS_FUNC bool lt(int32_t a, int32_t b) { return a < b; }
-  OF_NUMERICS_FUNC bool le(int32_t a, int32_t b) { return a <= b; }
-  OF_NUMERICS_FUNC bool gt(int32_t a, int32_t b) { return a > b; }
-  OF_NUMERICS_FUNC bool ge(int32_t a, int32_t b) { return a >= b; }
-  OF_NUMERICS_FUNC bool eq(int32_t a, int32_t b) { return a == b; }
-  OF_NUMERICS_FUNC bool ne(int32_t a, int32_t b) { return a != b; }
-
-  OF_NUMERICS_FUNC int32_t add(int32_t a, int32_t b) { return a + b; }
-  OF_NUMERICS_FUNC int32_t mul(int32_t a, int32_t b) { return a * b; }
-  OF_NUMERICS_FUNC int32_t sub(int32_t a, int32_t b) { return a - b; }
-  OF_NUMERICS_FUNC int32_t div(int32_t a, int32_t b) { return a / b; }
-  OF_NUMERICS_FUNC int32_t pow(int32_t a, int32_t b) { return powi<int32_t>(a, b); }
-  OF_NUMERICS_FUNC bool isnan(int32_t a) { return false; }
-  OF_NUMERICS_FUNC bool isinf(int32_t a) { return false; }
-};
-
-template<>
-struct numerics<int64_t> {
-  OF_NUMERICS_FUNC int64_t min() { return detail::numeric_limits<int64_t>::lowest(); }
-  OF_NUMERICS_FUNC int64_t max() { return detail::numeric_limits<int64_t>::max(); }
-  OF_NUMERICS_FUNC int64_t lower_bound() { return detail::numeric_limits<int64_t>::lower_bound(); }
-  OF_NUMERICS_FUNC int64_t upper_bound() { return detail::numeric_limits<int64_t>::upper_bound(); }
-
-  OF_NUMERICS_FUNC bool lt(int64_t a, int64_t b) { return a < b; }
-  OF_NUMERICS_FUNC bool le(int64_t a, int64_t b) { return a <= b; }
-  OF_NUMERICS_FUNC bool gt(int64_t a, int64_t b) { return a > b; }
-  OF_NUMERICS_FUNC bool ge(int64_t a, int64_t b) { return a >= b; }
-  OF_NUMERICS_FUNC bool eq(int64_t a, int64_t b) { return a == b; }
-  OF_NUMERICS_FUNC bool ne(int64_t a, int64_t b) { return a != b; }
-
-  OF_NUMERICS_FUNC int64_t add(int64_t a, int64_t b) { return a + b; }
-  OF_NUMERICS_FUNC int64_t mul(int64_t a, int64_t b) { return a * b; }
-  OF_NUMERICS_FUNC int64_t sub(int64_t a, int64_t b) { return a - b; }
-  OF_NUMERICS_FUNC int64_t div(int64_t a, int64_t b) { return a / b; };
-  OF_NUMERICS_FUNC int64_t pow(int64_t a, int64_t b) { return powi<int64_t>(a, b); }
-  OF_NUMERICS_FUNC bool isnan(int64_t a) { return false; }
-  OF_NUMERICS_FUNC bool isinf(int64_t a) { return false; }
-};
-
-// DEPRECATED: use math functions from std and cuda math API (if needed)
-template<>
-struct numerics<float> {
-  OF_NUMERICS_FUNC float min() { return detail::numeric_limits<float>::lowest(); }
-  OF_NUMERICS_FUNC float max() { return detail::numeric_limits<float>::max(); }
-  OF_NUMERICS_FUNC float lower_bound() { return detail::numeric_limits<float>::lower_bound(); }
-  OF_NUMERICS_FUNC float upper_bound() { return detail::numeric_limits<float>::upper_bound(); }
-
-  OF_NUMERICS_FUNC bool lt(float a, float b) { return a < b; }
-  OF_NUMERICS_FUNC bool le(float a, float b) { return a <= b; }
-  OF_NUMERICS_FUNC bool gt(float a, float b) { return a > b; }
-  OF_NUMERICS_FUNC bool ge(float a, float b) { return a >= b; }
-  OF_NUMERICS_FUNC bool eq(float a, float b) { return a == b; }
-  OF_NUMERICS_FUNC bool ne(float a, float b) { return a != b; }
-
-  OF_NUMERICS_FUNC float sqrt(float a) { return sqrtf(a); }
-  OF_NUMERICS_FUNC float atan(float a) { return atanf(a); }
-  OF_NUMERICS_FUNC float add(float a, float b) { return a + b; }
-  OF_NUMERICS_FUNC float div(float a, float b) { return a / b; }
-  OF_NUMERICS_FUNC float mul(float a, float b) { return a * b; }
-  OF_NUMERICS_FUNC float sub(float a, float b) { return a - b; }
-  OF_NUMERICS_FUNC float pow(float a, float b) { return powf(a, b); }
-  OF_NUMERICS_FUNC bool isnan(float a) { return ::isnan(a); }
-  OF_NUMERICS_FUNC bool isinf(float a) { return ::isinf(a); }
-};
-
-template<>
-struct numerics<double> {
-  OF_NUMERICS_FUNC double min() { return detail::numeric_limits<double>::lowest(); }
-  OF_NUMERICS_FUNC double max() { return detail::numeric_limits<double>::max(); }
-  OF_NUMERICS_FUNC double lower_bound() { return detail::numeric_limits<double>::lower_bound(); }
-  OF_NUMERICS_FUNC double upper_bound() { return detail::numeric_limits<double>::upper_bound(); }
-
-  OF_NUMERICS_FUNC bool lt(double a, double b) { return a < b; }
-  OF_NUMERICS_FUNC bool le(double a, double b) { return a <= b; }
-  OF_NUMERICS_FUNC bool gt(double a, double b) { return a > b; }
-  OF_NUMERICS_FUNC bool ge(double a, double b) { return a >= b; }
-  OF_NUMERICS_FUNC bool eq(double a, double b) { return a == b; }
-  OF_NUMERICS_FUNC bool ne(double a, double b) { return a != b; }
-
-  OF_NUMERICS_FUNC double sqrt(double a) { return ::sqrt(a); }
-  OF_NUMERICS_FUNC double atan(double a) { return ::atan(a); }
-  OF_NUMERICS_FUNC double add(double a, double b) { return a + b; }
-  OF_NUMERICS_FUNC double div(double a, double b) { return a / b; }
-  OF_NUMERICS_FUNC double mul(double a, double b) { return a * b; }
-  OF_NUMERICS_FUNC double sub(double a, double b) { return a - b; }
-  OF_NUMERICS_FUNC double pow(double a, double b) { return ::pow(a, b); }
-  OF_NUMERICS_FUNC bool isnan(double a) { return ::isnan(a); }
-  OF_NUMERICS_FUNC bool isinf(double a) { return ::isinf(a); }
-};
-
-}  // namespace detail
-}  // namespace oneflow
-
+
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+// reference: https://github.com/pytorch/pytorch/blob/master/aten/src/THC/THCNumerics.cuh
+#ifndef ONEFLOW_CORE_KERNEL_UTIL_NUMERICS_HIP_H
+#define ONEFLOW_CORE_KERNEL_UTIL_NUMERICS_HIP_H
+#pragma once
+
+#include <limits.h>
+#include <math.h>
+#include <float.h>
+#include <cstdlib>
+#include <assert.h>
+
+#include "oneflow/core/device/cuda_util.h"
+#include "oneflow/core/framework/framework.h"
+#include "oneflow/core/kernel/util/numeric_limits.hip.h"
+
+namespace oneflow {
+namespace detail {
+
+template<typename T>
+struct numerics {};
+
+template<typename T>
+OF_NUMERICS_FUNC T powi(T a, T b) {
+  assert(numerics<T>::ge(b, 0));
+  T result = 1;
+  while (b) {
+    if (b & 1) { result *= a; }
+    b /= 2;
+    a *= a;
+  }
+  return result;
+}
+
+template<>
+struct numerics<uint8_t> {
+  OF_NUMERICS_FUNC uint8_t min() { return detail::numeric_limits<uint8_t>::lowest(); }
+  OF_NUMERICS_FUNC uint8_t max() { return detail::numeric_limits<uint8_t>::max(); }
+  OF_NUMERICS_FUNC uint8_t lower_bound() { return detail::numeric_limits<uint8_t>::lower_bound(); }
+  OF_NUMERICS_FUNC uint8_t upper_bound() { return detail::numeric_limits<uint8_t>::upper_bound(); }
+
+  OF_NUMERICS_FUNC bool lt(uint8_t a, uint8_t b) { return a < b; }
+  OF_NUMERICS_FUNC bool le(uint8_t a, uint8_t b) { return a <= b; }
+  OF_NUMERICS_FUNC bool gt(uint8_t a, uint8_t b) { return a > b; }
+  OF_NUMERICS_FUNC bool ge(uint8_t a, uint8_t b) { return a >= b; }
+  OF_NUMERICS_FUNC bool eq(uint8_t a, uint8_t b) { return a == b; }
+  OF_NUMERICS_FUNC bool ne(uint8_t a, uint8_t b) { return a != b; }
+
+  OF_NUMERICS_FUNC uint8_t add(uint8_t a, uint8_t b) { return a + b; }
+  OF_NUMERICS_FUNC uint8_t mul(uint8_t a, uint8_t b) { return a * b; }
+  OF_NUMERICS_FUNC uint8_t sub(uint8_t a, uint8_t b) { return a - b; }
+  OF_NUMERICS_FUNC uint8_t div(uint8_t a, uint8_t b) { return a / b; }
+  OF_NUMERICS_FUNC uint8_t pow(uint8_t a, uint8_t b) { return powi<uint8_t>(a, b); }
+  OF_NUMERICS_FUNC bool isnan(uint8_t a) { return false; }
+  OF_NUMERICS_FUNC bool isinf(uint8_t a) { return false; }
+};
+
+#ifdef _MSC_VER
+// Suppress warning C4804: '/': unsafe use of type 'bool' in operation
+#pragma warning(push)
+#pragma warning(disable : 4804)
+#endif
+
+template<>
+struct numerics<bool> {
+  OF_NUMERICS_FUNC bool min() { return detail::numeric_limits<bool>::lowest(); }
+  OF_NUMERICS_FUNC bool max() { return detail::numeric_limits<bool>::max(); }
+  OF_NUMERICS_FUNC bool lower_bound() { return detail::numeric_limits<bool>::lower_bound(); }
+  OF_NUMERICS_FUNC bool upper_bound() { return detail::numeric_limits<bool>::upper_bound(); }
+
+  OF_NUMERICS_FUNC bool lt(bool a, bool b) { return a < b; }
+  OF_NUMERICS_FUNC bool le(bool a, bool b) { return a <= b; }
+  OF_NUMERICS_FUNC bool gt(bool a, bool b) { return a > b; }
+  OF_NUMERICS_FUNC bool ge(bool a, bool b) { return a >= b; }
+  OF_NUMERICS_FUNC bool eq(bool a, bool b) { return a == b; }
+  OF_NUMERICS_FUNC bool ne(bool a, bool b) { return a != b; }
+  OF_NUMERICS_FUNC bool add(bool a, bool b) { return a + b; }
+  OF_NUMERICS_FUNC bool mul(bool a, bool b) { return a && b; }
+  OF_NUMERICS_FUNC bool sub(bool a, bool b) { return a - b; }
+  OF_NUMERICS_FUNC bool div(bool a, bool b) { return a / b; }
+  OF_NUMERICS_FUNC bool isnan(bool a) { return false; }
+  OF_NUMERICS_FUNC bool isinf(bool a) { return false; }
+};
+
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif
+
+template<>
+struct numerics<int8_t> {
+  OF_NUMERICS_FUNC int8_t min() { return detail::numeric_limits<int8_t>::lowest(); }
+  OF_NUMERICS_FUNC int8_t max() { return detail::numeric_limits<int8_t>::max(); }
+  OF_NUMERICS_FUNC int8_t lower_bound() { return detail::numeric_limits<int8_t>::lower_bound(); }
+  OF_NUMERICS_FUNC int8_t upper_bound() { return detail::numeric_limits<int8_t>::upper_bound(); }
+
+  OF_NUMERICS_FUNC bool lt(int8_t a, int8_t b) { return a < b; }
+  OF_NUMERICS_FUNC bool le(int8_t a, int8_t b) { return a <= b; }
+  OF_NUMERICS_FUNC bool gt(int8_t a, int8_t b) { return a > b; }
+  OF_NUMERICS_FUNC bool ge(int8_t a, int8_t b) { return a >= b; }
+  OF_NUMERICS_FUNC bool eq(int8_t a, int8_t b) { return a == b; }
+  OF_NUMERICS_FUNC bool ne(int8_t a, int8_t b) { return a != b; }
+
+  OF_NUMERICS_FUNC int8_t add(int8_t a, int8_t b) { return a + b; }
+  OF_NUMERICS_FUNC int8_t mul(int8_t a, int8_t b) { return a * b; }
+  OF_NUMERICS_FUNC int8_t sub(int8_t a, int8_t b) { return a - b; }
+  OF_NUMERICS_FUNC int8_t div(int8_t a, int8_t b) { return a / b; }
+  OF_NUMERICS_FUNC int8_t pow(int8_t a, int8_t b) { return powi<int8_t>(a, b); }
+  OF_NUMERICS_FUNC bool isnan(int8_t a) { return false; }
+  OF_NUMERICS_FUNC bool isinf(int8_t a) { return false; }
+};
+
+template<>
+struct numerics<int16_t> {
+  OF_NUMERICS_FUNC int16_t min() { return detail::numeric_limits<int16_t>::lowest(); }
+  OF_NUMERICS_FUNC int16_t max() { return detail::numeric_limits<int16_t>::max(); }
+  OF_NUMERICS_FUNC int16_t lower_bound() { return detail::numeric_limits<int16_t>::lower_bound(); }
+  OF_NUMERICS_FUNC int16_t upper_bound() { return detail::numeric_limits<int16_t>::upper_bound(); }
+
+  OF_NUMERICS_FUNC bool lt(int16_t a, int16_t b) { return a < b; }
+  OF_NUMERICS_FUNC bool le(int16_t a, int16_t b) { return a <= b; }
+  OF_NUMERICS_FUNC bool gt(int16_t a, int16_t b) { return a > b; }
+  OF_NUMERICS_FUNC bool ge(int16_t a, int16_t b) { return a >= b; }
+  OF_NUMERICS_FUNC bool eq(int16_t a, int16_t b) { return a == b; }
+  OF_NUMERICS_FUNC bool ne(int16_t a, int16_t b) { return a != b; }
+
+  OF_NUMERICS_FUNC int16_t add(int16_t a, int16_t b) { return a + b; }
+  OF_NUMERICS_FUNC int16_t mul(int16_t a, int16_t b) { return a * b; }
+  OF_NUMERICS_FUNC int16_t sub(int16_t a, int16_t b) { return a - b; }
+  OF_NUMERICS_FUNC int16_t div(int16_t a, int16_t b) { return a / b; }
+  OF_NUMERICS_FUNC int16_t pow(int16_t a, int16_t b) { return powi<int16_t>(a, b); }
+  OF_NUMERICS_FUNC bool isnan(int16_t a) { return false; }
+  OF_NUMERICS_FUNC bool isinf(int16_t a) { return false; }
+};
+
+template<>
+struct numerics<int32_t> {
+  OF_NUMERICS_FUNC int32_t min() { return detail::numeric_limits<int32_t>::lowest(); }
+  OF_NUMERICS_FUNC int32_t max() { return detail::numeric_limits<int32_t>::max(); }
+  OF_NUMERICS_FUNC int32_t lower_bound() { return detail::numeric_limits<int32_t>::lower_bound(); }
+  OF_NUMERICS_FUNC int32_t upper_bound() { return detail::numeric_limits<int32_t>::upper_bound(); }
+
+  OF_NUMERICS_FUNC bool lt(int32_t a, int32_t b) { return a < b; }
+  OF_NUMERICS_FUNC bool le(int32_t a, int32_t b) { return a <= b; }
+  OF_NUMERICS_FUNC bool gt(int32_t a, int32_t b) { return a > b; }
+  OF_NUMERICS_FUNC bool ge(int32_t a, int32_t b) { return a >= b; }
+  OF_NUMERICS_FUNC bool eq(int32_t a, int32_t b) { return a == b; }
+  OF_NUMERICS_FUNC bool ne(int32_t a, int32_t b) { return a != b; }
+
+  OF_NUMERICS_FUNC int32_t add(int32_t a, int32_t b) { return a + b; }
+  OF_NUMERICS_FUNC int32_t mul(int32_t a, int32_t b) { return a * b; }
+  OF_NUMERICS_FUNC int32_t sub(int32_t a, int32_t b) { return a - b; }
+  OF_NUMERICS_FUNC int32_t div(int32_t a, int32_t b) { return a / b; }
+  OF_NUMERICS_FUNC int32_t pow(int32_t a, int32_t b) { return powi<int32_t>(a, b); }
+  OF_NUMERICS_FUNC bool isnan(int32_t a) { return false; }
+  OF_NUMERICS_FUNC bool isinf(int32_t a) { return false; }
+};
+
+template<>
+struct numerics<int64_t> {
+  OF_NUMERICS_FUNC int64_t min() { return detail::numeric_limits<int64_t>::lowest(); }
+  OF_NUMERICS_FUNC int64_t max() { return detail::numeric_limits<int64_t>::max(); }
+  OF_NUMERICS_FUNC int64_t lower_bound() { return detail::numeric_limits<int64_t>::lower_bound(); }
+  OF_NUMERICS_FUNC int64_t upper_bound() { return detail::numeric_limits<int64_t>::upper_bound(); }
+
+  OF_NUMERICS_FUNC bool lt(int64_t a, int64_t b) { return a < b; }
+  OF_NUMERICS_FUNC bool le(int64_t a, int64_t b) { return a <= b; }
+  OF_NUMERICS_FUNC bool gt(int64_t a, int64_t b) { return a > b; }
+  OF_NUMERICS_FUNC bool ge(int64_t a, int64_t b) { return a >= b; }
+  OF_NUMERICS_FUNC bool eq(int64_t a, int64_t b) { return a == b; }
+  OF_NUMERICS_FUNC bool ne(int64_t a, int64_t b) { return a != b; }
+
+  OF_NUMERICS_FUNC int64_t add(int64_t a, int64_t b) { return a + b; }
+  OF_NUMERICS_FUNC int64_t mul(int64_t a, int64_t b) { return a * b; }
+  OF_NUMERICS_FUNC int64_t sub(int64_t a, int64_t b) { return a - b; }
+  OF_NUMERICS_FUNC int64_t div(int64_t a, int64_t b) { return a / b; };
+  OF_NUMERICS_FUNC int64_t pow(int64_t a, int64_t b) { return powi<int64_t>(a, b); }
+  OF_NUMERICS_FUNC bool isnan(int64_t a) { return false; }
+  OF_NUMERICS_FUNC bool isinf(int64_t a) { return false; }
+};
+
+// DEPRECATED: use math functions from std and cuda math API (if needed)
+template<>
+struct numerics<float> {
+  OF_NUMERICS_FUNC float min() { return detail::numeric_limits<float>::lowest(); }
+  OF_NUMERICS_FUNC float max() { return detail::numeric_limits<float>::max(); }
+  OF_NUMERICS_FUNC float lower_bound() { return detail::numeric_limits<float>::lower_bound(); }
+  OF_NUMERICS_FUNC float upper_bound() { return detail::numeric_limits<float>::upper_bound(); }
+
+  OF_NUMERICS_FUNC bool lt(float a, float b) { return a < b; }
+  OF_NUMERICS_FUNC bool le(float a, float b) { return a <= b; }
+  OF_NUMERICS_FUNC bool gt(float a, float b) { return a > b; }
+  OF_NUMERICS_FUNC bool ge(float a, float b) { return a >= b; }
+  OF_NUMERICS_FUNC bool eq(float a, float b) { return a == b; }
+  OF_NUMERICS_FUNC bool ne(float a, float b) { return a != b; }
+
+  OF_NUMERICS_FUNC float sqrt(float a) { return sqrtf(a); }
+  OF_NUMERICS_FUNC float atan(float a) { return atanf(a); }
+  OF_NUMERICS_FUNC float add(float a, float b) { return a + b; }
+  OF_NUMERICS_FUNC float div(float a, float b) { return a / b; }
+  OF_NUMERICS_FUNC float mul(float a, float b) { return a * b; }
+  OF_NUMERICS_FUNC float sub(float a, float b) { return a - b; }
+  OF_NUMERICS_FUNC float pow(float a, float b) { return powf(a, b); }
+  OF_NUMERICS_FUNC bool isnan(float a) { return ::isnan(a); }
+  OF_NUMERICS_FUNC bool isinf(float a) { return ::isinf(a); }
+};
+
+template<>
+struct numerics<double> {
+  OF_NUMERICS_FUNC double min() { return detail::numeric_limits<double>::lowest(); }
+  OF_NUMERICS_FUNC double max() { return detail::numeric_limits<double>::max(); }
+  OF_NUMERICS_FUNC double lower_bound() { return detail::numeric_limits<double>::lower_bound(); }
+  OF_NUMERICS_FUNC double upper_bound() { return detail::numeric_limits<double>::upper_bound(); }
+
+  OF_NUMERICS_FUNC bool lt(double a, double b) { return a < b; }
+  OF_NUMERICS_FUNC bool le(double a, double b) { return a <= b; }
+  OF_NUMERICS_FUNC bool gt(double a, double b) { return a > b; }
+  OF_NUMERICS_FUNC bool ge(double a, double b) { return a >= b; }
+  OF_NUMERICS_FUNC bool eq(double a, double b) { return a == b; }
+  OF_NUMERICS_FUNC bool ne(double a, double b) { return a != b; }
+
+  OF_NUMERICS_FUNC double sqrt(double a) { return ::sqrt(a); }
+  OF_NUMERICS_FUNC double atan(double a) { return ::atan(a); }
+  OF_NUMERICS_FUNC double add(double a, double b) { return a + b; }
+  OF_NUMERICS_FUNC double div(double a, double b) { return a / b; }
+  OF_NUMERICS_FUNC double mul(double a, double b) { return a * b; }
+  OF_NUMERICS_FUNC double sub(double a, double b) { return a - b; }
+  OF_NUMERICS_FUNC double pow(double a, double b) { return ::pow(a, b); }
+  OF_NUMERICS_FUNC bool isnan(double a) { return ::isnan(a); }
+  OF_NUMERICS_FUNC bool isinf(double a) { return ::isinf(a); }
+};
+
+}  // namespace detail
+}  // namespace oneflow
+
 #endif  // ONEFLOW_CORE_KERNEL_UTIL_NUMERICS_HIP_H
\ No newline at end of file
diff --git a/oneflow/core/ndarray/ndarray_apply_binary_core.hip.cpp b/oneflow/core/ndarray/ndarray_apply_binary_core.hip.cpp
index 9ae4c6f..d24d580 100644
--- a/oneflow/core/ndarray/ndarray_apply_binary_core.hip.cpp
+++ b/oneflow/core/ndarray/ndarray_apply_binary_core.hip.cpp
@@ -1,68 +1,68 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "hip/hip_runtime.h"
-#include "oneflow/core/ndarray/ndarray_apply_binary_core.h"
-#include "oneflow/core/ndarray/binary_func.h"
-
-namespace oneflow {
-
-namespace {
-
-template<typename T, template<typename> class binary_func>
-__global__ void NdarrayApplyBinaryApplyGpu(size_t n,
-                                           typename BinaryFuncTrait<binary_func, T>::return_type* y,
-                                           const T* a, const T* b) {
-  NdarrayApplyBinaryCore<T, binary_func>::Apply(n, y, a, b);
-}
-
-template<typename T, template<typename> class binary_func>
-__global__ void NdarrayApplyBinaryInplaceApplyGpu(size_t n, T* y, const T* x) {
-  NdarrayApplyBinaryCore<T, binary_func>::InplaceApply(n, y, x);
-}
-
-}  // namespace
-
-template<typename T, template<typename> class binary_func>
-struct NdarrayApplyBinaryCoreWrapper<DeviceType::kCUDA, T, binary_func> final {
-  static void Apply(ep::Stream* stream,
-                    const XpuVarNdarray<typename BinaryFuncTrait<binary_func, T>::return_type>& y,
-                    const XpuVarNdarray<const T>& a, const XpuVarNdarray<const T>& b) {
-    size_t n = y.host_shape().HostElemNum();
-    if (n == 0) { return; }
-    RUN_CUDA_KERNEL((NdarrayApplyBinaryApplyGpu<T, binary_func>), stream, n, n, y.host_ptr(),
-                    a.host_ptr(), b.host_ptr());
-  }
-  static void InplaceApply(ep::Stream* stream, const XpuVarNdarray<T>& y,
-                           const XpuVarNdarray<const T>& x) {
-    size_t n = y.host_shape().HostElemNum();
-    if (n == 0) { return; }
-    RUN_CUDA_KERNEL((NdarrayApplyBinaryInplaceApplyGpu<T, binary_func>), stream, n, n, y.host_ptr(),
-                    x.host_ptr());
-  }
-};
-
-#define INSTANTIATE_NDARRAY_APPLY_BINARY_CORE(dtype_pair, binary_func)                           \
-  template struct NdarrayApplyBinaryCoreWrapper<DeviceType::kCUDA, OF_PP_PAIR_FIRST(dtype_pair), \
-                                                binary_func>;
-OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_NDARRAY_APPLY_BINARY_CORE,
-                                 ARITHMETIC_DATA_TYPE_SEQ HALF_DATA_TYPE_SEQ
-                                     UNSIGNED_INT_DATA_TYPE_SEQ BOOL_DATA_TYPE_SEQ,
-                                 ARITHMETIC_BINARY_FUNC_SEQ);
-OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_NDARRAY_APPLY_BINARY_CORE,
-                                 ARITHMETIC_DATA_TYPE_SEQ HALF_DATA_TYPE_SEQ
-                                     UNSIGNED_INT_DATA_TYPE_SEQ BOOL_DATA_TYPE_SEQ,
-                                 LOGICAL_BINARY_FUNC_SEQ);
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "hip/hip_runtime.h"
+#include "oneflow/core/ndarray/ndarray_apply_binary_core.h"
+#include "oneflow/core/ndarray/binary_func.h"
+
+namespace oneflow {
+
+namespace {
+
+template<typename T, template<typename> class binary_func>
+__global__ void NdarrayApplyBinaryApplyGpu(size_t n,
+                                           typename BinaryFuncTrait<binary_func, T>::return_type* y,
+                                           const T* a, const T* b) {
+  NdarrayApplyBinaryCore<T, binary_func>::Apply(n, y, a, b);
+}
+
+template<typename T, template<typename> class binary_func>
+__global__ void NdarrayApplyBinaryInplaceApplyGpu(size_t n, T* y, const T* x) {
+  NdarrayApplyBinaryCore<T, binary_func>::InplaceApply(n, y, x);
+}
+
+}  // namespace
+
+template<typename T, template<typename> class binary_func>
+struct NdarrayApplyBinaryCoreWrapper<DeviceType::kCUDA, T, binary_func> final {
+  static void Apply(ep::Stream* stream,
+                    const XpuVarNdarray<typename BinaryFuncTrait<binary_func, T>::return_type>& y,
+                    const XpuVarNdarray<const T>& a, const XpuVarNdarray<const T>& b) {
+    size_t n = y.host_shape().HostElemNum();
+    if (n == 0) { return; }
+    RUN_CUDA_KERNEL((NdarrayApplyBinaryApplyGpu<T, binary_func>), stream, n, n, y.host_ptr(),
+                    a.host_ptr(), b.host_ptr());
+  }
+  static void InplaceApply(ep::Stream* stream, const XpuVarNdarray<T>& y,
+                           const XpuVarNdarray<const T>& x) {
+    size_t n = y.host_shape().HostElemNum();
+    if (n == 0) { return; }
+    RUN_CUDA_KERNEL((NdarrayApplyBinaryInplaceApplyGpu<T, binary_func>), stream, n, n, y.host_ptr(),
+                    x.host_ptr());
+  }
+};
+
+#define INSTANTIATE_NDARRAY_APPLY_BINARY_CORE(dtype_pair, binary_func)                           \
+  template struct NdarrayApplyBinaryCoreWrapper<DeviceType::kCUDA, OF_PP_PAIR_FIRST(dtype_pair), \
+                                                binary_func>;
+OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_NDARRAY_APPLY_BINARY_CORE,
+                                 ARITHMETIC_DATA_TYPE_SEQ HALF_DATA_TYPE_SEQ
+                                     UNSIGNED_INT_DATA_TYPE_SEQ BOOL_DATA_TYPE_SEQ,
+                                 ARITHMETIC_BINARY_FUNC_SEQ);
+OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_NDARRAY_APPLY_BINARY_CORE,
+                                 ARITHMETIC_DATA_TYPE_SEQ HALF_DATA_TYPE_SEQ
+                                     UNSIGNED_INT_DATA_TYPE_SEQ BOOL_DATA_TYPE_SEQ,
+                                 LOGICAL_BINARY_FUNC_SEQ);
 }  // namespace oneflow
\ No newline at end of file
diff --git a/oneflow/core/ndarray/ndarray_apply_broadcast_binary_core.hip.cpp b/oneflow/core/ndarray/ndarray_apply_broadcast_binary_core.hip.cpp
index 68e9d7b..0f7ecd4 100644
--- a/oneflow/core/ndarray/ndarray_apply_broadcast_binary_core.hip.cpp
+++ b/oneflow/core/ndarray/ndarray_apply_broadcast_binary_core.hip.cpp
@@ -1,190 +1,190 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "hip/hip_runtime.h"
-#include "oneflow/core/ndarray/ndarray_apply_broadcast_binary_core.h"
-
-namespace oneflow {
-
-namespace {
-
-template<typename Index>
-struct XY2XFunctor final {
-  __host__ __device__ XY2XFunctor(Index dim_y) : dim_y_(dim_y) {}
-
-  __host__ __device__ Index operator()(Index idx) const { return idx / dim_y_; }
-
-  Index dim_y_;
-};
-
-template<typename Index>
-struct XY2YFunctor final {
-  __host__ __device__ XY2YFunctor(Index dim_y) : dim_y_(dim_y) {}
-
-  __host__ __device__ Index operator()(Index idx) const { return idx % dim_y_; }
-
-  Index dim_y_;
-};
-
-template<typename Index>
-struct XYZ2XZFunctor final {
-  __host__ __device__ XYZ2XZFunctor(Index dim_y, Index dim_z)
-      : dim_yz_(dim_y * dim_z), dim_z_(dim_z) {}
-
-  __host__ __device__ Index operator()(Index idx) const {
-    const Index x = idx / dim_yz_;
-    const Index z = (idx % dim_yz_) % dim_z_;
-    return x * dim_z_ + z;
-  }
-
-  Index dim_yz_;
-  Index dim_z_;
-};
-
-template<typename Index>
-struct XYZ2YFunctor final {
-  __host__ __device__ XYZ2YFunctor(Index dim_y, Index dim_z)
-      : dim_yz_(dim_y * dim_z), dim_z_(dim_z) {}
-
-  __host__ __device__ Index operator()(Index idx) const { return (idx % dim_yz_) / dim_z_; }
-
-  Index dim_yz_;
-  Index dim_z_;
-};
-
-template<typename T, typename K, template<typename> class binary_func, typename OffsetFunctor>
-__global__ void PartialBroadcastGpu(K n, typename BinaryFuncTrait<binary_func, T>::return_type* y,
-                                    const T* a, const T* b, OffsetFunctor offset_functor) {
-  CUDA_1D_KERNEL_LOOP_T(K, i, n) { y[i] = binary_func<T>::Invoke(a[i], b[offset_functor(i)]); }
-}
-
-template<typename T, int NDIMS, template<typename> class binary_func>
-__global__ void GpuBroadcastBinaryFunc(
-    const XpuVarNdarray<typename BinaryFuncTrait<binary_func, T>::return_type> y,
-    const XpuVarNdarray<const T> a, const XpuVarNdarray<const T> b) {
-  NdarrayApplyBroadcastBinaryCore<T, NDIMS, binary_func>::Apply(y, a, b);
-}
-template<typename T, int NDIMS, template<typename> class binary_func>
-__global__ void GpuInplaceBroadcastBinaryFunc(const XpuVarNdarray<T> y,
-                                              const XpuVarNdarray<const T> x) {
-  NdarrayApplyBroadcastBinaryCore<T, NDIMS, binary_func>::InplaceApply(y, x);
-}
-
-}  // namespace
-
-template<typename T, int NDIMS, template<typename> class binary_func>
-struct NdarrayApplyBroadcastBinaryCoreWrapper<DeviceType::kCUDA, T, NDIMS, binary_func> final {
-  static void Apply(ep::Stream* stream,
-                    const XpuVarNdarray<typename BinaryFuncTrait<binary_func, T>::return_type>& y,
-                    const XpuVarNdarray<const T>& a, const XpuVarNdarray<const T>& b) {
-    size_t n = y.host_shape().HostElemNum();
-    if (n == 0) { return; }
-    if (IsKernelSafeInt32(n) && PartialBroadcast<int32_t>(stream, y, a, b)) { return; }
-    if (!IsKernelSafeInt32(n) && PartialBroadcast<int64_t>(stream, y, a, b)) { return; }
-    RUN_CUDA_KERNEL((GpuBroadcastBinaryFunc<T, NDIMS, binary_func>), stream, n, y, a, b);
-  }
-
-  template<typename K>
-  static bool PartialBroadcast(
-      ep::Stream* stream,
-      const XpuVarNdarray<typename BinaryFuncTrait<binary_func, T>::return_type>& y,
-      const XpuVarNdarray<const T>& a, const XpuVarNdarray<const T>& b) {
-    size_t n = y.host_shape().HostElemNum();
-    if (y.host_shape() == a.host_shape()) {
-      if (y.host_shape().NumAxes() == 2) {
-        const K y_dim0 = y.host_shape().At(0);
-        const K y_dim1 = y.host_shape().At(1);
-        const K b_dim0 = b.host_shape().At(0);
-        const K b_dim1 = b.host_shape().At(1);
-        if (b_dim0 == y_dim0 && b_dim1 == 1) {
-          XY2XFunctor<K> xy2x(y_dim1);
-          RUN_CUDA_KERNEL((PartialBroadcastGpu<T, K, binary_func, XY2XFunctor<K>>), stream, n, n,
-                          y.host_ptr(), a.host_ptr(), b.host_ptr(), xy2x);
-          return true;
-        }
-        if (b_dim0 == 1 && b_dim1 == y_dim1) {
-          XY2YFunctor<K> xy2y(y_dim1);
-          RUN_CUDA_KERNEL((PartialBroadcastGpu<T, K, binary_func, XY2YFunctor<K>>), stream, n, n,
-                          y.host_ptr(), a.host_ptr(), b.host_ptr(), xy2y);
-          return true;
-        }
-      }
-      if (y.host_shape().NumAxes() == 3) {
-        const K y_dim0 = y.host_shape().At(0);
-        const K y_dim1 = y.host_shape().At(1);
-        const K y_dim2 = y.host_shape().At(2);
-        const K b_dim0 = b.host_shape().At(0);
-        const K b_dim1 = b.host_shape().At(1);
-        const K b_dim2 = b.host_shape().At(2);
-        if (b_dim0 == y_dim0 && b_dim1 == 1 && b_dim2 == y_dim2) {
-          XYZ2XZFunctor<K> xyz2xz(y_dim1, y_dim2);
-          RUN_CUDA_KERNEL((PartialBroadcastGpu<T, K, binary_func, XYZ2XZFunctor<K>>), stream, n, n,
-                          y.host_ptr(), a.host_ptr(), b.host_ptr(), xyz2xz);
-          return true;
-        }
-        if (b_dim0 == 1 && b_dim1 == y_dim1 && b_dim2 == 1) {
-          XYZ2YFunctor<K> xyz2y(y_dim1, y_dim2);
-          RUN_CUDA_KERNEL((PartialBroadcastGpu<T, K, binary_func, XYZ2YFunctor<K>>), stream, n, n,
-                          y.host_ptr(), a.host_ptr(), b.host_ptr(), xyz2y);
-          return true;
-        }
-      }
-    }
-    return false;
-  }
-};
-
-template<typename T, int NDIMS, template<typename> class binary_func>
-struct NdarrayApplyBroadcastInplaceBinaryCoreWrapper<DeviceType::kCUDA, T, NDIMS, binary_func>
-    final {
-  static void InplaceApply(ep::Stream* stream, const XpuVarNdarray<T>& y,
-                           const XpuVarNdarray<const T>& x) {
-    size_t n = y.host_shape().HostElemNum();
-    XpuVarNdarray<const T> a(y.host_shape(), y.host_ptr());
-    using NBB = NdarrayApplyBroadcastBinaryCoreWrapper<DeviceType::kCUDA, T, NDIMS, binary_func>;
-    if (n == 0) { return; }
-    if (IsKernelSafeInt32(n) && NBB::template PartialBroadcast<int32_t>(stream, y, a, x)) {
-      return;
-    }
-    if (!IsKernelSafeInt32(n) && NBB::template PartialBroadcast<int64_t>(stream, y, a, x)) {
-      return;
-    }
-    RUN_CUDA_KERNEL((GpuInplaceBroadcastBinaryFunc<T, NDIMS, binary_func>), stream, n, y, x);
-  }
-};
-
-#define INSTANTIATE_BROADCAST_BINARY_FUNC(dtype_pair, NDIMS, binary_func) \
-  template struct NdarrayApplyBroadcastBinaryCoreWrapper<                 \
-      DeviceType::kCUDA, OF_PP_PAIR_FIRST(dtype_pair), NDIMS, binary_func>;
-OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_BROADCAST_BINARY_FUNC,
-                                 ARITHMETIC_DATA_TYPE_SEQ HALF_DATA_TYPE_SEQ
-                                     UNSIGNED_INT_DATA_TYPE_SEQ BOOL_DATA_TYPE_SEQ,
-                                 DIM_SEQ, ARITHMETIC_BINARY_FUNC_SEQ);
-OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_BROADCAST_BINARY_FUNC,
-                                 ARITHMETIC_DATA_TYPE_SEQ HALF_DATA_TYPE_SEQ
-                                     UNSIGNED_INT_DATA_TYPE_SEQ BOOL_DATA_TYPE_SEQ,
-                                 DIM_SEQ, LOGICAL_BINARY_FUNC_SEQ);
-
-#define INSTANTIATE_BROADCAST_INPLACE_BINARY_FUNC(dtype_pair, NDIMS, binary_func) \
-  template struct NdarrayApplyBroadcastInplaceBinaryCoreWrapper<                  \
-      DeviceType::kCUDA, OF_PP_PAIR_FIRST(dtype_pair), NDIMS, binary_func>;
-OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_BROADCAST_INPLACE_BINARY_FUNC,
-                                 ARITHMETIC_DATA_TYPE_SEQ HALF_DATA_TYPE_SEQ
-                                     UNSIGNED_INT_DATA_TYPE_SEQ BOOL_DATA_TYPE_SEQ,
-                                 DIM_SEQ, ARITHMETIC_BINARY_FUNC_SEQ);
-OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_BROADCAST_INPLACE_BINARY_FUNC,
-                                 ((bool, DataType::kBool)), DIM_SEQ, LOGICAL_BINARY_FUNC_SEQ);
-
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "hip/hip_runtime.h"
+#include "oneflow/core/ndarray/ndarray_apply_broadcast_binary_core.h"
+
+namespace oneflow {
+
+namespace {
+
+template<typename Index>
+struct XY2XFunctor final {
+  __host__ __device__ XY2XFunctor(Index dim_y) : dim_y_(dim_y) {}
+
+  __host__ __device__ Index operator()(Index idx) const { return idx / dim_y_; }
+
+  Index dim_y_;
+};
+
+template<typename Index>
+struct XY2YFunctor final {
+  __host__ __device__ XY2YFunctor(Index dim_y) : dim_y_(dim_y) {}
+
+  __host__ __device__ Index operator()(Index idx) const { return idx % dim_y_; }
+
+  Index dim_y_;
+};
+
+template<typename Index>
+struct XYZ2XZFunctor final {
+  __host__ __device__ XYZ2XZFunctor(Index dim_y, Index dim_z)
+      : dim_yz_(dim_y * dim_z), dim_z_(dim_z) {}
+
+  __host__ __device__ Index operator()(Index idx) const {
+    const Index x = idx / dim_yz_;
+    const Index z = (idx % dim_yz_) % dim_z_;
+    return x * dim_z_ + z;
+  }
+
+  Index dim_yz_;
+  Index dim_z_;
+};
+
+template<typename Index>
+struct XYZ2YFunctor final {
+  __host__ __device__ XYZ2YFunctor(Index dim_y, Index dim_z)
+      : dim_yz_(dim_y * dim_z), dim_z_(dim_z) {}
+
+  __host__ __device__ Index operator()(Index idx) const { return (idx % dim_yz_) / dim_z_; }
+
+  Index dim_yz_;
+  Index dim_z_;
+};
+
+template<typename T, typename K, template<typename> class binary_func, typename OffsetFunctor>
+__global__ void PartialBroadcastGpu(K n, typename BinaryFuncTrait<binary_func, T>::return_type* y,
+                                    const T* a, const T* b, OffsetFunctor offset_functor) {
+  CUDA_1D_KERNEL_LOOP_T(K, i, n) { y[i] = binary_func<T>::Invoke(a[i], b[offset_functor(i)]); }
+}
+
+template<typename T, int NDIMS, template<typename> class binary_func>
+__global__ void GpuBroadcastBinaryFunc(
+    const XpuVarNdarray<typename BinaryFuncTrait<binary_func, T>::return_type> y,
+    const XpuVarNdarray<const T> a, const XpuVarNdarray<const T> b) {
+  NdarrayApplyBroadcastBinaryCore<T, NDIMS, binary_func>::Apply(y, a, b);
+}
+template<typename T, int NDIMS, template<typename> class binary_func>
+__global__ void GpuInplaceBroadcastBinaryFunc(const XpuVarNdarray<T> y,
+                                              const XpuVarNdarray<const T> x) {
+  NdarrayApplyBroadcastBinaryCore<T, NDIMS, binary_func>::InplaceApply(y, x);
+}
+
+}  // namespace
+
+template<typename T, int NDIMS, template<typename> class binary_func>
+struct NdarrayApplyBroadcastBinaryCoreWrapper<DeviceType::kCUDA, T, NDIMS, binary_func> final {
+  static void Apply(ep::Stream* stream,
+                    const XpuVarNdarray<typename BinaryFuncTrait<binary_func, T>::return_type>& y,
+                    const XpuVarNdarray<const T>& a, const XpuVarNdarray<const T>& b) {
+    size_t n = y.host_shape().HostElemNum();
+    if (n == 0) { return; }
+    if (IsKernelSafeInt32(n) && PartialBroadcast<int32_t>(stream, y, a, b)) { return; }
+    if (!IsKernelSafeInt32(n) && PartialBroadcast<int64_t>(stream, y, a, b)) { return; }
+    RUN_CUDA_KERNEL((GpuBroadcastBinaryFunc<T, NDIMS, binary_func>), stream, n, y, a, b);
+  }
+
+  template<typename K>
+  static bool PartialBroadcast(
+      ep::Stream* stream,
+      const XpuVarNdarray<typename BinaryFuncTrait<binary_func, T>::return_type>& y,
+      const XpuVarNdarray<const T>& a, const XpuVarNdarray<const T>& b) {
+    size_t n = y.host_shape().HostElemNum();
+    if (y.host_shape() == a.host_shape()) {
+      if (y.host_shape().NumAxes() == 2) {
+        const K y_dim0 = y.host_shape().At(0);
+        const K y_dim1 = y.host_shape().At(1);
+        const K b_dim0 = b.host_shape().At(0);
+        const K b_dim1 = b.host_shape().At(1);
+        if (b_dim0 == y_dim0 && b_dim1 == 1) {
+          XY2XFunctor<K> xy2x(y_dim1);
+          RUN_CUDA_KERNEL((PartialBroadcastGpu<T, K, binary_func, XY2XFunctor<K>>), stream, n, n,
+                          y.host_ptr(), a.host_ptr(), b.host_ptr(), xy2x);
+          return true;
+        }
+        if (b_dim0 == 1 && b_dim1 == y_dim1) {
+          XY2YFunctor<K> xy2y(y_dim1);
+          RUN_CUDA_KERNEL((PartialBroadcastGpu<T, K, binary_func, XY2YFunctor<K>>), stream, n, n,
+                          y.host_ptr(), a.host_ptr(), b.host_ptr(), xy2y);
+          return true;
+        }
+      }
+      if (y.host_shape().NumAxes() == 3) {
+        const K y_dim0 = y.host_shape().At(0);
+        const K y_dim1 = y.host_shape().At(1);
+        const K y_dim2 = y.host_shape().At(2);
+        const K b_dim0 = b.host_shape().At(0);
+        const K b_dim1 = b.host_shape().At(1);
+        const K b_dim2 = b.host_shape().At(2);
+        if (b_dim0 == y_dim0 && b_dim1 == 1 && b_dim2 == y_dim2) {
+          XYZ2XZFunctor<K> xyz2xz(y_dim1, y_dim2);
+          RUN_CUDA_KERNEL((PartialBroadcastGpu<T, K, binary_func, XYZ2XZFunctor<K>>), stream, n, n,
+                          y.host_ptr(), a.host_ptr(), b.host_ptr(), xyz2xz);
+          return true;
+        }
+        if (b_dim0 == 1 && b_dim1 == y_dim1 && b_dim2 == 1) {
+          XYZ2YFunctor<K> xyz2y(y_dim1, y_dim2);
+          RUN_CUDA_KERNEL((PartialBroadcastGpu<T, K, binary_func, XYZ2YFunctor<K>>), stream, n, n,
+                          y.host_ptr(), a.host_ptr(), b.host_ptr(), xyz2y);
+          return true;
+        }
+      }
+    }
+    return false;
+  }
+};
+
+template<typename T, int NDIMS, template<typename> class binary_func>
+struct NdarrayApplyBroadcastInplaceBinaryCoreWrapper<DeviceType::kCUDA, T, NDIMS, binary_func>
+    final {
+  static void InplaceApply(ep::Stream* stream, const XpuVarNdarray<T>& y,
+                           const XpuVarNdarray<const T>& x) {
+    size_t n = y.host_shape().HostElemNum();
+    XpuVarNdarray<const T> a(y.host_shape(), y.host_ptr());
+    using NBB = NdarrayApplyBroadcastBinaryCoreWrapper<DeviceType::kCUDA, T, NDIMS, binary_func>;
+    if (n == 0) { return; }
+    if (IsKernelSafeInt32(n) && NBB::template PartialBroadcast<int32_t>(stream, y, a, x)) {
+      return;
+    }
+    if (!IsKernelSafeInt32(n) && NBB::template PartialBroadcast<int64_t>(stream, y, a, x)) {
+      return;
+    }
+    RUN_CUDA_KERNEL((GpuInplaceBroadcastBinaryFunc<T, NDIMS, binary_func>), stream, n, y, x);
+  }
+};
+
+#define INSTANTIATE_BROADCAST_BINARY_FUNC(dtype_pair, NDIMS, binary_func) \
+  template struct NdarrayApplyBroadcastBinaryCoreWrapper<                 \
+      DeviceType::kCUDA, OF_PP_PAIR_FIRST(dtype_pair), NDIMS, binary_func>;
+OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_BROADCAST_BINARY_FUNC,
+                                 ARITHMETIC_DATA_TYPE_SEQ HALF_DATA_TYPE_SEQ
+                                     UNSIGNED_INT_DATA_TYPE_SEQ BOOL_DATA_TYPE_SEQ,
+                                 DIM_SEQ, ARITHMETIC_BINARY_FUNC_SEQ);
+OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_BROADCAST_BINARY_FUNC,
+                                 ARITHMETIC_DATA_TYPE_SEQ HALF_DATA_TYPE_SEQ
+                                     UNSIGNED_INT_DATA_TYPE_SEQ BOOL_DATA_TYPE_SEQ,
+                                 DIM_SEQ, LOGICAL_BINARY_FUNC_SEQ);
+
+#define INSTANTIATE_BROADCAST_INPLACE_BINARY_FUNC(dtype_pair, NDIMS, binary_func) \
+  template struct NdarrayApplyBroadcastInplaceBinaryCoreWrapper<                  \
+      DeviceType::kCUDA, OF_PP_PAIR_FIRST(dtype_pair), NDIMS, binary_func>;
+OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_BROADCAST_INPLACE_BINARY_FUNC,
+                                 ARITHMETIC_DATA_TYPE_SEQ HALF_DATA_TYPE_SEQ
+                                     UNSIGNED_INT_DATA_TYPE_SEQ BOOL_DATA_TYPE_SEQ,
+                                 DIM_SEQ, ARITHMETIC_BINARY_FUNC_SEQ);
+OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_BROADCAST_INPLACE_BINARY_FUNC,
+                                 ((bool, DataType::kBool)), DIM_SEQ, LOGICAL_BINARY_FUNC_SEQ);
+
 }  // namespace oneflow
\ No newline at end of file
diff --git a/oneflow/core/ndarray/ndarray_apply_broadcast_unary_core.hip.cpp b/oneflow/core/ndarray/ndarray_apply_broadcast_unary_core.hip.cpp
index 1b77803..d1de3fe 100644
--- a/oneflow/core/ndarray/ndarray_apply_broadcast_unary_core.hip.cpp
+++ b/oneflow/core/ndarray/ndarray_apply_broadcast_unary_core.hip.cpp
@@ -1,46 +1,46 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "hip/hip_runtime.h"
-#include "oneflow/core/ndarray/ndarray_apply_broadcast_unary_core.h"
-
-namespace oneflow {
-
-namespace {
-
-template<typename T, int NDIMS, template<typename> class unary_func>
-__global__ void GpuBroadcastUnaryFunc(const XpuVarNdarray<T> y, const XpuVarNdarray<const T> x) {
-  NdarrayApplyBroadcastUnaryCore<T, NDIMS, unary_func>::Apply(y, x);
-}
-
-}  // namespace
-
-template<typename T, int NDIMS, template<typename> class unary_func>
-struct NdarrayApplyBroadcastUnaryCoreWrapper<DeviceType::kCUDA, T, NDIMS, unary_func> final {
-  static void Apply(ep::Stream* stream, const XpuVarNdarray<T>& y,
-                    const XpuVarNdarray<const T>& x) {
-    size_t n = y.host_shape().HostElemNum();
-    if (n == 0) { return; }
-    RUN_CUDA_KERNEL((GpuBroadcastUnaryFunc<T, NDIMS, unary_func>), stream, n, y, x);
-  }
-};
-
-#define INSTANTIATE_BROADCAST_UNARY_FUNC(dtype_pair, NDIMS, unary_func) \
-  template struct NdarrayApplyBroadcastUnaryCoreWrapper<                \
-      DeviceType::kCUDA, OF_PP_PAIR_FIRST(dtype_pair), NDIMS, unary_func>;
-OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_BROADCAST_UNARY_FUNC,
-                                 ARITHMETIC_DATA_TYPE_SEQ HALF_DATA_TYPE_SEQ BOOL_DATA_TYPE_SEQ,
-                                 DIM_SEQ, ARITHMETIC_UNARY_FUNC_SEQ)
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "hip/hip_runtime.h"
+#include "oneflow/core/ndarray/ndarray_apply_broadcast_unary_core.h"
+
+namespace oneflow {
+
+namespace {
+
+template<typename T, int NDIMS, template<typename> class unary_func>
+__global__ void GpuBroadcastUnaryFunc(const XpuVarNdarray<T> y, const XpuVarNdarray<const T> x) {
+  NdarrayApplyBroadcastUnaryCore<T, NDIMS, unary_func>::Apply(y, x);
+}
+
+}  // namespace
+
+template<typename T, int NDIMS, template<typename> class unary_func>
+struct NdarrayApplyBroadcastUnaryCoreWrapper<DeviceType::kCUDA, T, NDIMS, unary_func> final {
+  static void Apply(ep::Stream* stream, const XpuVarNdarray<T>& y,
+                    const XpuVarNdarray<const T>& x) {
+    size_t n = y.host_shape().HostElemNum();
+    if (n == 0) { return; }
+    RUN_CUDA_KERNEL((GpuBroadcastUnaryFunc<T, NDIMS, unary_func>), stream, n, y, x);
+  }
+};
+
+#define INSTANTIATE_BROADCAST_UNARY_FUNC(dtype_pair, NDIMS, unary_func) \
+  template struct NdarrayApplyBroadcastUnaryCoreWrapper<                \
+      DeviceType::kCUDA, OF_PP_PAIR_FIRST(dtype_pair), NDIMS, unary_func>;
+OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_BROADCAST_UNARY_FUNC,
+                                 ARITHMETIC_DATA_TYPE_SEQ HALF_DATA_TYPE_SEQ BOOL_DATA_TYPE_SEQ,
+                                 DIM_SEQ, ARITHMETIC_UNARY_FUNC_SEQ)
 }  // namespace oneflow
\ No newline at end of file
diff --git a/oneflow/core/ndarray/ndarray_apply_unary_core.hip.cpp b/oneflow/core/ndarray/ndarray_apply_unary_core.hip.cpp
index 41f68f4..ce2b03f 100644
--- a/oneflow/core/ndarray/ndarray_apply_unary_core.hip.cpp
+++ b/oneflow/core/ndarray/ndarray_apply_unary_core.hip.cpp
@@ -1,47 +1,47 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "hip/hip_runtime.h"
-#include "oneflow/core/ndarray/ndarray_apply_unary_core.h"
-#include "oneflow/core/ndarray/unary_func.h"
-
-namespace oneflow {
-
-namespace {
-
-template<typename T, template<typename> class unary_func>
-__global__ void NdarrayApplyUnaryInplaceApplyGpu(T* ptr, size_t n) {
-  NdarrayApplyUnaryCore<T, unary_func>::InplaceApply(ptr, n);
-}
-
-}  // namespace
-
-template<typename T, template<typename> class unary_func>
-struct NdarrayApplyUnaryCoreWrapper<DeviceType::kCUDA, T, unary_func> final {
-  static void InplaceApply(ep::Stream* stream, const XpuVarNdarray<T>& y) {
-    size_t n = y.host_shape().HostElemNum();
-    if (n == 0) { return; }
-    RUN_CUDA_KERNEL((NdarrayApplyUnaryInplaceApplyGpu<T, unary_func>), stream, n, y.host_ptr(), n);
-  }
-};
-
-#define INSTANTIATE_NDARRAY_APPLY_UNARY_CORE(dtype_pair, unary_func)                            \
-  template struct NdarrayApplyUnaryCoreWrapper<DeviceType::kCUDA, OF_PP_PAIR_FIRST(dtype_pair), \
-                                               unary_func>;
-OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_NDARRAY_APPLY_UNARY_CORE,
-                                 ARITHMETIC_DATA_TYPE_SEQ HALF_DATA_TYPE_SEQ,
-                                 ARITHMETIC_UNARY_FUNC_SEQ);
-
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "hip/hip_runtime.h"
+#include "oneflow/core/ndarray/ndarray_apply_unary_core.h"
+#include "oneflow/core/ndarray/unary_func.h"
+
+namespace oneflow {
+
+namespace {
+
+template<typename T, template<typename> class unary_func>
+__global__ void NdarrayApplyUnaryInplaceApplyGpu(T* ptr, size_t n) {
+  NdarrayApplyUnaryCore<T, unary_func>::InplaceApply(ptr, n);
+}
+
+}  // namespace
+
+template<typename T, template<typename> class unary_func>
+struct NdarrayApplyUnaryCoreWrapper<DeviceType::kCUDA, T, unary_func> final {
+  static void InplaceApply(ep::Stream* stream, const XpuVarNdarray<T>& y) {
+    size_t n = y.host_shape().HostElemNum();
+    if (n == 0) { return; }
+    RUN_CUDA_KERNEL((NdarrayApplyUnaryInplaceApplyGpu<T, unary_func>), stream, n, y.host_ptr(), n);
+  }
+};
+
+#define INSTANTIATE_NDARRAY_APPLY_UNARY_CORE(dtype_pair, unary_func)                            \
+  template struct NdarrayApplyUnaryCoreWrapper<DeviceType::kCUDA, OF_PP_PAIR_FIRST(dtype_pair), \
+                                               unary_func>;
+OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_NDARRAY_APPLY_UNARY_CORE,
+                                 ARITHMETIC_DATA_TYPE_SEQ HALF_DATA_TYPE_SEQ,
+                                 ARITHMETIC_UNARY_FUNC_SEQ);
+
 }  // namespace oneflow
\ No newline at end of file
diff --git a/oneflow/core/ndarray/ndarray_assign_core.hip.cpp b/oneflow/core/ndarray/ndarray_assign_core.hip.cpp
index 16dbfed..f28bbea 100644
--- a/oneflow/core/ndarray/ndarray_assign_core.hip.cpp
+++ b/oneflow/core/ndarray/ndarray_assign_core.hip.cpp
@@ -1,63 +1,63 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "hip/hip_runtime.h"
-#include "oneflow/core/ndarray/ndarray_assign_core.h"
-#include "oneflow/core/device/cuda_util.h"
-#include "oneflow/core/kernel/kernel_util.h"
-
-namespace oneflow {
-
-namespace {
-
-template<typename T, typename X, int NDIMS>
-__global__ void NdarrayAssignReducedGpu(XpuVarNdarray<T> y,
-                                        const XpuReducedNdarray<X, NDIMS> reduced) {
-  NdarrayAssignCore<T, X, NDIMS>::Assign(y, reduced);
-}
-
-template<typename T, typename X, int NDIMS>
-__global__ void NdarrayAssignGpu(XpuVarNdarray<T> y, const XpuVarNdarray<const X> x) {
-  NdarrayAssignCore<T, X, NDIMS>::Assign(y, x);
-}
-
-}  // namespace
-
-template<typename T, typename X, int NDIMS>
-struct NdarrayAssignCoreWrapper<DeviceType::kCUDA, T, X, NDIMS> final {
-  static void Assign(ep::Stream* ctx, const XpuVarNdarray<T>& y,
-                     const XpuReducedNdarray<X, NDIMS>& reduced) {
-    size_t n = y.host_shape().HostElemNum();
-    if (n == 0) { return; }
-    RUN_CUDA_KERNEL((NdarrayAssignReducedGpu<T, X, NDIMS>), ctx, n, y, reduced);
-  }
-  static void Assign(ep::Stream* ctx, const XpuVarNdarray<T>& y, const XpuVarNdarray<const X>& x) {
-    size_t n = y.host_shape().HostElemNum();
-    if (n == 0) { return; }
-    RUN_CUDA_KERNEL((NdarrayAssignGpu<T, X, NDIMS>), ctx, n, y, x);
-  }
-};
-
-#define INSTANTIATE_NDARRAY_ASSIGN(ret_dtype_pair, dtype_pair, NDIMS)                           \
-  template struct NdarrayAssignCoreWrapper<DeviceType::kCUDA, OF_PP_PAIR_FIRST(ret_dtype_pair), \
-                                           OF_PP_PAIR_FIRST(dtype_pair), NDIMS>;
-OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(
-    INSTANTIATE_NDARRAY_ASSIGN,
-    ARITHMETIC_DATA_TYPE_SEQ UNSIGNED_INT_DATA_TYPE_SEQ BOOL_DATA_TYPE_SEQ,
-    ARITHMETIC_DATA_TYPE_SEQ UNSIGNED_INT_DATA_TYPE_SEQ BOOL_DATA_TYPE_SEQ, DIM_SEQ);
-OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_NDARRAY_ASSIGN, HALF_DATA_TYPE_SEQ, HALF_DATA_TYPE_SEQ,
-                                 DIM_SEQ);
-
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "hip/hip_runtime.h"
+#include "oneflow/core/ndarray/ndarray_assign_core.h"
+#include "oneflow/core/device/cuda_util.h"
+#include "oneflow/core/kernel/kernel_util.h"
+
+namespace oneflow {
+
+namespace {
+
+template<typename T, typename X, int NDIMS>
+__global__ void NdarrayAssignReducedGpu(XpuVarNdarray<T> y,
+                                        const XpuReducedNdarray<X, NDIMS> reduced) {
+  NdarrayAssignCore<T, X, NDIMS>::Assign(y, reduced);
+}
+
+template<typename T, typename X, int NDIMS>
+__global__ void NdarrayAssignGpu(XpuVarNdarray<T> y, const XpuVarNdarray<const X> x) {
+  NdarrayAssignCore<T, X, NDIMS>::Assign(y, x);
+}
+
+}  // namespace
+
+template<typename T, typename X, int NDIMS>
+struct NdarrayAssignCoreWrapper<DeviceType::kCUDA, T, X, NDIMS> final {
+  static void Assign(ep::Stream* ctx, const XpuVarNdarray<T>& y,
+                     const XpuReducedNdarray<X, NDIMS>& reduced) {
+    size_t n = y.host_shape().HostElemNum();
+    if (n == 0) { return; }
+    RUN_CUDA_KERNEL((NdarrayAssignReducedGpu<T, X, NDIMS>), ctx, n, y, reduced);
+  }
+  static void Assign(ep::Stream* ctx, const XpuVarNdarray<T>& y, const XpuVarNdarray<const X>& x) {
+    size_t n = y.host_shape().HostElemNum();
+    if (n == 0) { return; }
+    RUN_CUDA_KERNEL((NdarrayAssignGpu<T, X, NDIMS>), ctx, n, y, x);
+  }
+};
+
+#define INSTANTIATE_NDARRAY_ASSIGN(ret_dtype_pair, dtype_pair, NDIMS)                           \
+  template struct NdarrayAssignCoreWrapper<DeviceType::kCUDA, OF_PP_PAIR_FIRST(ret_dtype_pair), \
+                                           OF_PP_PAIR_FIRST(dtype_pair), NDIMS>;
+OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(
+    INSTANTIATE_NDARRAY_ASSIGN,
+    ARITHMETIC_DATA_TYPE_SEQ UNSIGNED_INT_DATA_TYPE_SEQ BOOL_DATA_TYPE_SEQ,
+    ARITHMETIC_DATA_TYPE_SEQ UNSIGNED_INT_DATA_TYPE_SEQ BOOL_DATA_TYPE_SEQ, DIM_SEQ);
+OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_NDARRAY_ASSIGN, HALF_DATA_TYPE_SEQ, HALF_DATA_TYPE_SEQ,
+                                 DIM_SEQ);
+
 }  // namespace oneflow
\ No newline at end of file
diff --git a/oneflow/core/ndarray/ndarray_reduce_impl.hip.cpp b/oneflow/core/ndarray/ndarray_reduce_impl.hip.cpp
index b651aa5..abd995b 100644
--- a/oneflow/core/ndarray/ndarray_reduce_impl.hip.cpp
+++ b/oneflow/core/ndarray/ndarray_reduce_impl.hip.cpp
@@ -1,383 +1,383 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "hip/hip_runtime.h"
-#include <hipcub/hipcub.hpp>
-#include "oneflow/core/ndarray/ndarray_reduce_impl.h"
-#include "oneflow/core/ndarray/binary_func.h"
-#include "oneflow/core/common/preprocessor.h"
-#include "oneflow/core/common/shape.h"
-#include "oneflow/core/common/permutation_iterator.h"
-#include "oneflow/core/ep/rocm/cuda_stream.h"
-
-namespace hipcub {
-struct Prod {
-  template<typename T>
-  __host__ __device__ __forceinline__ T operator()(const T& a, const T& b) const {
-    return a * b;
-  }
-};
-struct Any {
-  template<typename T>
-  __host__ __device__ __forceinline__ T operator()(const T& a, const T& b) const {
-    return a || b;
-  }
-};
-struct All {
-  template<typename T>
-  __host__ __device__ __forceinline__ T operator()(const T& a, const T& b) const {
-    return a && b;
-  }
-};
-}  // namespace hipcub
-
-namespace oneflow {
-
-namespace {
-
-template<template<typename> class R, typename T, typename K, typename RetT>
-__global__ void MatrixColReduceBy1ThreadPerColumn(K num_elems, K num_cols, const T* in, RetT* out) {
-  CUDA_1D_KERNEL_LOOP_T(K, j, num_cols) {
-    K index = j;
-    T sum = in[index];
-    for (index += num_cols; index < num_elems; index += num_cols) {
-      sum = R<T>::Invoke(sum, in[index]);
-    }
-    out[j] = sum;
-  }
-}
-
-template<typename T>
-struct WithAlign2 {
-  union {
-    T value;
-    int32_t padding;
-  };
-};
-
-template<template<typename> class R, typename T, typename K, typename RetT>
-__global__ void MatrixColReduceByWarpBlock(K num_elems, K num_cols, const T* in, RetT* out) {
-  const K thread_col = threadIdx.x % kCudaWarpSize;
-  const K thread_row = threadIdx.x / kCudaWarpSize;
-  const K thread_dim_row = blockDim.x / kCudaWarpSize;
-  const K num_valid_threads = thread_dim_row * num_cols;  // ASSERT: always <= num_elems
-  const K col = blockIdx.x * kCudaWarpSize + thread_col;
-  __shared__ WithAlign2<T> partial_values[kCudaWarpSize * kCudaWarpSize];
-  if (col < num_cols) {
-    K index = thread_row * num_cols + col;
-    T val = in[index];
-    for (index += num_valid_threads; index < num_elems; index += num_valid_threads) {
-      val = R<T>::Invoke(val, in[index]);
-    }
-    partial_values[threadIdx.x].value = val;
-  }
-  __syncthreads();
-  if (col < num_cols && thread_row == 0) {
-    int index = thread_col;
-    T val = partial_values[index].value;
-    for (index += kCudaWarpSize; index < blockDim.x; index += kCudaWarpSize) {
-      val = R<T>::Invoke(val, partial_values[index].value);
-    }
-    out[col] = val;
-  }
-}
-
-template<template<typename> class R, typename T, typename K, typename RetT>
-void MatrixColReduceBy1BlockLayer(ep::Stream* stream, K num_elems, K num_cols, const T* in,
-                                  RetT* out) {
-  CHECK_LE(num_cols, kCudaMaxBlocksNum * kCudaWarpSize);
-  const K num_rows = num_elems / num_cols;
-  CHECK_GT(num_rows, 0);
-  if (num_rows < kCudaWarpSize) {
-    RUN_CUDA_KERNEL((MatrixColReduceBy1ThreadPerColumn<R, T, K, RetT>), stream, num_cols, num_elems,
-                    num_cols, in, out);
-  } else {
-    const int num_blocks = (num_cols + kCudaWarpSize - 1) / kCudaWarpSize;
-    const int num_threads = kCudaWarpSize * kCudaWarpSize;
-    auto Reduce = &MatrixColReduceByWarpBlock<R, T, K, RetT>;
-    Reduce<<<num_blocks, num_threads, 0, stream->As<ep::CudaStream>()->cuda_stream()>>>(
-        num_elems, num_cols, in, out);
-  }
-}
-
-const static int32_t kNumRows4OneBlockLayer = kCudaWarpSize * kCudaWarpSize;
-const static int32_t kNumCols4OneBlockLayer = kCudaMaxBlocksNum * kCudaWarpSize / 2;
-
-template<template<typename> class R, typename T, typename K>
-void MatrixColReduceK(ep::Stream* stream, K num_rows, K num_cols, const T* in,
-                      typename BinaryFuncTrait<R, T>::return_type* out, T* tmp) {
-  K num_elems = num_rows * num_cols;
-  if (num_rows < kNumRows4OneBlockLayer || num_cols > kNumCols4OneBlockLayer) {
-    MatrixColReduceBy1BlockLayer<R, T, K, typename BinaryFuncTrait<R, T>::return_type>(
-        stream, num_elems, num_cols, in, out);
-  } else {
-    int scale_shift = 1;
-    for (; true; ++scale_shift) {
-      if ((num_rows >> scale_shift) < kNumRows4OneBlockLayer) { break; }
-      if ((num_cols << scale_shift) > kNumCols4OneBlockLayer) { break; }
-    }
-    MatrixColReduceBy1BlockLayer<R, T, K, T>(stream, num_elems, (num_cols << scale_shift), in, tmp);
-    // recursively calls MatrixColReduceK(...) log32(num_rows) times at most
-    MatrixColReduceK<R, T, K>(stream, (1 << scale_shift), num_cols, tmp, out, tmp);
-  }
-}
-
-template<template<typename> class R, typename T>
-void MatrixColReduce(ep::Stream* stream, int64_t num_rows, int64_t num_cols, const T* in,
-                     typename BinaryFuncTrait<R, T>::return_type* out, T* tmp) {
-  if (IsKernelSafeInt32(num_rows * num_cols)) {
-    return MatrixColReduceK<R, T, int32_t>(stream, num_rows, num_cols, in, out, tmp);
-  } else {
-    return MatrixColReduceK<R, T, int64_t>(stream, num_rows, num_cols, in, out, tmp);
-  }
-}
-
-}  // namespace
-
-template<typename T, template<typename> class binary_func>
-struct CubFunctor4BianryFunc;
-
-#define SPECIALIZE_CUB_FUNCTOR_4_BINARY_FUNC(func_name)          \
-  template<typename T>                                           \
-  struct CubFunctor4BianryFunc<T, BinaryFunc##func_name> final { \
-    using type = hipcub::func_name;                                 \
-  };
-OF_PP_FOR_EACH_ATOMIC(SPECIALIZE_CUB_FUNCTOR_4_BINARY_FUNC, REDUCE_BINARY_FUNC_NAME_SEQ);
-#undef SPECIALIZE_CUB_FUNCTOR_4_BINARY_FUNC
-
-struct RowOffsetFunctor final {
-  OF_DEVICE_FUNC explicit RowOffsetFunctor(int32_t num_cols) : num_cols_(num_cols) {}
-  OF_DEVICE_FUNC int32_t operator()(const int32_t& x) const { return x * num_cols_; }
-  int32_t num_cols_;
-};
-
-template<typename T, template<typename> class binary_func>
-struct NdarrayScalarReduce<DeviceType::kCUDA, T, binary_func> final {
-  using RetT = typename BinaryFuncTrait<binary_func, T>::return_type;
-  static bool Matched(const XpuVarNdarray<RetT>& y, const XpuVarNdarray<const T>& x) {
-    return y.shape().ElemNum() == 1;
-  }
-
-  static void Reduce(ep::Stream* stream, const XpuVarNdarray<RetT>& y,
-                     const XpuVarNdarray<const T>& x, const XpuVarNdarray<T>& tmp_storage) {
-    CHECK(Matched(y, x));
-    size_t x_size = x.shape().ElemNum();
-    size_t tmp_storage_bytes = 0;
-    auto DoReduce = [&](T* tmp_storage_ptr) {
-      int retcode = hipcub::DeviceReduce::Reduce(
-          tmp_storage_ptr, tmp_storage_bytes, x.ptr(), y.ptr(), x_size,
-          typename CubFunctor4BianryFunc<T, binary_func>::type(),
-          UnitOfBinaryFunc<T, binary_func>::Val(), stream->As<ep::CudaStream>()->cuda_stream());
-      CHECK_EQ(retcode, 0) << "hipcub::DeviceSegmentedReduce::Reduce error";
-    };
-    DoReduce(nullptr);
-    // CHECK_GE(tmp_storage.shape().ElemNum() * sizeof(T), tmp_storage_bytes);
-    DoReduce(tmp_storage.ptr());
-  }
-};
-
-template<typename T, template<typename> class binary_func>
-struct NdarrayMatrixRowReduce<DeviceType::kCUDA, T, binary_func> final {
-  using RetT = typename BinaryFuncTrait<binary_func, T>::return_type;
-  static bool Matched(const XpuVarNdarray<RetT>& y, const XpuVarNdarray<const T>& x) {
-    if (y.shape().ElemNum() > GetMaxVal<int32_t>()) { return false; }
-    if (x.shape().NumAxes() != 2) { return false; }
-    if (y.shape().NumAxes() != 2) { return false; }
-    return x.shape().At(0) == y.shape().At(0) && y.shape().At(1) == 1;
-  }
-
-  static void Reduce(ep::Stream* stream, const XpuVarNdarray<RetT>& y,
-                     const XpuVarNdarray<const T>& x, const XpuVarNdarray<T>& tmp_storage) {
-    CHECK(Matched(y, x));
-    int32_t num_rows = y.shape().ElemNum();
-    int32_t num_cols = x.shape().ElemNum() / y.shape().ElemNum();
-    RowOffsetFunctor get_row_offset(num_cols);
-    hipcub::CountingInputIterator<int32_t> counting_intput_it(0);
-    hipcub::TransformInputIterator<int32_t, RowOffsetFunctor, hipcub::CountingInputIterator<int32_t>>
-        transform_input_iter(counting_intput_it, get_row_offset);
-    size_t tmp_storage_bytes = 0;
-    auto DoReduce = [&](T* tmp_storage_ptr) {
-      int retcode = hipcub::DeviceSegmentedReduce::Reduce(
-          tmp_storage_ptr, tmp_storage_bytes, x.ptr(), y.ptr(), num_rows, transform_input_iter,
-          transform_input_iter + 1, typename CubFunctor4BianryFunc<T, binary_func>::type(),
-          UnitOfBinaryFunc<T, binary_func>::Val(), stream->As<ep::CudaStream>()->cuda_stream());
-      CHECK_EQ(retcode, 0) << "hipcub::DeviceSegmentedReduce::Reduce error";
-    };
-    DoReduce(nullptr);
-    CHECK_GE(tmp_storage.shape().ElemNum() * sizeof(T), tmp_storage_bytes);
-    DoReduce(tmp_storage.ptr());
-  }
-};
-
-template<typename T, template<typename> class binary_func>
-struct NdarrayMatrixColReduce<DeviceType::kCUDA, T, binary_func> final {
-  using RetT = typename BinaryFuncTrait<binary_func, T>::return_type;
-  static bool Matched(const XpuVarNdarray<RetT>& y, const XpuVarNdarray<const T>& x) {
-    if (y.shape().ElemNum() > GetMaxVal<int32_t>()) { return false; }
-    if (x.shape().NumAxes() != 2) { return false; }
-    if (y.shape().NumAxes() != 2) { return false; }
-    return y.shape().At(0) == 1 && x.shape().At(1) == y.shape().At(1);
-  }
-
-  struct XY2YXFunctor final {
-    __host__ __device__ XY2YXFunctor(int32_t dim_x, int32_t dim_y) : dim_x_(dim_x), dim_y_(dim_y) {}
-
-    __host__ __device__ int32_t operator()(const int32_t& idx) const {
-      const int32_t y = idx / dim_x_;
-      const int32_t x = idx % dim_x_;
-      return x * dim_y_ + y;
-    }
-
-    int32_t dim_x_;
-    int32_t dim_y_;
-  };
-
-  static void Reduce(ep::Stream* stream, const XpuVarNdarray<RetT>& y,
-                     const XpuVarNdarray<const T>& x, const XpuVarNdarray<T>& tmp_storage) {
-    CHECK(Matched(y, x));
-    int64_t num_rows = x.shape().At(0);
-    int64_t num_cols = x.shape().At(1);
-    if (num_cols < kNumCols4OneBlockLayer) {
-      return MatrixColReduce<binary_func, T>(stream, num_rows, num_cols, x.host_ptr(), y.host_ptr(),
-                                             tmp_storage.host_ptr());
-    }
-    RowOffsetFunctor get_row_offset(num_rows);
-    hipcub::CountingInputIterator<int32_t> counting_intput_it(0);
-    hipcub::TransformInputIterator<int32_t, RowOffsetFunctor, hipcub::CountingInputIterator<int32_t>>
-        transform_input_iter(counting_intput_it, get_row_offset);
-
-    XY2YXFunctor xy2yx(x.shape().At(0), x.shape().At(1));
-    using XY2YxIndexIter =
-        hipcub::TransformInputIterator<int32_t, XY2YXFunctor, hipcub::CountingInputIterator<int32_t>>;
-    XY2YxIndexIter xy2yx_iter(counting_intput_it, xy2yx);
-    PermutationIterator<const T, const T*, XY2YxIndexIter> x_iter(x.ptr(), xy2yx_iter);
-    size_t tmp_storage_bytes = 0;
-    auto DoReduce = [&](T* tmp_storage_ptr) {
-      int retcode = hipcub::DeviceSegmentedReduce::Reduce(
-          tmp_storage_ptr, tmp_storage_bytes, x_iter, y.ptr(), num_cols, transform_input_iter,
-          transform_input_iter + 1, typename CubFunctor4BianryFunc<T, binary_func>::type(),
-          UnitOfBinaryFunc<T, binary_func>::Val(), stream->As<ep::CudaStream>()->cuda_stream());
-      CHECK_EQ(retcode, 0) << "hipcub::DeviceSegmentedReduce::Reduce error";
-    };
-    DoReduce(nullptr);
-    CHECK_GE(tmp_storage.shape().ElemNum() * sizeof(T), tmp_storage_bytes);
-    DoReduce(tmp_storage.ptr());
-  }
-};
-
-template<typename T, template<typename> class binary_func>
-struct NdarrayXYZCubeXZReduce<DeviceType::kCUDA, T, binary_func> final {
-  using RetT = typename BinaryFuncTrait<binary_func, T>::return_type;
-  static bool Matched(const XpuVarNdarray<RetT>& y, const XpuVarNdarray<const T>& x) {
-    if (y.shape().ElemNum() > GetMaxVal<int32_t>()) { return false; }
-    if (x.shape().NumAxes() != 3) { return false; }
-    if (y.shape().NumAxes() != 3) { return false; }
-    return y.shape().At(0) == 1 && x.shape().At(1) == y.shape().At(1) && y.shape().At(2) == 1;
-  }
-
-  struct XYZ2YxzFunctor final {
-    __host__ __device__ XYZ2YxzFunctor(int32_t dim_x, int32_t dim_y, int32_t dim_z)
-        : dim_z_(dim_z), dim_xz_(dim_x * dim_z), dim_yz_(dim_y * dim_z) {}
-
-    __host__ __device__ int32_t operator()(const int32_t& idx) const {
-      const int32_t y = idx / dim_xz_;
-      const int32_t xz_idx = idx % dim_xz_;
-      const int32_t x = xz_idx / dim_z_;
-      const int32_t z = xz_idx % dim_z_;
-      return x * dim_yz_ + y * dim_z_ + z;
-    }
-
-    int32_t dim_z_;
-    int32_t dim_xz_;
-    int32_t dim_yz_;
-  };
-
-  static void Reduce(ep::Stream* stream, const XpuVarNdarray<RetT>& y,
-                     const XpuVarNdarray<const T>& x, const XpuVarNdarray<T>& tmp_storage) {
-    CHECK(Matched(y, x));
-    int32_t num_rows = y.shape().ElemNum();
-    int32_t num_cols = x.shape().ElemNum() / y.shape().ElemNum();
-
-    RowOffsetFunctor get_row_offset(num_cols);
-    hipcub::CountingInputIterator<int32_t> counting_intput_it(0);
-    hipcub::TransformInputIterator<int32_t, RowOffsetFunctor, hipcub::CountingInputIterator<int32_t>>
-        transform_input_iter(counting_intput_it, get_row_offset);
-
-    XYZ2YxzFunctor xyz2yxz(x.shape().At(0), x.shape().At(1), x.shape().At(2));
-    using XYZ2YxzIndexIter =
-        hipcub::TransformInputIterator<int32_t, XYZ2YxzFunctor, hipcub::CountingInputIterator<int32_t>>;
-    XYZ2YxzIndexIter xyz2yxz_iter(counting_intput_it, xyz2yxz);
-    PermutationIterator<const T, const T*, XYZ2YxzIndexIter> x_iter(x.ptr(), xyz2yxz_iter);
-    size_t tmp_storage_bytes = 0;
-    auto DoReduce = [&](T* tmp_storage_ptr) {
-      int retcode = hipcub::DeviceSegmentedReduce::Reduce(
-          tmp_storage_ptr, tmp_storage_bytes, x_iter, y.ptr(), num_rows, transform_input_iter,
-          transform_input_iter + 1, typename CubFunctor4BianryFunc<T, binary_func>::type(),
-          UnitOfBinaryFunc<T, binary_func>::Val(), stream->As<ep::CudaStream>()->cuda_stream());
-      CHECK_EQ(retcode, 0) << "hipcub::DeviceSegmentedReduce::Reduce error";
-    };
-    DoReduce(nullptr);
-    CHECK_GE(tmp_storage.shape().ElemNum() * sizeof(T), tmp_storage_bytes);
-    DoReduce(tmp_storage.ptr());
-  }
-};
-
-namespace {
-
-template<typename T, int NDIMS, template<typename> class binary_func>
-__global__ void NdarrayReduceGpuInplaceReduceAxis(const XpuReducedNdarray<T, NDIMS> dst_reduced,
-                                                  const XpuReducedNdarray<T, NDIMS> x, int axis) {
-  NdarrayReduceCore<T, NDIMS, binary_func>::ReduceAxis(dst_reduced, x, axis);
-}
-
-}  // namespace
-
-template<typename T, int NDIMS, template<typename> class binary_func>
-struct NdarrayReduceCoreWrapper<DeviceType::kCUDA, T, NDIMS, binary_func> final {
-  static void ReduceAxis(ep::Stream* stream, const XpuReducedNdarray<T, NDIMS>& dst_reduced,
-                         const XpuReducedNdarray<T, NDIMS>& x, int axis) {
-    size_t n = x.host_shape().HostElemNum();
-    RUN_CUDA_KERNEL((NdarrayReduceGpuInplaceReduceAxis<T, NDIMS, binary_func>), stream, n,
-                    dst_reduced, x, axis);
-  }
-};
-
-#define INSTANTIATE_NDARRAY_REDUCE_IMPL(dtype, binary_func)                                        \
-  template struct NdarrayScalarReduce<DeviceType::kCUDA, OF_PP_PAIR_FIRST(dtype), binary_func>;    \
-  template struct NdarrayMatrixRowReduce<DeviceType::kCUDA, OF_PP_PAIR_FIRST(dtype), binary_func>; \
-  template struct NdarrayMatrixColReduce<DeviceType::kCUDA, OF_PP_PAIR_FIRST(dtype), binary_func>; \
-  template struct NdarrayXYZCubeXZReduce<DeviceType::kCUDA, OF_PP_PAIR_FIRST(dtype), binary_func>;
-OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_NDARRAY_REDUCE_IMPL,
-                                 ARITHMETIC_DATA_TYPE_SEQ HALF_DATA_TYPE_SEQ
-                                     UNSIGNED_INT_DATA_TYPE_SEQ,
-                                 ARITHMETIC_REDUCE_BINARY_FUNC_SEQ);
-OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_NDARRAY_REDUCE_IMPL,
-                                 ARITHMETIC_DATA_TYPE_SEQ UNSIGNED_INT_DATA_TYPE_SEQ
-                                     BOOL_DATA_TYPE_SEQ,
-                                 LOGICAL_REDUCE_BINARY_FUNC_SEQ);
-
-#define INSTANTIATE_NDARRAY_REDUCE_CORE_WRAPPER(dtype_pair, NDIMS, binary_func)                    \
-  template struct NdarrayReduceCoreWrapper<DeviceType::kCUDA, OF_PP_PAIR_FIRST(dtype_pair), NDIMS, \
-                                           binary_func>;
-OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_NDARRAY_REDUCE_CORE_WRAPPER,
-                                 ARITHMETIC_DATA_TYPE_SEQ HALF_DATA_TYPE_SEQ
-                                     UNSIGNED_INT_DATA_TYPE_SEQ,
-                                 DIM_SEQ, ARITHMETIC_REDUCE_BINARY_FUNC_SEQ);
-OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_NDARRAY_REDUCE_CORE_WRAPPER,
-                                 ARITHMETIC_DATA_TYPE_SEQ UNSIGNED_INT_DATA_TYPE_SEQ
-                                     BOOL_DATA_TYPE_SEQ,
-                                 DIM_SEQ, LOGICAL_REDUCE_BINARY_FUNC_SEQ);
-
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "hip/hip_runtime.h"
+#include <hipcub/hipcub.hpp>
+#include "oneflow/core/ndarray/ndarray_reduce_impl.h"
+#include "oneflow/core/ndarray/binary_func.h"
+#include "oneflow/core/common/preprocessor.h"
+#include "oneflow/core/common/shape.h"
+#include "oneflow/core/common/permutation_iterator.h"
+#include "oneflow/core/ep/rocm/cuda_stream.h"
+
+namespace hipcub {
+struct Prod {
+  template<typename T>
+  __host__ __device__ __forceinline__ T operator()(const T& a, const T& b) const {
+    return a * b;
+  }
+};
+struct Any {
+  template<typename T>
+  __host__ __device__ __forceinline__ T operator()(const T& a, const T& b) const {
+    return a || b;
+  }
+};
+struct All {
+  template<typename T>
+  __host__ __device__ __forceinline__ T operator()(const T& a, const T& b) const {
+    return a && b;
+  }
+};
+}  // namespace hipcub
+
+namespace oneflow {
+
+namespace {
+
+template<template<typename> class R, typename T, typename K, typename RetT>
+__global__ void MatrixColReduceBy1ThreadPerColumn(K num_elems, K num_cols, const T* in, RetT* out) {
+  CUDA_1D_KERNEL_LOOP_T(K, j, num_cols) {
+    K index = j;
+    T sum = in[index];
+    for (index += num_cols; index < num_elems; index += num_cols) {
+      sum = R<T>::Invoke(sum, in[index]);
+    }
+    out[j] = sum;
+  }
+}
+
+template<typename T>
+struct WithAlign2 {
+  union {
+    T value;
+    int32_t padding;
+  };
+};
+
+template<template<typename> class R, typename T, typename K, typename RetT>
+__global__ void MatrixColReduceByWarpBlock(K num_elems, K num_cols, const T* in, RetT* out) {
+  const K thread_col = threadIdx.x % kCudaWarpSize;
+  const K thread_row = threadIdx.x / kCudaWarpSize;
+  const K thread_dim_row = blockDim.x / kCudaWarpSize;
+  const K num_valid_threads = thread_dim_row * num_cols;  // ASSERT: always <= num_elems
+  const K col = blockIdx.x * kCudaWarpSize + thread_col;
+  __shared__ WithAlign2<T> partial_values[kCudaWarpSize * kCudaWarpSize];
+  if (col < num_cols) {
+    K index = thread_row * num_cols + col;
+    T val = in[index];
+    for (index += num_valid_threads; index < num_elems; index += num_valid_threads) {
+      val = R<T>::Invoke(val, in[index]);
+    }
+    partial_values[threadIdx.x].value = val;
+  }
+  __syncthreads();
+  if (col < num_cols && thread_row == 0) {
+    int index = thread_col;
+    T val = partial_values[index].value;
+    for (index += kCudaWarpSize; index < blockDim.x; index += kCudaWarpSize) {
+      val = R<T>::Invoke(val, partial_values[index].value);
+    }
+    out[col] = val;
+  }
+}
+
+template<template<typename> class R, typename T, typename K, typename RetT>
+void MatrixColReduceBy1BlockLayer(ep::Stream* stream, K num_elems, K num_cols, const T* in,
+                                  RetT* out) {
+  CHECK_LE(num_cols, kCudaMaxBlocksNum * kCudaWarpSize);
+  const K num_rows = num_elems / num_cols;
+  CHECK_GT(num_rows, 0);
+  if (num_rows < kCudaWarpSize) {
+    RUN_CUDA_KERNEL((MatrixColReduceBy1ThreadPerColumn<R, T, K, RetT>), stream, num_cols, num_elems,
+                    num_cols, in, out);
+  } else {
+    const int num_blocks = (num_cols + kCudaWarpSize - 1) / kCudaWarpSize;
+    const int num_threads = kCudaWarpSize * kCudaWarpSize;
+    auto Reduce = &MatrixColReduceByWarpBlock<R, T, K, RetT>;
+    Reduce<<<num_blocks, num_threads, 0, stream->As<ep::CudaStream>()->cuda_stream()>>>(
+        num_elems, num_cols, in, out);
+  }
+}
+
+const static int32_t kNumRows4OneBlockLayer = kCudaWarpSize * kCudaWarpSize;
+const static int32_t kNumCols4OneBlockLayer = kCudaMaxBlocksNum * kCudaWarpSize / 2;
+
+template<template<typename> class R, typename T, typename K>
+void MatrixColReduceK(ep::Stream* stream, K num_rows, K num_cols, const T* in,
+                      typename BinaryFuncTrait<R, T>::return_type* out, T* tmp) {
+  K num_elems = num_rows * num_cols;
+  if (num_rows < kNumRows4OneBlockLayer || num_cols > kNumCols4OneBlockLayer) {
+    MatrixColReduceBy1BlockLayer<R, T, K, typename BinaryFuncTrait<R, T>::return_type>(
+        stream, num_elems, num_cols, in, out);
+  } else {
+    int scale_shift = 1;
+    for (; true; ++scale_shift) {
+      if ((num_rows >> scale_shift) < kNumRows4OneBlockLayer) { break; }
+      if ((num_cols << scale_shift) > kNumCols4OneBlockLayer) { break; }
+    }
+    MatrixColReduceBy1BlockLayer<R, T, K, T>(stream, num_elems, (num_cols << scale_shift), in, tmp);
+    // recursively calls MatrixColReduceK(...) log32(num_rows) times at most
+    MatrixColReduceK<R, T, K>(stream, (1 << scale_shift), num_cols, tmp, out, tmp);
+  }
+}
+
+template<template<typename> class R, typename T>
+void MatrixColReduce(ep::Stream* stream, int64_t num_rows, int64_t num_cols, const T* in,
+                     typename BinaryFuncTrait<R, T>::return_type* out, T* tmp) {
+  if (IsKernelSafeInt32(num_rows * num_cols)) {
+    return MatrixColReduceK<R, T, int32_t>(stream, num_rows, num_cols, in, out, tmp);
+  } else {
+    return MatrixColReduceK<R, T, int64_t>(stream, num_rows, num_cols, in, out, tmp);
+  }
+}
+
+}  // namespace
+
+template<typename T, template<typename> class binary_func>
+struct CubFunctor4BianryFunc;
+
+#define SPECIALIZE_CUB_FUNCTOR_4_BINARY_FUNC(func_name)          \
+  template<typename T>                                           \
+  struct CubFunctor4BianryFunc<T, BinaryFunc##func_name> final { \
+    using type = hipcub::func_name;                                 \
+  };
+OF_PP_FOR_EACH_ATOMIC(SPECIALIZE_CUB_FUNCTOR_4_BINARY_FUNC, REDUCE_BINARY_FUNC_NAME_SEQ);
+#undef SPECIALIZE_CUB_FUNCTOR_4_BINARY_FUNC
+
+struct RowOffsetFunctor final {
+  OF_DEVICE_FUNC explicit RowOffsetFunctor(int32_t num_cols) : num_cols_(num_cols) {}
+  OF_DEVICE_FUNC int32_t operator()(const int32_t& x) const { return x * num_cols_; }
+  int32_t num_cols_;
+};
+
+template<typename T, template<typename> class binary_func>
+struct NdarrayScalarReduce<DeviceType::kCUDA, T, binary_func> final {
+  using RetT = typename BinaryFuncTrait<binary_func, T>::return_type;
+  static bool Matched(const XpuVarNdarray<RetT>& y, const XpuVarNdarray<const T>& x) {
+    return y.shape().ElemNum() == 1;
+  }
+
+  static void Reduce(ep::Stream* stream, const XpuVarNdarray<RetT>& y,
+                     const XpuVarNdarray<const T>& x, const XpuVarNdarray<T>& tmp_storage) {
+    CHECK(Matched(y, x));
+    size_t x_size = x.shape().ElemNum();
+    size_t tmp_storage_bytes = 0;
+    auto DoReduce = [&](T* tmp_storage_ptr) {
+      int retcode = hipcub::DeviceReduce::Reduce(
+          tmp_storage_ptr, tmp_storage_bytes, x.ptr(), y.ptr(), x_size,
+          typename CubFunctor4BianryFunc<T, binary_func>::type(),
+          UnitOfBinaryFunc<T, binary_func>::Val(), stream->As<ep::CudaStream>()->cuda_stream());
+      CHECK_EQ(retcode, 0) << "hipcub::DeviceSegmentedReduce::Reduce error";
+    };
+    DoReduce(nullptr);
+    // CHECK_GE(tmp_storage.shape().ElemNum() * sizeof(T), tmp_storage_bytes);
+    DoReduce(tmp_storage.ptr());
+  }
+};
+
+template<typename T, template<typename> class binary_func>
+struct NdarrayMatrixRowReduce<DeviceType::kCUDA, T, binary_func> final {
+  using RetT = typename BinaryFuncTrait<binary_func, T>::return_type;
+  static bool Matched(const XpuVarNdarray<RetT>& y, const XpuVarNdarray<const T>& x) {
+    if (y.shape().ElemNum() > GetMaxVal<int32_t>()) { return false; }
+    if (x.shape().NumAxes() != 2) { return false; }
+    if (y.shape().NumAxes() != 2) { return false; }
+    return x.shape().At(0) == y.shape().At(0) && y.shape().At(1) == 1;
+  }
+
+  static void Reduce(ep::Stream* stream, const XpuVarNdarray<RetT>& y,
+                     const XpuVarNdarray<const T>& x, const XpuVarNdarray<T>& tmp_storage) {
+    CHECK(Matched(y, x));
+    int32_t num_rows = y.shape().ElemNum();
+    int32_t num_cols = x.shape().ElemNum() / y.shape().ElemNum();
+    RowOffsetFunctor get_row_offset(num_cols);
+    hipcub::CountingInputIterator<int32_t> counting_intput_it(0);
+    hipcub::TransformInputIterator<int32_t, RowOffsetFunctor, hipcub::CountingInputIterator<int32_t>>
+        transform_input_iter(counting_intput_it, get_row_offset);
+    size_t tmp_storage_bytes = 0;
+    auto DoReduce = [&](T* tmp_storage_ptr) {
+      int retcode = hipcub::DeviceSegmentedReduce::Reduce(
+          tmp_storage_ptr, tmp_storage_bytes, x.ptr(), y.ptr(), num_rows, transform_input_iter,
+          transform_input_iter + 1, typename CubFunctor4BianryFunc<T, binary_func>::type(),
+          UnitOfBinaryFunc<T, binary_func>::Val(), stream->As<ep::CudaStream>()->cuda_stream());
+      CHECK_EQ(retcode, 0) << "hipcub::DeviceSegmentedReduce::Reduce error";
+    };
+    DoReduce(nullptr);
+    CHECK_GE(tmp_storage.shape().ElemNum() * sizeof(T), tmp_storage_bytes);
+    DoReduce(tmp_storage.ptr());
+  }
+};
+
+template<typename T, template<typename> class binary_func>
+struct NdarrayMatrixColReduce<DeviceType::kCUDA, T, binary_func> final {
+  using RetT = typename BinaryFuncTrait<binary_func, T>::return_type;
+  static bool Matched(const XpuVarNdarray<RetT>& y, const XpuVarNdarray<const T>& x) {
+    if (y.shape().ElemNum() > GetMaxVal<int32_t>()) { return false; }
+    if (x.shape().NumAxes() != 2) { return false; }
+    if (y.shape().NumAxes() != 2) { return false; }
+    return y.shape().At(0) == 1 && x.shape().At(1) == y.shape().At(1);
+  }
+
+  struct XY2YXFunctor final {
+    __host__ __device__ XY2YXFunctor(int32_t dim_x, int32_t dim_y) : dim_x_(dim_x), dim_y_(dim_y) {}
+
+    __host__ __device__ int32_t operator()(const int32_t& idx) const {
+      const int32_t y = idx / dim_x_;
+      const int32_t x = idx % dim_x_;
+      return x * dim_y_ + y;
+    }
+
+    int32_t dim_x_;
+    int32_t dim_y_;
+  };
+
+  static void Reduce(ep::Stream* stream, const XpuVarNdarray<RetT>& y,
+                     const XpuVarNdarray<const T>& x, const XpuVarNdarray<T>& tmp_storage) {
+    CHECK(Matched(y, x));
+    int64_t num_rows = x.shape().At(0);
+    int64_t num_cols = x.shape().At(1);
+    if (num_cols < kNumCols4OneBlockLayer) {
+      return MatrixColReduce<binary_func, T>(stream, num_rows, num_cols, x.host_ptr(), y.host_ptr(),
+                                             tmp_storage.host_ptr());
+    }
+    RowOffsetFunctor get_row_offset(num_rows);
+    hipcub::CountingInputIterator<int32_t> counting_intput_it(0);
+    hipcub::TransformInputIterator<int32_t, RowOffsetFunctor, hipcub::CountingInputIterator<int32_t>>
+        transform_input_iter(counting_intput_it, get_row_offset);
+
+    XY2YXFunctor xy2yx(x.shape().At(0), x.shape().At(1));
+    using XY2YxIndexIter =
+        hipcub::TransformInputIterator<int32_t, XY2YXFunctor, hipcub::CountingInputIterator<int32_t>>;
+    XY2YxIndexIter xy2yx_iter(counting_intput_it, xy2yx);
+    PermutationIterator<const T, const T*, XY2YxIndexIter> x_iter(x.ptr(), xy2yx_iter);
+    size_t tmp_storage_bytes = 0;
+    auto DoReduce = [&](T* tmp_storage_ptr) {
+      int retcode = hipcub::DeviceSegmentedReduce::Reduce(
+          tmp_storage_ptr, tmp_storage_bytes, x_iter, y.ptr(), num_cols, transform_input_iter,
+          transform_input_iter + 1, typename CubFunctor4BianryFunc<T, binary_func>::type(),
+          UnitOfBinaryFunc<T, binary_func>::Val(), stream->As<ep::CudaStream>()->cuda_stream());
+      CHECK_EQ(retcode, 0) << "hipcub::DeviceSegmentedReduce::Reduce error";
+    };
+    DoReduce(nullptr);
+    CHECK_GE(tmp_storage.shape().ElemNum() * sizeof(T), tmp_storage_bytes);
+    DoReduce(tmp_storage.ptr());
+  }
+};
+
+template<typename T, template<typename> class binary_func>
+struct NdarrayXYZCubeXZReduce<DeviceType::kCUDA, T, binary_func> final {
+  using RetT = typename BinaryFuncTrait<binary_func, T>::return_type;
+  static bool Matched(const XpuVarNdarray<RetT>& y, const XpuVarNdarray<const T>& x) {
+    if (y.shape().ElemNum() > GetMaxVal<int32_t>()) { return false; }
+    if (x.shape().NumAxes() != 3) { return false; }
+    if (y.shape().NumAxes() != 3) { return false; }
+    return y.shape().At(0) == 1 && x.shape().At(1) == y.shape().At(1) && y.shape().At(2) == 1;
+  }
+
+  struct XYZ2YxzFunctor final {
+    __host__ __device__ XYZ2YxzFunctor(int32_t dim_x, int32_t dim_y, int32_t dim_z)
+        : dim_z_(dim_z), dim_xz_(dim_x * dim_z), dim_yz_(dim_y * dim_z) {}
+
+    __host__ __device__ int32_t operator()(const int32_t& idx) const {
+      const int32_t y = idx / dim_xz_;
+      const int32_t xz_idx = idx % dim_xz_;
+      const int32_t x = xz_idx / dim_z_;
+      const int32_t z = xz_idx % dim_z_;
+      return x * dim_yz_ + y * dim_z_ + z;
+    }
+
+    int32_t dim_z_;
+    int32_t dim_xz_;
+    int32_t dim_yz_;
+  };
+
+  static void Reduce(ep::Stream* stream, const XpuVarNdarray<RetT>& y,
+                     const XpuVarNdarray<const T>& x, const XpuVarNdarray<T>& tmp_storage) {
+    CHECK(Matched(y, x));
+    int32_t num_rows = y.shape().ElemNum();
+    int32_t num_cols = x.shape().ElemNum() / y.shape().ElemNum();
+
+    RowOffsetFunctor get_row_offset(num_cols);
+    hipcub::CountingInputIterator<int32_t> counting_intput_it(0);
+    hipcub::TransformInputIterator<int32_t, RowOffsetFunctor, hipcub::CountingInputIterator<int32_t>>
+        transform_input_iter(counting_intput_it, get_row_offset);
+
+    XYZ2YxzFunctor xyz2yxz(x.shape().At(0), x.shape().At(1), x.shape().At(2));
+    using XYZ2YxzIndexIter =
+        hipcub::TransformInputIterator<int32_t, XYZ2YxzFunctor, hipcub::CountingInputIterator<int32_t>>;
+    XYZ2YxzIndexIter xyz2yxz_iter(counting_intput_it, xyz2yxz);
+    PermutationIterator<const T, const T*, XYZ2YxzIndexIter> x_iter(x.ptr(), xyz2yxz_iter);
+    size_t tmp_storage_bytes = 0;
+    auto DoReduce = [&](T* tmp_storage_ptr) {
+      int retcode = hipcub::DeviceSegmentedReduce::Reduce(
+          tmp_storage_ptr, tmp_storage_bytes, x_iter, y.ptr(), num_rows, transform_input_iter,
+          transform_input_iter + 1, typename CubFunctor4BianryFunc<T, binary_func>::type(),
+          UnitOfBinaryFunc<T, binary_func>::Val(), stream->As<ep::CudaStream>()->cuda_stream());
+      CHECK_EQ(retcode, 0) << "hipcub::DeviceSegmentedReduce::Reduce error";
+    };
+    DoReduce(nullptr);
+    CHECK_GE(tmp_storage.shape().ElemNum() * sizeof(T), tmp_storage_bytes);
+    DoReduce(tmp_storage.ptr());
+  }
+};
+
+namespace {
+
+template<typename T, int NDIMS, template<typename> class binary_func>
+__global__ void NdarrayReduceGpuInplaceReduceAxis(const XpuReducedNdarray<T, NDIMS> dst_reduced,
+                                                  const XpuReducedNdarray<T, NDIMS> x, int axis) {
+  NdarrayReduceCore<T, NDIMS, binary_func>::ReduceAxis(dst_reduced, x, axis);
+}
+
+}  // namespace
+
+template<typename T, int NDIMS, template<typename> class binary_func>
+struct NdarrayReduceCoreWrapper<DeviceType::kCUDA, T, NDIMS, binary_func> final {
+  static void ReduceAxis(ep::Stream* stream, const XpuReducedNdarray<T, NDIMS>& dst_reduced,
+                         const XpuReducedNdarray<T, NDIMS>& x, int axis) {
+    size_t n = x.host_shape().HostElemNum();
+    RUN_CUDA_KERNEL((NdarrayReduceGpuInplaceReduceAxis<T, NDIMS, binary_func>), stream, n,
+                    dst_reduced, x, axis);
+  }
+};
+
+#define INSTANTIATE_NDARRAY_REDUCE_IMPL(dtype, binary_func)                                        \
+  template struct NdarrayScalarReduce<DeviceType::kCUDA, OF_PP_PAIR_FIRST(dtype), binary_func>;    \
+  template struct NdarrayMatrixRowReduce<DeviceType::kCUDA, OF_PP_PAIR_FIRST(dtype), binary_func>; \
+  template struct NdarrayMatrixColReduce<DeviceType::kCUDA, OF_PP_PAIR_FIRST(dtype), binary_func>; \
+  template struct NdarrayXYZCubeXZReduce<DeviceType::kCUDA, OF_PP_PAIR_FIRST(dtype), binary_func>;
+OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_NDARRAY_REDUCE_IMPL,
+                                 ARITHMETIC_DATA_TYPE_SEQ HALF_DATA_TYPE_SEQ
+                                     UNSIGNED_INT_DATA_TYPE_SEQ,
+                                 ARITHMETIC_REDUCE_BINARY_FUNC_SEQ);
+OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_NDARRAY_REDUCE_IMPL,
+                                 ARITHMETIC_DATA_TYPE_SEQ UNSIGNED_INT_DATA_TYPE_SEQ
+                                     BOOL_DATA_TYPE_SEQ,
+                                 LOGICAL_REDUCE_BINARY_FUNC_SEQ);
+
+#define INSTANTIATE_NDARRAY_REDUCE_CORE_WRAPPER(dtype_pair, NDIMS, binary_func)                    \
+  template struct NdarrayReduceCoreWrapper<DeviceType::kCUDA, OF_PP_PAIR_FIRST(dtype_pair), NDIMS, \
+                                           binary_func>;
+OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_NDARRAY_REDUCE_CORE_WRAPPER,
+                                 ARITHMETIC_DATA_TYPE_SEQ HALF_DATA_TYPE_SEQ
+                                     UNSIGNED_INT_DATA_TYPE_SEQ,
+                                 DIM_SEQ, ARITHMETIC_REDUCE_BINARY_FUNC_SEQ);
+OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_NDARRAY_REDUCE_CORE_WRAPPER,
+                                 ARITHMETIC_DATA_TYPE_SEQ UNSIGNED_INT_DATA_TYPE_SEQ
+                                     BOOL_DATA_TYPE_SEQ,
+                                 DIM_SEQ, LOGICAL_REDUCE_BINARY_FUNC_SEQ);
+
 }  // namespace oneflow
\ No newline at end of file
diff --git a/oneflow/core/ndarray/xpu_ndarray_assign.hip.cpp b/oneflow/core/ndarray/xpu_ndarray_assign.hip.cpp
index 693b39c..2030335 100644
--- a/oneflow/core/ndarray/xpu_ndarray_assign.hip.cpp
+++ b/oneflow/core/ndarray/xpu_ndarray_assign.hip.cpp
@@ -1,62 +1,62 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "hip/hip_runtime.h"
-#include "oneflow/core/ndarray/ndarray_assign_core.h"
-#include "oneflow/core/device/cuda_util.h"
-#include "oneflow/core/kernel/kernel_util.h"
-
-namespace oneflow {
-
-namespace {
-
-template<typename T, typename X, int NDIMS>
-__global__ void NdarrayAssignReducedGpu(XpuVarNdarray<T> y,
-                                        const XpuReducedNdarray<X, NDIMS> reduced) {
-  NdarrayAssignCore<T, X, NDIMS>::Assign(y, reduced);
-}
-
-template<typename T, typename X, int NDIMS>
-__global__ void NdarrayAssignGpu(XpuVarNdarray<T> y, const XpuVarNdarray<const X> x) {
-  NdarrayAssignCore<T, X, NDIMS>::Assign(y, x);
-}
-
-}  // namespace
-
-template<typename T, typename X, int NDIMS>
-struct NdarrayAssignCoreWrapper<DeviceType::kCUDA, T, X, NDIMS> final {
-  static void Assign(ep::Stream* stream, XpuVarNdarray<T>* y,
-                     const XpuReducedNdarray<X, NDIMS>& reduced) {
-    size_t n = y->host_shape().HostElemNum();
-    RUN_CUDA_KERNEL((NdarrayAssignReducedGpu<T, X, NDIMS>), stream, n, *y, reduced);
-  }
-  static void Assign(ep::Stream* ctx, const XpuVarNdarray<T>& y, const XpuVarNdarray<const X>& x) {
-    size_t n = y.host_shape().HostElemNum();
-    if (n == 0) { return; }
-    RUN_CUDA_KERNEL((NdarrayAssignGpu<T, X, NDIMS>), ctx, n, y, x);
-  }
-};
-
-#define INSTANTIATE_NDARRAY_ASSIGN(ret_dtype_pair, dtype_pair, NDIMS)                           \
-  template struct NdarrayAssignCoreWrapper<DeviceType::kCUDA, OF_PP_PAIR_FIRST(ret_dtype_pair), \
-                                           OF_PP_PAIR_FIRST(dtype_pair), NDIMS>;
-OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(
-    INSTANTIATE_NDARRAY_ASSIGN,
-    ARITHMETIC_DATA_TYPE_SEQ UNSIGNED_INT_DATA_TYPE_SEQ BOOL_DATA_TYPE_SEQ,
-    ARITHMETIC_DATA_TYPE_SEQ UNSIGNED_INT_DATA_TYPE_SEQ BOOL_DATA_TYPE_SEQ, DIM_SEQ);
-OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_NDARRAY_ASSIGN, HALF_DATA_TYPE_SEQ, HALF_DATA_TYPE_SEQ,
-                                 DIM_SEQ);
-
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "hip/hip_runtime.h"
+#include "oneflow/core/ndarray/ndarray_assign_core.h"
+#include "oneflow/core/device/cuda_util.h"
+#include "oneflow/core/kernel/kernel_util.h"
+
+namespace oneflow {
+
+namespace {
+
+template<typename T, typename X, int NDIMS>
+__global__ void NdarrayAssignReducedGpu(XpuVarNdarray<T> y,
+                                        const XpuReducedNdarray<X, NDIMS> reduced) {
+  NdarrayAssignCore<T, X, NDIMS>::Assign(y, reduced);
+}
+
+template<typename T, typename X, int NDIMS>
+__global__ void NdarrayAssignGpu(XpuVarNdarray<T> y, const XpuVarNdarray<const X> x) {
+  NdarrayAssignCore<T, X, NDIMS>::Assign(y, x);
+}
+
+}  // namespace
+
+template<typename T, typename X, int NDIMS>
+struct NdarrayAssignCoreWrapper<DeviceType::kCUDA, T, X, NDIMS> final {
+  static void Assign(ep::Stream* stream, XpuVarNdarray<T>* y,
+                     const XpuReducedNdarray<X, NDIMS>& reduced) {
+    size_t n = y->host_shape().HostElemNum();
+    RUN_CUDA_KERNEL((NdarrayAssignReducedGpu<T, X, NDIMS>), stream, n, *y, reduced);
+  }
+  static void Assign(ep::Stream* ctx, const XpuVarNdarray<T>& y, const XpuVarNdarray<const X>& x) {
+    size_t n = y.host_shape().HostElemNum();
+    if (n == 0) { return; }
+    RUN_CUDA_KERNEL((NdarrayAssignGpu<T, X, NDIMS>), ctx, n, y, x);
+  }
+};
+
+#define INSTANTIATE_NDARRAY_ASSIGN(ret_dtype_pair, dtype_pair, NDIMS)                           \
+  template struct NdarrayAssignCoreWrapper<DeviceType::kCUDA, OF_PP_PAIR_FIRST(ret_dtype_pair), \
+                                           OF_PP_PAIR_FIRST(dtype_pair), NDIMS>;
+OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(
+    INSTANTIATE_NDARRAY_ASSIGN,
+    ARITHMETIC_DATA_TYPE_SEQ UNSIGNED_INT_DATA_TYPE_SEQ BOOL_DATA_TYPE_SEQ,
+    ARITHMETIC_DATA_TYPE_SEQ UNSIGNED_INT_DATA_TYPE_SEQ BOOL_DATA_TYPE_SEQ, DIM_SEQ);
+OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_NDARRAY_ASSIGN, HALF_DATA_TYPE_SEQ, HALF_DATA_TYPE_SEQ,
+                                 DIM_SEQ);
+
 }  // namespace oneflow
\ No newline at end of file
diff --git a/oneflow/core/profiler/event.cpp b/oneflow/core/profiler/event.cpp
index d56cebf..dfa0142 100644
--- a/oneflow/core/profiler/event.cpp
+++ b/oneflow/core/profiler/event.cpp
@@ -1,91 +1,91 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-#include "fmt/core.h"
-#include "fmt/format.h"
-#include "oneflow/core/profiler/event.h"
-#include "oneflow/core/profiler/util.h"
-
-using json = nlohmann::json;
-
-namespace oneflow {
-
-namespace profiler {
-nlohmann::json IEvent::ToJson() {
-  return json{{"name", name_}, {"time", GetDuration<double>()}, {"input_shapes", "-"}};
-}
-
-void IEvent::SetStartedAt(double t) { started_at_ = t; }
-
-void IEvent::SetFinishedAt(double t) { finished_at_ = t; }
-
-void IEvent::Start() { SetStartedAt(GetTimeNow()); }
-
-void IEvent::Finish() { SetFinishedAt(GetTimeNow()); }
-
-bool IEvent::IsChildOf(const IEvent* e) {
-  if (!e) { return false; }
-  if (this == e) { return false; }
-  return GetStartedAt<double>() >= e->GetStartedAt<double>()
-         && GetFinishedAt<double>() <= e->GetFinishedAt<double>();
-}
-
-const std::string& IEvent::GetName() const { return name_; }
-
-std::string CustomEvent::Key() { return name_; }
-
-nlohmann::json CustomEvent::ToJson() {
-  auto j = IEvent::ToJson();
-  j["type"] = EventType::kCustom;
-  j["custom_type"] = type_;
-  return j;
-}
-
-std::shared_ptr<CustomEvent> CustomEvent::Create(const std::string& name, CustomEventType type) {
-  return std::shared_ptr<CustomEvent>(new CustomEvent(name, type));
-}
-
-std::string KernelEvent::Key() { return fmt::format("{}.{}", name_, GetFormatedInputShapes()); }
-
-nlohmann::json KernelEvent::ToJson() {
-  auto j = IEvent::ToJson();
-  j["type"] = EventType::kOneflowKernel;
-  j["input_shapes"] = GetFormatedInputShapes();
-#if defined(WITH_CUDA) || defined(WITH_ROCM)
-  j["memory_size"] = memory_size_;
-  if (!children_.empty()) { j["children"] = children_; }
-#endif  // WITH_CUDA
-  return j;
-}
-
-std::shared_ptr<KernelEvent> KernelEvent::Create(
-    const std::string& name, const std::function<std::vector<Shape>(void)>& shape_getter) {
-  return std::shared_ptr<KernelEvent>(new KernelEvent(name, shape_getter));
-}
-
-std::string KernelEvent::GetFormatedInputShapes(size_t max_num_to_format) {
-  if (input_shapes_.size() == 0) { return "-"; }
-  std::vector<std::string> shapes_formated(std::min(input_shapes_.size(), max_num_to_format));
-  for (auto i = 0; i < shapes_formated.size(); ++i) {
-    const std::string current_shape = input_shapes_[i].ToString();
-    shapes_formated[i] = current_shape == "()" ? "scalar" : current_shape;
-  }
-  if (input_shapes_.size() > max_num_to_format) { shapes_formated.emplace_back("..."); }
-  return fmt::format("[{}]", fmt::join(shapes_formated, ", "));
-}
-
-}  // namespace profiler
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+#include "fmt/core.h"
+#include "fmt/format.h"
+#include "oneflow/core/profiler/event.h"
+#include "oneflow/core/profiler/util.h"
+
+using json = nlohmann::json;
+
+namespace oneflow {
+
+namespace profiler {
+nlohmann::json IEvent::ToJson() {
+  return json{{"name", name_}, {"time", GetDuration<double>()}, {"input_shapes", "-"}};
+}
+
+void IEvent::SetStartedAt(double t) { started_at_ = t; }
+
+void IEvent::SetFinishedAt(double t) { finished_at_ = t; }
+
+void IEvent::Start() { SetStartedAt(GetTimeNow()); }
+
+void IEvent::Finish() { SetFinishedAt(GetTimeNow()); }
+
+bool IEvent::IsChildOf(const IEvent* e) {
+  if (!e) { return false; }
+  if (this == e) { return false; }
+  return GetStartedAt<double>() >= e->GetStartedAt<double>()
+         && GetFinishedAt<double>() <= e->GetFinishedAt<double>();
+}
+
+const std::string& IEvent::GetName() const { return name_; }
+
+std::string CustomEvent::Key() { return name_; }
+
+nlohmann::json CustomEvent::ToJson() {
+  auto j = IEvent::ToJson();
+  j["type"] = EventType::kCustom;
+  j["custom_type"] = type_;
+  return j;
+}
+
+std::shared_ptr<CustomEvent> CustomEvent::Create(const std::string& name, CustomEventType type) {
+  return std::shared_ptr<CustomEvent>(new CustomEvent(name, type));
+}
+
+std::string KernelEvent::Key() { return fmt::format("{}.{}", name_, GetFormatedInputShapes()); }
+
+nlohmann::json KernelEvent::ToJson() {
+  auto j = IEvent::ToJson();
+  j["type"] = EventType::kOneflowKernel;
+  j["input_shapes"] = GetFormatedInputShapes();
+#if defined(WITH_CUDA) || defined(WITH_ROCM)
+  j["memory_size"] = memory_size_;
+  if (!children_.empty()) { j["children"] = children_; }
+#endif  // WITH_CUDA
+  return j;
+}
+
+std::shared_ptr<KernelEvent> KernelEvent::Create(
+    const std::string& name, const std::function<std::vector<Shape>(void)>& shape_getter) {
+  return std::shared_ptr<KernelEvent>(new KernelEvent(name, shape_getter));
+}
+
+std::string KernelEvent::GetFormatedInputShapes(size_t max_num_to_format) {
+  if (input_shapes_.size() == 0) { return "-"; }
+  std::vector<std::string> shapes_formated(std::min(input_shapes_.size(), max_num_to_format));
+  for (auto i = 0; i < shapes_formated.size(); ++i) {
+    const std::string current_shape = input_shapes_[i].ToString();
+    shapes_formated[i] = current_shape == "()" ? "scalar" : current_shape;
+  }
+  if (input_shapes_.size() > max_num_to_format) { shapes_formated.emplace_back("..."); }
+  return fmt::format("[{}]", fmt::join(shapes_formated, ", "));
+}
+
+}  // namespace profiler
 }  // namespace oneflow
\ No newline at end of file
diff --git a/oneflow/core/profiler/event.h b/oneflow/core/profiler/event.h
index af60ff5..59d8c68 100644
--- a/oneflow/core/profiler/event.h
+++ b/oneflow/core/profiler/event.h
@@ -1,186 +1,186 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#ifndef ONEFLOW_CORE_PROFILER_EVENT_H_
-#define ONEFLOW_CORE_PROFILER_EVENT_H_
-
-#include <functional>
-#include <memory>
-#include <vector>
-#include "nlohmann/json.hpp"
-#include "oneflow/core/common/util.h"
-#include "oneflow/core/common/shape_view.h"
-
-namespace oneflow {
-
-namespace profiler {
-
-class ProfileManager;
-
-enum class EventType {
-  kCustom,        // has three kinds
-  kOneflowKernel  // OneFlow cpu/cuda kernel
-};
-enum class CustomEventType {
-  kDefault,     // for record_function
-  kCudaKernel,  // cuda kernel
-  kCudaRuntime  // something like cudaLaunchKernel
-};
-enum class EventTimeUnit { kNS, kUS };
-
-class IEvent {
- public:
-  OF_DISALLOW_COPY_AND_MOVE(IEvent);
-
-  IEvent() = delete;
-  IEvent(const std::string& name, EventTimeUnit time_unit) : name_(name), time_unit_(time_unit) {}
-
-  virtual std::string Key() = 0;
-  virtual nlohmann::json ToJson();
-  virtual ~IEvent() = default;
-
-  virtual void Start();
-  virtual void Finish();
-  bool IsChildOf(const IEvent* e);
-
-  const std::string& GetName() const;
-  template<typename T>
-  const T GetDuration(EventTimeUnit time_unit = EventTimeUnit::kUS) const;
-  template<typename T>
-  const T GetStartedAt(EventTimeUnit time_unit = EventTimeUnit::kUS) const;
-  template<typename T>
-  const T GetFinishedAt(EventTimeUnit time_unit = EventTimeUnit::kUS) const;
-
- protected:
-  virtual void SetStartedAt(double t);
-  virtual void SetFinishedAt(double t);
-
-  std::string name_;
-  EventTimeUnit time_unit_;
-  double started_at_ = 0;
-  double finished_at_ = 0;
-};
-
-inline double ConvertTime(double time_, EventTimeUnit src_time_unit, EventTimeUnit dst_time_unit) {
-  if (src_time_unit == EventTimeUnit::kNS && dst_time_unit == EventTimeUnit::kUS) {
-    return time_ / 1000;
-  }
-  if (src_time_unit == EventTimeUnit::kUS && dst_time_unit == EventTimeUnit::kNS) {
-    return time_ * 1000;
-  }
-  return time_;
-}
-
-template<>
-const inline double IEvent::GetStartedAt<double>(EventTimeUnit time_unit) const {
-  return ConvertTime(started_at_, time_unit_, time_unit);
-}
-
-template<>
-const inline time_t IEvent::GetStartedAt<time_t>(EventTimeUnit time_unit) const {
-  return static_cast<time_t>(GetStartedAt<double>(time_unit));
-}
-
-template<>
-const inline double IEvent::GetFinishedAt<double>(EventTimeUnit time_unit) const {
-  return ConvertTime(finished_at_, time_unit_, time_unit);
-}
-
-template<>
-const inline time_t IEvent::GetFinishedAt<time_t>(EventTimeUnit time_unit) const {
-  return static_cast<time_t>(GetFinishedAt<double>(time_unit));
-}
-
-template<>
-const inline double IEvent::GetDuration<double>(EventTimeUnit time_unit) const {
-  return GetFinishedAt<double>(time_unit) - GetStartedAt<double>(time_unit);
-}
-
-template<>
-const inline time_t IEvent::GetDuration<time_t>(EventTimeUnit time_unit) const {
-  return static_cast<time_t>(GetDuration<double>(time_unit));
-}
-
-class CustomEvent final : public IEvent {
- public:
-  friend class ProfileManager;
-  std::string Key() override;
-
-  nlohmann::json ToJson() override;
-
-  static std::shared_ptr<CustomEvent> Create(const std::string& name,
-                                             CustomEventType type = CustomEventType::kDefault);
-
- private:
-  CustomEventType type_;
-  CustomEvent(const std::string& custom_name, CustomEventType type)
-      : IEvent(custom_name,
-               type == CustomEventType::kDefault ? EventTimeUnit::kNS : EventTimeUnit::kUS),
-        type_(type) {}
-};
-
-class KernelEvent final : public IEvent {
- public:
-  std::string Key() override;
-
-  nlohmann::json ToJson() override;
-
-  static std::shared_ptr<KernelEvent> Create(
-      const std::string& name, const std::function<std::vector<Shape>(void)>& shape_getter);
-
-#if defined(WITH_CUDA) || defined(WITH_ROCM)
-  void SetMemorySize(int64_t memory_size) { memory_size_ = memory_size; }
-  void AddChildEvent(const std::shared_ptr<IEvent>& e) { children_.emplace(e); }
-  bool AddChildEventIfSo(const std::shared_ptr<IEvent>& e) {
-    if (e->IsChildOf(dynamic_cast<IEvent*>(this))) {
-      children_.emplace(e);
-      return true;
-    }
-    return false;
-  }
-  bool HasChildEvent(const std::shared_ptr<IEvent>& e) { return children_.count(e); }
-  void WalkAmongChildren(const std::function<void(const std::shared_ptr<IEvent>& e)>& f) const {
-    for (const auto& x : children_) { f(x); }
-  }
-#endif  // WITH_CUDA
-
- private:
-  KernelEvent(const std::string& kernel_name,
-              const std::function<std::vector<Shape>(void)>& shape_getter)
-      : IEvent(kernel_name, EventTimeUnit::kNS) {
-    if (shape_getter) { input_shapes_ = shape_getter(); }
-  }
-
-#if defined(WITH_CUDA) || defined(WITH_ROCM)
-  int64_t memory_size_ = -1;
-  std::set<std::shared_ptr<IEvent>> children_;
-#endif  // WITH_CUDA
-
-  std::vector<Shape> input_shapes_;
-  std::string GetFormatedInputShapes(size_t max_num_to_format = 4);
-};
-
-}  // namespace profiler
-}  // namespace oneflow
-
-namespace nlohmann {
-
-inline void to_json(json& j, const std::shared_ptr<::oneflow::profiler::IEvent>& event) {
-  j = event->ToJson();
-}
-
-}  // namespace nlohmann
-
-#endif  // ONEFLOW_CORE_PROFILER_EVENT_H_
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#ifndef ONEFLOW_CORE_PROFILER_EVENT_H_
+#define ONEFLOW_CORE_PROFILER_EVENT_H_
+
+#include <functional>
+#include <memory>
+#include <vector>
+#include "nlohmann/json.hpp"
+#include "oneflow/core/common/util.h"
+#include "oneflow/core/common/shape_view.h"
+
+namespace oneflow {
+
+namespace profiler {
+
+class ProfileManager;
+
+enum class EventType {
+  kCustom,        // has three kinds
+  kOneflowKernel  // OneFlow cpu/cuda kernel
+};
+enum class CustomEventType {
+  kDefault,     // for record_function
+  kCudaKernel,  // cuda kernel
+  kCudaRuntime  // something like cudaLaunchKernel
+};
+enum class EventTimeUnit { kNS, kUS };
+
+class IEvent {
+ public:
+  OF_DISALLOW_COPY_AND_MOVE(IEvent);
+
+  IEvent() = delete;
+  IEvent(const std::string& name, EventTimeUnit time_unit) : name_(name), time_unit_(time_unit) {}
+
+  virtual std::string Key() = 0;
+  virtual nlohmann::json ToJson();
+  virtual ~IEvent() = default;
+
+  virtual void Start();
+  virtual void Finish();
+  bool IsChildOf(const IEvent* e);
+
+  const std::string& GetName() const;
+  template<typename T>
+  const T GetDuration(EventTimeUnit time_unit = EventTimeUnit::kUS) const;
+  template<typename T>
+  const T GetStartedAt(EventTimeUnit time_unit = EventTimeUnit::kUS) const;
+  template<typename T>
+  const T GetFinishedAt(EventTimeUnit time_unit = EventTimeUnit::kUS) const;
+
+ protected:
+  virtual void SetStartedAt(double t);
+  virtual void SetFinishedAt(double t);
+
+  std::string name_;
+  EventTimeUnit time_unit_;
+  double started_at_ = 0;
+  double finished_at_ = 0;
+};
+
+inline double ConvertTime(double time_, EventTimeUnit src_time_unit, EventTimeUnit dst_time_unit) {
+  if (src_time_unit == EventTimeUnit::kNS && dst_time_unit == EventTimeUnit::kUS) {
+    return time_ / 1000;
+  }
+  if (src_time_unit == EventTimeUnit::kUS && dst_time_unit == EventTimeUnit::kNS) {
+    return time_ * 1000;
+  }
+  return time_;
+}
+
+template<>
+const inline double IEvent::GetStartedAt<double>(EventTimeUnit time_unit) const {
+  return ConvertTime(started_at_, time_unit_, time_unit);
+}
+
+template<>
+const inline time_t IEvent::GetStartedAt<time_t>(EventTimeUnit time_unit) const {
+  return static_cast<time_t>(GetStartedAt<double>(time_unit));
+}
+
+template<>
+const inline double IEvent::GetFinishedAt<double>(EventTimeUnit time_unit) const {
+  return ConvertTime(finished_at_, time_unit_, time_unit);
+}
+
+template<>
+const inline time_t IEvent::GetFinishedAt<time_t>(EventTimeUnit time_unit) const {
+  return static_cast<time_t>(GetFinishedAt<double>(time_unit));
+}
+
+template<>
+const inline double IEvent::GetDuration<double>(EventTimeUnit time_unit) const {
+  return GetFinishedAt<double>(time_unit) - GetStartedAt<double>(time_unit);
+}
+
+template<>
+const inline time_t IEvent::GetDuration<time_t>(EventTimeUnit time_unit) const {
+  return static_cast<time_t>(GetDuration<double>(time_unit));
+}
+
+class CustomEvent final : public IEvent {
+ public:
+  friend class ProfileManager;
+  std::string Key() override;
+
+  nlohmann::json ToJson() override;
+
+  static std::shared_ptr<CustomEvent> Create(const std::string& name,
+                                             CustomEventType type = CustomEventType::kDefault);
+
+ private:
+  CustomEventType type_;
+  CustomEvent(const std::string& custom_name, CustomEventType type)
+      : IEvent(custom_name,
+               type == CustomEventType::kDefault ? EventTimeUnit::kNS : EventTimeUnit::kUS),
+        type_(type) {}
+};
+
+class KernelEvent final : public IEvent {
+ public:
+  std::string Key() override;
+
+  nlohmann::json ToJson() override;
+
+  static std::shared_ptr<KernelEvent> Create(
+      const std::string& name, const std::function<std::vector<Shape>(void)>& shape_getter);
+
+#if defined(WITH_CUDA) || defined(WITH_ROCM)
+  void SetMemorySize(int64_t memory_size) { memory_size_ = memory_size; }
+  void AddChildEvent(const std::shared_ptr<IEvent>& e) { children_.emplace(e); }
+  bool AddChildEventIfSo(const std::shared_ptr<IEvent>& e) {
+    if (e->IsChildOf(dynamic_cast<IEvent*>(this))) {
+      children_.emplace(e);
+      return true;
+    }
+    return false;
+  }
+  bool HasChildEvent(const std::shared_ptr<IEvent>& e) { return children_.count(e); }
+  void WalkAmongChildren(const std::function<void(const std::shared_ptr<IEvent>& e)>& f) const {
+    for (const auto& x : children_) { f(x); }
+  }
+#endif  // WITH_CUDA
+
+ private:
+  KernelEvent(const std::string& kernel_name,
+              const std::function<std::vector<Shape>(void)>& shape_getter)
+      : IEvent(kernel_name, EventTimeUnit::kNS) {
+    if (shape_getter) { input_shapes_ = shape_getter(); }
+  }
+
+#if defined(WITH_CUDA) || defined(WITH_ROCM)
+  int64_t memory_size_ = -1;
+  std::set<std::shared_ptr<IEvent>> children_;
+#endif  // WITH_CUDA
+
+  std::vector<Shape> input_shapes_;
+  std::string GetFormatedInputShapes(size_t max_num_to_format = 4);
+};
+
+}  // namespace profiler
+}  // namespace oneflow
+
+namespace nlohmann {
+
+inline void to_json(json& j, const std::shared_ptr<::oneflow::profiler::IEvent>& event) {
+  j = event->ToJson();
+}
+
+}  // namespace nlohmann
+
+#endif  // ONEFLOW_CORE_PROFILER_EVENT_H_
diff --git a/oneflow/core/profiler/event_recorder.h b/oneflow/core/profiler/event_recorder.h
index 31b1c34..6332948 100644
--- a/oneflow/core/profiler/event_recorder.h
+++ b/oneflow/core/profiler/event_recorder.h
@@ -1,60 +1,60 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#ifndef ONEFLOW_CORE_PROFILER_EVENT_RECORDER_H_
-#define ONEFLOW_CORE_PROFILER_EVENT_RECORDER_H_
-
-#include "oneflow/core/common/util.h"
-#include "oneflow/core/profiler/event.h"
-
-namespace oneflow {
-namespace profiler {
-
-class EventRecorder {
- public:
-  using ShapeGetterFuncType = std::function<std::vector<Shape>(void)>;
-
-  OF_DISALLOW_COPY_AND_MOVE(EventRecorder);
-
-  explicit EventRecorder(const std::shared_ptr<IEvent>& event) : event_(event) {
-    CHECK_JUST(RegisterEventToProfileManager(event));
-    event_->Start();
-  }
-
-  Maybe<void> RegisterEventToProfileManager(const std::shared_ptr<IEvent>& event);
-
-  ~EventRecorder() {
-    if (event_) {
-      event_->Finish();
-      event_.reset();
-    }
-  }
-  static std::shared_ptr<EventRecorder> CreateCustomEventRecorder(const std::string& name);
-
-  static Maybe<EventRecorder> CreateKernelEventRecorder(
-      const std::string& name,
-#if defined(WITH_CUDA) || defined(WITH_ROCM)
-      const std::function<int64_t()>& memory_size_getter,
-#endif
-      const ShapeGetterFuncType& shape_getter);
-
- private:
-  std::shared_ptr<IEvent> event_;
-};
-
-}  // namespace profiler
-}  // namespace oneflow
-
-#endif  // ONEFLOW_CORE_PROFILER_EVENT_RECORDER_H_
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#ifndef ONEFLOW_CORE_PROFILER_EVENT_RECORDER_H_
+#define ONEFLOW_CORE_PROFILER_EVENT_RECORDER_H_
+
+#include "oneflow/core/common/util.h"
+#include "oneflow/core/profiler/event.h"
+
+namespace oneflow {
+namespace profiler {
+
+class EventRecorder {
+ public:
+  using ShapeGetterFuncType = std::function<std::vector<Shape>(void)>;
+
+  OF_DISALLOW_COPY_AND_MOVE(EventRecorder);
+
+  explicit EventRecorder(const std::shared_ptr<IEvent>& event) : event_(event) {
+    CHECK_JUST(RegisterEventToProfileManager(event));
+    event_->Start();
+  }
+
+  Maybe<void> RegisterEventToProfileManager(const std::shared_ptr<IEvent>& event);
+
+  ~EventRecorder() {
+    if (event_) {
+      event_->Finish();
+      event_.reset();
+    }
+  }
+  static std::shared_ptr<EventRecorder> CreateCustomEventRecorder(const std::string& name);
+
+  static Maybe<EventRecorder> CreateKernelEventRecorder(
+      const std::string& name,
+#if defined(WITH_CUDA) || defined(WITH_ROCM)
+      const std::function<int64_t()>& memory_size_getter,
+#endif
+      const ShapeGetterFuncType& shape_getter);
+
+ private:
+  std::shared_ptr<IEvent> event_;
+};
+
+}  // namespace profiler
+}  // namespace oneflow
+
+#endif  // ONEFLOW_CORE_PROFILER_EVENT_RECORDER_H_
diff --git a/oneflow/core/vm/sync_vm_mode_guard.h b/oneflow/core/vm/sync_vm_mode_guard.h
index 5e63607..40e7179 100644
--- a/oneflow/core/vm/sync_vm_mode_guard.h
+++ b/oneflow/core/vm/sync_vm_mode_guard.h
@@ -1,39 +1,39 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#ifndef ONEFLOW_CORE_VM_SYNC_VM_MODE_GUARD_H_
-#define ONEFLOW_CORE_VM_SYNC_VM_MODE_GUARD_H_
-
-#include "oneflow/core/common/thread_local_guard.h"
-
-namespace oneflow {
-
-enum class SyncVmMode {
-  kInvalid = 0,
-  kEnable = 1,
-  kDisable = 2,
-};
-
-class SyncVmModeGuard final : public ThreadLocalGuard<SyncVmMode> {
- public:
-  using ThreadLocalGuard<SyncVmMode>::ThreadLocalGuard;
-  ~SyncVmModeGuard() = default;
-
-  static bool IsCurrentSyncVmMode() {
-    const auto& opt_sync_mode = Current();
-    return opt_sync_mode.has_value() && CHECK_JUST(opt_sync_mode) == SyncVmMode::kEnable;
-  }
-};
-
-}  // namespace oneflow
-
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#ifndef ONEFLOW_CORE_VM_SYNC_VM_MODE_GUARD_H_
+#define ONEFLOW_CORE_VM_SYNC_VM_MODE_GUARD_H_
+
+#include "oneflow/core/common/thread_local_guard.h"
+
+namespace oneflow {
+
+enum class SyncVmMode {
+  kInvalid = 0,
+  kEnable = 1,
+  kDisable = 2,
+};
+
+class SyncVmModeGuard final : public ThreadLocalGuard<SyncVmMode> {
+ public:
+  using ThreadLocalGuard<SyncVmMode>::ThreadLocalGuard;
+  ~SyncVmModeGuard() = default;
+
+  static bool IsCurrentSyncVmMode() {
+    const auto& opt_sync_mode = Current();
+    return opt_sync_mode.has_value() && CHECK_JUST(opt_sync_mode) == SyncVmMode::kEnable;
+  }
+};
+
+}  // namespace oneflow
+
 #endif  // ONEFLOW_CORE_VM_SYNC_VM_MODE_GUARD_H_
\ No newline at end of file
diff --git a/oneflow/user/kernels/adaptive_pool_gpu_kernel.hip.cpp b/oneflow/user/kernels/adaptive_pool_gpu_kernel.hip.cpp
index 326c408..b925d3e 100644
--- a/oneflow/user/kernels/adaptive_pool_gpu_kernel.hip.cpp
+++ b/oneflow/user/kernels/adaptive_pool_gpu_kernel.hip.cpp
@@ -1,296 +1,296 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "hip/hip_runtime.h"
-#include "oneflow/core/device/cuda_util.h"
-#include "oneflow/core/framework/framework.h"
-#include "oneflow/core/kernel/kernel_util.hip.h"
-#include "oneflow/core/common/data_type.h"
-#include "oneflow/core/kernel/util/cuda_half_util.h"
-#include "oneflow/core/hip/atomic.hip.h"
-#include "oneflow/core/operator/operator_util.h"
-#include "oneflow/user/utils/pool_util.h"
-
-#include <algorithm>
-#include <cfloat>
-#include <cmath>
-
-namespace oneflow {
-
-namespace user_op {
-
-#define START_IND(a, b, c) (int)std::floor((float)(a * c) / b)
-#define END_IND(a, b, c) (int)std::ceil((float)((a + 1) * c) / b)
-
-#define START_IND_INT(a, b, c) ((a * c) / b)
-#define END_IND_INT(a, b, c) (((a + 1) * c + b - 1) / b)
-
-template<typename T>
-__global__ void InitPtr(int elements, T* ptr) {
-  int gid = (blockDim.x * blockIdx.x) + threadIdx.x;
-  int step = gridDim.x * blockDim.x;
-  while (gid < elements) {
-    ptr[gid] = static_cast<T>(0);
-    gid += step;
-  }
-}
-
-inline Shape GetShape5D(const Shape& shape, const std::string& data_format, int32_t dim) {
-  FixedDimVector shape_3d = {GetInDim(shape, data_format, 0, dim),
-                             GetInDim(shape, data_format, 1, dim),
-                             GetInDim(shape, data_format, 2, dim)};
-  return Shape({shape.At(0), shape.At(1), shape_3d.at(0), shape_3d.at(1), shape_3d.at(2)});
-}
-
-template<typename T>
-__global__ void AdaptiveAvgPoolCudaKernel(const T* input, T* output, int num_elems, int in_d,
-                                          int in_h, int in_w, int out_d, int out_h, int out_w) {
-  const int out_panel_size = out_d * out_h * out_w;
-  const int in_panel_size = in_d * in_h * in_w;
-
-  CUDA_1D_KERNEL_LOOP(idx, num_elems) {
-    // TODO (Tianyu): Replace following codes with 'NdIndexOffsetHelper'
-    int bc_idx = idx / out_panel_size;
-    int out_d_idx = (idx % out_panel_size) / out_w / out_h;
-    int out_h_idx = (idx % out_panel_size) % (out_h * out_w) / out_w;
-    int out_w_idx = (idx % out_panel_size) % (out_h * out_w) % out_w;
-
-    int in_start_d = START_IND(out_d_idx, out_d, in_d);
-    int in_end_d = END_IND(out_d_idx, out_d, in_d);
-    int k_d = in_end_d - in_start_d;
-
-    int in_start_h = START_IND(out_h_idx, out_h, in_h);
-    int in_end_h = END_IND(out_h_idx, out_h, in_h);
-    int k_h = in_end_h - in_start_h;
-
-    int in_start_w = START_IND(out_w_idx, out_w, in_w);
-    int in_end_w = END_IND(out_w_idx, out_w, in_w);
-    int k_w = in_end_w - in_start_w;
-
-    const T* in_ptr =
-        input + bc_idx * in_panel_size + in_start_d * in_h * in_w + in_start_h * in_w + in_start_w;
-    T sum = static_cast<T>(0);
-    for (int id = 0; id < k_d; ++id) {
-      for (int ih = 0; ih < k_h; ++ih) {
-        for (int iw = 0; iw < k_w; ++iw) {
-          T val = *(in_ptr + ih * in_w + iw);
-          sum += val;
-        }
-      }
-      in_ptr += in_h * in_w;  // next input depth
-    }
-    // Update output
-    output[idx] = sum / k_d / k_h / k_w;
-  }
-}
-
-template<typename T>
-__global__ void AdaptiveAvgPoolGradCudaKernel(T* input, const T* output, int num_elems, int in_d,
-                                              int in_h, int in_w, int out_d, int out_h, int out_w) {
-  const int out_panel_size = out_d * out_h * out_w;
-  const int in_panel_size = in_d * in_h * in_w;
-
-  CUDA_1D_KERNEL_LOOP(idx, num_elems) {
-    // TODO (Tianyu): Replace following codes with 'NdIndexOffsetHelper'
-    int bc_idx = idx / out_panel_size;
-    int out_d_idx = (idx % out_panel_size) / out_w / out_h;
-    int out_h_idx = (idx % out_panel_size) % (out_h * out_w) / out_w;
-    int out_w_idx = (idx % out_panel_size) % (out_h * out_w) % out_w;
-
-    int in_start_d = START_IND(out_d_idx, out_d, in_d);
-    int in_end_d = END_IND(out_d_idx, out_d, in_d);
-    int k_d = in_end_d - in_start_d;
-
-    int in_start_h = START_IND(out_h_idx, out_h, in_h);
-    int in_end_h = END_IND(out_h_idx, out_h, in_h);
-    int k_h = in_end_h - in_start_h;
-
-    int in_start_w = START_IND(out_w_idx, out_w, in_w);
-    int in_end_w = END_IND(out_w_idx, out_w, in_w);
-    int k_w = in_end_w - in_start_w;
-
-    const T grad_delta = output[idx] / k_d / k_h / k_w;
-    T* input_ptr =
-        input + bc_idx * in_panel_size + in_start_d * in_h * in_w + in_start_h * in_w + in_start_w;
-    for (int id = 0; id < k_d; ++id) {
-      for (int ih = 0; ih < k_h; ++ih) {
-        for (int iw = 0; iw < k_w; ++iw) {
-          // TODO (Tianyu): Use 'atmoic::Add' when necessary
-          cuda::atomic::Add(input_ptr + ih * in_w + iw, grad_delta);
-        }
-      }
-      input_ptr += in_h * in_w;  // next input depth
-    }
-  }
-}
-
-template<typename T>
-void AvgForwardCompute(KernelComputeContext* ctx, const int32_t& dim) {
-  const Tensor* in_tensor = ctx->Tensor4ArgNameAndIndex("x", 0);
-  Tensor* out_tensor = ctx->Tensor4ArgNameAndIndex("y", 0);
-  const T* in_ptr = in_tensor->dptr<T>();
-  T* out_ptr = out_tensor->mut_dptr<T>();
-
-  const Shape& x_shape = ctx->TensorDesc4ArgNameAndIndex("x", 0)->shape();
-  const Shape& y_shape = ctx->TensorDesc4ArgNameAndIndex("y", 0)->shape();
-
-  // TODO (Tianyu): Support 'channels_last'
-  std::string data_format = "channels_first";
-  const Shape& in = GetShape5D(x_shape, data_format, dim);
-  const Shape& out = GetShape5D(y_shape, data_format, dim);
-
-  const int out_elems = out_tensor->shape_view().elem_cnt();
-
-  RUN_CUDA_KERNEL((AdaptiveAvgPoolCudaKernel<T>), ctx->stream(), out_elems, in_ptr, out_ptr,
-                  out_elems, in.At(2), in.At(3), in.At(4), out.At(2), out.At(3), out.At(4));
-}
-
-template<typename T>
-void AvgBackwardCompute(KernelComputeContext* ctx, const int32_t& dim) {
-  const Tensor* out_tensor = ctx->Tensor4ArgNameAndIndex("dy", 0);
-  Tensor* in_tensor = ctx->Tensor4ArgNameAndIndex("dx", 0);
-  const T* out_ptr = out_tensor->dptr<T>();
-  T* in_ptr = in_tensor->mut_dptr<T>();
-
-  const Shape& dx_shape = ctx->TensorDesc4ArgNameAndIndex("dx", 0)->shape();
-  const Shape& dy_shape = ctx->TensorDesc4ArgNameAndIndex("dy", 0)->shape();
-
-  // TODO (Tianyu): Support 'channels_last'
-  std::string data_format = "channels_first";
-  const Shape& in = GetShape5D(dx_shape, data_format, dim);
-  const Shape& out = GetShape5D(dy_shape, data_format, dim);
-
-  const int in_elems = in_tensor->shape_view().elem_cnt();
-  const int out_elems = out_tensor->shape_view().elem_cnt();
-
-  RUN_CUDA_KERNEL((InitPtr<T>), ctx->stream(), in_elems, in_elems, in_ptr);
-  RUN_CUDA_KERNEL((AdaptiveAvgPoolGradCudaKernel<T>), ctx->stream(), out_elems, in_ptr, out_ptr,
-                  out_elems, in.At(2), in.At(3), in.At(4), out.At(2), out.At(3), out.At(4));
-}
-
-template<DeviceType device_type, typename T>
-class GpuAdaptiveAvgPool1dKernel final : public OpKernel {
- public:
-  GpuAdaptiveAvgPool1dKernel() = default;
-  ~GpuAdaptiveAvgPool1dKernel() = default;
-
- private:
-  using user_op::OpKernel::Compute;
-  void Compute(KernelComputeContext* ctx) const override { AvgForwardCompute<T>(ctx, 1); }
-
-  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
-};
-
-template<DeviceType device_type, typename T>
-class GpuAdaptiveAvgPool2dKernel final : public OpKernel {
- public:
-  GpuAdaptiveAvgPool2dKernel() = default;
-  ~GpuAdaptiveAvgPool2dKernel() = default;
-
- private:
-  using user_op::OpKernel::Compute;
-  void Compute(KernelComputeContext* ctx) const override { AvgForwardCompute<T>(ctx, 2); }
-
-  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
-};
-
-template<DeviceType device_type, typename T>
-class GpuAdaptiveAvgPool3dKernel final : public OpKernel {
- public:
-  GpuAdaptiveAvgPool3dKernel() = default;
-  ~GpuAdaptiveAvgPool3dKernel() = default;
-
- private:
-  using user_op::OpKernel::Compute;
-  void Compute(KernelComputeContext* ctx) const override { AvgForwardCompute<T>(ctx, 3); }
-
-  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
-};
-
-template<DeviceType device_type, typename T>
-class GpuAdaptiveAvgPool1dGradKernel final : public OpKernel {
- public:
-  GpuAdaptiveAvgPool1dGradKernel() = default;
-  ~GpuAdaptiveAvgPool1dGradKernel() = default;
-
- private:
-  using user_op::OpKernel::Compute;
-  void Compute(KernelComputeContext* ctx) const override { AvgBackwardCompute<T>(ctx, 1); }
-  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
-};
-
-template<DeviceType device_type, typename T>
-class GpuAdaptiveAvgPool2dGradKernel final : public OpKernel {
- public:
-  GpuAdaptiveAvgPool2dGradKernel() = default;
-  ~GpuAdaptiveAvgPool2dGradKernel() = default;
-
- private:
-  using user_op::OpKernel::Compute;
-  void Compute(KernelComputeContext* ctx) const override { AvgBackwardCompute<T>(ctx, 2); }
-  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
-};
-
-template<DeviceType device_type, typename T>
-class GpuAdaptiveAvgPool3dGradKernel final : public OpKernel {
- public:
-  GpuAdaptiveAvgPool3dGradKernel() = default;
-  ~GpuAdaptiveAvgPool3dGradKernel() = default;
-
- private:
-  using user_op::OpKernel::Compute;
-  void Compute(KernelComputeContext* ctx) const override { AvgBackwardCompute<T>(ctx, 3); }
-  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
-};
-
-#define REGISTER_CUDA_ADAPTIVE_AVGPOOL_KERNEL(device, dtype)                   \
-  REGISTER_USER_KERNEL("adaptive_avg_pool1d")                                  \
-      .SetCreateFn<GpuAdaptiveAvgPool1dKernel<device, dtype>>()                \
-      .SetIsMatchedHob((HobDeviceType() == device)                             \
-                       && (HobDataType("y", 0) == GetDataType<dtype>::value)); \
-  REGISTER_USER_KERNEL("adaptive_avg_pool2d")                                  \
-      .SetCreateFn<GpuAdaptiveAvgPool2dKernel<device, dtype>>()                \
-      .SetIsMatchedHob((HobDeviceType() == device)                             \
-                       && (HobDataType("y", 0) == GetDataType<dtype>::value)); \
-  REGISTER_USER_KERNEL("adaptive_avg_pool3d")                                  \
-      .SetCreateFn<GpuAdaptiveAvgPool3dKernel<device, dtype>>()                \
-      .SetIsMatchedHob((HobDeviceType() == device)                             \
-                       && (HobDataType("y", 0) == GetDataType<dtype>::value));
-
-REGISTER_CUDA_ADAPTIVE_AVGPOOL_KERNEL(DeviceType::kCUDA, float);
-REGISTER_CUDA_ADAPTIVE_AVGPOOL_KERNEL(DeviceType::kCUDA, double);
-REGISTER_CUDA_ADAPTIVE_AVGPOOL_KERNEL(DeviceType::kCUDA, int);
-
-#define REGISTER_CUDA_ADAPTIVE_AVGPOOL_BACKWARD_KERNEL(device, dtype)           \
-  REGISTER_USER_KERNEL("adaptive_avg_pool1d_grad")                              \
-      .SetCreateFn<GpuAdaptiveAvgPool1dGradKernel<device, dtype>>()             \
-      .SetIsMatchedHob((HobDeviceType() == device)                              \
-                       && (HobDataType("dx", 0) == GetDataType<dtype>::value)); \
-  REGISTER_USER_KERNEL("adaptive_avg_pool2d_grad")                              \
-      .SetCreateFn<GpuAdaptiveAvgPool2dGradKernel<device, dtype>>()             \
-      .SetIsMatchedHob((HobDeviceType() == device)                              \
-                       && (HobDataType("dx", 0) == GetDataType<dtype>::value)); \
-  REGISTER_USER_KERNEL("adaptive_avg_pool3d_grad")                              \
-      .SetCreateFn<GpuAdaptiveAvgPool3dGradKernel<device, dtype>>()             \
-      .SetIsMatchedHob((HobDeviceType() == device)                              \
-                       && (HobDataType("dx", 0) == GetDataType<dtype>::value));
-
-REGISTER_CUDA_ADAPTIVE_AVGPOOL_BACKWARD_KERNEL(DeviceType::kCUDA, float);
-REGISTER_CUDA_ADAPTIVE_AVGPOOL_BACKWARD_KERNEL(DeviceType::kCUDA, double);
-REGISTER_CUDA_ADAPTIVE_AVGPOOL_BACKWARD_KERNEL(DeviceType::kCUDA, int);
-
-}  // namespace user_op
-
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "hip/hip_runtime.h"
+#include "oneflow/core/device/cuda_util.h"
+#include "oneflow/core/framework/framework.h"
+#include "oneflow/core/kernel/kernel_util.hip.h"
+#include "oneflow/core/common/data_type.h"
+#include "oneflow/core/kernel/util/cuda_half_util.h"
+#include "oneflow/core/hip/atomic.hip.h"
+#include "oneflow/core/operator/operator_util.h"
+#include "oneflow/user/utils/pool_util.h"
+
+#include <algorithm>
+#include <cfloat>
+#include <cmath>
+
+namespace oneflow {
+
+namespace user_op {
+
+#define START_IND(a, b, c) (int)std::floor((float)(a * c) / b)
+#define END_IND(a, b, c) (int)std::ceil((float)((a + 1) * c) / b)
+
+#define START_IND_INT(a, b, c) ((a * c) / b)
+#define END_IND_INT(a, b, c) (((a + 1) * c + b - 1) / b)
+
+template<typename T>
+__global__ void InitPtr(int elements, T* ptr) {
+  int gid = (blockDim.x * blockIdx.x) + threadIdx.x;
+  int step = gridDim.x * blockDim.x;
+  while (gid < elements) {
+    ptr[gid] = static_cast<T>(0);
+    gid += step;
+  }
+}
+
+inline Shape GetShape5D(const Shape& shape, const std::string& data_format, int32_t dim) {
+  FixedDimVector shape_3d = {GetInDim(shape, data_format, 0, dim),
+                             GetInDim(shape, data_format, 1, dim),
+                             GetInDim(shape, data_format, 2, dim)};
+  return Shape({shape.At(0), shape.At(1), shape_3d.at(0), shape_3d.at(1), shape_3d.at(2)});
+}
+
+template<typename T>
+__global__ void AdaptiveAvgPoolCudaKernel(const T* input, T* output, int num_elems, int in_d,
+                                          int in_h, int in_w, int out_d, int out_h, int out_w) {
+  const int out_panel_size = out_d * out_h * out_w;
+  const int in_panel_size = in_d * in_h * in_w;
+
+  CUDA_1D_KERNEL_LOOP(idx, num_elems) {
+    // TODO (Tianyu): Replace following codes with 'NdIndexOffsetHelper'
+    int bc_idx = idx / out_panel_size;
+    int out_d_idx = (idx % out_panel_size) / out_w / out_h;
+    int out_h_idx = (idx % out_panel_size) % (out_h * out_w) / out_w;
+    int out_w_idx = (idx % out_panel_size) % (out_h * out_w) % out_w;
+
+    int in_start_d = START_IND(out_d_idx, out_d, in_d);
+    int in_end_d = END_IND(out_d_idx, out_d, in_d);
+    int k_d = in_end_d - in_start_d;
+
+    int in_start_h = START_IND(out_h_idx, out_h, in_h);
+    int in_end_h = END_IND(out_h_idx, out_h, in_h);
+    int k_h = in_end_h - in_start_h;
+
+    int in_start_w = START_IND(out_w_idx, out_w, in_w);
+    int in_end_w = END_IND(out_w_idx, out_w, in_w);
+    int k_w = in_end_w - in_start_w;
+
+    const T* in_ptr =
+        input + bc_idx * in_panel_size + in_start_d * in_h * in_w + in_start_h * in_w + in_start_w;
+    T sum = static_cast<T>(0);
+    for (int id = 0; id < k_d; ++id) {
+      for (int ih = 0; ih < k_h; ++ih) {
+        for (int iw = 0; iw < k_w; ++iw) {
+          T val = *(in_ptr + ih * in_w + iw);
+          sum += val;
+        }
+      }
+      in_ptr += in_h * in_w;  // next input depth
+    }
+    // Update output
+    output[idx] = sum / k_d / k_h / k_w;
+  }
+}
+
+template<typename T>
+__global__ void AdaptiveAvgPoolGradCudaKernel(T* input, const T* output, int num_elems, int in_d,
+                                              int in_h, int in_w, int out_d, int out_h, int out_w) {
+  const int out_panel_size = out_d * out_h * out_w;
+  const int in_panel_size = in_d * in_h * in_w;
+
+  CUDA_1D_KERNEL_LOOP(idx, num_elems) {
+    // TODO (Tianyu): Replace following codes with 'NdIndexOffsetHelper'
+    int bc_idx = idx / out_panel_size;
+    int out_d_idx = (idx % out_panel_size) / out_w / out_h;
+    int out_h_idx = (idx % out_panel_size) % (out_h * out_w) / out_w;
+    int out_w_idx = (idx % out_panel_size) % (out_h * out_w) % out_w;
+
+    int in_start_d = START_IND(out_d_idx, out_d, in_d);
+    int in_end_d = END_IND(out_d_idx, out_d, in_d);
+    int k_d = in_end_d - in_start_d;
+
+    int in_start_h = START_IND(out_h_idx, out_h, in_h);
+    int in_end_h = END_IND(out_h_idx, out_h, in_h);
+    int k_h = in_end_h - in_start_h;
+
+    int in_start_w = START_IND(out_w_idx, out_w, in_w);
+    int in_end_w = END_IND(out_w_idx, out_w, in_w);
+    int k_w = in_end_w - in_start_w;
+
+    const T grad_delta = output[idx] / k_d / k_h / k_w;
+    T* input_ptr =
+        input + bc_idx * in_panel_size + in_start_d * in_h * in_w + in_start_h * in_w + in_start_w;
+    for (int id = 0; id < k_d; ++id) {
+      for (int ih = 0; ih < k_h; ++ih) {
+        for (int iw = 0; iw < k_w; ++iw) {
+          // TODO (Tianyu): Use 'atmoic::Add' when necessary
+          cuda::atomic::Add(input_ptr + ih * in_w + iw, grad_delta);
+        }
+      }
+      input_ptr += in_h * in_w;  // next input depth
+    }
+  }
+}
+
+template<typename T>
+void AvgForwardCompute(KernelComputeContext* ctx, const int32_t& dim) {
+  const Tensor* in_tensor = ctx->Tensor4ArgNameAndIndex("x", 0);
+  Tensor* out_tensor = ctx->Tensor4ArgNameAndIndex("y", 0);
+  const T* in_ptr = in_tensor->dptr<T>();
+  T* out_ptr = out_tensor->mut_dptr<T>();
+
+  const Shape& x_shape = ctx->TensorDesc4ArgNameAndIndex("x", 0)->shape();
+  const Shape& y_shape = ctx->TensorDesc4ArgNameAndIndex("y", 0)->shape();
+
+  // TODO (Tianyu): Support 'channels_last'
+  std::string data_format = "channels_first";
+  const Shape& in = GetShape5D(x_shape, data_format, dim);
+  const Shape& out = GetShape5D(y_shape, data_format, dim);
+
+  const int out_elems = out_tensor->shape_view().elem_cnt();
+
+  RUN_CUDA_KERNEL((AdaptiveAvgPoolCudaKernel<T>), ctx->stream(), out_elems, in_ptr, out_ptr,
+                  out_elems, in.At(2), in.At(3), in.At(4), out.At(2), out.At(3), out.At(4));
+}
+
+template<typename T>
+void AvgBackwardCompute(KernelComputeContext* ctx, const int32_t& dim) {
+  const Tensor* out_tensor = ctx->Tensor4ArgNameAndIndex("dy", 0);
+  Tensor* in_tensor = ctx->Tensor4ArgNameAndIndex("dx", 0);
+  const T* out_ptr = out_tensor->dptr<T>();
+  T* in_ptr = in_tensor->mut_dptr<T>();
+
+  const Shape& dx_shape = ctx->TensorDesc4ArgNameAndIndex("dx", 0)->shape();
+  const Shape& dy_shape = ctx->TensorDesc4ArgNameAndIndex("dy", 0)->shape();
+
+  // TODO (Tianyu): Support 'channels_last'
+  std::string data_format = "channels_first";
+  const Shape& in = GetShape5D(dx_shape, data_format, dim);
+  const Shape& out = GetShape5D(dy_shape, data_format, dim);
+
+  const int in_elems = in_tensor->shape_view().elem_cnt();
+  const int out_elems = out_tensor->shape_view().elem_cnt();
+
+  RUN_CUDA_KERNEL((InitPtr<T>), ctx->stream(), in_elems, in_elems, in_ptr);
+  RUN_CUDA_KERNEL((AdaptiveAvgPoolGradCudaKernel<T>), ctx->stream(), out_elems, in_ptr, out_ptr,
+                  out_elems, in.At(2), in.At(3), in.At(4), out.At(2), out.At(3), out.At(4));
+}
+
+template<DeviceType device_type, typename T>
+class GpuAdaptiveAvgPool1dKernel final : public OpKernel {
+ public:
+  GpuAdaptiveAvgPool1dKernel() = default;
+  ~GpuAdaptiveAvgPool1dKernel() = default;
+
+ private:
+  using user_op::OpKernel::Compute;
+  void Compute(KernelComputeContext* ctx) const override { AvgForwardCompute<T>(ctx, 1); }
+
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
+};
+
+template<DeviceType device_type, typename T>
+class GpuAdaptiveAvgPool2dKernel final : public OpKernel {
+ public:
+  GpuAdaptiveAvgPool2dKernel() = default;
+  ~GpuAdaptiveAvgPool2dKernel() = default;
+
+ private:
+  using user_op::OpKernel::Compute;
+  void Compute(KernelComputeContext* ctx) const override { AvgForwardCompute<T>(ctx, 2); }
+
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
+};
+
+template<DeviceType device_type, typename T>
+class GpuAdaptiveAvgPool3dKernel final : public OpKernel {
+ public:
+  GpuAdaptiveAvgPool3dKernel() = default;
+  ~GpuAdaptiveAvgPool3dKernel() = default;
+
+ private:
+  using user_op::OpKernel::Compute;
+  void Compute(KernelComputeContext* ctx) const override { AvgForwardCompute<T>(ctx, 3); }
+
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
+};
+
+template<DeviceType device_type, typename T>
+class GpuAdaptiveAvgPool1dGradKernel final : public OpKernel {
+ public:
+  GpuAdaptiveAvgPool1dGradKernel() = default;
+  ~GpuAdaptiveAvgPool1dGradKernel() = default;
+
+ private:
+  using user_op::OpKernel::Compute;
+  void Compute(KernelComputeContext* ctx) const override { AvgBackwardCompute<T>(ctx, 1); }
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
+};
+
+template<DeviceType device_type, typename T>
+class GpuAdaptiveAvgPool2dGradKernel final : public OpKernel {
+ public:
+  GpuAdaptiveAvgPool2dGradKernel() = default;
+  ~GpuAdaptiveAvgPool2dGradKernel() = default;
+
+ private:
+  using user_op::OpKernel::Compute;
+  void Compute(KernelComputeContext* ctx) const override { AvgBackwardCompute<T>(ctx, 2); }
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
+};
+
+template<DeviceType device_type, typename T>
+class GpuAdaptiveAvgPool3dGradKernel final : public OpKernel {
+ public:
+  GpuAdaptiveAvgPool3dGradKernel() = default;
+  ~GpuAdaptiveAvgPool3dGradKernel() = default;
+
+ private:
+  using user_op::OpKernel::Compute;
+  void Compute(KernelComputeContext* ctx) const override { AvgBackwardCompute<T>(ctx, 3); }
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
+};
+
+#define REGISTER_CUDA_ADAPTIVE_AVGPOOL_KERNEL(device, dtype)                   \
+  REGISTER_USER_KERNEL("adaptive_avg_pool1d")                                  \
+      .SetCreateFn<GpuAdaptiveAvgPool1dKernel<device, dtype>>()                \
+      .SetIsMatchedHob((HobDeviceType() == device)                             \
+                       && (HobDataType("y", 0) == GetDataType<dtype>::value)); \
+  REGISTER_USER_KERNEL("adaptive_avg_pool2d")                                  \
+      .SetCreateFn<GpuAdaptiveAvgPool2dKernel<device, dtype>>()                \
+      .SetIsMatchedHob((HobDeviceType() == device)                             \
+                       && (HobDataType("y", 0) == GetDataType<dtype>::value)); \
+  REGISTER_USER_KERNEL("adaptive_avg_pool3d")                                  \
+      .SetCreateFn<GpuAdaptiveAvgPool3dKernel<device, dtype>>()                \
+      .SetIsMatchedHob((HobDeviceType() == device)                             \
+                       && (HobDataType("y", 0) == GetDataType<dtype>::value));
+
+REGISTER_CUDA_ADAPTIVE_AVGPOOL_KERNEL(DeviceType::kCUDA, float);
+REGISTER_CUDA_ADAPTIVE_AVGPOOL_KERNEL(DeviceType::kCUDA, double);
+REGISTER_CUDA_ADAPTIVE_AVGPOOL_KERNEL(DeviceType::kCUDA, int);
+
+#define REGISTER_CUDA_ADAPTIVE_AVGPOOL_BACKWARD_KERNEL(device, dtype)           \
+  REGISTER_USER_KERNEL("adaptive_avg_pool1d_grad")                              \
+      .SetCreateFn<GpuAdaptiveAvgPool1dGradKernel<device, dtype>>()             \
+      .SetIsMatchedHob((HobDeviceType() == device)                              \
+                       && (HobDataType("dx", 0) == GetDataType<dtype>::value)); \
+  REGISTER_USER_KERNEL("adaptive_avg_pool2d_grad")                              \
+      .SetCreateFn<GpuAdaptiveAvgPool2dGradKernel<device, dtype>>()             \
+      .SetIsMatchedHob((HobDeviceType() == device)                              \
+                       && (HobDataType("dx", 0) == GetDataType<dtype>::value)); \
+  REGISTER_USER_KERNEL("adaptive_avg_pool3d_grad")                              \
+      .SetCreateFn<GpuAdaptiveAvgPool3dGradKernel<device, dtype>>()             \
+      .SetIsMatchedHob((HobDeviceType() == device)                              \
+                       && (HobDataType("dx", 0) == GetDataType<dtype>::value));
+
+REGISTER_CUDA_ADAPTIVE_AVGPOOL_BACKWARD_KERNEL(DeviceType::kCUDA, float);
+REGISTER_CUDA_ADAPTIVE_AVGPOOL_BACKWARD_KERNEL(DeviceType::kCUDA, double);
+REGISTER_CUDA_ADAPTIVE_AVGPOOL_BACKWARD_KERNEL(DeviceType::kCUDA, int);
+
+}  // namespace user_op
+
 }  // namespace oneflow
\ No newline at end of file
diff --git a/oneflow/user/kernels/affine_grid_kernel.hip.cpp b/oneflow/user/kernels/affine_grid_kernel.hip.cpp
index 9fe19bd..c5a445d 100644
--- a/oneflow/user/kernels/affine_grid_kernel.hip.cpp
+++ b/oneflow/user/kernels/affine_grid_kernel.hip.cpp
@@ -1,133 +1,133 @@
-
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "hip/hip_runtime.h"
-#include "oneflow/core/kernel/new_kernel_util.h"
-#include "oneflow/core/kernel/kernel_util.h"
-#include "oneflow/core/device/cuda_util.h"
-#include "affine_grid_kernel.h"
-
-namespace oneflow {
-
-namespace {
-
-template<typename data_type, bool align_corners>
-OF_DEVICE_FUNC data_type LinspaceGPU(int32_t index, int32_t num_steps) {
-  if (num_steps <= 1) { return static_cast<data_type>(0.0); }
-
-  if (align_corners) {
-    return static_cast<data_type>(-1.0 + 2.0 / (num_steps - 1) * index);
-  } else {
-    return static_cast<data_type>((-1.0 + 2.0 / (num_steps - 1) * index) * (num_steps - 1)
-                                  / num_steps);
-  }
-}
-
-template<typename data_type, bool align_corners>
-__global__ void Generate2DBaseGridGPUKernel(const int32_t nthreads, data_type* grid_ptr, int32_t H,
-                                            int32_t W) {
-  CUDA_1D_KERNEL_LOOP(index, nthreads) {
-    const int32_t h = index / W;
-    const int32_t w = index % W;
-    const int32_t pixel_length = 3;
-    data_type* row_ptr = grid_ptr + h * W * pixel_length;
-    data_type* pixel_ptr = row_ptr + w * pixel_length;
-    data_type h_value = LinspaceGPU<data_type, align_corners>(h, H);
-    data_type w_value = LinspaceGPU<data_type, align_corners>(w, W);
-
-    pixel_ptr[0] = w_value;
-    pixel_ptr[1] = h_value;
-    pixel_ptr[2] = static_cast<data_type>(1.0);
-  }
-}
-
-template<typename data_type, bool align_corners>
-__global__ void Generate3DBaseGridGPUKernel(const int32_t nthreads, data_type* grid_ptr, int32_t D,
-                                            int32_t H, int32_t W) {
-  CUDA_1D_KERNEL_LOOP(index, nthreads) {
-    const int32_t d = index / H;
-    const int32_t h = index % H;
-    const int32_t pixel_length = 4;
-    data_type* image_ptr = grid_ptr + d * H * W * pixel_length;
-    data_type* row_ptr = image_ptr + h * W * pixel_length;
-    data_type d_value = LinspaceGPU<data_type, align_corners>(d, D);
-    data_type h_value = LinspaceGPU<data_type, align_corners>(h, H);
-
-    for (int32_t w = 0; w < W; ++w) {
-      data_type* pixel_ptr = row_ptr + w * pixel_length;
-      data_type w_value = LinspaceGPU<data_type, align_corners>(w, W);
-      pixel_ptr[0] = w_value;
-      pixel_ptr[1] = h_value;
-      pixel_ptr[2] = d_value;
-      pixel_ptr[3] = static_cast<data_type>(1.0);
-    }
-  }
-}
-
-}  // namespace
-
-void GenerateBaseGridImp<DeviceType::kCUDA>::Generate2D(user_op::KernelComputeContext* ctx,
-                                                        float* grid_ptr, int64_t H, int64_t W,
-                                                        bool align_corners) {
-  int count = H * W;
-  if (align_corners) {
-    RUN_CUDA_KERNEL((Generate2DBaseGridGPUKernel<float, true>), ctx->stream(), count, count,
-                    grid_ptr, H, W);
-  } else {
-    RUN_CUDA_KERNEL((Generate2DBaseGridGPUKernel<float, false>), ctx->stream(), count, count,
-                    grid_ptr, H, W);
-  }
-}
-void GenerateBaseGridImp<DeviceType::kCUDA>::Generate2D(user_op::KernelComputeContext* ctx,
-                                                        double* grid_ptr, int64_t H, int64_t W,
-                                                        bool align_corners) {
-  int count = H * W;
-  if (align_corners) {
-    RUN_CUDA_KERNEL((Generate2DBaseGridGPUKernel<double, true>), ctx->stream(), count, count,
-                    grid_ptr, H, W);
-  } else {
-    RUN_CUDA_KERNEL((Generate2DBaseGridGPUKernel<double, false>), ctx->stream(), count, count,
-                    grid_ptr, H, W);
-  }
-}
-
-void GenerateBaseGridImp<DeviceType::kCUDA>::Generate3D(user_op::KernelComputeContext* ctx,
-                                                        float* grid_ptr, int64_t D, int64_t H,
-                                                        int64_t W, bool align_corners) {
-  int count = D * H;
-  if (align_corners) {
-    RUN_CUDA_KERNEL((Generate3DBaseGridGPUKernel<float, true>), ctx->stream(), count, count,
-                    grid_ptr, D, H, W);
-  } else {
-    RUN_CUDA_KERNEL((Generate3DBaseGridGPUKernel<float, false>), ctx->stream(), count, count,
-                    grid_ptr, D, H, W);
-  }
-}
-
-void GenerateBaseGridImp<DeviceType::kCUDA>::Generate3D(user_op::KernelComputeContext* ctx,
-                                                        double* grid_ptr, int64_t D, int64_t H,
-                                                        int64_t W, bool align_corners) {
-  int count = D * H;
-  if (align_corners) {
-    RUN_CUDA_KERNEL((Generate3DBaseGridGPUKernel<double, true>), ctx->stream(), count, count,
-                    grid_ptr, D, H, W);
-  } else {
-    RUN_CUDA_KERNEL((Generate3DBaseGridGPUKernel<double, false>), ctx->stream(), count, count,
-                    grid_ptr, D, H, W);
-  }
-}
-
+
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "hip/hip_runtime.h"
+#include "oneflow/core/kernel/new_kernel_util.h"
+#include "oneflow/core/kernel/kernel_util.h"
+#include "oneflow/core/device/cuda_util.h"
+#include "affine_grid_kernel.h"
+
+namespace oneflow {
+
+namespace {
+
+template<typename data_type, bool align_corners>
+OF_DEVICE_FUNC data_type LinspaceGPU(int32_t index, int32_t num_steps) {
+  if (num_steps <= 1) { return static_cast<data_type>(0.0); }
+
+  if (align_corners) {
+    return static_cast<data_type>(-1.0 + 2.0 / (num_steps - 1) * index);
+  } else {
+    return static_cast<data_type>((-1.0 + 2.0 / (num_steps - 1) * index) * (num_steps - 1)
+                                  / num_steps);
+  }
+}
+
+template<typename data_type, bool align_corners>
+__global__ void Generate2DBaseGridGPUKernel(const int32_t nthreads, data_type* grid_ptr, int32_t H,
+                                            int32_t W) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    const int32_t h = index / W;
+    const int32_t w = index % W;
+    const int32_t pixel_length = 3;
+    data_type* row_ptr = grid_ptr + h * W * pixel_length;
+    data_type* pixel_ptr = row_ptr + w * pixel_length;
+    data_type h_value = LinspaceGPU<data_type, align_corners>(h, H);
+    data_type w_value = LinspaceGPU<data_type, align_corners>(w, W);
+
+    pixel_ptr[0] = w_value;
+    pixel_ptr[1] = h_value;
+    pixel_ptr[2] = static_cast<data_type>(1.0);
+  }
+}
+
+template<typename data_type, bool align_corners>
+__global__ void Generate3DBaseGridGPUKernel(const int32_t nthreads, data_type* grid_ptr, int32_t D,
+                                            int32_t H, int32_t W) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    const int32_t d = index / H;
+    const int32_t h = index % H;
+    const int32_t pixel_length = 4;
+    data_type* image_ptr = grid_ptr + d * H * W * pixel_length;
+    data_type* row_ptr = image_ptr + h * W * pixel_length;
+    data_type d_value = LinspaceGPU<data_type, align_corners>(d, D);
+    data_type h_value = LinspaceGPU<data_type, align_corners>(h, H);
+
+    for (int32_t w = 0; w < W; ++w) {
+      data_type* pixel_ptr = row_ptr + w * pixel_length;
+      data_type w_value = LinspaceGPU<data_type, align_corners>(w, W);
+      pixel_ptr[0] = w_value;
+      pixel_ptr[1] = h_value;
+      pixel_ptr[2] = d_value;
+      pixel_ptr[3] = static_cast<data_type>(1.0);
+    }
+  }
+}
+
+}  // namespace
+
+void GenerateBaseGridImp<DeviceType::kCUDA>::Generate2D(user_op::KernelComputeContext* ctx,
+                                                        float* grid_ptr, int64_t H, int64_t W,
+                                                        bool align_corners) {
+  int count = H * W;
+  if (align_corners) {
+    RUN_CUDA_KERNEL((Generate2DBaseGridGPUKernel<float, true>), ctx->stream(), count, count,
+                    grid_ptr, H, W);
+  } else {
+    RUN_CUDA_KERNEL((Generate2DBaseGridGPUKernel<float, false>), ctx->stream(), count, count,
+                    grid_ptr, H, W);
+  }
+}
+void GenerateBaseGridImp<DeviceType::kCUDA>::Generate2D(user_op::KernelComputeContext* ctx,
+                                                        double* grid_ptr, int64_t H, int64_t W,
+                                                        bool align_corners) {
+  int count = H * W;
+  if (align_corners) {
+    RUN_CUDA_KERNEL((Generate2DBaseGridGPUKernel<double, true>), ctx->stream(), count, count,
+                    grid_ptr, H, W);
+  } else {
+    RUN_CUDA_KERNEL((Generate2DBaseGridGPUKernel<double, false>), ctx->stream(), count, count,
+                    grid_ptr, H, W);
+  }
+}
+
+void GenerateBaseGridImp<DeviceType::kCUDA>::Generate3D(user_op::KernelComputeContext* ctx,
+                                                        float* grid_ptr, int64_t D, int64_t H,
+                                                        int64_t W, bool align_corners) {
+  int count = D * H;
+  if (align_corners) {
+    RUN_CUDA_KERNEL((Generate3DBaseGridGPUKernel<float, true>), ctx->stream(), count, count,
+                    grid_ptr, D, H, W);
+  } else {
+    RUN_CUDA_KERNEL((Generate3DBaseGridGPUKernel<float, false>), ctx->stream(), count, count,
+                    grid_ptr, D, H, W);
+  }
+}
+
+void GenerateBaseGridImp<DeviceType::kCUDA>::Generate3D(user_op::KernelComputeContext* ctx,
+                                                        double* grid_ptr, int64_t D, int64_t H,
+                                                        int64_t W, bool align_corners) {
+  int count = D * H;
+  if (align_corners) {
+    RUN_CUDA_KERNEL((Generate3DBaseGridGPUKernel<double, true>), ctx->stream(), count, count,
+                    grid_ptr, D, H, W);
+  } else {
+    RUN_CUDA_KERNEL((Generate3DBaseGridGPUKernel<double, false>), ctx->stream(), count, count,
+                    grid_ptr, D, H, W);
+  }
+}
+
 }  // namespace oneflow
\ No newline at end of file
diff --git a/oneflow/user/kernels/arange_kernel_util.hip.cpp b/oneflow/user/kernels/arange_kernel_util.hip.cpp
index 2df4427..d7e6b59 100644
--- a/oneflow/user/kernels/arange_kernel_util.hip.cpp
+++ b/oneflow/user/kernels/arange_kernel_util.hip.cpp
@@ -1,48 +1,48 @@
-
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#ifdef WITH_ROCM
-#include "hip/hip_runtime.h"
-#include "oneflow/core/framework/framework.h"
-#include "oneflow/user/kernels/arange_kernel_util.h"
-
-namespace oneflow {
-
-namespace user_op {
-
-template<typename T>
-__global__ void ArangeForwardGpuKernel(const T start, const T delta, const int64_t arange_elem_cnt,
-                                       T* out) {
-  // Use Loop to set the value
-  DoArange<T>(start, delta, arange_elem_cnt, out);
-}
-
-template<typename T>
-struct ArangeFunctor<DeviceType::kCUDA, T> final {
-  void operator()(ep::Stream* stream, const T start, const T delta, const int64_t arange_elem_cnt,
-                  T* out) {
-    // The thread num is set as arange_elem_cnt
-    RUN_CUDA_KERNEL((ArangeForwardGpuKernel<T>), stream, arange_elem_cnt, start, delta,
-                    arange_elem_cnt, out);
-  }
-};
-
-OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_ARANGE_FUNCTOR, (DeviceType::kCUDA),
-                                 ARANGE_DATA_TYPE_SEQ);
-}  // namespace user_op
-}  // namespace oneflow
-
+
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#ifdef WITH_ROCM
+#include "hip/hip_runtime.h"
+#include "oneflow/core/framework/framework.h"
+#include "oneflow/user/kernels/arange_kernel_util.h"
+
+namespace oneflow {
+
+namespace user_op {
+
+template<typename T>
+__global__ void ArangeForwardGpuKernel(const T start, const T delta, const int64_t arange_elem_cnt,
+                                       T* out) {
+  // Use Loop to set the value
+  DoArange<T>(start, delta, arange_elem_cnt, out);
+}
+
+template<typename T>
+struct ArangeFunctor<DeviceType::kCUDA, T> final {
+  void operator()(ep::Stream* stream, const T start, const T delta, const int64_t arange_elem_cnt,
+                  T* out) {
+    // The thread num is set as arange_elem_cnt
+    RUN_CUDA_KERNEL((ArangeForwardGpuKernel<T>), stream, arange_elem_cnt, start, delta,
+                    arange_elem_cnt, out);
+  }
+};
+
+OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_ARANGE_FUNCTOR, (DeviceType::kCUDA),
+                                 ARANGE_DATA_TYPE_SEQ);
+}  // namespace user_op
+}  // namespace oneflow
+
 #endif  // End WITH_ROCM
\ No newline at end of file
diff --git a/oneflow/user/kernels/arg_sort_kernel.hip.cpp b/oneflow/user/kernels/arg_sort_kernel.hip.cpp
index 46372db..1f2f276 100644
--- a/oneflow/user/kernels/arg_sort_kernel.hip.cpp
+++ b/oneflow/user/kernels/arg_sort_kernel.hip.cpp
@@ -1,148 +1,148 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "hip/hip_runtime.h"
-#include "oneflow/core/framework/framework.h"
-#include "oneflow/core/kernel/new_kernel_util.h"
-#include "oneflow/user/kernels/radix_sort.hip.h"
-#include "oneflow/core/ep/rocm/cuda_stream.h"
-
-namespace oneflow {
-
-namespace {
-
-template<typename T>
-class TmpBufferManager final {
- public:
-  OF_DISALLOW_COPY_AND_MOVE(TmpBufferManager);
-  TmpBufferManager(int32_t capacity, void* ptr, const ShapeView& in_shape)
-      : capacity_{capacity},
-        sorted_in_elem_cnt_{in_shape.elem_cnt()},
-        indices_elem_cnt_{sorted_in_elem_cnt_} {
-    const int32_t sorted_in_aligned_bytes = GetCudaAlignedSize(sorted_in_elem_cnt_ * sizeof(T));
-    const int32_t indices_aligned_bytes = GetCudaAlignedSize(indices_elem_cnt_ * sizeof(int32_t));
-    sorted_in_ptr_ = reinterpret_cast<T*>(ptr);
-    indices_ptr_ = reinterpret_cast<int32_t*>(reinterpret_cast<char*>(sorted_in_ptr_)
-                                              + sorted_in_aligned_bytes);
-    temp_storage_ptr_ =
-        reinterpret_cast<void*>(reinterpret_cast<char*>(indices_ptr_) + indices_aligned_bytes);
-    temp_storage_bytes_ = capacity_ - sorted_in_aligned_bytes - indices_aligned_bytes;
-    CHECK_GE(temp_storage_bytes_, 0);
-  }
-  ~TmpBufferManager() = default;
-
-  T* SortedInPtr() const { return sorted_in_ptr_; }
-  int32_t* IndicesPtr() const { return indices_ptr_; }
-  void* TempStoragePtr() const { return temp_storage_ptr_; }
-
-  int32_t TempStorageBytes() const { return temp_storage_bytes_; }
-
- private:
-  int32_t capacity_;
-
-  T* sorted_in_ptr_;
-  int32_t* indices_ptr_;
-  void* temp_storage_ptr_;
-
-  int64_t sorted_in_elem_cnt_;
-  int64_t indices_elem_cnt_;
-  int32_t temp_storage_bytes_;
-};
-
-__global__ void InitializeIndices(int32_t elem_cnt, int32_t* indices_ptr, int32_t instance_size) {
-  CUDA_1D_KERNEL_LOOP(i, elem_cnt) { indices_ptr[i] = i % instance_size; };
-}
-
-}  // namespace
-
-template<typename T>
-class GpuArgSortKernel final : public user_op::OpKernel {
- public:
-  GpuArgSortKernel() = default;
-  ~GpuArgSortKernel() = default;
-
- private:
-  using user_op::OpKernel::Compute;
-  void Compute(user_op::KernelComputeContext* ctx) const override {
-    const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("in", 0);
-    user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0);
-    user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0);
-    TmpBufferManager<T> buf_manager(static_cast<int32_t>(tmp_buffer->shape_view().elem_cnt()),
-                                    tmp_buffer->mut_dptr<void>(), in->shape_view());
-
-    const int32_t elem_cnt = in->shape_view().elem_cnt();
-    const int32_t instance_size = in->shape_view().At(in->shape_view().NumAxes() - 1);
-    const int32_t instance_num = elem_cnt / instance_size;
-    const std::string& direction = ctx->Attr<std::string>("direction");
-    InitializeIndices<<<BlocksNum4ThreadsNum(elem_cnt), kCudaThreadsNumPerBlock, 0,
-                        ctx->stream()->As<ep::CudaStream>()->cuda_stream()>>>(
-        elem_cnt, buf_manager.IndicesPtr(), instance_size);
-    if (direction == "ASCENDING") {
-      SortPairsAscending(in->dptr<T>(), buf_manager.IndicesPtr(), instance_num, instance_size,
-                         buf_manager.TempStoragePtr(), buf_manager.TempStorageBytes(),
-                         buf_manager.SortedInPtr(), out->mut_dptr<int32_t>(),
-                         ctx->stream()->As<ep::CudaStream>()->cuda_stream());
-    } else if (direction == "DESCENDING") {
-      SortPairsDescending(in->dptr<T>(), buf_manager.IndicesPtr(), instance_num, instance_size,
-                          buf_manager.TempStoragePtr(), buf_manager.TempStorageBytes(),
-                          buf_manager.SortedInPtr(), out->mut_dptr<int32_t>(),
-                          ctx->stream()->As<ep::CudaStream>()->cuda_stream());
-    } else {
-      UNIMPLEMENTED();
-    }
-  }
-  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
-};
-
-#define REGISTER_CUDA_ARG_SORT_KERNEL(dtype)                                                       \
-  REGISTER_USER_KERNEL("arg_sort")                                                                 \
-      .SetCreateFn<GpuArgSortKernel<dtype>>()                                                      \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                             \
-                       && (user_op::HobDataType("in", 0) == GetDataType<dtype>::value))            \
-      .SetInferTmpSizeFn([](user_op::InferContext* ctx) {                                          \
-        const Shape& in_shape = ctx->InputShape("in", 0);                                          \
-        const int32_t elem_cnt = in_shape.elem_cnt();                                              \
-        const int32_t instance_size = in_shape.dim_vec().back();                                   \
-        const int32_t instance_num = elem_cnt / instance_size;                                     \
-                                                                                                   \
-        /* Sorted In */                                                                            \
-        const int32_t sorted_in_aligned_bytes = GetCudaAlignedSize(elem_cnt * sizeof(dtype));      \
-        /* Indices */                                                                              \
-        const int32_t indices_aligned_bytes = GetCudaAlignedSize(elem_cnt * sizeof(int32_t));      \
-        /* CUB Temp Storage */                                                                     \
-        int32_t temp_storage_bytes = -1;                                                           \
-        const std::string& direction = ctx->Attr<std::string>("direction");                        \
-        if (direction == "ASCENDING") {                                                            \
-          temp_storage_bytes =                                                                     \
-              InferTempStorageForSortPairsAscending<dtype, int32_t>(instance_num, instance_size);  \
-        } else if (direction == "DESCENDING") {                                                    \
-          temp_storage_bytes =                                                                     \
-              InferTempStorageForSortPairsDescending<dtype, int32_t>(instance_num, instance_size); \
-        } else {                                                                                   \
-          UNIMPLEMENTED();                                                                         \
-        }                                                                                          \
-                                                                                                   \
-        return sorted_in_aligned_bytes + indices_aligned_bytes + temp_storage_bytes;               \
-      });
-
-REGISTER_CUDA_ARG_SORT_KERNEL(float)
-REGISTER_CUDA_ARG_SORT_KERNEL(double)
-REGISTER_CUDA_ARG_SORT_KERNEL(bool)
-REGISTER_CUDA_ARG_SORT_KERNEL(int8_t)
-REGISTER_CUDA_ARG_SORT_KERNEL(uint8_t)
-REGISTER_CUDA_ARG_SORT_KERNEL(int32_t)
-REGISTER_CUDA_ARG_SORT_KERNEL(int64_t)
-
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "hip/hip_runtime.h"
+#include "oneflow/core/framework/framework.h"
+#include "oneflow/core/kernel/new_kernel_util.h"
+#include "oneflow/user/kernels/radix_sort.hip.h"
+#include "oneflow/core/ep/rocm/cuda_stream.h"
+
+namespace oneflow {
+
+namespace {
+
+template<typename T>
+class TmpBufferManager final {
+ public:
+  OF_DISALLOW_COPY_AND_MOVE(TmpBufferManager);
+  TmpBufferManager(int32_t capacity, void* ptr, const ShapeView& in_shape)
+      : capacity_{capacity},
+        sorted_in_elem_cnt_{in_shape.elem_cnt()},
+        indices_elem_cnt_{sorted_in_elem_cnt_} {
+    const int32_t sorted_in_aligned_bytes = GetCudaAlignedSize(sorted_in_elem_cnt_ * sizeof(T));
+    const int32_t indices_aligned_bytes = GetCudaAlignedSize(indices_elem_cnt_ * sizeof(int32_t));
+    sorted_in_ptr_ = reinterpret_cast<T*>(ptr);
+    indices_ptr_ = reinterpret_cast<int32_t*>(reinterpret_cast<char*>(sorted_in_ptr_)
+                                              + sorted_in_aligned_bytes);
+    temp_storage_ptr_ =
+        reinterpret_cast<void*>(reinterpret_cast<char*>(indices_ptr_) + indices_aligned_bytes);
+    temp_storage_bytes_ = capacity_ - sorted_in_aligned_bytes - indices_aligned_bytes;
+    CHECK_GE(temp_storage_bytes_, 0);
+  }
+  ~TmpBufferManager() = default;
+
+  T* SortedInPtr() const { return sorted_in_ptr_; }
+  int32_t* IndicesPtr() const { return indices_ptr_; }
+  void* TempStoragePtr() const { return temp_storage_ptr_; }
+
+  int32_t TempStorageBytes() const { return temp_storage_bytes_; }
+
+ private:
+  int32_t capacity_;
+
+  T* sorted_in_ptr_;
+  int32_t* indices_ptr_;
+  void* temp_storage_ptr_;
+
+  int64_t sorted_in_elem_cnt_;
+  int64_t indices_elem_cnt_;
+  int32_t temp_storage_bytes_;
+};
+
+__global__ void InitializeIndices(int32_t elem_cnt, int32_t* indices_ptr, int32_t instance_size) {
+  CUDA_1D_KERNEL_LOOP(i, elem_cnt) { indices_ptr[i] = i % instance_size; };
+}
+
+}  // namespace
+
+template<typename T>
+class GpuArgSortKernel final : public user_op::OpKernel {
+ public:
+  GpuArgSortKernel() = default;
+  ~GpuArgSortKernel() = default;
+
+ private:
+  using user_op::OpKernel::Compute;
+  void Compute(user_op::KernelComputeContext* ctx) const override {
+    const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("in", 0);
+    user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0);
+    user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0);
+    TmpBufferManager<T> buf_manager(static_cast<int32_t>(tmp_buffer->shape_view().elem_cnt()),
+                                    tmp_buffer->mut_dptr<void>(), in->shape_view());
+
+    const int32_t elem_cnt = in->shape_view().elem_cnt();
+    const int32_t instance_size = in->shape_view().At(in->shape_view().NumAxes() - 1);
+    const int32_t instance_num = elem_cnt / instance_size;
+    const std::string& direction = ctx->Attr<std::string>("direction");
+    InitializeIndices<<<BlocksNum4ThreadsNum(elem_cnt), kCudaThreadsNumPerBlock, 0,
+                        ctx->stream()->As<ep::CudaStream>()->cuda_stream()>>>(
+        elem_cnt, buf_manager.IndicesPtr(), instance_size);
+    if (direction == "ASCENDING") {
+      SortPairsAscending(in->dptr<T>(), buf_manager.IndicesPtr(), instance_num, instance_size,
+                         buf_manager.TempStoragePtr(), buf_manager.TempStorageBytes(),
+                         buf_manager.SortedInPtr(), out->mut_dptr<int32_t>(),
+                         ctx->stream()->As<ep::CudaStream>()->cuda_stream());
+    } else if (direction == "DESCENDING") {
+      SortPairsDescending(in->dptr<T>(), buf_manager.IndicesPtr(), instance_num, instance_size,
+                          buf_manager.TempStoragePtr(), buf_manager.TempStorageBytes(),
+                          buf_manager.SortedInPtr(), out->mut_dptr<int32_t>(),
+                          ctx->stream()->As<ep::CudaStream>()->cuda_stream());
+    } else {
+      UNIMPLEMENTED();
+    }
+  }
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
+};
+
+#define REGISTER_CUDA_ARG_SORT_KERNEL(dtype)                                                       \
+  REGISTER_USER_KERNEL("arg_sort")                                                                 \
+      .SetCreateFn<GpuArgSortKernel<dtype>>()                                                      \
+      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                             \
+                       && (user_op::HobDataType("in", 0) == GetDataType<dtype>::value))            \
+      .SetInferTmpSizeFn([](user_op::InferContext* ctx) {                                          \
+        const Shape& in_shape = ctx->InputShape("in", 0);                                          \
+        const int32_t elem_cnt = in_shape.elem_cnt();                                              \
+        const int32_t instance_size = in_shape.dim_vec().back();                                   \
+        const int32_t instance_num = elem_cnt / instance_size;                                     \
+                                                                                                   \
+        /* Sorted In */                                                                            \
+        const int32_t sorted_in_aligned_bytes = GetCudaAlignedSize(elem_cnt * sizeof(dtype));      \
+        /* Indices */                                                                              \
+        const int32_t indices_aligned_bytes = GetCudaAlignedSize(elem_cnt * sizeof(int32_t));      \
+        /* CUB Temp Storage */                                                                     \
+        int32_t temp_storage_bytes = -1;                                                           \
+        const std::string& direction = ctx->Attr<std::string>("direction");                        \
+        if (direction == "ASCENDING") {                                                            \
+          temp_storage_bytes =                                                                     \
+              InferTempStorageForSortPairsAscending<dtype, int32_t>(instance_num, instance_size);  \
+        } else if (direction == "DESCENDING") {                                                    \
+          temp_storage_bytes =                                                                     \
+              InferTempStorageForSortPairsDescending<dtype, int32_t>(instance_num, instance_size); \
+        } else {                                                                                   \
+          UNIMPLEMENTED();                                                                         \
+        }                                                                                          \
+                                                                                                   \
+        return sorted_in_aligned_bytes + indices_aligned_bytes + temp_storage_bytes;               \
+      });
+
+REGISTER_CUDA_ARG_SORT_KERNEL(float)
+REGISTER_CUDA_ARG_SORT_KERNEL(double)
+REGISTER_CUDA_ARG_SORT_KERNEL(bool)
+REGISTER_CUDA_ARG_SORT_KERNEL(int8_t)
+REGISTER_CUDA_ARG_SORT_KERNEL(uint8_t)
+REGISTER_CUDA_ARG_SORT_KERNEL(int32_t)
+REGISTER_CUDA_ARG_SORT_KERNEL(int64_t)
+
 }  // namespace oneflow
\ No newline at end of file
diff --git a/oneflow/user/kernels/arg_where_kernel_util.hip.cpp b/oneflow/user/kernels/arg_where_kernel_util.hip.cpp
index 4157777..9d78b08 100644
--- a/oneflow/user/kernels/arg_where_kernel_util.hip.cpp
+++ b/oneflow/user/kernels/arg_where_kernel_util.hip.cpp
@@ -1,142 +1,142 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "hip/hip_runtime.h"
-#include "oneflow/user/kernels/arg_where_kernel_util.h"
-#include "oneflow/core/common/nd_index_offset_helper.h"
-#include "oneflow/core/common/small_vector.h"
-#include "oneflow/core/hip/elementwise.hip.h"
-#include "oneflow/core/kernel/kernel_util.h"
-#include "oneflow/core/ep/rocm/cuda_stream.h"
-#include <hipcub/hipcub.hpp>
-
-namespace oneflow {
-
-namespace {
-
-constexpr int kBlockSize = cuda::elementwise::kBlockSize;
-
-int GetNumBlocks(int64_t elem_cnt) {
-  int num_blocks = 0;
-  OF_CUDA_CHECK(cuda::elementwise::GetNumBlocks(elem_cnt, &num_blocks));
-  return num_blocks;
-}
-
-template<typename T, int NDIM>
-struct StrideIterator {
-  typedef StrideIterator self_type;
-  typedef std::ptrdiff_t difference_type;
-  typedef T value_type;
-  typedef T* pointer;
-  typedef T& reference;
-  typedef std::random_access_iterator_tag iterator_category;
-
-  explicit StrideIterator(T* ptr, size_t max_iters) : ptr_(ptr), max_iters_(max_iters) {}
-
-  OF_DEVICE_FUNC reference operator[](int i) {
-    assert(0 <= i && i < max_iters_);
-    return *(ptr_ + (i * NDIM));
-  }
-
- private:
-  T* ptr_;
-  size_t max_iters_;
-};
-
-template<typename T, int NDIM>
-__global__ void __launch_bounds__(kBlockSize)
-    CudaOffsetToNdIndexInplace(NdIndexOffsetHelper<T, NDIM> index_converter,
-                               const T* output_size_ptr, T* output_ptr) {
-  CUDA_1D_KERNEL_LOOP_T(T, i, *output_size_ptr) {
-    T* index_ptr = output_ptr + i * NDIM;
-    index_converter.OffsetToNdIndex(*index_ptr, index_ptr);
-  }
-}
-
-template<typename T>
-struct IsTrue {
-  __device__ __forceinline__ bool operator()(const T& val) const { return static_cast<bool>(val); }
-};
-
-template<typename IN_T, typename OUT_T, typename OUT_ITER>
-hipError_t SelectTrue(hipStream_t stream, int num_items, void* temp_storage,
-                       size_t& temp_storage_bytes, const IN_T* input, OUT_ITER output_iter,
-                       OUT_T* num_selected) {
-  IsTrue<IN_T> is_true;
-  hipcub::TransformInputIterator<bool, IsTrue<IN_T>, const IN_T*> flag_iter(input, is_true);
-  hipcub::CountingInputIterator<OUT_T> offset_counter(0);
-  return hipcub::DeviceSelect::Flagged(temp_storage, temp_storage_bytes, offset_counter, flag_iter,
-                                    output_iter, num_selected, num_items, stream, false);
-}
-
-}  // namespace
-
-template<typename IN_T, typename OUT_T, int NDIM>
-struct ArgWhereKernelUtil<DeviceType::kCUDA, IN_T, OUT_T, NDIM> {
-  static void ArgWhere(ep::Stream* stream, const ShapeView& input_shape, const IN_T* input_ptr,
-                       void* temp_storage, size_t temp_storage_bytes, OUT_T* output_ptr,
-                       OUT_T* output_size_ptr) {
-    const int64_t elem_cnt = input_shape.elem_cnt();
-    // deal with empty blob
-    if (elem_cnt == 0) {
-      Memset<DeviceType::kCUDA>(stream, output_size_ptr, 0, sizeof(OUT_T));
-      return;
-    }
-
-    CHECK_NOTNULL(stream);
-    CHECK_LE(elem_cnt, std::numeric_limits<OUT_T>::max());
-    size_t workspace = GetWorkspaceBytesSize(stream, elem_cnt);
-    CHECK_LE(workspace, temp_storage_bytes);
-
-    if (NDIM == 1) {
-      OF_CUDA_CHECK((SelectTrue<IN_T, OUT_T, OUT_T*>(
-          stream->As<ep::CudaStream>()->cuda_stream(), input_shape.elem_cnt(), temp_storage,
-          workspace, input_ptr, output_ptr, output_size_ptr)));
-    } else {
-      using OutputIterator = StrideIterator<OUT_T, NDIM>;
-      OutputIterator output_iter(output_ptr, elem_cnt);
-      OF_CUDA_CHECK((SelectTrue<IN_T, OUT_T, OutputIterator>(
-          stream->As<ep::CudaStream>()->cuda_stream(), elem_cnt, temp_storage, workspace, input_ptr,
-          output_iter, output_size_ptr)));
-
-      OUT_T dims[NDIM] = {0};
-      std::transform(input_shape.ptr(), input_shape.ptr() + input_shape.NumAxes(), dims,
-                     [](int64_t dim) { return static_cast<OUT_T>(dim); });
-      NdIndexOffsetHelper<OUT_T, NDIM> index_converter(dims);
-      CudaOffsetToNdIndexInplace<OUT_T, NDIM>
-          <<<GetNumBlocks(elem_cnt), kBlockSize, 0, stream->As<ep::CudaStream>()->cuda_stream()>>>(
-              index_converter, output_size_ptr, output_ptr);
-    }
-  }
-
-  static size_t GetWorkspaceBytesSize(ep::Stream* stream, int64_t elem_cnt) {
-    hipStream_t cuda_stream = stream ? stream->As<ep::CudaStream>()->cuda_stream() : 0;
-    size_t workspace = 0;
-    if (NDIM == 1) {
-      OF_CUDA_CHECK((SelectTrue<IN_T, OUT_T, OUT_T*>(cuda_stream, elem_cnt, nullptr, workspace,
-                                                     nullptr, nullptr, nullptr)));
-    } else {
-      using OutputIterator = StrideIterator<OUT_T, NDIM>;
-      OutputIterator output_iter(nullptr, elem_cnt);
-      OF_CUDA_CHECK((SelectTrue<IN_T, OUT_T, OutputIterator>(
-          cuda_stream, elem_cnt, nullptr, workspace, nullptr, output_iter, nullptr)));
-    }
-    return workspace;
-  }
-};
-
-INSTANTIATE_ARG_WHERE_KERNEL_UTIL_FOR_DEVICE(DeviceType::kCUDA)
-
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "hip/hip_runtime.h"
+#include "oneflow/user/kernels/arg_where_kernel_util.h"
+#include "oneflow/core/common/nd_index_offset_helper.h"
+#include "oneflow/core/common/small_vector.h"
+#include "oneflow/core/hip/elementwise.hip.h"
+#include "oneflow/core/kernel/kernel_util.h"
+#include "oneflow/core/ep/rocm/cuda_stream.h"
+#include <hipcub/hipcub.hpp>
+
+namespace oneflow {
+
+namespace {
+
+constexpr int kBlockSize = cuda::elementwise::kBlockSize;
+
+int GetNumBlocks(int64_t elem_cnt) {
+  int num_blocks = 0;
+  OF_CUDA_CHECK(cuda::elementwise::GetNumBlocks(elem_cnt, &num_blocks));
+  return num_blocks;
+}
+
+template<typename T, int NDIM>
+struct StrideIterator {
+  typedef StrideIterator self_type;
+  typedef std::ptrdiff_t difference_type;
+  typedef T value_type;
+  typedef T* pointer;
+  typedef T& reference;
+  typedef std::random_access_iterator_tag iterator_category;
+
+  explicit StrideIterator(T* ptr, size_t max_iters) : ptr_(ptr), max_iters_(max_iters) {}
+
+  OF_DEVICE_FUNC reference operator[](int i) {
+    assert(0 <= i && i < max_iters_);
+    return *(ptr_ + (i * NDIM));
+  }
+
+ private:
+  T* ptr_;
+  size_t max_iters_;
+};
+
+template<typename T, int NDIM>
+__global__ void __launch_bounds__(kBlockSize)
+    CudaOffsetToNdIndexInplace(NdIndexOffsetHelper<T, NDIM> index_converter,
+                               const T* output_size_ptr, T* output_ptr) {
+  CUDA_1D_KERNEL_LOOP_T(T, i, *output_size_ptr) {
+    T* index_ptr = output_ptr + i * NDIM;
+    index_converter.OffsetToNdIndex(*index_ptr, index_ptr);
+  }
+}
+
+template<typename T>
+struct IsTrue {
+  __device__ __forceinline__ bool operator()(const T& val) const { return static_cast<bool>(val); }
+};
+
+template<typename IN_T, typename OUT_T, typename OUT_ITER>
+hipError_t SelectTrue(hipStream_t stream, int num_items, void* temp_storage,
+                       size_t& temp_storage_bytes, const IN_T* input, OUT_ITER output_iter,
+                       OUT_T* num_selected) {
+  IsTrue<IN_T> is_true;
+  hipcub::TransformInputIterator<bool, IsTrue<IN_T>, const IN_T*> flag_iter(input, is_true);
+  hipcub::CountingInputIterator<OUT_T> offset_counter(0);
+  return hipcub::DeviceSelect::Flagged(temp_storage, temp_storage_bytes, offset_counter, flag_iter,
+                                    output_iter, num_selected, num_items, stream, false);
+}
+
+}  // namespace
+
+template<typename IN_T, typename OUT_T, int NDIM>
+struct ArgWhereKernelUtil<DeviceType::kCUDA, IN_T, OUT_T, NDIM> {
+  static void ArgWhere(ep::Stream* stream, const ShapeView& input_shape, const IN_T* input_ptr,
+                       void* temp_storage, size_t temp_storage_bytes, OUT_T* output_ptr,
+                       OUT_T* output_size_ptr) {
+    const int64_t elem_cnt = input_shape.elem_cnt();
+    // deal with empty blob
+    if (elem_cnt == 0) {
+      Memset<DeviceType::kCUDA>(stream, output_size_ptr, 0, sizeof(OUT_T));
+      return;
+    }
+
+    CHECK_NOTNULL(stream);
+    CHECK_LE(elem_cnt, std::numeric_limits<OUT_T>::max());
+    size_t workspace = GetWorkspaceBytesSize(stream, elem_cnt);
+    CHECK_LE(workspace, temp_storage_bytes);
+
+    if (NDIM == 1) {
+      OF_CUDA_CHECK((SelectTrue<IN_T, OUT_T, OUT_T*>(
+          stream->As<ep::CudaStream>()->cuda_stream(), input_shape.elem_cnt(), temp_storage,
+          workspace, input_ptr, output_ptr, output_size_ptr)));
+    } else {
+      using OutputIterator = StrideIterator<OUT_T, NDIM>;
+      OutputIterator output_iter(output_ptr, elem_cnt);
+      OF_CUDA_CHECK((SelectTrue<IN_T, OUT_T, OutputIterator>(
+          stream->As<ep::CudaStream>()->cuda_stream(), elem_cnt, temp_storage, workspace, input_ptr,
+          output_iter, output_size_ptr)));
+
+      OUT_T dims[NDIM] = {0};
+      std::transform(input_shape.ptr(), input_shape.ptr() + input_shape.NumAxes(), dims,
+                     [](int64_t dim) { return static_cast<OUT_T>(dim); });
+      NdIndexOffsetHelper<OUT_T, NDIM> index_converter(dims);
+      CudaOffsetToNdIndexInplace<OUT_T, NDIM>
+          <<<GetNumBlocks(elem_cnt), kBlockSize, 0, stream->As<ep::CudaStream>()->cuda_stream()>>>(
+              index_converter, output_size_ptr, output_ptr);
+    }
+  }
+
+  static size_t GetWorkspaceBytesSize(ep::Stream* stream, int64_t elem_cnt) {
+    hipStream_t cuda_stream = stream ? stream->As<ep::CudaStream>()->cuda_stream() : 0;
+    size_t workspace = 0;
+    if (NDIM == 1) {
+      OF_CUDA_CHECK((SelectTrue<IN_T, OUT_T, OUT_T*>(cuda_stream, elem_cnt, nullptr, workspace,
+                                                     nullptr, nullptr, nullptr)));
+    } else {
+      using OutputIterator = StrideIterator<OUT_T, NDIM>;
+      OutputIterator output_iter(nullptr, elem_cnt);
+      OF_CUDA_CHECK((SelectTrue<IN_T, OUT_T, OutputIterator>(
+          cuda_stream, elem_cnt, nullptr, workspace, nullptr, output_iter, nullptr)));
+    }
+    return workspace;
+  }
+};
+
+INSTANTIATE_ARG_WHERE_KERNEL_UTIL_FOR_DEVICE(DeviceType::kCUDA)
+
 }  // namespace oneflow
\ No newline at end of file
diff --git a/oneflow/user/kernels/argmax_kernel.hip.cpp b/oneflow/user/kernels/argmax_kernel.hip.cpp
index 80b75e4..2d84443 100644
--- a/oneflow/user/kernels/argmax_kernel.hip.cpp
+++ b/oneflow/user/kernels/argmax_kernel.hip.cpp
@@ -1,194 +1,194 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "hip/hip_runtime.h"
-#include "oneflow/core/framework/framework.h"
-#include <hipcub/hipcub.hpp>
-#include "oneflow/core/ep/rocm/cuda_stream.h"
-
-namespace oneflow {
-
-namespace {
-
-template<typename T>
-class TmpBufferManager final {
- public:
-  OF_DISALLOW_COPY_AND_MOVE(TmpBufferManager);
-  TmpBufferManager(int32_t capacity, void* ptr, int32_t instance_num)
-      : capacity_{capacity}, key_value_out_elem_cnt_{instance_num} {
-    const int32_t key_value_out_aligned_bytes =
-        GetCudaAlignedSize(key_value_out_elem_cnt_ * sizeof(hipcub::KeyValuePair<int32_t, T>));
-    key_value_out_ptr_ = reinterpret_cast<hipcub::KeyValuePair<int32_t, T>*>(ptr);
-    temp_storage_ptr_ = reinterpret_cast<void*>(reinterpret_cast<char*>(key_value_out_ptr_)
-                                                + key_value_out_aligned_bytes);
-    temp_storage_bytes_ = capacity_ - key_value_out_aligned_bytes;
-    CHECK_GE(temp_storage_bytes_, 0);
-  }
-  ~TmpBufferManager() = default;
-
-  hipcub::KeyValuePair<int32_t, T>* KeyValueOutPtr() const { return key_value_out_ptr_; }
-  void* TempStoragePtr() const { return temp_storage_ptr_; }
-
-  int32_t TempStorageBytes() const { return temp_storage_bytes_; }
-
- private:
-  int32_t capacity_;
-
-  hipcub::KeyValuePair<int32_t, T>* key_value_out_ptr_;
-  void* temp_storage_ptr_;
-
-  int32_t key_value_out_elem_cnt_;
-  int32_t temp_storage_bytes_;
-};
-
-class MultiplyFunctor final {
- public:
-  MultiplyFunctor(int32_t num_col) : num_col_(num_col) {}
-  __host__ __device__ __forceinline__ int32_t operator()(int32_t idx) const {
-    return idx * num_col_;
-  }
-
- private:
-  int32_t num_col_;
-};
-
-template<typename T>
-size_t InferTempStorageForArgMax(int32_t num_row, int32_t num_col) {
-  using SegmentOffsetIter =
-      hipcub::TransformInputIterator<int32_t, MultiplyFunctor, hipcub::CountingInputIterator<int32_t>>;
-  hipcub::CountingInputIterator<int32_t> counting_iter(0);
-  MultiplyFunctor multiply_functor(num_col);
-  SegmentOffsetIter segment_offset_iter(counting_iter, multiply_functor);
-
-  size_t temp_storage_bytes = 0;
-  auto err =
-      hipcub::DeviceSegmentedReduce::ArgMax<T*, hipcub::KeyValuePair<int32_t, T>*, SegmentOffsetIter>(
-          /* d_temp_storage */ nullptr, /* temp_storage_bytes */ temp_storage_bytes,
-          /* d_in */ nullptr, /* d_out */ nullptr, /* num_segments */ num_row,
-          /* d_begin_offsets */ segment_offset_iter, /* d_end_offsets */ segment_offset_iter + 1,
-          /* stream */ 0);
-
-  // auto err =
-  //   hipcub::DeviceReduce::ArgMax<T*, hipcub::KeyValuePair<int32_t, T>*>(
-  //                   nullptr, temp_storage_bytes,
-  //                   nullptr, nullptr, num_row,
-  //                   0);
-
-  OF_CUDA_CHECK(err);
-
-  return temp_storage_bytes;
-}
-
-template<typename T>
-void ArgMax(const T* in_ptr, int32_t num_row, int32_t num_col, void* temp_storage_ptr,
-            int32_t temp_storage_bytes, hipcub::KeyValuePair<int32_t, T>* out_ptr,
-            hipStream_t stream) {
-  size_t rt_inferred_temp_storage_bytes = InferTempStorageForArgMax<T>(num_row, num_col);
-  CHECK_LE(rt_inferred_temp_storage_bytes, temp_storage_bytes);
-
-  using SegmentOffsetIter =
-      hipcub::TransformInputIterator<int32_t, MultiplyFunctor, hipcub::CountingInputIterator<int32_t>>;
-  hipcub::CountingInputIterator<int32_t> counting_iter(0);
-  MultiplyFunctor multiply_functor(num_col);
-  SegmentOffsetIter segment_offset_iter(counting_iter, multiply_functor);
-
-  // void * d_temp_storage = nullptr;
-  // hipMalloc((void **)&d_temp_storage, rt_inferred_temp_storage_bytes);
-
-  auto err = hipcub::DeviceSegmentedReduce::ArgMax(
-      /* d_temp_storage */ temp_storage_ptr,
-      /* temp_storage_bytes */ rt_inferred_temp_storage_bytes,
-      /* d_in */ in_ptr,
-      /* d_out */ out_ptr,
-      /* num_segments */ num_row,
-      /* d_begin_offsets */ segment_offset_iter,
-      /* d_end_offsets */ segment_offset_iter + 1,
-      /* stream */ stream);
-
-  // auto err =
-  //   hipcub::DeviceReduce::ArgMax(
-  //                   d_temp_storage, rt_inferred_temp_storage_bytes,
-  //                   in_ptr, out_ptr, num_row,
-  //                   stream);
-
-  OF_CUDA_CHECK(err);
-}
-
-template<typename T>
-__global__ void WriteKeysToOutput(const int32_t instance_num, const int32_t instance_size, 
-                                  const hipcub::KeyValuePair<int32_t, T>* key_value_out_ptr,
-                                  int64_t* out_ptr) {
-  CUDA_1D_KERNEL_LOOP(i, instance_num) { out_ptr[i] = key_value_out_ptr[i].key % instance_size; }
-}
-
-}  // namespace
-
-template<typename T>
-class GpuArgMaxKernel final : public user_op::OpKernel {
- public:
-  GpuArgMaxKernel() = default;
-  ~GpuArgMaxKernel() = default;
-
- private:
-  using user_op::OpKernel::Compute;
-  void Compute(user_op::KernelComputeContext* ctx) const override {
-    const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("in", 0);
-    user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0);
-    user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0);
-
-    const int32_t elem_cnt = in->shape_view().elem_cnt();
-    const int32_t instance_size = in->shape_view().At(in->shape_view().NumAxes() - 1);
-    const int32_t instance_num = elem_cnt / instance_size;
-    TmpBufferManager<T> buffer_manager(tmp_buffer->shape_view().elem_cnt(),
-                                       tmp_buffer->mut_dptr<void>(), instance_num);
-
-    ArgMax(in->dptr<T>(), instance_num, instance_size, buffer_manager.TempStoragePtr(),
-           buffer_manager.TempStorageBytes(), buffer_manager.KeyValueOutPtr(),
-           ctx->stream()->As<ep::CudaStream>()->cuda_stream());
-    WriteKeysToOutput<T><<<BlocksNum4ThreadsNum(instance_num), kCudaThreadsNumPerBlock, 0,
-                           ctx->stream()->As<ep::CudaStream>()->cuda_stream()>>>(
-        instance_num, instance_size, buffer_manager.KeyValueOutPtr(), out->mut_dptr<int64_t>());
-  }
-  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
-};
-
-#define REGISTER_CUDA_ARGMAX_KERNEL(dtype)                                                         \
-  REGISTER_USER_KERNEL("argmax")                                                                   \
-      .SetCreateFn<GpuArgMaxKernel<dtype>>()                                                       \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                             \
-                       && (user_op::HobDataType("in", 0) == GetDataType<dtype>::value))            \
-      .SetInferTmpSizeFn([](user_op::InferContext* ctx) {                                          \
-        const Shape& in_shape = ctx->InputShape("in", 0);                                          \
-        const int32_t instance_size = in_shape.dim_vec().back();                                   \
-        const int32_t instance_num = in_shape.elem_cnt() / instance_size;                          \
-                                                                                                   \
-        /* Key-Value Out */                                                                        \
-        int32_t key_value_out_bytes =                                                              \
-            GetCudaAlignedSize(instance_num * sizeof(hipcub::KeyValuePair<int32_t, dtype>));          \
-                                                                                                   \
-        /* CUB Temp Storage */                                                                     \
-        size_t temp_storage_bytes = InferTempStorageForArgMax<dtype>(instance_num, instance_size); \
-                                                                                                   \
-        return key_value_out_bytes + temp_storage_bytes;                                           \
-      });
-
-REGISTER_CUDA_ARGMAX_KERNEL(float)
-REGISTER_CUDA_ARGMAX_KERNEL(double)
-REGISTER_CUDA_ARGMAX_KERNEL(uint8_t)
-REGISTER_CUDA_ARGMAX_KERNEL(int8_t)
-REGISTER_CUDA_ARGMAX_KERNEL(int32_t)
-REGISTER_CUDA_ARGMAX_KERNEL(int64_t)
-
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "hip/hip_runtime.h"
+#include "oneflow/core/framework/framework.h"
+#include <hipcub/hipcub.hpp>
+#include "oneflow/core/ep/rocm/cuda_stream.h"
+
+namespace oneflow {
+
+namespace {
+
+template<typename T>
+class TmpBufferManager final {
+ public:
+  OF_DISALLOW_COPY_AND_MOVE(TmpBufferManager);
+  TmpBufferManager(int32_t capacity, void* ptr, int32_t instance_num)
+      : capacity_{capacity}, key_value_out_elem_cnt_{instance_num} {
+    const int32_t key_value_out_aligned_bytes =
+        GetCudaAlignedSize(key_value_out_elem_cnt_ * sizeof(hipcub::KeyValuePair<int32_t, T>));
+    key_value_out_ptr_ = reinterpret_cast<hipcub::KeyValuePair<int32_t, T>*>(ptr);
+    temp_storage_ptr_ = reinterpret_cast<void*>(reinterpret_cast<char*>(key_value_out_ptr_)
+                                                + key_value_out_aligned_bytes);
+    temp_storage_bytes_ = capacity_ - key_value_out_aligned_bytes;
+    CHECK_GE(temp_storage_bytes_, 0);
+  }
+  ~TmpBufferManager() = default;
+
+  hipcub::KeyValuePair<int32_t, T>* KeyValueOutPtr() const { return key_value_out_ptr_; }
+  void* TempStoragePtr() const { return temp_storage_ptr_; }
+
+  int32_t TempStorageBytes() const { return temp_storage_bytes_; }
+
+ private:
+  int32_t capacity_;
+
+  hipcub::KeyValuePair<int32_t, T>* key_value_out_ptr_;
+  void* temp_storage_ptr_;
+
+  int32_t key_value_out_elem_cnt_;
+  int32_t temp_storage_bytes_;
+};
+
+class MultiplyFunctor final {
+ public:
+  MultiplyFunctor(int32_t num_col) : num_col_(num_col) {}
+  __host__ __device__ __forceinline__ int32_t operator()(int32_t idx) const {
+    return idx * num_col_;
+  }
+
+ private:
+  int32_t num_col_;
+};
+
+template<typename T>
+size_t InferTempStorageForArgMax(int32_t num_row, int32_t num_col) {
+  using SegmentOffsetIter =
+      hipcub::TransformInputIterator<int32_t, MultiplyFunctor, hipcub::CountingInputIterator<int32_t>>;
+  hipcub::CountingInputIterator<int32_t> counting_iter(0);
+  MultiplyFunctor multiply_functor(num_col);
+  SegmentOffsetIter segment_offset_iter(counting_iter, multiply_functor);
+
+  size_t temp_storage_bytes = 0;
+  auto err =
+      hipcub::DeviceSegmentedReduce::ArgMax<T*, hipcub::KeyValuePair<int32_t, T>*, SegmentOffsetIter>(
+          /* d_temp_storage */ nullptr, /* temp_storage_bytes */ temp_storage_bytes,
+          /* d_in */ nullptr, /* d_out */ nullptr, /* num_segments */ num_row,
+          /* d_begin_offsets */ segment_offset_iter, /* d_end_offsets */ segment_offset_iter + 1,
+          /* stream */ 0);
+
+  // auto err =
+  //   hipcub::DeviceReduce::ArgMax<T*, hipcub::KeyValuePair<int32_t, T>*>(
+  //                   nullptr, temp_storage_bytes,
+  //                   nullptr, nullptr, num_row,
+  //                   0);
+
+  OF_CUDA_CHECK(err);
+
+  return temp_storage_bytes;
+}
+
+template<typename T>
+void ArgMax(const T* in_ptr, int32_t num_row, int32_t num_col, void* temp_storage_ptr,
+            int32_t temp_storage_bytes, hipcub::KeyValuePair<int32_t, T>* out_ptr,
+            hipStream_t stream) {
+  size_t rt_inferred_temp_storage_bytes = InferTempStorageForArgMax<T>(num_row, num_col);
+  CHECK_LE(rt_inferred_temp_storage_bytes, temp_storage_bytes);
+
+  using SegmentOffsetIter =
+      hipcub::TransformInputIterator<int32_t, MultiplyFunctor, hipcub::CountingInputIterator<int32_t>>;
+  hipcub::CountingInputIterator<int32_t> counting_iter(0);
+  MultiplyFunctor multiply_functor(num_col);
+  SegmentOffsetIter segment_offset_iter(counting_iter, multiply_functor);
+
+  // void * d_temp_storage = nullptr;
+  // hipMalloc((void **)&d_temp_storage, rt_inferred_temp_storage_bytes);
+
+  auto err = hipcub::DeviceSegmentedReduce::ArgMax(
+      /* d_temp_storage */ temp_storage_ptr,
+      /* temp_storage_bytes */ rt_inferred_temp_storage_bytes,
+      /* d_in */ in_ptr,
+      /* d_out */ out_ptr,
+      /* num_segments */ num_row,
+      /* d_begin_offsets */ segment_offset_iter,
+      /* d_end_offsets */ segment_offset_iter + 1,
+      /* stream */ stream);
+
+  // auto err =
+  //   hipcub::DeviceReduce::ArgMax(
+  //                   d_temp_storage, rt_inferred_temp_storage_bytes,
+  //                   in_ptr, out_ptr, num_row,
+  //                   stream);
+
+  OF_CUDA_CHECK(err);
+}
+
+template<typename T>
+__global__ void WriteKeysToOutput(const int32_t instance_num, const int32_t instance_size, 
+                                  const hipcub::KeyValuePair<int32_t, T>* key_value_out_ptr,
+                                  int64_t* out_ptr) {
+  CUDA_1D_KERNEL_LOOP(i, instance_num) { out_ptr[i] = key_value_out_ptr[i].key % instance_size; }
+}
+
+}  // namespace
+
+template<typename T>
+class GpuArgMaxKernel final : public user_op::OpKernel {
+ public:
+  GpuArgMaxKernel() = default;
+  ~GpuArgMaxKernel() = default;
+
+ private:
+  using user_op::OpKernel::Compute;
+  void Compute(user_op::KernelComputeContext* ctx) const override {
+    const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("in", 0);
+    user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0);
+    user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0);
+
+    const int32_t elem_cnt = in->shape_view().elem_cnt();
+    const int32_t instance_size = in->shape_view().At(in->shape_view().NumAxes() - 1);
+    const int32_t instance_num = elem_cnt / instance_size;
+    TmpBufferManager<T> buffer_manager(tmp_buffer->shape_view().elem_cnt(),
+                                       tmp_buffer->mut_dptr<void>(), instance_num);
+
+    ArgMax(in->dptr<T>(), instance_num, instance_size, buffer_manager.TempStoragePtr(),
+           buffer_manager.TempStorageBytes(), buffer_manager.KeyValueOutPtr(),
+           ctx->stream()->As<ep::CudaStream>()->cuda_stream());
+    WriteKeysToOutput<T><<<BlocksNum4ThreadsNum(instance_num), kCudaThreadsNumPerBlock, 0,
+                           ctx->stream()->As<ep::CudaStream>()->cuda_stream()>>>(
+        instance_num, instance_size, buffer_manager.KeyValueOutPtr(), out->mut_dptr<int64_t>());
+  }
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
+};
+
+#define REGISTER_CUDA_ARGMAX_KERNEL(dtype)                                                         \
+  REGISTER_USER_KERNEL("argmax")                                                                   \
+      .SetCreateFn<GpuArgMaxKernel<dtype>>()                                                       \
+      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                             \
+                       && (user_op::HobDataType("in", 0) == GetDataType<dtype>::value))            \
+      .SetInferTmpSizeFn([](user_op::InferContext* ctx) {                                          \
+        const Shape& in_shape = ctx->InputShape("in", 0);                                          \
+        const int32_t instance_size = in_shape.dim_vec().back();                                   \
+        const int32_t instance_num = in_shape.elem_cnt() / instance_size;                          \
+                                                                                                   \
+        /* Key-Value Out */                                                                        \
+        int32_t key_value_out_bytes =                                                              \
+            GetCudaAlignedSize(instance_num * sizeof(hipcub::KeyValuePair<int32_t, dtype>));          \
+                                                                                                   \
+        /* CUB Temp Storage */                                                                     \
+        size_t temp_storage_bytes = InferTempStorageForArgMax<dtype>(instance_num, instance_size); \
+                                                                                                   \
+        return key_value_out_bytes + temp_storage_bytes;                                           \
+      });
+
+REGISTER_CUDA_ARGMAX_KERNEL(float)
+REGISTER_CUDA_ARGMAX_KERNEL(double)
+REGISTER_CUDA_ARGMAX_KERNEL(uint8_t)
+REGISTER_CUDA_ARGMAX_KERNEL(int8_t)
+REGISTER_CUDA_ARGMAX_KERNEL(int32_t)
+REGISTER_CUDA_ARGMAX_KERNEL(int64_t)
+
 }  // namespace oneflow
\ No newline at end of file
diff --git a/oneflow/user/kernels/as_strided_kernel.hip.cpp b/oneflow/user/kernels/as_strided_kernel.hip.cpp
index ef8972e..2448f45 100644
--- a/oneflow/user/kernels/as_strided_kernel.hip.cpp
+++ b/oneflow/user/kernels/as_strided_kernel.hip.cpp
@@ -1,199 +1,199 @@
-#include "hip/hip_runtime.h"
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-#include <cstdint>
-#include "oneflow/core/hip/atomic.hip.h"
-#include "oneflow/core/common/just.h"
-#include "oneflow/core/common/util.h"
-#include "oneflow/core/framework/consistency_check.h"
-#include "oneflow/core/framework/framework.h"
-#include "oneflow/core/kernel/new_kernel_util.h"
-#include "oneflow/core/kernel/kernel_util.h"
-#include "oneflow/core/ep/rocm/cuda_stream.h"
-#include "oneflow/core/common/nd_index_offset_helper.h"
-
-namespace oneflow {
-
-namespace {
-
-constexpr size_t NUM_DIM = 8;
-
-template<size_t num_dims, typename IndexType>
-struct AsStridedParams {
-  NdIndexOffsetHelper<IndexType, num_dims> destIndexOffsetHelper;
-  int64_t dest_dims[num_dims];
-  int32_t stride[num_dims];
-  int32_t dest_num_dims;
-  int32_t storage_offset;
-  int32_t input_num;
-  int32_t output_num;
-};
-
-template<typename T>
-__global__ void AsStrided_kernel(const T* input_buf, T* output_buf,
-                                 AsStridedParams<NUM_DIM, int64_t> params) {
-  const int64_t* dest_dims = reinterpret_cast<const int64_t*>(params.dest_dims);
-  const int32_t* stride = reinterpret_cast<const int32_t*>(params.stride);
-
-  CUDA_1D_KERNEL_LOOP_T(int64_t, i, params.output_num) {
-    int64_t dst_index[NUM_DIM];
-    params.destIndexOffsetHelper.OffsetToNdIndex(i, dst_index, params.dest_num_dims);
-    int32_t index_in_input = params.storage_offset;
-    FOR_RANGE(int64_t, j, 0, params.dest_num_dims) { index_in_input += dst_index[j] * stride[j]; }
-    output_buf[i] = input_buf[index_in_input];
-  }
-}
-
-template<typename T>
-__global__ void AsStridedGrad_kernel(const T* dy_buf, T* dx_buf,
-                                     AsStridedParams<NUM_DIM, int64_t> params) {
-  const int64_t* dest_dims = reinterpret_cast<const int64_t*>(params.dest_dims);
-  const int32_t* stride = reinterpret_cast<const int32_t*>(params.stride);
-  CUDA_1D_KERNEL_LOOP_T(int64_t, i, params.output_num) {
-    int64_t dy_index[NUM_DIM];
-    params.destIndexOffsetHelper.OffsetToNdIndex(i, dy_index, params.dest_num_dims);
-    int32_t index_in_dx = params.storage_offset;
-    FOR_RANGE(int64_t, j, 0, params.dest_num_dims) { index_in_dx += dy_index[j] * stride[j]; }
-    cuda::atomic::Add(dx_buf + index_in_dx, dy_buf[i]);
-  }
-}
-
-template<typename T>
-struct AsStridedFunctor final {
-  void operator()(ep::Stream* stream, const T* input_buf, T* output_buf, const int64_t* dest_dims,
-                  const int32_t* stride, const int32_t dest_num_dims, const int32_t storage_offset,
-                  const int32_t input_num, const int32_t output_num) {
-    NdIndexOffsetHelper<int64_t, NUM_DIM> destIndexOffsetHelper(dest_dims, dest_num_dims);
-    AsStridedParams<NUM_DIM, int64_t> params;
-    params.destIndexOffsetHelper = destIndexOffsetHelper;
-    FOR_RANGE(size_t, i, 0, dest_num_dims) {
-      params.dest_dims[i] = dest_dims[i];
-      params.stride[i] = stride[i];
-    }
-    params.dest_num_dims = dest_num_dims;
-    params.storage_offset = storage_offset;
-    params.input_num = input_num;
-    params.output_num = output_num;
-
-    AsStrided_kernel<T>
-        <<<BlocksNum4ThreadsNum(output_num), kCudaThreadsNumPerBlock, 0,
-           stream->As<ep::CudaStream>()->cuda_stream()>>>(input_buf, output_buf, params);
-  }
-};
-
-template<typename T>
-struct AsStridedGradFunctor final {
-  void operator()(ep::Stream* stream, const T* dy_buf, T* dx_buf, const int64_t* dy_dims,
-                  const int32_t* stride, const int32_t dy_num_dims, const int32_t storage_offset,
-                  const int32_t dx_num, const int32_t dy_num) {
-    NdIndexOffsetHelper<int64_t, NUM_DIM> dyIndexOffsetHelper(dy_dims, dy_num_dims);
-    AsStridedParams<NUM_DIM, int64_t> params;
-    params.destIndexOffsetHelper = dyIndexOffsetHelper;
-    FOR_RANGE(size_t, i, 0, dy_num_dims) {
-      params.dest_dims[i] = dy_dims[i];
-      params.stride[i] = stride[i];
-    }
-    params.dest_num_dims = dy_num_dims;
-    params.storage_offset = storage_offset;
-    params.input_num = dx_num;
-    params.output_num = dy_num;
-
-    AsStridedGrad_kernel<T>
-        <<<BlocksNum4ThreadsNum(dy_num), kCudaThreadsNumPerBlock, 0,
-           stream->As<ep::CudaStream>()->cuda_stream()>>>(dy_buf, dx_buf, params);
-  }
-};
-
-}  // namespace
-
-template<typename T>
-class GpuAsStridedKernel final : public user_op::OpKernel {
- public:
-  GpuAsStridedKernel() = default;
-  ~GpuAsStridedKernel() = default;
-
- private:
-  using user_op::OpKernel::Compute;
-  void Compute(user_op::KernelComputeContext* ctx) const override {
-    const user_op::Tensor* input = ctx->Tensor4ArgNameAndIndex("input", 0);
-    user_op::Tensor* output = ctx->Tensor4ArgNameAndIndex("output", 0);
-    const auto size = ctx->Attr<std::vector<int32_t>>("size");
-    const auto stride = ctx->Attr<std::vector<int32_t>>("stride");
-    const int32_t storage_offset = ctx->Attr<int32_t>("storage_offset");
-
-    size_t dest_num_dims = output->shape_view().NumAxes();
-    const int64_t* dest_dims = output->shape_view().ptr();
-    const size_t input_num = input->shape_view().Count(0);
-    const size_t output_num = output->shape_view().Count(0);
-    if (input_num == 0) {
-      // 0-size tensor
-      return;
-    }
-
-    AsStridedFunctor<T>()(ctx->stream(), input->dptr<T>(), output->mut_dptr<T>(), dest_dims,
-                          stride.data(), dest_num_dims, storage_offset, input_num, output_num);
-  }
-  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
-};
-
-template<typename T>
-class GpuAsStridedGradKernel final : public user_op::OpKernel {
- public:
-  GpuAsStridedGradKernel() = default;
-  ~GpuAsStridedGradKernel() = default;
-
- private:
-  using user_op::OpKernel::Compute;
-  void Compute(user_op::KernelComputeContext* ctx) const override {
-    const user_op::Tensor* dy = ctx->Tensor4ArgNameAndIndex("dy", 0);
-    user_op::Tensor* dx = ctx->Tensor4ArgNameAndIndex("dx", 0);
-    const auto size = ctx->Attr<std::vector<int32_t>>("size");
-    const auto stride = ctx->Attr<std::vector<int32_t>>("stride");
-    const int32_t storage_offset = ctx->Attr<int32_t>("storage_offset");
-
-    size_t dy_num_dims = dy->shape_view().NumAxes();
-    const int64_t* dy_dims = dy->shape_view().ptr();
-    const size_t dx_num = dx->shape_view().Count(0);
-    const size_t dy_num = dy->shape_view().Count(0);
-
-    Memset<DeviceType::kCUDA>(ctx->stream(), dx->mut_dptr(), 0,
-                              dx->shape_view().Count(0) * sizeof(T));
-
-    AsStridedGradFunctor<T>()(ctx->stream(), dy->dptr<T>(), dx->mut_dptr<T>(), dy_dims,
-                              stride.data(), dy_num_dims, storage_offset, dx_num, dy_num);
-  }
-  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
-};
-
-#define REGISTER_GPUASSTRIDED_KERNEL(in_type)                                                 \
-  REGISTER_USER_KERNEL("as_strided")                                                          \
-      .SetCreateFn<GpuAsStridedKernel<in_type>>()                                             \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                        \
-                       && (user_op::HobDataType("input", 0) == GetDataType<in_type>::value)); \
-  REGISTER_USER_KERNEL("as_strided_grad")                                                     \
-      .SetCreateFn<GpuAsStridedGradKernel<in_type>>()                                         \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                        \
-                       && (user_op::HobDataType("input", 0) == GetDataType<in_type>::value));
-
-REGISTER_GPUASSTRIDED_KERNEL(half);
-REGISTER_GPUASSTRIDED_KERNEL(float);
-REGISTER_GPUASSTRIDED_KERNEL(double);
-REGISTER_GPUASSTRIDED_KERNEL(int64_t);
-
-#undef REGISTER_GPUASSTRIDED_KERNEL
-
+#include "hip/hip_runtime.h"
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+#include <cstdint>
+#include "oneflow/core/hip/atomic.hip.h"
+#include "oneflow/core/common/just.h"
+#include "oneflow/core/common/util.h"
+#include "oneflow/core/framework/consistency_check.h"
+#include "oneflow/core/framework/framework.h"
+#include "oneflow/core/kernel/new_kernel_util.h"
+#include "oneflow/core/kernel/kernel_util.h"
+#include "oneflow/core/ep/rocm/cuda_stream.h"
+#include "oneflow/core/common/nd_index_offset_helper.h"
+
+namespace oneflow {
+
+namespace {
+
+constexpr size_t NUM_DIM = 8;
+
+template<size_t num_dims, typename IndexType>
+struct AsStridedParams {
+  NdIndexOffsetHelper<IndexType, num_dims> destIndexOffsetHelper;
+  int64_t dest_dims[num_dims];
+  int32_t stride[num_dims];
+  int32_t dest_num_dims;
+  int32_t storage_offset;
+  int32_t input_num;
+  int32_t output_num;
+};
+
+template<typename T>
+__global__ void AsStrided_kernel(const T* input_buf, T* output_buf,
+                                 AsStridedParams<NUM_DIM, int64_t> params) {
+  const int64_t* dest_dims = reinterpret_cast<const int64_t*>(params.dest_dims);
+  const int32_t* stride = reinterpret_cast<const int32_t*>(params.stride);
+
+  CUDA_1D_KERNEL_LOOP_T(int64_t, i, params.output_num) {
+    int64_t dst_index[NUM_DIM];
+    params.destIndexOffsetHelper.OffsetToNdIndex(i, dst_index, params.dest_num_dims);
+    int32_t index_in_input = params.storage_offset;
+    FOR_RANGE(int64_t, j, 0, params.dest_num_dims) { index_in_input += dst_index[j] * stride[j]; }
+    output_buf[i] = input_buf[index_in_input];
+  }
+}
+
+template<typename T>
+__global__ void AsStridedGrad_kernel(const T* dy_buf, T* dx_buf,
+                                     AsStridedParams<NUM_DIM, int64_t> params) {
+  const int64_t* dest_dims = reinterpret_cast<const int64_t*>(params.dest_dims);
+  const int32_t* stride = reinterpret_cast<const int32_t*>(params.stride);
+  CUDA_1D_KERNEL_LOOP_T(int64_t, i, params.output_num) {
+    int64_t dy_index[NUM_DIM];
+    params.destIndexOffsetHelper.OffsetToNdIndex(i, dy_index, params.dest_num_dims);
+    int32_t index_in_dx = params.storage_offset;
+    FOR_RANGE(int64_t, j, 0, params.dest_num_dims) { index_in_dx += dy_index[j] * stride[j]; }
+    cuda::atomic::Add(dx_buf + index_in_dx, dy_buf[i]);
+  }
+}
+
+template<typename T>
+struct AsStridedFunctor final {
+  void operator()(ep::Stream* stream, const T* input_buf, T* output_buf, const int64_t* dest_dims,
+                  const int32_t* stride, const int32_t dest_num_dims, const int32_t storage_offset,
+                  const int32_t input_num, const int32_t output_num) {
+    NdIndexOffsetHelper<int64_t, NUM_DIM> destIndexOffsetHelper(dest_dims, dest_num_dims);
+    AsStridedParams<NUM_DIM, int64_t> params;
+    params.destIndexOffsetHelper = destIndexOffsetHelper;
+    FOR_RANGE(size_t, i, 0, dest_num_dims) {
+      params.dest_dims[i] = dest_dims[i];
+      params.stride[i] = stride[i];
+    }
+    params.dest_num_dims = dest_num_dims;
+    params.storage_offset = storage_offset;
+    params.input_num = input_num;
+    params.output_num = output_num;
+
+    AsStrided_kernel<T>
+        <<<BlocksNum4ThreadsNum(output_num), kCudaThreadsNumPerBlock, 0,
+           stream->As<ep::CudaStream>()->cuda_stream()>>>(input_buf, output_buf, params);
+  }
+};
+
+template<typename T>
+struct AsStridedGradFunctor final {
+  void operator()(ep::Stream* stream, const T* dy_buf, T* dx_buf, const int64_t* dy_dims,
+                  const int32_t* stride, const int32_t dy_num_dims, const int32_t storage_offset,
+                  const int32_t dx_num, const int32_t dy_num) {
+    NdIndexOffsetHelper<int64_t, NUM_DIM> dyIndexOffsetHelper(dy_dims, dy_num_dims);
+    AsStridedParams<NUM_DIM, int64_t> params;
+    params.destIndexOffsetHelper = dyIndexOffsetHelper;
+    FOR_RANGE(size_t, i, 0, dy_num_dims) {
+      params.dest_dims[i] = dy_dims[i];
+      params.stride[i] = stride[i];
+    }
+    params.dest_num_dims = dy_num_dims;
+    params.storage_offset = storage_offset;
+    params.input_num = dx_num;
+    params.output_num = dy_num;
+
+    AsStridedGrad_kernel<T>
+        <<<BlocksNum4ThreadsNum(dy_num), kCudaThreadsNumPerBlock, 0,
+           stream->As<ep::CudaStream>()->cuda_stream()>>>(dy_buf, dx_buf, params);
+  }
+};
+
+}  // namespace
+
+template<typename T>
+class GpuAsStridedKernel final : public user_op::OpKernel {
+ public:
+  GpuAsStridedKernel() = default;
+  ~GpuAsStridedKernel() = default;
+
+ private:
+  using user_op::OpKernel::Compute;
+  void Compute(user_op::KernelComputeContext* ctx) const override {
+    const user_op::Tensor* input = ctx->Tensor4ArgNameAndIndex("input", 0);
+    user_op::Tensor* output = ctx->Tensor4ArgNameAndIndex("output", 0);
+    const auto size = ctx->Attr<std::vector<int32_t>>("size");
+    const auto stride = ctx->Attr<std::vector<int32_t>>("stride");
+    const int32_t storage_offset = ctx->Attr<int32_t>("storage_offset");
+
+    size_t dest_num_dims = output->shape_view().NumAxes();
+    const int64_t* dest_dims = output->shape_view().ptr();
+    const size_t input_num = input->shape_view().Count(0);
+    const size_t output_num = output->shape_view().Count(0);
+    if (input_num == 0) {
+      // 0-size tensor
+      return;
+    }
+
+    AsStridedFunctor<T>()(ctx->stream(), input->dptr<T>(), output->mut_dptr<T>(), dest_dims,
+                          stride.data(), dest_num_dims, storage_offset, input_num, output_num);
+  }
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
+};
+
+template<typename T>
+class GpuAsStridedGradKernel final : public user_op::OpKernel {
+ public:
+  GpuAsStridedGradKernel() = default;
+  ~GpuAsStridedGradKernel() = default;
+
+ private:
+  using user_op::OpKernel::Compute;
+  void Compute(user_op::KernelComputeContext* ctx) const override {
+    const user_op::Tensor* dy = ctx->Tensor4ArgNameAndIndex("dy", 0);
+    user_op::Tensor* dx = ctx->Tensor4ArgNameAndIndex("dx", 0);
+    const auto size = ctx->Attr<std::vector<int32_t>>("size");
+    const auto stride = ctx->Attr<std::vector<int32_t>>("stride");
+    const int32_t storage_offset = ctx->Attr<int32_t>("storage_offset");
+
+    size_t dy_num_dims = dy->shape_view().NumAxes();
+    const int64_t* dy_dims = dy->shape_view().ptr();
+    const size_t dx_num = dx->shape_view().Count(0);
+    const size_t dy_num = dy->shape_view().Count(0);
+
+    Memset<DeviceType::kCUDA>(ctx->stream(), dx->mut_dptr(), 0,
+                              dx->shape_view().Count(0) * sizeof(T));
+
+    AsStridedGradFunctor<T>()(ctx->stream(), dy->dptr<T>(), dx->mut_dptr<T>(), dy_dims,
+                              stride.data(), dy_num_dims, storage_offset, dx_num, dy_num);
+  }
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
+};
+
+#define REGISTER_GPUASSTRIDED_KERNEL(in_type)                                                 \
+  REGISTER_USER_KERNEL("as_strided")                                                          \
+      .SetCreateFn<GpuAsStridedKernel<in_type>>()                                             \
+      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                        \
+                       && (user_op::HobDataType("input", 0) == GetDataType<in_type>::value)); \
+  REGISTER_USER_KERNEL("as_strided_grad")                                                     \
+      .SetCreateFn<GpuAsStridedGradKernel<in_type>>()                                         \
+      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                        \
+                       && (user_op::HobDataType("input", 0) == GetDataType<in_type>::value));
+
+REGISTER_GPUASSTRIDED_KERNEL(half);
+REGISTER_GPUASSTRIDED_KERNEL(float);
+REGISTER_GPUASSTRIDED_KERNEL(double);
+REGISTER_GPUASSTRIDED_KERNEL(int64_t);
+
+#undef REGISTER_GPUASSTRIDED_KERNEL
+
 }  // namespace oneflow
\ No newline at end of file
diff --git a/oneflow/user/kernels/assign_if_kernel.hip.cpp b/oneflow/user/kernels/assign_if_kernel.hip.cpp
index 3752a71..6163c48 100644
--- a/oneflow/user/kernels/assign_if_kernel.hip.cpp
+++ b/oneflow/user/kernels/assign_if_kernel.hip.cpp
@@ -1,76 +1,76 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "hip/hip_runtime.h"
-#include "oneflow/core/framework/framework.h"
-#include "oneflow/core/kernel/kernel_util.h"
-#include "oneflow/core/ep/rocm/cuda_stream.h"
-
-namespace oneflow {
-
-namespace {
-
-template<bool assign_if, typename C, typename T>
-__global__ void AssignGpu(int64_t elem_cnt, const C* condition, const T* value, T* ref) {
-  if (assign_if == (*condition == 0)) { return; }
-  CUDA_1D_KERNEL_LOOP(i, elem_cnt) { ref[i] = value[i]; }
-}
-
-template<bool assign_if, typename C, typename T>
-class AssignIfGPUKernel final : public user_op::OpKernel {
- public:
-  AssignIfGPUKernel() = default;
-  ~AssignIfGPUKernel() override = default;
-
- private:
-  using user_op::OpKernel::Compute;
-  void Compute(user_op::KernelComputeContext* ctx) const override {
-    const user_op::Tensor* condition = ctx->Tensor4ArgNameAndIndex("condition", 0);
-    CHECK_EQ(condition->shape_view().NumAxes(), 1);
-    CHECK_EQ(condition->shape_view().At(0), 1);
-    const user_op::Tensor* value = ctx->Tensor4ArgNameAndIndex("value", 0);
-    user_op::Tensor* ref = ctx->Tensor4ArgNameAndIndex("ref", 0);
-    if (value->dptr() == ref->dptr()) { return; }
-    CHECK_EQ(value->shape_view(), ref->shape_view());
-    CHECK_EQ(value->data_type(), ref->data_type());
-    const size_t elem_cnt = ref->shape_view().elem_cnt();
-    AssignGpu<assign_if, C, T><<<BlocksNum4ThreadsNum(elem_cnt), kCudaThreadsNumPerBlock, 0,
-                                 ctx->stream()->As<ep::CudaStream>()->cuda_stream()>>>(
-        elem_cnt, condition->dptr<C>(), value->dptr<T>(), ref->mut_dptr<T>());
-  }
-  bool AlwaysComputeWhenAllOutputsEmpty() const override { return true; }
-};
-
-}  // namespace
-
-#define REGISTER_ASSIGN_WITH_CONDITION_VALUE_CUDA_KERNEL(op_type_name, assign_if, condition_type, \
-                                                         value_type)                              \
-  REGISTER_USER_KERNEL(op_type_name)                                                              \
-      .SetCreateFn<AssignIfGPUKernel<assign_if, condition_type, value_type>>()                    \
-      .SetIsMatchedHob(                                                                           \
-          (user_op::HobDeviceType() == DeviceType::kCUDA)                                         \
-          && (user_op::HobDataType("condition", 0) == GetDataType<condition_type>::value)         \
-          && (user_op::HobDataType("value", 0) == GetDataType<value_type>::value));
-
-#define REGISTER_ASSIGN_IF_CUDA_KERNEL(condition_type, value_type)                        \
-  REGISTER_ASSIGN_WITH_CONDITION_VALUE_CUDA_KERNEL(                                       \
-      "assign_if", true, OF_PP_PAIR_FIRST(condition_type), OF_PP_PAIR_FIRST(value_type)); \
-  REGISTER_ASSIGN_WITH_CONDITION_VALUE_CUDA_KERNEL(                                       \
-      "assign_if_not", false, OF_PP_PAIR_FIRST(condition_type), OF_PP_PAIR_FIRST(value_type))
-
-OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_ASSIGN_IF_CUDA_KERNEL, INT_DATA_TYPE_SEQ,
-                                 POD_DATA_TYPE_SEQ)
-
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "hip/hip_runtime.h"
+#include "oneflow/core/framework/framework.h"
+#include "oneflow/core/kernel/kernel_util.h"
+#include "oneflow/core/ep/rocm/cuda_stream.h"
+
+namespace oneflow {
+
+namespace {
+
+template<bool assign_if, typename C, typename T>
+__global__ void AssignGpu(int64_t elem_cnt, const C* condition, const T* value, T* ref) {
+  if (assign_if == (*condition == 0)) { return; }
+  CUDA_1D_KERNEL_LOOP(i, elem_cnt) { ref[i] = value[i]; }
+}
+
+template<bool assign_if, typename C, typename T>
+class AssignIfGPUKernel final : public user_op::OpKernel {
+ public:
+  AssignIfGPUKernel() = default;
+  ~AssignIfGPUKernel() override = default;
+
+ private:
+  using user_op::OpKernel::Compute;
+  void Compute(user_op::KernelComputeContext* ctx) const override {
+    const user_op::Tensor* condition = ctx->Tensor4ArgNameAndIndex("condition", 0);
+    CHECK_EQ(condition->shape_view().NumAxes(), 1);
+    CHECK_EQ(condition->shape_view().At(0), 1);
+    const user_op::Tensor* value = ctx->Tensor4ArgNameAndIndex("value", 0);
+    user_op::Tensor* ref = ctx->Tensor4ArgNameAndIndex("ref", 0);
+    if (value->dptr() == ref->dptr()) { return; }
+    CHECK_EQ(value->shape_view(), ref->shape_view());
+    CHECK_EQ(value->data_type(), ref->data_type());
+    const size_t elem_cnt = ref->shape_view().elem_cnt();
+    AssignGpu<assign_if, C, T><<<BlocksNum4ThreadsNum(elem_cnt), kCudaThreadsNumPerBlock, 0,
+                                 ctx->stream()->As<ep::CudaStream>()->cuda_stream()>>>(
+        elem_cnt, condition->dptr<C>(), value->dptr<T>(), ref->mut_dptr<T>());
+  }
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return true; }
+};
+
+}  // namespace
+
+#define REGISTER_ASSIGN_WITH_CONDITION_VALUE_CUDA_KERNEL(op_type_name, assign_if, condition_type, \
+                                                         value_type)                              \
+  REGISTER_USER_KERNEL(op_type_name)                                                              \
+      .SetCreateFn<AssignIfGPUKernel<assign_if, condition_type, value_type>>()                    \
+      .SetIsMatchedHob(                                                                           \
+          (user_op::HobDeviceType() == DeviceType::kCUDA)                                         \
+          && (user_op::HobDataType("condition", 0) == GetDataType<condition_type>::value)         \
+          && (user_op::HobDataType("value", 0) == GetDataType<value_type>::value));
+
+#define REGISTER_ASSIGN_IF_CUDA_KERNEL(condition_type, value_type)                        \
+  REGISTER_ASSIGN_WITH_CONDITION_VALUE_CUDA_KERNEL(                                       \
+      "assign_if", true, OF_PP_PAIR_FIRST(condition_type), OF_PP_PAIR_FIRST(value_type)); \
+  REGISTER_ASSIGN_WITH_CONDITION_VALUE_CUDA_KERNEL(                                       \
+      "assign_if_not", false, OF_PP_PAIR_FIRST(condition_type), OF_PP_PAIR_FIRST(value_type))
+
+OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_ASSIGN_IF_CUDA_KERNEL, INT_DATA_TYPE_SEQ,
+                                 POD_DATA_TYPE_SEQ)
+
 }  // namespace oneflow
\ No newline at end of file
diff --git a/oneflow/user/kernels/avg_pool_kernel.hip.cpp b/oneflow/user/kernels/avg_pool_kernel.hip.cpp
index e2abbff..a8777ba 100644
--- a/oneflow/user/kernels/avg_pool_kernel.hip.cpp
+++ b/oneflow/user/kernels/avg_pool_kernel.hip.cpp
@@ -1,200 +1,200 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "hip/hip_runtime.h"
-#include <cstdint>
-#include "oneflow/core/hip/elementwise.hip.h"
-#include "oneflow/user/kernels/avg_pool_kernel_util.h"
-#include "oneflow/core/ep/rocm/cuda_stream.h"
-
-namespace oneflow {
-
-namespace {
-
-constexpr int kBlockSize = cuda::elementwise::kBlockSize;
-
-int GetMinThreadNum(const int64_t elem_num) { return std::min<int64_t>(elem_num, kBlockSize); }
-
-int GetNumBlocks(int32_t elem_cnt) {
-  int num_blocks = 0;
-  OF_CUDA_CHECK(cuda::elementwise::GetNumBlocks(elem_cnt, &num_blocks));
-  return num_blocks;
-}
-
-}  // namespace
-
-template<typename T, typename IDX>
-__launch_bounds__(kBlockSize) __global__
-    void DoCUDAAvgPool1dForward(const NdIndexOffsetHelper<IDX, 2> index_helper, IDX elem_num,
-                                const T* src, T* dest, int32_t padding_l, const int32_t n_batch,
-                                const int32_t n_channel, const int32_t x_length,
-                                const int32_t kernel_size_l, const int32_t stride_l,
-                                const bool count_include_pad, const int32_t divisor_override) {
-  Avgpool1dForwardCompute<T>(index_helper, elem_num, src, dest, padding_l, n_batch, n_channel,
-                             x_length, kernel_size_l, stride_l, count_include_pad,
-                             divisor_override);
-};
-
-template<typename T, typename IDX>
-__launch_bounds__(kBlockSize) __global__
-    void DoCUDAAvgPool2dForward(const NdIndexOffsetHelper<IDX, 3> index_helper, IDX elem_num,
-                                const T* src, T* dest, const int32_t padding_h,
-                                const int32_t padding_w, const int32_t n_batch,
-                                const int32_t n_channel, const int32_t x_height,
-                                const int32_t x_width, const int32_t kernel_size_h,
-                                const int32_t kernel_size_w, const int32_t stride_h,
-                                const int32_t stride_w, const bool count_include_pad,
-                                const int32_t divisor_override) {
-  Avgpool2dForwardCompute<T>(index_helper, elem_num, src, dest, padding_h, padding_w, n_batch,
-                             n_channel, x_height, x_width, kernel_size_h, kernel_size_w, stride_h,
-                             stride_w, count_include_pad, divisor_override);
-};
-
-template<typename T, typename IDX>
-__launch_bounds__(kBlockSize) __global__
-    void DoCUDAAvgPool3dForward(const NdIndexOffsetHelper<IDX, 4> index_helper, IDX elem_num,
-                                const T* src, T* dest, int32_t padding_t, const int32_t padding_h,
-                                const int32_t padding_w, const int32_t n_batch,
-                                const int32_t n_channel, const int32_t x_time,
-                                const int32_t x_height, const int32_t x_width,
-                                const int32_t kernel_size_t, int32_t kernel_size_h,
-                                const int32_t kernel_size_w, const int32_t stride_t,
-                                const int32_t stride_h, const int32_t stride_w,
-                                const bool count_include_pad, const int32_t divisor_override) {
-  Avgpool3dForwardCompute<T>(index_helper, elem_num, src, dest, padding_t, padding_h, padding_w,
-                             n_batch, n_channel, x_time, x_height, x_width, kernel_size_t,
-                             kernel_size_h, kernel_size_w, stride_t, stride_h, stride_w,
-                             count_include_pad, divisor_override);
-};
-
-template<typename T, typename IDX>
-__launch_bounds__(kBlockSize) __global__
-    void DoCUDAAvgPool1dBackward(const NdIndexOffsetHelper<IDX, 2> index_helper, IDX elem_num,
-                                 const T* src, T* dest, const int32_t padding_l,
-                                 const int32_t n_batch, const int32_t n_channel,
-                                 const int32_t input_length, const int32_t kernel_size_l,
-                                 const int32_t stride_l, const bool count_include_pad,
-                                 const int32_t divisor_override) {
-  Avgpool1dBackwardCompute<T>(index_helper, elem_num, src, dest, padding_l, n_batch, n_channel,
-                              input_length, kernel_size_l, stride_l, count_include_pad,
-                              divisor_override);
-};
-
-template<typename T, typename IDX>
-__launch_bounds__(kBlockSize) __global__
-    void DoCUDAAvgPool2dBackward(const NdIndexOffsetHelper<IDX, 3> index_helper, IDX elem_num,
-                                 const T* src, T* dest, const int32_t padding_h,
-                                 const int32_t padding_w, const int32_t n_batch,
-                                 const int32_t n_channel, const int32_t input_height,
-                                 const int32_t input_width, const int32_t kernel_size_h,
-                                 const int32_t kernel_size_w, const int32_t stride_h,
-                                 const int32_t stride_w, const bool count_include_pad,
-                                 int32_t divisor_override) {
-  Avgpool2dBackwardCompute<T>(index_helper, elem_num, src, dest, padding_h, padding_w, n_batch,
-                              n_channel, input_height, input_width, kernel_size_h, kernel_size_w,
-                              stride_h, stride_w, count_include_pad, divisor_override);
-};
-
-template<typename T, typename IDX>
-__launch_bounds__(kBlockSize) __global__ void DoCUDAAvgPool3dBackward(
-    const NdIndexOffsetHelper<IDX, 4> index_helper, IDX elem_num, const T* src, T* dest,
-    const int32_t padding_t, const int32_t padding_h, const int32_t padding_w,
-    const int32_t n_batch, const int32_t n_channel, const int32_t x_time, const int32_t x_height,
-    const int32_t x_width, const int32_t kernel_size_t, const int32_t kernel_size_h,
-    const int32_t kernel_size_w, const int32_t stride_t, const int32_t stride_h,
-    const int32_t stride_w, const bool count_include_pad, const int32_t divisor_override) {
-  Avgpool3dBackwardCompute<T>(index_helper, elem_num, src, dest, padding_t, padding_h, padding_w,
-                              n_batch, n_channel, x_time, x_height, x_width, kernel_size_t,
-                              kernel_size_h, kernel_size_w, stride_t, stride_h, stride_w,
-                              count_include_pad, divisor_override);
-};
-
-template<typename T, typename IDX>
-struct AvgPoolKernelUtil<DeviceType::kCUDA, T, IDX> {
-  static void Avgpool1dForward(ep::Stream* stream, const NdIndexOffsetHelper<IDX, 2>& index_helper,
-                               const IDX elem_num, const T* src, T* dest,
-                               const AvgPoolParams3D& params_3d) {
-    DoCUDAAvgPool1dForward<T, IDX><<<GetNumBlocks(elem_num), GetMinThreadNum(elem_num), 0,
-                                     stream->As<ep::CudaStream>()->cuda_stream()>>>(
-        index_helper, elem_num, src, dest, params_3d.padding()[2], params_3d.num_batch(),
-        params_3d.num_channel(), params_3d.GetXShape5D().At(4), params_3d.pool_size_3d()[2],
-        params_3d.stride_3d()[2], params_3d.count_include_pad(), params_3d.divisor_override());
-  }
-
-  static void Avgpool1dBackward(ep::Stream* stream, const NdIndexOffsetHelper<IDX, 2>& index_helper,
-                                const IDX elem_num, const T* src, T* dest,
-                                const AvgPoolParams3D& params_3d) {
-    DoCUDAAvgPool1dBackward<T, IDX><<<GetNumBlocks(elem_num), GetMinThreadNum(elem_num), 0,
-                                      stream->As<ep::CudaStream>()->cuda_stream()>>>(
-        index_helper, elem_num, src, dest, params_3d.padding()[2], params_3d.num_batch(),
-        params_3d.num_channel(), params_3d.GetXShape5D().At(4), params_3d.pool_size_3d()[2],
-        params_3d.stride_3d()[2], params_3d.count_include_pad(), params_3d.divisor_override());
-  }
-
-  static void Avgpool2dForward(ep::Stream* stream, const NdIndexOffsetHelper<IDX, 3>& index_helper,
-                               const IDX elem_num, const T* src, T* dest,
-                               const AvgPoolParams3D& params_3d) {
-    DoCUDAAvgPool2dForward<T, IDX><<<GetNumBlocks(elem_num), GetMinThreadNum(elem_num), 0,
-                                     stream->As<ep::CudaStream>()->cuda_stream()>>>(
-        index_helper, elem_num, src, dest, params_3d.padding()[1], params_3d.padding()[2],
-        params_3d.num_batch(), params_3d.num_channel(), params_3d.GetXShape5D().At(3),
-        params_3d.GetXShape5D().At(4), params_3d.pool_size_3d()[1], params_3d.pool_size_3d()[2],
-        params_3d.stride_3d()[1], params_3d.stride_3d()[2], params_3d.count_include_pad(),
-        params_3d.divisor_override());
-  }
-
-  static void Avgpool2dBackward(ep::Stream* stream, const NdIndexOffsetHelper<IDX, 3>& index_helper,
-                                const IDX elem_num, const T* src, T* dest,
-                                const AvgPoolParams3D& params_3d) {
-    DoCUDAAvgPool2dBackward<T, IDX><<<GetNumBlocks(elem_num), GetMinThreadNum(elem_num), 0,
-                                      stream->As<ep::CudaStream>()->cuda_stream()>>>(
-        index_helper, elem_num, src, dest, params_3d.padding()[1], params_3d.padding()[2],
-        params_3d.num_batch(), params_3d.num_channel(), params_3d.GetXShape5D().At(3),
-        params_3d.GetXShape5D().At(4), params_3d.pool_size_3d()[1], params_3d.pool_size_3d()[2],
-        params_3d.stride_3d()[1], params_3d.stride_3d()[2], params_3d.count_include_pad(),
-        params_3d.divisor_override());
-  }
-
-  static void Avgpool3dForward(ep::Stream* stream, const NdIndexOffsetHelper<IDX, 4>& index_helper,
-                               const IDX elem_num, const T* src, T* dest,
-                               const AvgPoolParams3D& params_3d) {
-    DoCUDAAvgPool3dForward<T, IDX><<<GetNumBlocks(elem_num), GetMinThreadNum(elem_num), 0,
-                                     stream->As<ep::CudaStream>()->cuda_stream()>>>(
-        index_helper, elem_num, src, dest, params_3d.padding()[0], params_3d.padding()[1],
-        params_3d.padding()[2], params_3d.num_batch(), params_3d.num_channel(),
-        params_3d.GetXShape5D().At(2), params_3d.GetXShape5D().At(3), params_3d.GetXShape5D().At(4),
-        params_3d.pool_size_3d()[0], params_3d.pool_size_3d()[1], params_3d.pool_size_3d()[2],
-        params_3d.stride_3d()[0], params_3d.stride_3d()[1], params_3d.stride_3d()[2],
-        params_3d.count_include_pad(), params_3d.divisor_override());
-  }
-
-  static void Avgpool3dBackward(ep::Stream* stream, const NdIndexOffsetHelper<IDX, 4>& index_helper,
-                                const IDX elem_num, const T* src, T* dest,
-                                const AvgPoolParams3D& params_3d) {
-    DoCUDAAvgPool3dBackward<T, IDX><<<GetNumBlocks(elem_num), GetMinThreadNum(elem_num), 0,
-                                      stream->As<ep::CudaStream>()->cuda_stream()>>>(
-        index_helper, elem_num, src, dest, params_3d.padding()[0], params_3d.padding()[1],
-        params_3d.padding()[2], params_3d.num_batch(), params_3d.num_channel(),
-        params_3d.GetXShape5D().At(2), params_3d.GetXShape5D().At(3), params_3d.GetXShape5D().At(4),
-        params_3d.pool_size_3d()[0], params_3d.pool_size_3d()[1], params_3d.pool_size_3d()[2],
-        params_3d.stride_3d()[0], params_3d.stride_3d()[1], params_3d.stride_3d()[2],
-        params_3d.count_include_pad(), params_3d.divisor_override());
-  }
-};
-
-OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_AVG_POOL_KERNEL_UTIL, (DeviceType::kCUDA),
-                                 AVG_POOL_DATA_TYPE_CUDA_SEQ, AVG_POOL_IDX_DATA_TYPE_SEQ);
-
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "hip/hip_runtime.h"
+#include <cstdint>
+#include "oneflow/core/hip/elementwise.hip.h"
+#include "oneflow/user/kernels/avg_pool_kernel_util.h"
+#include "oneflow/core/ep/rocm/cuda_stream.h"
+
+namespace oneflow {
+
+namespace {
+
+constexpr int kBlockSize = cuda::elementwise::kBlockSize;
+
+int GetMinThreadNum(const int64_t elem_num) { return std::min<int64_t>(elem_num, kBlockSize); }
+
+int GetNumBlocks(int32_t elem_cnt) {
+  int num_blocks = 0;
+  OF_CUDA_CHECK(cuda::elementwise::GetNumBlocks(elem_cnt, &num_blocks));
+  return num_blocks;
+}
+
+}  // namespace
+
+template<typename T, typename IDX>
+__launch_bounds__(kBlockSize) __global__
+    void DoCUDAAvgPool1dForward(const NdIndexOffsetHelper<IDX, 2> index_helper, IDX elem_num,
+                                const T* src, T* dest, int32_t padding_l, const int32_t n_batch,
+                                const int32_t n_channel, const int32_t x_length,
+                                const int32_t kernel_size_l, const int32_t stride_l,
+                                const bool count_include_pad, const int32_t divisor_override) {
+  Avgpool1dForwardCompute<T>(index_helper, elem_num, src, dest, padding_l, n_batch, n_channel,
+                             x_length, kernel_size_l, stride_l, count_include_pad,
+                             divisor_override);
+};
+
+template<typename T, typename IDX>
+__launch_bounds__(kBlockSize) __global__
+    void DoCUDAAvgPool2dForward(const NdIndexOffsetHelper<IDX, 3> index_helper, IDX elem_num,
+                                const T* src, T* dest, const int32_t padding_h,
+                                const int32_t padding_w, const int32_t n_batch,
+                                const int32_t n_channel, const int32_t x_height,
+                                const int32_t x_width, const int32_t kernel_size_h,
+                                const int32_t kernel_size_w, const int32_t stride_h,
+                                const int32_t stride_w, const bool count_include_pad,
+                                const int32_t divisor_override) {
+  Avgpool2dForwardCompute<T>(index_helper, elem_num, src, dest, padding_h, padding_w, n_batch,
+                             n_channel, x_height, x_width, kernel_size_h, kernel_size_w, stride_h,
+                             stride_w, count_include_pad, divisor_override);
+};
+
+template<typename T, typename IDX>
+__launch_bounds__(kBlockSize) __global__
+    void DoCUDAAvgPool3dForward(const NdIndexOffsetHelper<IDX, 4> index_helper, IDX elem_num,
+                                const T* src, T* dest, int32_t padding_t, const int32_t padding_h,
+                                const int32_t padding_w, const int32_t n_batch,
+                                const int32_t n_channel, const int32_t x_time,
+                                const int32_t x_height, const int32_t x_width,
+                                const int32_t kernel_size_t, int32_t kernel_size_h,
+                                const int32_t kernel_size_w, const int32_t stride_t,
+                                const int32_t stride_h, const int32_t stride_w,
+                                const bool count_include_pad, const int32_t divisor_override) {
+  Avgpool3dForwardCompute<T>(index_helper, elem_num, src, dest, padding_t, padding_h, padding_w,
+                             n_batch, n_channel, x_time, x_height, x_width, kernel_size_t,
+                             kernel_size_h, kernel_size_w, stride_t, stride_h, stride_w,
+                             count_include_pad, divisor_override);
+};
+
+template<typename T, typename IDX>
+__launch_bounds__(kBlockSize) __global__
+    void DoCUDAAvgPool1dBackward(const NdIndexOffsetHelper<IDX, 2> index_helper, IDX elem_num,
+                                 const T* src, T* dest, const int32_t padding_l,
+                                 const int32_t n_batch, const int32_t n_channel,
+                                 const int32_t input_length, const int32_t kernel_size_l,
+                                 const int32_t stride_l, const bool count_include_pad,
+                                 const int32_t divisor_override) {
+  Avgpool1dBackwardCompute<T>(index_helper, elem_num, src, dest, padding_l, n_batch, n_channel,
+                              input_length, kernel_size_l, stride_l, count_include_pad,
+                              divisor_override);
+};
+
+template<typename T, typename IDX>
+__launch_bounds__(kBlockSize) __global__
+    void DoCUDAAvgPool2dBackward(const NdIndexOffsetHelper<IDX, 3> index_helper, IDX elem_num,
+                                 const T* src, T* dest, const int32_t padding_h,
+                                 const int32_t padding_w, const int32_t n_batch,
+                                 const int32_t n_channel, const int32_t input_height,
+                                 const int32_t input_width, const int32_t kernel_size_h,
+                                 const int32_t kernel_size_w, const int32_t stride_h,
+                                 const int32_t stride_w, const bool count_include_pad,
+                                 int32_t divisor_override) {
+  Avgpool2dBackwardCompute<T>(index_helper, elem_num, src, dest, padding_h, padding_w, n_batch,
+                              n_channel, input_height, input_width, kernel_size_h, kernel_size_w,
+                              stride_h, stride_w, count_include_pad, divisor_override);
+};
+
+template<typename T, typename IDX>
+__launch_bounds__(kBlockSize) __global__ void DoCUDAAvgPool3dBackward(
+    const NdIndexOffsetHelper<IDX, 4> index_helper, IDX elem_num, const T* src, T* dest,
+    const int32_t padding_t, const int32_t padding_h, const int32_t padding_w,
+    const int32_t n_batch, const int32_t n_channel, const int32_t x_time, const int32_t x_height,
+    const int32_t x_width, const int32_t kernel_size_t, const int32_t kernel_size_h,
+    const int32_t kernel_size_w, const int32_t stride_t, const int32_t stride_h,
+    const int32_t stride_w, const bool count_include_pad, const int32_t divisor_override) {
+  Avgpool3dBackwardCompute<T>(index_helper, elem_num, src, dest, padding_t, padding_h, padding_w,
+                              n_batch, n_channel, x_time, x_height, x_width, kernel_size_t,
+                              kernel_size_h, kernel_size_w, stride_t, stride_h, stride_w,
+                              count_include_pad, divisor_override);
+};
+
+template<typename T, typename IDX>
+struct AvgPoolKernelUtil<DeviceType::kCUDA, T, IDX> {
+  static void Avgpool1dForward(ep::Stream* stream, const NdIndexOffsetHelper<IDX, 2>& index_helper,
+                               const IDX elem_num, const T* src, T* dest,
+                               const AvgPoolParams3D& params_3d) {
+    DoCUDAAvgPool1dForward<T, IDX><<<GetNumBlocks(elem_num), GetMinThreadNum(elem_num), 0,
+                                     stream->As<ep::CudaStream>()->cuda_stream()>>>(
+        index_helper, elem_num, src, dest, params_3d.padding()[2], params_3d.num_batch(),
+        params_3d.num_channel(), params_3d.GetXShape5D().At(4), params_3d.pool_size_3d()[2],
+        params_3d.stride_3d()[2], params_3d.count_include_pad(), params_3d.divisor_override());
+  }
+
+  static void Avgpool1dBackward(ep::Stream* stream, const NdIndexOffsetHelper<IDX, 2>& index_helper,
+                                const IDX elem_num, const T* src, T* dest,
+                                const AvgPoolParams3D& params_3d) {
+    DoCUDAAvgPool1dBackward<T, IDX><<<GetNumBlocks(elem_num), GetMinThreadNum(elem_num), 0,
+                                      stream->As<ep::CudaStream>()->cuda_stream()>>>(
+        index_helper, elem_num, src, dest, params_3d.padding()[2], params_3d.num_batch(),
+        params_3d.num_channel(), params_3d.GetXShape5D().At(4), params_3d.pool_size_3d()[2],
+        params_3d.stride_3d()[2], params_3d.count_include_pad(), params_3d.divisor_override());
+  }
+
+  static void Avgpool2dForward(ep::Stream* stream, const NdIndexOffsetHelper<IDX, 3>& index_helper,
+                               const IDX elem_num, const T* src, T* dest,
+                               const AvgPoolParams3D& params_3d) {
+    DoCUDAAvgPool2dForward<T, IDX><<<GetNumBlocks(elem_num), GetMinThreadNum(elem_num), 0,
+                                     stream->As<ep::CudaStream>()->cuda_stream()>>>(
+        index_helper, elem_num, src, dest, params_3d.padding()[1], params_3d.padding()[2],
+        params_3d.num_batch(), params_3d.num_channel(), params_3d.GetXShape5D().At(3),
+        params_3d.GetXShape5D().At(4), params_3d.pool_size_3d()[1], params_3d.pool_size_3d()[2],
+        params_3d.stride_3d()[1], params_3d.stride_3d()[2], params_3d.count_include_pad(),
+        params_3d.divisor_override());
+  }
+
+  static void Avgpool2dBackward(ep::Stream* stream, const NdIndexOffsetHelper<IDX, 3>& index_helper,
+                                const IDX elem_num, const T* src, T* dest,
+                                const AvgPoolParams3D& params_3d) {
+    DoCUDAAvgPool2dBackward<T, IDX><<<GetNumBlocks(elem_num), GetMinThreadNum(elem_num), 0,
+                                      stream->As<ep::CudaStream>()->cuda_stream()>>>(
+        index_helper, elem_num, src, dest, params_3d.padding()[1], params_3d.padding()[2],
+        params_3d.num_batch(), params_3d.num_channel(), params_3d.GetXShape5D().At(3),
+        params_3d.GetXShape5D().At(4), params_3d.pool_size_3d()[1], params_3d.pool_size_3d()[2],
+        params_3d.stride_3d()[1], params_3d.stride_3d()[2], params_3d.count_include_pad(),
+        params_3d.divisor_override());
+  }
+
+  static void Avgpool3dForward(ep::Stream* stream, const NdIndexOffsetHelper<IDX, 4>& index_helper,
+                               const IDX elem_num, const T* src, T* dest,
+                               const AvgPoolParams3D& params_3d) {
+    DoCUDAAvgPool3dForward<T, IDX><<<GetNumBlocks(elem_num), GetMinThreadNum(elem_num), 0,
+                                     stream->As<ep::CudaStream>()->cuda_stream()>>>(
+        index_helper, elem_num, src, dest, params_3d.padding()[0], params_3d.padding()[1],
+        params_3d.padding()[2], params_3d.num_batch(), params_3d.num_channel(),
+        params_3d.GetXShape5D().At(2), params_3d.GetXShape5D().At(3), params_3d.GetXShape5D().At(4),
+        params_3d.pool_size_3d()[0], params_3d.pool_size_3d()[1], params_3d.pool_size_3d()[2],
+        params_3d.stride_3d()[0], params_3d.stride_3d()[1], params_3d.stride_3d()[2],
+        params_3d.count_include_pad(), params_3d.divisor_override());
+  }
+
+  static void Avgpool3dBackward(ep::Stream* stream, const NdIndexOffsetHelper<IDX, 4>& index_helper,
+                                const IDX elem_num, const T* src, T* dest,
+                                const AvgPoolParams3D& params_3d) {
+    DoCUDAAvgPool3dBackward<T, IDX><<<GetNumBlocks(elem_num), GetMinThreadNum(elem_num), 0,
+                                      stream->As<ep::CudaStream>()->cuda_stream()>>>(
+        index_helper, elem_num, src, dest, params_3d.padding()[0], params_3d.padding()[1],
+        params_3d.padding()[2], params_3d.num_batch(), params_3d.num_channel(),
+        params_3d.GetXShape5D().At(2), params_3d.GetXShape5D().At(3), params_3d.GetXShape5D().At(4),
+        params_3d.pool_size_3d()[0], params_3d.pool_size_3d()[1], params_3d.pool_size_3d()[2],
+        params_3d.stride_3d()[0], params_3d.stride_3d()[1], params_3d.stride_3d()[2],
+        params_3d.count_include_pad(), params_3d.divisor_override());
+  }
+};
+
+OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_AVG_POOL_KERNEL_UTIL, (DeviceType::kCUDA),
+                                 AVG_POOL_DATA_TYPE_CUDA_SEQ, AVG_POOL_IDX_DATA_TYPE_SEQ);
+
 }  // namespace oneflow
\ No newline at end of file
diff --git a/oneflow/user/kernels/batch_gather_kernel_util.hip.cpp b/oneflow/user/kernels/batch_gather_kernel_util.hip.cpp
index 547fea4..de9cf92 100644
--- a/oneflow/user/kernels/batch_gather_kernel_util.hip.cpp
+++ b/oneflow/user/kernels/batch_gather_kernel_util.hip.cpp
@@ -1,103 +1,103 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "hip/hip_runtime.h"
-#include "oneflow/user/kernels/batch_gather_kernel_util.h"
-#include "oneflow/core/hip/atomic.hip.h"
-#include "oneflow/core/ep/rocm/cuda_stream.h"
-#include <assert.h>
-
-namespace oneflow {
-
-namespace {
-
-template<typename K>
-__device__ int64_t GetInOffset(const int64_t out_offset, const K* indices,
-                               const int64_t indices_num, const int64_t instance_size,
-                               const int64_t gather_dim_size) {
-  const int64_t batch_idx = out_offset / (indices_num * instance_size);
-  const int64_t indices_idx = out_offset % (indices_num * instance_size) / instance_size;
-  const int64_t inner_idx = out_offset % instance_size;
-  const int64_t idx = indices[batch_idx * indices_num + indices_idx];
-  assert(idx >= 0 && idx < gather_dim_size);
-  return batch_idx * gather_dim_size * instance_size + idx * instance_size + inner_idx;
-}
-
-template<typename T, typename K>
-__global__ void BatchGatherForwardGpu(const int64_t elem_cnt, const T* in, const K* indices,
-                                      const int64_t indices_num, const int64_t instance_size,
-                                      const int64_t gather_dim_size, T* out) {
-  CUDA_1D_KERNEL_LOOP(i, elem_cnt) {
-    out[i] = in[GetInOffset<K>(i, indices, indices_num, instance_size, gather_dim_size)];
-  }
-}
-
-template<typename T, typename K>
-__global__ void BatchGatherBackwardGpu(const int64_t elem_cnt, const T* out_diff, const K* indices,
-                                       const int64_t indices_num, const int64_t instance_size,
-                                       const int64_t gather_dim_size, T* in_diff) {
-  CUDA_1D_KERNEL_LOOP(i, elem_cnt) {
-    cuda::atomic::Add(
-        in_diff + GetInOffset<K>(i, indices, indices_num, instance_size, gather_dim_size),
-        out_diff[i]);
-  }
-}
-
-}  // namespace
-
-template<typename T, typename K>
-struct BatchGatherKernelUtilImpl<DeviceType::kCUDA, T, K> final {
-  static void Forward(ep::Stream* stream, const T* in, const K* indices,
-                      const Shape& flat_out_shape, const int64_t gather_dim_size, T* out);
-  static void Backward(ep::Stream* stream, const T* out_diff, const K* indices,
-                       const Shape& flat_out_diff_shape, const int64_t gather_dim_size, T* in_diff);
-};
-
-template<typename T, typename K>
-void BatchGatherKernelUtilImpl<DeviceType::kCUDA, T, K>::Forward(ep::Stream* stream, const T* in,
-                                                                 const K* indices,
-                                                                 const Shape& flat_out_shape,
-                                                                 const int64_t gather_dim_size,
-                                                                 T* out) {
-  const int64_t batch_num = flat_out_shape.At(0);
-  const int64_t indices_num = flat_out_shape.At(1);
-  const int64_t instance_size = flat_out_shape.At(2);
-  const int64_t elem_cnt = batch_num * indices_num * instance_size;
-  BatchGatherForwardGpu<T, K><<<BlocksNum4ThreadsNum(elem_cnt), kCudaThreadsNumPerBlock, 0,
-                                stream->As<ep::CudaStream>()->cuda_stream()>>>(
-      elem_cnt, in, indices, indices_num, instance_size, gather_dim_size, out);
-}
-
-template<typename T, typename K>
-void BatchGatherKernelUtilImpl<DeviceType::kCUDA, T, K>::Backward(
-    ep::Stream* stream, const T* out_diff, const K* indices, const Shape& flat_out_diff_shape,
-    const int64_t gather_dim_size, T* in_diff) {
-  const int64_t batch_num = flat_out_diff_shape.At(0);
-  const int64_t indices_num = flat_out_diff_shape.At(1);
-  const int64_t instance_size = flat_out_diff_shape.At(2);
-  const int64_t elem_cnt = batch_num * indices_num * instance_size;
-  BatchGatherBackwardGpu<T, K><<<BlocksNum4ThreadsNum(elem_cnt), kCudaThreadsNumPerBlock, 0,
-                                 stream->As<ep::CudaStream>()->cuda_stream()>>>(
-      elem_cnt, out_diff, indices, indices_num, instance_size, gather_dim_size, in_diff);
-}
-
-#define INSTANTIATE_BATCH_GATHER_KERNEL_UTIL_IMPL_CUDA(in_type_pair, index_type_pair)          \
-  template struct BatchGatherKernelUtilImpl<DeviceType::kCUDA, OF_PP_PAIR_FIRST(in_type_pair), \
-                                            OF_PP_PAIR_FIRST(index_type_pair)>;
-OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_BATCH_GATHER_KERNEL_UTIL_IMPL_CUDA,
-                                 FLOATING_DATA_TYPE_SEQ, INT_DATA_TYPE_SEQ);
-#undef INSTANTIATE_BATCH_GATHER_KERNEL_UTIL_IMPL_CUDA
-
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "hip/hip_runtime.h"
+#include "oneflow/user/kernels/batch_gather_kernel_util.h"
+#include "oneflow/core/hip/atomic.hip.h"
+#include "oneflow/core/ep/rocm/cuda_stream.h"
+#include <assert.h>
+
+namespace oneflow {
+
+namespace {
+
+template<typename K>
+__device__ int64_t GetInOffset(const int64_t out_offset, const K* indices,
+                               const int64_t indices_num, const int64_t instance_size,
+                               const int64_t gather_dim_size) {
+  const int64_t batch_idx = out_offset / (indices_num * instance_size);
+  const int64_t indices_idx = out_offset % (indices_num * instance_size) / instance_size;
+  const int64_t inner_idx = out_offset % instance_size;
+  const int64_t idx = indices[batch_idx * indices_num + indices_idx];
+  assert(idx >= 0 && idx < gather_dim_size);
+  return batch_idx * gather_dim_size * instance_size + idx * instance_size + inner_idx;
+}
+
+template<typename T, typename K>
+__global__ void BatchGatherForwardGpu(const int64_t elem_cnt, const T* in, const K* indices,
+                                      const int64_t indices_num, const int64_t instance_size,
+                                      const int64_t gather_dim_size, T* out) {
+  CUDA_1D_KERNEL_LOOP(i, elem_cnt) {
+    out[i] = in[GetInOffset<K>(i, indices, indices_num, instance_size, gather_dim_size)];
+  }
+}
+
+template<typename T, typename K>
+__global__ void BatchGatherBackwardGpu(const int64_t elem_cnt, const T* out_diff, const K* indices,
+                                       const int64_t indices_num, const int64_t instance_size,
+                                       const int64_t gather_dim_size, T* in_diff) {
+  CUDA_1D_KERNEL_LOOP(i, elem_cnt) {
+    cuda::atomic::Add(
+        in_diff + GetInOffset<K>(i, indices, indices_num, instance_size, gather_dim_size),
+        out_diff[i]);
+  }
+}
+
+}  // namespace
+
+template<typename T, typename K>
+struct BatchGatherKernelUtilImpl<DeviceType::kCUDA, T, K> final {
+  static void Forward(ep::Stream* stream, const T* in, const K* indices,
+                      const Shape& flat_out_shape, const int64_t gather_dim_size, T* out);
+  static void Backward(ep::Stream* stream, const T* out_diff, const K* indices,
+                       const Shape& flat_out_diff_shape, const int64_t gather_dim_size, T* in_diff);
+};
+
+template<typename T, typename K>
+void BatchGatherKernelUtilImpl<DeviceType::kCUDA, T, K>::Forward(ep::Stream* stream, const T* in,
+                                                                 const K* indices,
+                                                                 const Shape& flat_out_shape,
+                                                                 const int64_t gather_dim_size,
+                                                                 T* out) {
+  const int64_t batch_num = flat_out_shape.At(0);
+  const int64_t indices_num = flat_out_shape.At(1);
+  const int64_t instance_size = flat_out_shape.At(2);
+  const int64_t elem_cnt = batch_num * indices_num * instance_size;
+  BatchGatherForwardGpu<T, K><<<BlocksNum4ThreadsNum(elem_cnt), kCudaThreadsNumPerBlock, 0,
+                                stream->As<ep::CudaStream>()->cuda_stream()>>>(
+      elem_cnt, in, indices, indices_num, instance_size, gather_dim_size, out);
+}
+
+template<typename T, typename K>
+void BatchGatherKernelUtilImpl<DeviceType::kCUDA, T, K>::Backward(
+    ep::Stream* stream, const T* out_diff, const K* indices, const Shape& flat_out_diff_shape,
+    const int64_t gather_dim_size, T* in_diff) {
+  const int64_t batch_num = flat_out_diff_shape.At(0);
+  const int64_t indices_num = flat_out_diff_shape.At(1);
+  const int64_t instance_size = flat_out_diff_shape.At(2);
+  const int64_t elem_cnt = batch_num * indices_num * instance_size;
+  BatchGatherBackwardGpu<T, K><<<BlocksNum4ThreadsNum(elem_cnt), kCudaThreadsNumPerBlock, 0,
+                                 stream->As<ep::CudaStream>()->cuda_stream()>>>(
+      elem_cnt, out_diff, indices, indices_num, instance_size, gather_dim_size, in_diff);
+}
+
+#define INSTANTIATE_BATCH_GATHER_KERNEL_UTIL_IMPL_CUDA(in_type_pair, index_type_pair)          \
+  template struct BatchGatherKernelUtilImpl<DeviceType::kCUDA, OF_PP_PAIR_FIRST(in_type_pair), \
+                                            OF_PP_PAIR_FIRST(index_type_pair)>;
+OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_BATCH_GATHER_KERNEL_UTIL_IMPL_CUDA,
+                                 FLOATING_DATA_TYPE_SEQ, INT_DATA_TYPE_SEQ);
+#undef INSTANTIATE_BATCH_GATHER_KERNEL_UTIL_IMPL_CUDA
+
 }  // namespace oneflow
\ No newline at end of file
diff --git a/oneflow/user/kernels/binary_cross_entropy_kernel.hip.cpp b/oneflow/user/kernels/binary_cross_entropy_kernel.hip.cpp
index c1fe0cd..ccceaad 100644
--- a/oneflow/user/kernels/binary_cross_entropy_kernel.hip.cpp
+++ b/oneflow/user/kernels/binary_cross_entropy_kernel.hip.cpp
@@ -1,204 +1,204 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "oneflow/core/framework/framework.h"
-#include "oneflow/core/hip/elementwise.hip.h"
-#include "oneflow/user/kernels/loss_kernel_util.h"
-#include "oneflow/core/ep/rocm/cuda_stream.h"
-
-namespace oneflow {
-namespace user_op {
-namespace {
-
-using namespace loss;
-
-template<typename T>
-struct BinaryCrossEntropyFunctor {
-  T zero_;
-  T one_;
-  T negative_hundred_;
-  BinaryCrossEntropyFunctor()
-      : zero_(GetZeroVal<T>()), one_(GetOneVal<T>()), negative_hundred_(static_cast<T>(-100)) {}
-  __device__ __forceinline__ T operator()(T input_val, T target_val) const {
-    assert(input_val >= zero_);
-    assert(input_val <= one_);
-    return (target_val - one_) * max(static_cast<T>(log(one_ - input_val)), negative_hundred_)
-           - target_val * max(static_cast<T>(log(input_val)), negative_hundred_);
-  }
-
-  __device__ __forceinline__ T operator()(T input_val, T target_val, T weight_val) const {
-    return (*this)(input_val, target_val) * weight_val;
-  }
-};
-
-template<>
-struct BinaryCrossEntropyFunctor<float> {
-  float zero_;
-  float one_;
-  float negative_hundred_;
-  BinaryCrossEntropyFunctor() : zero_(0.f), one_(1.f), negative_hundred_(-100.f) {}
-  __device__ __forceinline__ float operator()(float input_val, float target_val) const {
-    assert(input_val >= zero_);
-    assert(input_val <= one_);
-    return (target_val - one_) * max(logf(one_ - input_val), negative_hundred_)
-           - target_val * max(logf(input_val), negative_hundred_);
-  }
-
-  __device__ __forceinline__ float operator()(float input_val, float target_val,
-                                              float weight_val) const {
-    return (*this)(input_val, target_val) * weight_val;
-  }
-};
-
-template<>
-struct BinaryCrossEntropyFunctor<half> {
-  BinaryCrossEntropyFunctor<float> float_functor;
-  __device__ __forceinline__ half operator()(half input_val, half target_val) const {
-    return __float2half(float_functor(__half2float(input_val), __half2float(target_val)));
-  }
-
-  __device__ __forceinline__ half operator()(half input_val, half target_val,
-                                             half weight_val) const {
-    return (*this)(input_val, target_val) * weight_val;
-  }
-};
-
-template<typename T>
-struct BinaryCrossEntropyGradFunctor {
-  T eps_;
-  T one_;
-  BinaryCrossEntropyGradFunctor() : eps_(static_cast<T>(1e-12)), one_(GetOneVal<T>()) {}
-  __device__ __forceinline__ T operator()(T input_val, T target_val, T dy_val) const {
-    return dy_val * (input_val - target_val) / max((one_ - input_val) * input_val, eps_);
-  }
-  __device__ __forceinline__ T operator()(T input_val, T target_val, T dy_val, T weight_val) const {
-    return (*this)(input_val, target_val, dy_val) * weight_val;
-  }
-};
-
-template<>
-struct BinaryCrossEntropyGradFunctor<half> {
-  BinaryCrossEntropyGradFunctor<float> float_functor;
-  BinaryCrossEntropyGradFunctor() {}
-  __device__ __forceinline__ half operator()(half input_val, half target_val, half dy_val) const {
-    return __float2half(
-        float_functor(__half2float(input_val), __half2float(target_val), __half2float(dy_val)));
-  }
-  __device__ __forceinline__ half operator()(half input_val, half target_val, half dy_val,
-                                             half weight_val) const {
-    return __float2half(float_functor(__half2float(input_val), __half2float(target_val),
-                                      __half2float(dy_val), __half2float(weight_val)));
-  }
-};
-
-template<typename T>
-class BinaryCrossEntropyKernel final : public user_op::OpKernel {
- public:
-  BinaryCrossEntropyKernel() = default;
-  ~BinaryCrossEntropyKernel() = default;
-
- private:
-  using user_op::OpKernel::Compute;
-  void Compute(user_op::KernelComputeContext* ctx) const override {
-    const auto* input_blob = ctx->Tensor4ArgNameAndIndex("input", 0);
-    const auto* target_blob = ctx->Tensor4ArgNameAndIndex("target", 0);
-    auto* out_blob = ctx->Tensor4ArgNameAndIndex("out", 0);
-
-    const int64_t elem_cnt = input_blob->shape_view().elem_cnt();
-
-    const T* input = input_blob->dptr<T>();
-    const T* target = target_blob->dptr<T>();
-    T* out = out_blob->mut_dptr<T>();
-
-    if (ctx->has_input("weight", 0)) {
-      const T* weight = ctx->Tensor4ArgNameAndIndex("weight", 0)->dptr<T>();
-      OF_CUDA_CHECK(
-          (cuda::elementwise::Ternary(BinaryCrossEntropyFunctor<T>(), elem_cnt, out, input, target,
-                                      weight, ctx->stream()->As<ep::CudaStream>()->cuda_stream())));
-    } else {
-      OF_CUDA_CHECK(
-          (cuda::elementwise::Binary(BinaryCrossEntropyFunctor<T>(), elem_cnt, out, input, target,
-                                     ctx->stream()->As<ep::CudaStream>()->cuda_stream())));
-    }
-  }
-  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
-};
-
-template<typename T>
-class BinaryCrossEntropyGradKernel final : public user_op::OpKernel {
- public:
-  BinaryCrossEntropyGradKernel() = default;
-  ~BinaryCrossEntropyGradKernel() = default;
-
- private:
-  using user_op::OpKernel::Compute;
-  void Compute(user_op::KernelComputeContext* ctx) const override {
-    const auto* input_blob = ctx->Tensor4ArgNameAndIndex("input", 0);
-    const auto* target_blob = ctx->Tensor4ArgNameAndIndex("target", 0);
-    const auto* dy_blob = ctx->Tensor4ArgNameAndIndex("dy", 0);
-    auto* dx_blob = ctx->Tensor4ArgNameAndIndex("dx", 0);
-
-    const int64_t elem_cnt = input_blob->shape_view().elem_cnt();
-
-    const T* dy = dy_blob->dptr<T>();
-    const T* input = input_blob->dptr<T>();
-    const T* target = target_blob->dptr<T>();
-    T* dx = dx_blob->mut_dptr<T>();
-
-    if (ctx->has_input("weight", 0)) {
-      const T* weight = ctx->Tensor4ArgNameAndIndex("weight", 0)->dptr<T>();
-      using FunctorT = BinaryCrossEntropyGradFunctor<T>;
-      using FactoryT = cuda::elementwise::SimpleFactory<FunctorT>;
-      OF_CUDA_CHECK((cuda::elementwise::GenericLauncher<FactoryT, T, T, T, T, T>::Launch(
-          FactoryT(FunctorT()), elem_cnt, dx, input, target, dy, weight,
-          ctx->stream()->As<ep::CudaStream>()->cuda_stream())));
-    } else {
-      OF_CUDA_CHECK((cuda::elementwise::Ternary(
-          BinaryCrossEntropyGradFunctor<T>(), elem_cnt, dx, input, target, dy,
-          ctx->stream()->As<ep::CudaStream>()->cuda_stream())));
-    }
-  }
-  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
-};
-
-}  // namespace
-
-#define REGISTER_BINARY_CROSS_ENTROPY_KERNEL(dtype)                                        \
-  REGISTER_USER_KERNEL("binary_cross_entropy")                                             \
-      .SetCreateFn<BinaryCrossEntropyKernel<dtype>>()                                      \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                     \
-                       && (user_op::HobDataType("input", 0) == GetDataType<dtype>::value)  \
-                       && (user_op::HobDataType("target", 0) == GetDataType<dtype>::value) \
-                       && (user_op::HobDataType("out", 0) == GetDataType<dtype>::value));
-
-#define REGISTER_BINARY_CROSS_ENTROPY_GRAD_KERNEL(dtype)                                   \
-  REGISTER_USER_KERNEL("binary_cross_entropy_grad")                                        \
-      .SetCreateFn<BinaryCrossEntropyGradKernel<dtype>>()                                  \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                     \
-                       && (user_op::HobDataType("input", 0) == GetDataType<dtype>::value)  \
-                       && (user_op::HobDataType("target", 0) == GetDataType<dtype>::value) \
-                       && (user_op::HobDataType("dy", 0) == GetDataType<dtype>::value)     \
-                       && (user_op::HobDataType("dx", 0) == GetDataType<dtype>::value));
-
-REGISTER_BINARY_CROSS_ENTROPY_KERNEL(half)
-REGISTER_BINARY_CROSS_ENTROPY_KERNEL(float)
-REGISTER_BINARY_CROSS_ENTROPY_KERNEL(double)
-
-REGISTER_BINARY_CROSS_ENTROPY_GRAD_KERNEL(half)
-REGISTER_BINARY_CROSS_ENTROPY_GRAD_KERNEL(float)
-REGISTER_BINARY_CROSS_ENTROPY_GRAD_KERNEL(double)
-
-}  // namespace user_op
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "oneflow/core/framework/framework.h"
+#include "oneflow/core/hip/elementwise.hip.h"
+#include "oneflow/user/kernels/loss_kernel_util.h"
+#include "oneflow/core/ep/rocm/cuda_stream.h"
+
+namespace oneflow {
+namespace user_op {
+namespace {
+
+using namespace loss;
+
+template<typename T>
+struct BinaryCrossEntropyFunctor {
+  T zero_;
+  T one_;
+  T negative_hundred_;
+  BinaryCrossEntropyFunctor()
+      : zero_(GetZeroVal<T>()), one_(GetOneVal<T>()), negative_hundred_(static_cast<T>(-100)) {}
+  __device__ __forceinline__ T operator()(T input_val, T target_val) const {
+    assert(input_val >= zero_);
+    assert(input_val <= one_);
+    return (target_val - one_) * max(static_cast<T>(log(one_ - input_val)), negative_hundred_)
+           - target_val * max(static_cast<T>(log(input_val)), negative_hundred_);
+  }
+
+  __device__ __forceinline__ T operator()(T input_val, T target_val, T weight_val) const {
+    return (*this)(input_val, target_val) * weight_val;
+  }
+};
+
+template<>
+struct BinaryCrossEntropyFunctor<float> {
+  float zero_;
+  float one_;
+  float negative_hundred_;
+  BinaryCrossEntropyFunctor() : zero_(0.f), one_(1.f), negative_hundred_(-100.f) {}
+  __device__ __forceinline__ float operator()(float input_val, float target_val) const {
+    assert(input_val >= zero_);
+    assert(input_val <= one_);
+    return (target_val - one_) * max(logf(one_ - input_val), negative_hundred_)
+           - target_val * max(logf(input_val), negative_hundred_);
+  }
+
+  __device__ __forceinline__ float operator()(float input_val, float target_val,
+                                              float weight_val) const {
+    return (*this)(input_val, target_val) * weight_val;
+  }
+};
+
+template<>
+struct BinaryCrossEntropyFunctor<half> {
+  BinaryCrossEntropyFunctor<float> float_functor;
+  __device__ __forceinline__ half operator()(half input_val, half target_val) const {
+    return __float2half(float_functor(__half2float(input_val), __half2float(target_val)));
+  }
+
+  __device__ __forceinline__ half operator()(half input_val, half target_val,
+                                             half weight_val) const {
+    return (*this)(input_val, target_val) * weight_val;
+  }
+};
+
+template<typename T>
+struct BinaryCrossEntropyGradFunctor {
+  T eps_;
+  T one_;
+  BinaryCrossEntropyGradFunctor() : eps_(static_cast<T>(1e-12)), one_(GetOneVal<T>()) {}
+  __device__ __forceinline__ T operator()(T input_val, T target_val, T dy_val) const {
+    return dy_val * (input_val - target_val) / max((one_ - input_val) * input_val, eps_);
+  }
+  __device__ __forceinline__ T operator()(T input_val, T target_val, T dy_val, T weight_val) const {
+    return (*this)(input_val, target_val, dy_val) * weight_val;
+  }
+};
+
+template<>
+struct BinaryCrossEntropyGradFunctor<half> {
+  BinaryCrossEntropyGradFunctor<float> float_functor;
+  BinaryCrossEntropyGradFunctor() {}
+  __device__ __forceinline__ half operator()(half input_val, half target_val, half dy_val) const {
+    return __float2half(
+        float_functor(__half2float(input_val), __half2float(target_val), __half2float(dy_val)));
+  }
+  __device__ __forceinline__ half operator()(half input_val, half target_val, half dy_val,
+                                             half weight_val) const {
+    return __float2half(float_functor(__half2float(input_val), __half2float(target_val),
+                                      __half2float(dy_val), __half2float(weight_val)));
+  }
+};
+
+template<typename T>
+class BinaryCrossEntropyKernel final : public user_op::OpKernel {
+ public:
+  BinaryCrossEntropyKernel() = default;
+  ~BinaryCrossEntropyKernel() = default;
+
+ private:
+  using user_op::OpKernel::Compute;
+  void Compute(user_op::KernelComputeContext* ctx) const override {
+    const auto* input_blob = ctx->Tensor4ArgNameAndIndex("input", 0);
+    const auto* target_blob = ctx->Tensor4ArgNameAndIndex("target", 0);
+    auto* out_blob = ctx->Tensor4ArgNameAndIndex("out", 0);
+
+    const int64_t elem_cnt = input_blob->shape_view().elem_cnt();
+
+    const T* input = input_blob->dptr<T>();
+    const T* target = target_blob->dptr<T>();
+    T* out = out_blob->mut_dptr<T>();
+
+    if (ctx->has_input("weight", 0)) {
+      const T* weight = ctx->Tensor4ArgNameAndIndex("weight", 0)->dptr<T>();
+      OF_CUDA_CHECK(
+          (cuda::elementwise::Ternary(BinaryCrossEntropyFunctor<T>(), elem_cnt, out, input, target,
+                                      weight, ctx->stream()->As<ep::CudaStream>()->cuda_stream())));
+    } else {
+      OF_CUDA_CHECK(
+          (cuda::elementwise::Binary(BinaryCrossEntropyFunctor<T>(), elem_cnt, out, input, target,
+                                     ctx->stream()->As<ep::CudaStream>()->cuda_stream())));
+    }
+  }
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
+};
+
+template<typename T>
+class BinaryCrossEntropyGradKernel final : public user_op::OpKernel {
+ public:
+  BinaryCrossEntropyGradKernel() = default;
+  ~BinaryCrossEntropyGradKernel() = default;
+
+ private:
+  using user_op::OpKernel::Compute;
+  void Compute(user_op::KernelComputeContext* ctx) const override {
+    const auto* input_blob = ctx->Tensor4ArgNameAndIndex("input", 0);
+    const auto* target_blob = ctx->Tensor4ArgNameAndIndex("target", 0);
+    const auto* dy_blob = ctx->Tensor4ArgNameAndIndex("dy", 0);
+    auto* dx_blob = ctx->Tensor4ArgNameAndIndex("dx", 0);
+
+    const int64_t elem_cnt = input_blob->shape_view().elem_cnt();
+
+    const T* dy = dy_blob->dptr<T>();
+    const T* input = input_blob->dptr<T>();
+    const T* target = target_blob->dptr<T>();
+    T* dx = dx_blob->mut_dptr<T>();
+
+    if (ctx->has_input("weight", 0)) {
+      const T* weight = ctx->Tensor4ArgNameAndIndex("weight", 0)->dptr<T>();
+      using FunctorT = BinaryCrossEntropyGradFunctor<T>;
+      using FactoryT = cuda::elementwise::SimpleFactory<FunctorT>;
+      OF_CUDA_CHECK((cuda::elementwise::GenericLauncher<FactoryT, T, T, T, T, T>::Launch(
+          FactoryT(FunctorT()), elem_cnt, dx, input, target, dy, weight,
+          ctx->stream()->As<ep::CudaStream>()->cuda_stream())));
+    } else {
+      OF_CUDA_CHECK((cuda::elementwise::Ternary(
+          BinaryCrossEntropyGradFunctor<T>(), elem_cnt, dx, input, target, dy,
+          ctx->stream()->As<ep::CudaStream>()->cuda_stream())));
+    }
+  }
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
+};
+
+}  // namespace
+
+#define REGISTER_BINARY_CROSS_ENTROPY_KERNEL(dtype)                                        \
+  REGISTER_USER_KERNEL("binary_cross_entropy")                                             \
+      .SetCreateFn<BinaryCrossEntropyKernel<dtype>>()                                      \
+      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                     \
+                       && (user_op::HobDataType("input", 0) == GetDataType<dtype>::value)  \
+                       && (user_op::HobDataType("target", 0) == GetDataType<dtype>::value) \
+                       && (user_op::HobDataType("out", 0) == GetDataType<dtype>::value));
+
+#define REGISTER_BINARY_CROSS_ENTROPY_GRAD_KERNEL(dtype)                                   \
+  REGISTER_USER_KERNEL("binary_cross_entropy_grad")                                        \
+      .SetCreateFn<BinaryCrossEntropyGradKernel<dtype>>()                                  \
+      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                     \
+                       && (user_op::HobDataType("input", 0) == GetDataType<dtype>::value)  \
+                       && (user_op::HobDataType("target", 0) == GetDataType<dtype>::value) \
+                       && (user_op::HobDataType("dy", 0) == GetDataType<dtype>::value)     \
+                       && (user_op::HobDataType("dx", 0) == GetDataType<dtype>::value));
+
+REGISTER_BINARY_CROSS_ENTROPY_KERNEL(half)
+REGISTER_BINARY_CROSS_ENTROPY_KERNEL(float)
+REGISTER_BINARY_CROSS_ENTROPY_KERNEL(double)
+
+REGISTER_BINARY_CROSS_ENTROPY_GRAD_KERNEL(half)
+REGISTER_BINARY_CROSS_ENTROPY_GRAD_KERNEL(float)
+REGISTER_BINARY_CROSS_ENTROPY_GRAD_KERNEL(double)
+
+}  // namespace user_op
 }  // namespace oneflow
\ No newline at end of file
diff --git a/oneflow/user/kernels/binary_cross_entropy_with_logits_kernel.hip.cpp b/oneflow/user/kernels/binary_cross_entropy_with_logits_kernel.hip.cpp
index fc19e37..9a7d7c4 100644
--- a/oneflow/user/kernels/binary_cross_entropy_with_logits_kernel.hip.cpp
+++ b/oneflow/user/kernels/binary_cross_entropy_with_logits_kernel.hip.cpp
@@ -1,373 +1,373 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "oneflow/core/framework/framework.h"
-#include "oneflow/core/hip/elementwise.hip.h"
-#include "oneflow/core/ndarray/ndarray_util.h"
-#include "oneflow/core/ndarray/xpu_var_ndarray.h"
-#include "oneflow/user/kernels/loss_kernel_util.h"
-#include "oneflow/core/ep/rocm/cuda_stream.h"
-
-namespace oneflow {
-namespace user_op {
-namespace {
-
-using namespace loss;
-
-enum class WeightType {
-  kNone,
-  kWeight,
-  kPosWeight,
-  kBoth,
-};
-
-template<typename T, WeightType WEIGHT_TYPE>
-struct BinaryCrossEntropyWithLogitsFunctor;
-
-template<typename T>
-struct BinaryCrossEntropyWithLogitsFunctor<T, WeightType::kNone> {
-  T zero_;
-  T one_;
-  BinaryCrossEntropyWithLogitsFunctor() : zero_(GetZeroVal<T>()), one_(GetOneVal<T>()) {}
-  __device__ __forceinline__ T operator()(T input_val, T target_val) const {
-    const T max_val = -input_val < zero_ ? zero_ : -input_val;
-    return (one_ - target_val) * input_val + max_val
-           + (log(exp(-max_val) + exp(-input_val - max_val)));
-  }
-};
-
-template<typename T>
-struct BinaryCrossEntropyWithLogitsFunctor<T, WeightType::kPosWeight> {
-  T zero_;
-  T one_;
-  BinaryCrossEntropyWithLogitsFunctor() : zero_(GetZeroVal<T>()), one_(GetOneVal<T>()) {}
-  __device__ __forceinline__ T operator()(T input_val, T target_val, T weight_val) const {
-    const T max_val = -input_val < zero_ ? zero_ : -input_val;
-    const T pos_weight_processed_val = weight_val - target_val + one_;
-    return (one_ - target_val) * input_val
-           + (pos_weight_processed_val
-              * (log(exp(-max_val) + exp(-input_val - max_val)) + max_val));
-  }
-};
-
-template<>
-struct BinaryCrossEntropyWithLogitsFunctor<float, WeightType::kNone> {
-  float zero_;
-  float one_;
-  BinaryCrossEntropyWithLogitsFunctor() : zero_(0.f), one_(1.f) {}
-  __device__ __forceinline__ float operator()(float input_val, float target_val) const {
-    const float max_val = -input_val < zero_ ? zero_ : -input_val;
-    return (one_ - target_val) * input_val + max_val
-           + (logf(expf(-max_val) + expf(-input_val - max_val)));
-  }
-};
-
-template<>
-struct BinaryCrossEntropyWithLogitsFunctor<float, WeightType::kPosWeight> {
-  float zero_;
-  float one_;
-  BinaryCrossEntropyWithLogitsFunctor() : zero_(0.f), one_(1.f) {}
-  __device__ __forceinline__ float operator()(float input_val, float target_val,
-                                              float weight_val) const {
-    const float max_val = -input_val < zero_ ? zero_ : -input_val;
-    const float pos_weight_processed_val = weight_val - target_val + one_;
-    return (one_ - target_val) * input_val
-           + (pos_weight_processed_val
-              * (logf(expf(-max_val) + expf(-input_val - max_val)) + max_val));
-  }
-};
-
-template<typename T>
-struct BinaryCrossEntropyWithLogitsFunctor<T, WeightType::kWeight> {
-  BinaryCrossEntropyWithLogitsFunctor<T, WeightType::kNone> f;
-  __device__ __forceinline__ T operator()(T input_val, T target_val, T weight_val) const {
-    return f(input_val, target_val) * weight_val;
-  }
-};
-
-template<typename T>
-struct BinaryCrossEntropyWithLogitsFunctor<T, WeightType::kBoth> {
-  BinaryCrossEntropyWithLogitsFunctor<T, WeightType::kPosWeight> f;
-  __device__ __forceinline__ T operator()(T input_val, T target_val, T weight_val,
-                                          T pos_weight_val) const {
-    return f(input_val, target_val, pos_weight_val) * weight_val;
-  }
-};
-
-template<>
-struct BinaryCrossEntropyWithLogitsFunctor<half, WeightType::kNone> {
-  BinaryCrossEntropyWithLogitsFunctor<float, WeightType::kNone> f;
-  __device__ __forceinline__ half operator()(half input_val, half target_val) const {
-    return __float2half(f(__half2float(input_val), __half2float(target_val)));
-  }
-};
-template<>
-struct BinaryCrossEntropyWithLogitsFunctor<half, WeightType::kPosWeight> {
-  BinaryCrossEntropyWithLogitsFunctor<float, WeightType::kPosWeight> f;
-  __device__ __forceinline__ half operator()(half input_val, half target_val,
-                                             half weight_val) const {
-    return __float2half(
-        f(__half2float(input_val), __half2float(target_val), __half2float(weight_val)));
-  }
-};
-template<>
-struct BinaryCrossEntropyWithLogitsFunctor<half, WeightType::kWeight> {
-  BinaryCrossEntropyWithLogitsFunctor<float, WeightType::kWeight> f;
-  __device__ __forceinline__ half operator()(half input_val, half target_val,
-                                             half weight_val) const {
-    return __float2half(
-        f(__half2float(input_val), __half2float(target_val), __half2float(weight_val)));
-  }
-};
-template<>
-struct BinaryCrossEntropyWithLogitsFunctor<half, WeightType::kBoth> {
-  BinaryCrossEntropyWithLogitsFunctor<float, WeightType::kBoth> f;
-  __device__ __forceinline__ half operator()(half input_val, half target_val, half weight_val,
-                                             half pos_weight_val) const {
-    return __float2half(f(__half2float(input_val), __half2float(target_val),
-                          __half2float(weight_val), __half2float(pos_weight_val)));
-  }
-};
-
-template<typename T>
-__device__ __forceinline__ T CalSigmoid(const T x) {
-  const T half_of_one = static_cast<T>(0.5);
-  return half_of_one * tanh(half_of_one * x) + half_of_one;
-}
-
-template<>
-__device__ __forceinline__ float CalSigmoid(const float x) {
-  const float half_of_one = static_cast<float>(0.5);
-  return half_of_one * tanhf(half_of_one * x) + half_of_one;
-}
-
-template<>
-__device__ __forceinline__ half CalSigmoid(const half x) {
-  return __float2half(CalSigmoid(__half2float(x)));
-}
-
-template<typename T, WeightType WEIGHT_TYPE>
-struct BinaryCrossEntropyWithLogitsGradFunctor;
-
-template<typename T>
-struct BinaryCrossEntropyWithLogitsGradFunctor<T, WeightType::kNone> {
-  __device__ __forceinline__ T operator()(T input_val, T target_val, T dy_val) const {
-    return (CalSigmoid(input_val) - target_val) * dy_val;
-  }
-};
-
-template<typename T>
-struct BinaryCrossEntropyWithLogitsGradFunctor<T, WeightType::kPosWeight> {
-  T one_;
-  BinaryCrossEntropyWithLogitsGradFunctor() : one_(GetOneVal<T>()) {}
-  __device__ __forceinline__ T operator()(T input_val, T target_val, T dy_val, T weight_val) const {
-    return dy_val * ((weight_val + one_ - target_val) * CalSigmoid(input_val) - weight_val);
-  }
-};
-
-template<typename T>
-struct BinaryCrossEntropyWithLogitsGradFunctor<T, WeightType::kWeight> {
-  BinaryCrossEntropyWithLogitsGradFunctor<T, WeightType::kNone> f;
-  __device__ __forceinline__ T operator()(T input_val, T target_val, T dy_val, T weight_val) const {
-    return f(input_val, target_val, dy_val) * weight_val;
-  }
-};
-
-template<typename T>
-struct BinaryCrossEntropyWithLogitsGradFunctor<T, WeightType::kBoth> {
-  BinaryCrossEntropyWithLogitsGradFunctor<T, WeightType::kPosWeight> f;
-  __device__ __forceinline__ T operator()(T input_val, T target_val, T dy_val, T weight_val,
-                                          T pos_weight_val) const {
-    return f(input_val, target_val, dy_val, pos_weight_val) * weight_val;
-  }
-};
-
-template<typename T>
-class BinaryCrossEntropyWithLogitsKernel final : public user_op::OpKernel {
- public:
-  BinaryCrossEntropyWithLogitsKernel() = default;
-  ~BinaryCrossEntropyWithLogitsKernel() override = default;
-
- private:
-  using user_op::OpKernel::Compute;
-  void Compute(user_op::KernelComputeContext* ctx) const override {
-    const auto* input_blob = ctx->Tensor4ArgNameAndIndex("input", 0);
-    const auto* target_blob = ctx->Tensor4ArgNameAndIndex("target", 0);
-    auto* out_blob = ctx->Tensor4ArgNameAndIndex("out", 0);
-    auto* tmp_buffer_blob = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0);
-
-    const int64_t elem_cnt = input_blob->shape_view().elem_cnt();
-
-    const T* input = input_blob->dptr<T>();
-    const T* target = target_blob->dptr<T>();
-    T* out = out_blob->mut_dptr<T>();
-
-    if (ctx->Attr<bool>("has_pos_weight")) {
-      T* pos_weight_processed = tmp_buffer_blob->mut_dptr<T>();
-      const T* pos_weight = ctx->Tensor4ArgNameAndIndex("pos_weight", 0)->dptr<T>();
-
-      Shape pos_weight_shape = Shape::Ones(target_blob->shape_view().NumAxes());
-      pos_weight_shape.Set(pos_weight_shape.NumAxes() - 1,
-                           ctx->Tensor4ArgNameAndIndex("pos_weight", 0)->shape_view().elem_cnt());
-      NdarrayUtil<DeviceType::kCUDA, T>::BroadcastMul(
-          ctx->stream(), XpuVarNdarray<T>(target_blob->shape_view(), pos_weight_processed),
-          XpuVarNdarray<const T>(pos_weight_shape, pos_weight),
-          XpuVarNdarray<const T>(target_blob->shape_view(), target));
-      if (ctx->has_input("weight", 0)) {
-        const T* weight = ctx->Tensor4ArgNameAndIndex("weight", 0)->dptr<T>();
-        using FunctorT = BinaryCrossEntropyWithLogitsFunctor<T, WeightType::kBoth>;
-        using FactoryT = cuda::elementwise::SimpleFactory<FunctorT>;
-        OF_CUDA_CHECK((cuda::elementwise::GenericLauncher<FactoryT, T, T, T, T, T>::Launch(
-            FactoryT(FunctorT()), elem_cnt, out, input, target, weight, pos_weight_processed,
-            ctx->stream()->As<ep::CudaStream>()->cuda_stream())));
-
-      } else {
-        OF_CUDA_CHECK((cuda::elementwise::Ternary(
-            BinaryCrossEntropyWithLogitsFunctor<T, WeightType::kPosWeight>(), elem_cnt, out, input,
-            target, pos_weight_processed, ctx->stream()->As<ep::CudaStream>()->cuda_stream())));
-      }
-    } else {
-      if (ctx->has_input("weight", 0)) {
-        const T* weight = ctx->Tensor4ArgNameAndIndex("weight", 0)->dptr<T>();
-        OF_CUDA_CHECK((cuda::elementwise::Ternary(
-            BinaryCrossEntropyWithLogitsFunctor<T, WeightType::kWeight>(), elem_cnt, out, input,
-            target, weight, ctx->stream()->As<ep::CudaStream>()->cuda_stream())));
-      } else {
-        OF_CUDA_CHECK((cuda::elementwise::Binary(
-            BinaryCrossEntropyWithLogitsFunctor<T, WeightType::kNone>(), elem_cnt, out, input,
-            target, ctx->stream()->As<ep::CudaStream>()->cuda_stream())));
-      }
-    }
-  }
-  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
-};
-
-template<typename T>
-class BinaryCrossEntropyWithLogitsGradKernel final : public user_op::OpKernel {
- public:
-  BinaryCrossEntropyWithLogitsGradKernel() = default;
-  ~BinaryCrossEntropyWithLogitsGradKernel() = default;
-
- private:
-  using user_op::OpKernel::Compute;
-  void Compute(user_op::KernelComputeContext* ctx) const override {
-    const auto* input_blob = ctx->Tensor4ArgNameAndIndex("input", 0);
-    const auto* target_blob = ctx->Tensor4ArgNameAndIndex("target", 0);
-    const auto* dy_blob = ctx->Tensor4ArgNameAndIndex("dy", 0);
-    auto* dx_blob = ctx->Tensor4ArgNameAndIndex("dx", 0);
-    auto* tmp_buffer_blob = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0);
-
-    const int64_t elem_cnt = input_blob->shape_view().elem_cnt();
-
-    const T* dy = dy_blob->dptr<T>();
-    const T* input = input_blob->dptr<T>();
-    const T* target = target_blob->dptr<T>();
-    T* dx = dx_blob->mut_dptr<T>();
-
-    if (ctx->Attr<bool>("has_pos_weight")) {
-      T* pos_weight_processed = tmp_buffer_blob->mut_dptr<T>();
-      const T* pos_weight = ctx->Tensor4ArgNameAndIndex("pos_weight", 0)->dptr<T>();
-
-      Shape pos_weight_shape = Shape::Ones(target_blob->shape_view().NumAxes());
-      pos_weight_shape.Set(pos_weight_shape.NumAxes() - 1,
-                           ctx->Tensor4ArgNameAndIndex("pos_weight", 0)->shape_view().elem_cnt());
-      NdarrayUtil<DeviceType::kCUDA, T>::BroadcastMul(
-          ctx->stream(), XpuVarNdarray<T>(target_blob->shape_view(), pos_weight_processed),
-          XpuVarNdarray<const T>(pos_weight_shape, pos_weight),
-          XpuVarNdarray<const T>(target_blob->shape_view(), target));
-
-      if (ctx->has_input("weight", 0)) {
-        const T* weight = ctx->Tensor4ArgNameAndIndex("weight", 0)->dptr<T>();
-        using FunctorT = BinaryCrossEntropyWithLogitsGradFunctor<T, WeightType::kBoth>;
-        using FactoryT = cuda::elementwise::SimpleFactory<FunctorT>;
-        OF_CUDA_CHECK((cuda::elementwise::GenericLauncher<FactoryT, T, T, T, T, T, T>::Launch(
-            FactoryT(FunctorT()), elem_cnt, dx, input, target, dy, weight, pos_weight_processed,
-            ctx->stream()->As<ep::CudaStream>()->cuda_stream())));
-
-      } else {
-        using FunctorT = BinaryCrossEntropyWithLogitsGradFunctor<T, WeightType::kPosWeight>;
-        using FactoryT = cuda::elementwise::SimpleFactory<FunctorT>;
-        OF_CUDA_CHECK((cuda::elementwise::GenericLauncher<FactoryT, T, T, T, T, T>::Launch(
-            FactoryT(FunctorT()), elem_cnt, dx, input, target, dy, pos_weight_processed,
-            ctx->stream()->As<ep::CudaStream>()->cuda_stream())));
-      }
-    } else {
-      if (ctx->has_input("weight", 0)) {
-        const T* weight = ctx->Tensor4ArgNameAndIndex("weight", 0)->dptr<T>();
-        using FunctorT = BinaryCrossEntropyWithLogitsGradFunctor<T, WeightType::kWeight>;
-        using FactoryT = cuda::elementwise::SimpleFactory<FunctorT>;
-        OF_CUDA_CHECK((cuda::elementwise::GenericLauncher<FactoryT, T, T, T, T, T>::Launch(
-            FactoryT(FunctorT()), elem_cnt, dx, input, target, dy, weight,
-            ctx->stream()->As<ep::CudaStream>()->cuda_stream())));
-      } else {
-        OF_CUDA_CHECK((cuda::elementwise::Ternary(
-            BinaryCrossEntropyWithLogitsGradFunctor<T, WeightType::kNone>(), elem_cnt, dx, input,
-            target, dy, ctx->stream()->As<ep::CudaStream>()->cuda_stream())));
-      }
-    }
-  }
-  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
-};
-
-template<typename T>
-user_op::InferTmpSizeFn GenFwInferTmpSizeFn() {
-  return [](user_op::InferContext* ctx) {
-    const int64_t n = ctx->InputShape("input", 0).elem_cnt();
-    size_t tmp_buffer_size = 0;
-    if (ctx->Attr<bool>("has_pos_weight")) { tmp_buffer_size += GetCudaAlignedSize(n * sizeof(T)); }
-    return tmp_buffer_size;
-  };
-}
-template<typename T>
-user_op::InferTmpSizeFn GenBwInferTmpSizeFn() {
-  return [](user_op::InferContext* ctx) {
-    const int64_t n = ctx->InputShape("target", 0).elem_cnt();
-    size_t tmp_buffer_size = 0;
-    if (ctx->Attr<bool>("has_pos_weight")) { tmp_buffer_size += GetCudaAlignedSize(n * sizeof(T)); }
-    return tmp_buffer_size;
-  };
-}
-
-}  // namespace
-
-#define REGISTER_BINARY_CROSS_ENTROPY_KERNEL(dtype)                                        \
-  REGISTER_USER_KERNEL("binary_cross_entropy_with_logits")                                 \
-      .SetCreateFn<BinaryCrossEntropyWithLogitsKernel<dtype>>()                            \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                     \
-                       && (user_op::HobDataType("input", 0) == GetDataType<dtype>::value)  \
-                       && (user_op::HobDataType("target", 0) == GetDataType<dtype>::value) \
-                       && (user_op::HobDataType("out", 0) == GetDataType<dtype>::value))   \
-      .SetInferTmpSizeFn(GenFwInferTmpSizeFn<dtype>());
-
-#define REGISTER_BINARY_CROSS_ENTROPY_GRAD_KERNEL(dtype)                                   \
-  REGISTER_USER_KERNEL("binary_cross_entropy_with_logits_grad")                            \
-      .SetCreateFn<BinaryCrossEntropyWithLogitsGradKernel<dtype>>()                        \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                     \
-                       && (user_op::HobDataType("input", 0) == GetDataType<dtype>::value)  \
-                       && (user_op::HobDataType("target", 0) == GetDataType<dtype>::value) \
-                       && (user_op::HobDataType("dy", 0) == GetDataType<dtype>::value)     \
-                       && (user_op::HobDataType("dx", 0) == GetDataType<dtype>::value))    \
-      .SetInferTmpSizeFn(GenBwInferTmpSizeFn<dtype>());
-
-REGISTER_BINARY_CROSS_ENTROPY_KERNEL(half)
-REGISTER_BINARY_CROSS_ENTROPY_KERNEL(float)
-REGISTER_BINARY_CROSS_ENTROPY_KERNEL(double)
-
-REGISTER_BINARY_CROSS_ENTROPY_GRAD_KERNEL(half)
-REGISTER_BINARY_CROSS_ENTROPY_GRAD_KERNEL(float)
-REGISTER_BINARY_CROSS_ENTROPY_GRAD_KERNEL(double)
-
-}  // namespace user_op
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "oneflow/core/framework/framework.h"
+#include "oneflow/core/hip/elementwise.hip.h"
+#include "oneflow/core/ndarray/ndarray_util.h"
+#include "oneflow/core/ndarray/xpu_var_ndarray.h"
+#include "oneflow/user/kernels/loss_kernel_util.h"
+#include "oneflow/core/ep/rocm/cuda_stream.h"
+
+namespace oneflow {
+namespace user_op {
+namespace {
+
+using namespace loss;
+
+enum class WeightType {
+  kNone,
+  kWeight,
+  kPosWeight,
+  kBoth,
+};
+
+template<typename T, WeightType WEIGHT_TYPE>
+struct BinaryCrossEntropyWithLogitsFunctor;
+
+template<typename T>
+struct BinaryCrossEntropyWithLogitsFunctor<T, WeightType::kNone> {
+  T zero_;
+  T one_;
+  BinaryCrossEntropyWithLogitsFunctor() : zero_(GetZeroVal<T>()), one_(GetOneVal<T>()) {}
+  __device__ __forceinline__ T operator()(T input_val, T target_val) const {
+    const T max_val = -input_val < zero_ ? zero_ : -input_val;
+    return (one_ - target_val) * input_val + max_val
+           + (log(exp(-max_val) + exp(-input_val - max_val)));
+  }
+};
+
+template<typename T>
+struct BinaryCrossEntropyWithLogitsFunctor<T, WeightType::kPosWeight> {
+  T zero_;
+  T one_;
+  BinaryCrossEntropyWithLogitsFunctor() : zero_(GetZeroVal<T>()), one_(GetOneVal<T>()) {}
+  __device__ __forceinline__ T operator()(T input_val, T target_val, T weight_val) const {
+    const T max_val = -input_val < zero_ ? zero_ : -input_val;
+    const T pos_weight_processed_val = weight_val - target_val + one_;
+    return (one_ - target_val) * input_val
+           + (pos_weight_processed_val
+              * (log(exp(-max_val) + exp(-input_val - max_val)) + max_val));
+  }
+};
+
+template<>
+struct BinaryCrossEntropyWithLogitsFunctor<float, WeightType::kNone> {
+  float zero_;
+  float one_;
+  BinaryCrossEntropyWithLogitsFunctor() : zero_(0.f), one_(1.f) {}
+  __device__ __forceinline__ float operator()(float input_val, float target_val) const {
+    const float max_val = -input_val < zero_ ? zero_ : -input_val;
+    return (one_ - target_val) * input_val + max_val
+           + (logf(expf(-max_val) + expf(-input_val - max_val)));
+  }
+};
+
+template<>
+struct BinaryCrossEntropyWithLogitsFunctor<float, WeightType::kPosWeight> {
+  float zero_;
+  float one_;
+  BinaryCrossEntropyWithLogitsFunctor() : zero_(0.f), one_(1.f) {}
+  __device__ __forceinline__ float operator()(float input_val, float target_val,
+                                              float weight_val) const {
+    const float max_val = -input_val < zero_ ? zero_ : -input_val;
+    const float pos_weight_processed_val = weight_val - target_val + one_;
+    return (one_ - target_val) * input_val
+           + (pos_weight_processed_val
+              * (logf(expf(-max_val) + expf(-input_val - max_val)) + max_val));
+  }
+};
+
+template<typename T>
+struct BinaryCrossEntropyWithLogitsFunctor<T, WeightType::kWeight> {
+  BinaryCrossEntropyWithLogitsFunctor<T, WeightType::kNone> f;
+  __device__ __forceinline__ T operator()(T input_val, T target_val, T weight_val) const {
+    return f(input_val, target_val) * weight_val;
+  }
+};
+
+template<typename T>
+struct BinaryCrossEntropyWithLogitsFunctor<T, WeightType::kBoth> {
+  BinaryCrossEntropyWithLogitsFunctor<T, WeightType::kPosWeight> f;
+  __device__ __forceinline__ T operator()(T input_val, T target_val, T weight_val,
+                                          T pos_weight_val) const {
+    return f(input_val, target_val, pos_weight_val) * weight_val;
+  }
+};
+
+template<>
+struct BinaryCrossEntropyWithLogitsFunctor<half, WeightType::kNone> {
+  BinaryCrossEntropyWithLogitsFunctor<float, WeightType::kNone> f;
+  __device__ __forceinline__ half operator()(half input_val, half target_val) const {
+    return __float2half(f(__half2float(input_val), __half2float(target_val)));
+  }
+};
+template<>
+struct BinaryCrossEntropyWithLogitsFunctor<half, WeightType::kPosWeight> {
+  BinaryCrossEntropyWithLogitsFunctor<float, WeightType::kPosWeight> f;
+  __device__ __forceinline__ half operator()(half input_val, half target_val,
+                                             half weight_val) const {
+    return __float2half(
+        f(__half2float(input_val), __half2float(target_val), __half2float(weight_val)));
+  }
+};
+template<>
+struct BinaryCrossEntropyWithLogitsFunctor<half, WeightType::kWeight> {
+  BinaryCrossEntropyWithLogitsFunctor<float, WeightType::kWeight> f;
+  __device__ __forceinline__ half operator()(half input_val, half target_val,
+                                             half weight_val) const {
+    return __float2half(
+        f(__half2float(input_val), __half2float(target_val), __half2float(weight_val)));
+  }
+};
+template<>
+struct BinaryCrossEntropyWithLogitsFunctor<half, WeightType::kBoth> {
+  BinaryCrossEntropyWithLogitsFunctor<float, WeightType::kBoth> f;
+  __device__ __forceinline__ half operator()(half input_val, half target_val, half weight_val,
+                                             half pos_weight_val) const {
+    return __float2half(f(__half2float(input_val), __half2float(target_val),
+                          __half2float(weight_val), __half2float(pos_weight_val)));
+  }
+};
+
+template<typename T>
+__device__ __forceinline__ T CalSigmoid(const T x) {
+  const T half_of_one = static_cast<T>(0.5);
+  return half_of_one * tanh(half_of_one * x) + half_of_one;
+}
+
+template<>
+__device__ __forceinline__ float CalSigmoid(const float x) {
+  const float half_of_one = static_cast<float>(0.5);
+  return half_of_one * tanhf(half_of_one * x) + half_of_one;
+}
+
+template<>
+__device__ __forceinline__ half CalSigmoid(const half x) {
+  return __float2half(CalSigmoid(__half2float(x)));
+}
+
+template<typename T, WeightType WEIGHT_TYPE>
+struct BinaryCrossEntropyWithLogitsGradFunctor;
+
+template<typename T>
+struct BinaryCrossEntropyWithLogitsGradFunctor<T, WeightType::kNone> {
+  __device__ __forceinline__ T operator()(T input_val, T target_val, T dy_val) const {
+    return (CalSigmoid(input_val) - target_val) * dy_val;
+  }
+};
+
+template<typename T>
+struct BinaryCrossEntropyWithLogitsGradFunctor<T, WeightType::kPosWeight> {
+  T one_;
+  BinaryCrossEntropyWithLogitsGradFunctor() : one_(GetOneVal<T>()) {}
+  __device__ __forceinline__ T operator()(T input_val, T target_val, T dy_val, T weight_val) const {
+    return dy_val * ((weight_val + one_ - target_val) * CalSigmoid(input_val) - weight_val);
+  }
+};
+
+template<typename T>
+struct BinaryCrossEntropyWithLogitsGradFunctor<T, WeightType::kWeight> {
+  BinaryCrossEntropyWithLogitsGradFunctor<T, WeightType::kNone> f;
+  __device__ __forceinline__ T operator()(T input_val, T target_val, T dy_val, T weight_val) const {
+    return f(input_val, target_val, dy_val) * weight_val;
+  }
+};
+
+template<typename T>
+struct BinaryCrossEntropyWithLogitsGradFunctor<T, WeightType::kBoth> {
+  BinaryCrossEntropyWithLogitsGradFunctor<T, WeightType::kPosWeight> f;
+  __device__ __forceinline__ T operator()(T input_val, T target_val, T dy_val, T weight_val,
+                                          T pos_weight_val) const {
+    return f(input_val, target_val, dy_val, pos_weight_val) * weight_val;
+  }
+};
+
+template<typename T>
+class BinaryCrossEntropyWithLogitsKernel final : public user_op::OpKernel {
+ public:
+  BinaryCrossEntropyWithLogitsKernel() = default;
+  ~BinaryCrossEntropyWithLogitsKernel() override = default;
+
+ private:
+  using user_op::OpKernel::Compute;
+  void Compute(user_op::KernelComputeContext* ctx) const override {
+    const auto* input_blob = ctx->Tensor4ArgNameAndIndex("input", 0);
+    const auto* target_blob = ctx->Tensor4ArgNameAndIndex("target", 0);
+    auto* out_blob = ctx->Tensor4ArgNameAndIndex("out", 0);
+    auto* tmp_buffer_blob = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0);
+
+    const int64_t elem_cnt = input_blob->shape_view().elem_cnt();
+
+    const T* input = input_blob->dptr<T>();
+    const T* target = target_blob->dptr<T>();
+    T* out = out_blob->mut_dptr<T>();
+
+    if (ctx->Attr<bool>("has_pos_weight")) {
+      T* pos_weight_processed = tmp_buffer_blob->mut_dptr<T>();
+      const T* pos_weight = ctx->Tensor4ArgNameAndIndex("pos_weight", 0)->dptr<T>();
+
+      Shape pos_weight_shape = Shape::Ones(target_blob->shape_view().NumAxes());
+      pos_weight_shape.Set(pos_weight_shape.NumAxes() - 1,
+                           ctx->Tensor4ArgNameAndIndex("pos_weight", 0)->shape_view().elem_cnt());
+      NdarrayUtil<DeviceType::kCUDA, T>::BroadcastMul(
+          ctx->stream(), XpuVarNdarray<T>(target_blob->shape_view(), pos_weight_processed),
+          XpuVarNdarray<const T>(pos_weight_shape, pos_weight),
+          XpuVarNdarray<const T>(target_blob->shape_view(), target));
+      if (ctx->has_input("weight", 0)) {
+        const T* weight = ctx->Tensor4ArgNameAndIndex("weight", 0)->dptr<T>();
+        using FunctorT = BinaryCrossEntropyWithLogitsFunctor<T, WeightType::kBoth>;
+        using FactoryT = cuda::elementwise::SimpleFactory<FunctorT>;
+        OF_CUDA_CHECK((cuda::elementwise::GenericLauncher<FactoryT, T, T, T, T, T>::Launch(
+            FactoryT(FunctorT()), elem_cnt, out, input, target, weight, pos_weight_processed,
+            ctx->stream()->As<ep::CudaStream>()->cuda_stream())));
+
+      } else {
+        OF_CUDA_CHECK((cuda::elementwise::Ternary(
+            BinaryCrossEntropyWithLogitsFunctor<T, WeightType::kPosWeight>(), elem_cnt, out, input,
+            target, pos_weight_processed, ctx->stream()->As<ep::CudaStream>()->cuda_stream())));
+      }
+    } else {
+      if (ctx->has_input("weight", 0)) {
+        const T* weight = ctx->Tensor4ArgNameAndIndex("weight", 0)->dptr<T>();
+        OF_CUDA_CHECK((cuda::elementwise::Ternary(
+            BinaryCrossEntropyWithLogitsFunctor<T, WeightType::kWeight>(), elem_cnt, out, input,
+            target, weight, ctx->stream()->As<ep::CudaStream>()->cuda_stream())));
+      } else {
+        OF_CUDA_CHECK((cuda::elementwise::Binary(
+            BinaryCrossEntropyWithLogitsFunctor<T, WeightType::kNone>(), elem_cnt, out, input,
+            target, ctx->stream()->As<ep::CudaStream>()->cuda_stream())));
+      }
+    }
+  }
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
+};
+
+template<typename T>
+class BinaryCrossEntropyWithLogitsGradKernel final : public user_op::OpKernel {
+ public:
+  BinaryCrossEntropyWithLogitsGradKernel() = default;
+  ~BinaryCrossEntropyWithLogitsGradKernel() = default;
+
+ private:
+  using user_op::OpKernel::Compute;
+  void Compute(user_op::KernelComputeContext* ctx) const override {
+    const auto* input_blob = ctx->Tensor4ArgNameAndIndex("input", 0);
+    const auto* target_blob = ctx->Tensor4ArgNameAndIndex("target", 0);
+    const auto* dy_blob = ctx->Tensor4ArgNameAndIndex("dy", 0);
+    auto* dx_blob = ctx->Tensor4ArgNameAndIndex("dx", 0);
+    auto* tmp_buffer_blob = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0);
+
+    const int64_t elem_cnt = input_blob->shape_view().elem_cnt();
+
+    const T* dy = dy_blob->dptr<T>();
+    const T* input = input_blob->dptr<T>();
+    const T* target = target_blob->dptr<T>();
+    T* dx = dx_blob->mut_dptr<T>();
+
+    if (ctx->Attr<bool>("has_pos_weight")) {
+      T* pos_weight_processed = tmp_buffer_blob->mut_dptr<T>();
+      const T* pos_weight = ctx->Tensor4ArgNameAndIndex("pos_weight", 0)->dptr<T>();
+
+      Shape pos_weight_shape = Shape::Ones(target_blob->shape_view().NumAxes());
+      pos_weight_shape.Set(pos_weight_shape.NumAxes() - 1,
+                           ctx->Tensor4ArgNameAndIndex("pos_weight", 0)->shape_view().elem_cnt());
+      NdarrayUtil<DeviceType::kCUDA, T>::BroadcastMul(
+          ctx->stream(), XpuVarNdarray<T>(target_blob->shape_view(), pos_weight_processed),
+          XpuVarNdarray<const T>(pos_weight_shape, pos_weight),
+          XpuVarNdarray<const T>(target_blob->shape_view(), target));
+
+      if (ctx->has_input("weight", 0)) {
+        const T* weight = ctx->Tensor4ArgNameAndIndex("weight", 0)->dptr<T>();
+        using FunctorT = BinaryCrossEntropyWithLogitsGradFunctor<T, WeightType::kBoth>;
+        using FactoryT = cuda::elementwise::SimpleFactory<FunctorT>;
+        OF_CUDA_CHECK((cuda::elementwise::GenericLauncher<FactoryT, T, T, T, T, T, T>::Launch(
+            FactoryT(FunctorT()), elem_cnt, dx, input, target, dy, weight, pos_weight_processed,
+            ctx->stream()->As<ep::CudaStream>()->cuda_stream())));
+
+      } else {
+        using FunctorT = BinaryCrossEntropyWithLogitsGradFunctor<T, WeightType::kPosWeight>;
+        using FactoryT = cuda::elementwise::SimpleFactory<FunctorT>;
+        OF_CUDA_CHECK((cuda::elementwise::GenericLauncher<FactoryT, T, T, T, T, T>::Launch(
+            FactoryT(FunctorT()), elem_cnt, dx, input, target, dy, pos_weight_processed,
+            ctx->stream()->As<ep::CudaStream>()->cuda_stream())));
+      }
+    } else {
+      if (ctx->has_input("weight", 0)) {
+        const T* weight = ctx->Tensor4ArgNameAndIndex("weight", 0)->dptr<T>();
+        using FunctorT = BinaryCrossEntropyWithLogitsGradFunctor<T, WeightType::kWeight>;
+        using FactoryT = cuda::elementwise::SimpleFactory<FunctorT>;
+        OF_CUDA_CHECK((cuda::elementwise::GenericLauncher<FactoryT, T, T, T, T, T>::Launch(
+            FactoryT(FunctorT()), elem_cnt, dx, input, target, dy, weight,
+            ctx->stream()->As<ep::CudaStream>()->cuda_stream())));
+      } else {
+        OF_CUDA_CHECK((cuda::elementwise::Ternary(
+            BinaryCrossEntropyWithLogitsGradFunctor<T, WeightType::kNone>(), elem_cnt, dx, input,
+            target, dy, ctx->stream()->As<ep::CudaStream>()->cuda_stream())));
+      }
+    }
+  }
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
+};
+
+template<typename T>
+user_op::InferTmpSizeFn GenFwInferTmpSizeFn() {
+  return [](user_op::InferContext* ctx) {
+    const int64_t n = ctx->InputShape("input", 0).elem_cnt();
+    size_t tmp_buffer_size = 0;
+    if (ctx->Attr<bool>("has_pos_weight")) { tmp_buffer_size += GetCudaAlignedSize(n * sizeof(T)); }
+    return tmp_buffer_size;
+  };
+}
+template<typename T>
+user_op::InferTmpSizeFn GenBwInferTmpSizeFn() {
+  return [](user_op::InferContext* ctx) {
+    const int64_t n = ctx->InputShape("target", 0).elem_cnt();
+    size_t tmp_buffer_size = 0;
+    if (ctx->Attr<bool>("has_pos_weight")) { tmp_buffer_size += GetCudaAlignedSize(n * sizeof(T)); }
+    return tmp_buffer_size;
+  };
+}
+
+}  // namespace
+
+#define REGISTER_BINARY_CROSS_ENTROPY_KERNEL(dtype)                                        \
+  REGISTER_USER_KERNEL("binary_cross_entropy_with_logits")                                 \
+      .SetCreateFn<BinaryCrossEntropyWithLogitsKernel<dtype>>()                            \
+      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                     \
+                       && (user_op::HobDataType("input", 0) == GetDataType<dtype>::value)  \
+                       && (user_op::HobDataType("target", 0) == GetDataType<dtype>::value) \
+                       && (user_op::HobDataType("out", 0) == GetDataType<dtype>::value))   \
+      .SetInferTmpSizeFn(GenFwInferTmpSizeFn<dtype>());
+
+#define REGISTER_BINARY_CROSS_ENTROPY_GRAD_KERNEL(dtype)                                   \
+  REGISTER_USER_KERNEL("binary_cross_entropy_with_logits_grad")                            \
+      .SetCreateFn<BinaryCrossEntropyWithLogitsGradKernel<dtype>>()                        \
+      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                     \
+                       && (user_op::HobDataType("input", 0) == GetDataType<dtype>::value)  \
+                       && (user_op::HobDataType("target", 0) == GetDataType<dtype>::value) \
+                       && (user_op::HobDataType("dy", 0) == GetDataType<dtype>::value)     \
+                       && (user_op::HobDataType("dx", 0) == GetDataType<dtype>::value))    \
+      .SetInferTmpSizeFn(GenBwInferTmpSizeFn<dtype>());
+
+REGISTER_BINARY_CROSS_ENTROPY_KERNEL(half)
+REGISTER_BINARY_CROSS_ENTROPY_KERNEL(float)
+REGISTER_BINARY_CROSS_ENTROPY_KERNEL(double)
+
+REGISTER_BINARY_CROSS_ENTROPY_GRAD_KERNEL(half)
+REGISTER_BINARY_CROSS_ENTROPY_GRAD_KERNEL(float)
+REGISTER_BINARY_CROSS_ENTROPY_GRAD_KERNEL(double)
+
+}  // namespace user_op
 }  // namespace oneflow
\ No newline at end of file
diff --git a/oneflow/user/kernels/binary_cross_entropy_with_logits_mean_kernel.hip.cpp b/oneflow/user/kernels/binary_cross_entropy_with_logits_mean_kernel.hip.cpp
index 2de0821..79ca507 100644
--- a/oneflow/user/kernels/binary_cross_entropy_with_logits_mean_kernel.hip.cpp
+++ b/oneflow/user/kernels/binary_cross_entropy_with_logits_mean_kernel.hip.cpp
@@ -1,277 +1,277 @@
-#include "hip/hip_runtime.h"
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "oneflow/user/kernels/binary_cross_entropy_with_logits_mean_kernel_util.h"
-#include "oneflow/core/ep/rocm/cuda_stream.h"
-#include "oneflow/core/hip/elementwise.hip.h"
-#include <hipcub/hipcub.hpp>
-#include "oneflow/core/kernel/cuda_graph_support.h"
-
-namespace oneflow {
-
-namespace user_op {
-
-namespace {
-
-constexpr int32_t kBlockSize = 1024;
-constexpr int32_t kReduceLocalSumBlockSize = 1024;
-constexpr int32_t kSingleBlockProcessNumThreshold = 1024;
-
-template<typename T>
-struct DefaultComputeType {
-  using type = T;
-};
-
-template<>
-struct DefaultComputeType<half> {
-  using type = float;
-};
-
-template<class Func>
-inline hipError_t GetNumBlocks(Func func, int64_t block_size, size_t dynamic_smem_size,
-                                int64_t max_blocks, int64_t waves, int* num_blocks) {
-  int dev;
-  {
-    hipError_t err = hipGetDevice(&dev);
-    if (err != hipSuccess) { return err; }
-  }
-  int sm_count;
-  {
-    hipError_t err = hipDeviceGetAttribute(&sm_count, hipDeviceAttributeMultiprocessorCount, dev);
-    if (err != hipSuccess) { return err; }
-  }
-  int max_active_blocks;
-  {
-    hipError_t err = hipOccupancyMaxActiveBlocksPerMultiprocessor(&max_active_blocks, func,
-                                                                    block_size, dynamic_smem_size);
-  }
-  *num_blocks =
-      std::max<int>(1, std::min<int64_t>(max_blocks, sm_count * max_active_blocks * waves));
-  return hipSuccess;
-}
-
-template<typename In, typename Out, typename ComputeType>
-__global__ void FusedBinaryCrossEntropyWithLogitsReduceMeanKernel(const In* input, const In* target,
-                                                                  Out* out,
-                                                                  const int32_t local_elem_cnt,
-                                                                  const int32_t reduce_elem_cnt) {
-  ComputeType zero = static_cast<ComputeType>(0.0);
-  ComputeType one = static_cast<ComputeType>(1.0);
-  using BlockReduce = hipcub::BlockReduce<ComputeType, kBlockSize>;
-  __shared__ typename BlockReduce::TempStorage temp_storage;
-  ComputeType reduce_sum = 0.0;
-  CUDA_1D_KERNEL_LOOP(i, local_elem_cnt) {
-    const ComputeType input_val = static_cast<ComputeType>(input[i]);
-    const ComputeType target_val = static_cast<ComputeType>(target[i]);
-    const ComputeType max_val = -input_val < zero ? zero : -input_val;
-    const ComputeType result =
-        (one - target_val) * input_val + max_val + (log(exp(-max_val) + exp(-input_val - max_val)));
-    reduce_sum += result;
-  }
-
-  const ComputeType block_reduce_sum = BlockReduce(temp_storage).Sum(reduce_sum);
-  if (threadIdx.x == 0) { out[blockIdx.x] = static_cast<Out>(block_reduce_sum / reduce_elem_cnt); }
-}
-
-template<typename Out, typename ComputeType>
-__global__ void ReduceLocalSumKernel(ComputeType* block_local_sum_buf, Out* out, int64_t elem_cnt) {
-  using BlockReduce = hipcub::BlockReduce<ComputeType, kReduceLocalSumBlockSize>;
-  __shared__ typename BlockReduce::TempStorage temp_storage;
-  ComputeType reduce_sum = 0.0;
-  CUDA_1D_KERNEL_LOOP(i, elem_cnt) { reduce_sum += block_local_sum_buf[i]; }
-  const ComputeType block_reduce_sum = BlockReduce(temp_storage).Sum(reduce_sum);
-  if (threadIdx.x == 0) { out[0] = static_cast<Out>(block_reduce_sum); }
-}
-
-template<typename T>
-__device__ __forceinline__ T Sigmoid(const T x) {
-  const T half_of_one = static_cast<T>(0.5);
-  return half_of_one * tanh(half_of_one * x) + half_of_one;
-}
-
-template<>
-__device__ __forceinline__ half Sigmoid(const half x) {
-  return __float2half(Sigmoid(__half2float(x)));
-}
-
-template<typename T, typename ComputeType>
-struct BinaryCrossEntropyWithLogitsReduceMeanGradFunctor {
-  OF_DEVICE_FUNC explicit BinaryCrossEntropyWithLogitsReduceMeanGradFunctor(
-      const T elem_cnt_reciprocal, const T dy)
-      : elem_cnt_reciprocal(elem_cnt_reciprocal), dy(dy) {}
-  __device__ T operator()(const T input_val, const T target_val) const {
-    return (Sigmoid(input_val) - target_val) * dy * elem_cnt_reciprocal;
-  }
-  const T dy;
-  const T elem_cnt_reciprocal;
-};
-
-template<typename T, typename ComputeType>
-struct BinaryCrossEntropyWithLogitsReduceMeanGradDyptrFunctor {
-  OF_DEVICE_FUNC explicit BinaryCrossEntropyWithLogitsReduceMeanGradDyptrFunctor(
-      const int32_t elem_cnt, const T* dy_ptr)
-      : elem_cnt_reciprocal(1.0f / elem_cnt), dy_ptr(dy_ptr) {}
-  __device__ BinaryCrossEntropyWithLogitsReduceMeanGradFunctor<T, ComputeType> operator()() const {
-    return BinaryCrossEntropyWithLogitsReduceMeanGradFunctor<T, ComputeType>(elem_cnt_reciprocal,
-                                                                             *dy_ptr);
-  }
-  const T* dy_ptr;
-  const T elem_cnt_reciprocal;
-};
-
-template<typename T>
-class BinaryCrossEntropyWithLogitsMeanKernel final : public user_op::OpKernel,
-                                                     public CudaGraphSupport {
- public:
-  BinaryCrossEntropyWithLogitsMeanKernel() = default;
-  ~BinaryCrossEntropyWithLogitsMeanKernel() override = default;
-  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
-
-  std::shared_ptr<user_op::OpKernelCache> InitOpKernelCache(
-      user_op::KernelCacheContext* ctx) const override {
-    return CreateBCEWithLogitsReduceMeanKernelCache(ctx);
-  }
-
- private:
-  using user_op::OpKernel::Compute;
-  void Compute(user_op::KernelComputeContext* ctx, user_op::OpKernelState* state,
-               const user_op::OpKernelCache* cache) const override {
-    const auto* input_blob = ctx->Tensor4ArgNameAndIndex("input", 0);
-    const auto* target_blob = ctx->Tensor4ArgNameAndIndex("target", 0);
-    auto* out_blob = ctx->Tensor4ArgNameAndIndex("out", 0);
-
-    int64_t local_elem_cnt = input_blob->shape_view().elem_cnt();
-    int64_t reduce_elem_cnt = local_elem_cnt;
-
-    if (cache != nullptr) {
-      // Because `out`'s SBP maybe P or B, we need to use reduce_elem_cnt as reduce_mean factor.
-      const auto* bce_cache = dynamic_cast<const BCEWithLogitsReduceMeanKernelCache*>(cache);
-      CHECK_NOTNULL(bce_cache);
-      reduce_elem_cnt = bce_cache->reduce_elem_cnt();
-    }
-
-    const T* input = input_blob->dptr<T>();
-    const T* target = target_blob->dptr<T>();
-    T* out = out_blob->mut_dptr<T>();
-    using ComputeType = typename DefaultComputeType<T>::type;
-
-    if (local_elem_cnt <= kSingleBlockProcessNumThreshold) {
-      FusedBinaryCrossEntropyWithLogitsReduceMeanKernel<T, T, ComputeType>
-          <<<1, kBlockSize, 0, ctx->stream()->As<ep::CudaStream>()->cuda_stream()>>>(
-              input_blob->dptr<T>(), target_blob->dptr<T>(), out_blob->mut_dptr<T>(),
-              local_elem_cnt, reduce_elem_cnt);
-    } else {
-      auto* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0);
-      const int64_t tmp_buffer_elem_cnt = tmp_buffer->shape_view().elem_cnt() / sizeof(T);
-      const int64_t block_num = (local_elem_cnt + kBlockSize - 1) / kBlockSize;
-      int launch_block = block_num;
-      OF_CUDA_CHECK(GetNumBlocks(
-          FusedBinaryCrossEntropyWithLogitsReduceMeanKernel<T, ComputeType, ComputeType>,
-          kBlockSize, 0, block_num, 32, &launch_block));
-      launch_block = std::min<int32_t>(tmp_buffer_elem_cnt, launch_block);
-      FusedBinaryCrossEntropyWithLogitsReduceMeanKernel<T, ComputeType, ComputeType>
-          <<<launch_block, kBlockSize, 0, ctx->stream()->As<ep::CudaStream>()->cuda_stream()>>>(
-              input_blob->dptr<T>(), target_blob->dptr<T>(), tmp_buffer->mut_dptr<ComputeType>(),
-              local_elem_cnt, reduce_elem_cnt);
-      ReduceLocalSumKernel<T, ComputeType>
-          <<<1, kReduceLocalSumBlockSize, 0, ctx->stream()->As<ep::CudaStream>()->cuda_stream()>>>(
-              tmp_buffer->mut_dptr<ComputeType>(), out_blob->mut_dptr<T>(), block_num);
-    }
-  }
-};
-
-template<typename T>
-class BinaryCrossEntropyWithLogitsReduceMeanGradKernel final : public user_op::OpKernel {
- public:
-  BinaryCrossEntropyWithLogitsReduceMeanGradKernel() = default;
-  ~BinaryCrossEntropyWithLogitsReduceMeanGradKernel() = default;
-  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
-
-  std::shared_ptr<user_op::OpKernelCache> InitOpKernelCache(
-      user_op::KernelCacheContext* ctx) const override {
-    return CreateBCEWithLogitsReduceMeanKernelCache(ctx);
-  }
-
- private:
-  using user_op::OpKernel::Compute;
-  void Compute(user_op::KernelComputeContext* ctx, user_op::OpKernelState* state,
-               const user_op::OpKernelCache* cache) const override {
-    const auto* input_blob = ctx->Tensor4ArgNameAndIndex("input", 0);
-    const auto* target_blob = ctx->Tensor4ArgNameAndIndex("target", 0);
-    const auto* dy_blob = ctx->Tensor4ArgNameAndIndex("dy", 0);
-    auto* dx_blob = ctx->Tensor4ArgNameAndIndex("dx", 0);
-
-    int64_t local_elem_cnt = input_blob->shape_view().elem_cnt();
-    int64_t reduce_elem_cnt = local_elem_cnt;
-    if (cache != nullptr) {
-      // Because `out`'s SBP maybe P or B, we need to use reduce_elem_cnt as reduce_mean factor.
-      const auto* bce_cache = dynamic_cast<const BCEWithLogitsReduceMeanKernelCache*>(cache);
-      CHECK_NOTNULL(bce_cache);
-      reduce_elem_cnt = bce_cache->reduce_elem_cnt();
-    }
-
-    const T* dy = dy_blob->dptr<T>();
-    const T* input = input_blob->dptr<T>();
-    const T* target = target_blob->dptr<T>();
-    T* dx = dx_blob->mut_dptr<T>();
-    using ComputeType = typename DefaultComputeType<T>::type;
-
-    OF_CUDA_CHECK((cuda::elementwise::BinaryWithFactory(
-        BinaryCrossEntropyWithLogitsReduceMeanGradDyptrFunctor<T, ComputeType>(reduce_elem_cnt, dy),
-        local_elem_cnt, dx, input, target, ctx->stream()->As<ep::CudaStream>()->cuda_stream())));
-  }
-};
-
-}  // namespace
-
-#define REGISTER_BINARY_CROSS_ENTROPY_REDUCE_MEAN_KERNEL(dtype)                                 \
-  REGISTER_USER_KERNEL("binary_cross_entropy_with_logits_reduce_mean")                          \
-      .SetCreateFn<BinaryCrossEntropyWithLogitsMeanKernel<dtype>>()                             \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                          \
-                       && (user_op::HobDataType("input", 0) == GetDataType<dtype>::value)       \
-                       && (user_op::HobDataType("target", 0) == GetDataType<dtype>::value)      \
-                       && (user_op::HobDataType("out", 0) == GetDataType<dtype>::value))        \
-      .SetInferTmpSizeFn([](user_op::InferContext* ctx) {                                       \
-        const int64_t elem_cnt = ctx->InputShape("input", 0).elem_cnt();                        \
-        const int64_t block_num = (elem_cnt + kBlockSize - 1) / kBlockSize;                     \
-        int launch_block = block_num;                                                           \
-        using ComputeType = typename DefaultComputeType<dtype>::type;                           \
-        OF_CUDA_CHECK(GetNumBlocks(                                                             \
-            FusedBinaryCrossEntropyWithLogitsReduceMeanKernel<dtype, ComputeType, ComputeType>, \
-            kBlockSize, 0, block_num, 32, &launch_block));                                      \
-        const int64_t tmp_buffer_size = GetCudaAlignedSize(launch_block * sizeof(dtype));       \
-        return tmp_buffer_size;                                                                 \
-      });
-
-#define REGISTER_BINARY_CROSS_ENTROPY_REDUCE_MEAN_GRAD_KERNEL(dtype)                       \
-  REGISTER_USER_KERNEL("binary_cross_entropy_with_logits_reduce_mean_grad")                \
-      .SetCreateFn<BinaryCrossEntropyWithLogitsReduceMeanGradKernel<dtype>>()              \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                     \
-                       && (user_op::HobDataType("input", 0) == GetDataType<dtype>::value)  \
-                       && (user_op::HobDataType("target", 0) == GetDataType<dtype>::value) \
-                       && (user_op::HobDataType("dy", 0) == GetDataType<dtype>::value)     \
-                       && (user_op::HobDataType("dx", 0) == GetDataType<dtype>::value));
-
-REGISTER_BINARY_CROSS_ENTROPY_REDUCE_MEAN_KERNEL(half)
-REGISTER_BINARY_CROSS_ENTROPY_REDUCE_MEAN_KERNEL(float)
-REGISTER_BINARY_CROSS_ENTROPY_REDUCE_MEAN_KERNEL(double)
-
-REGISTER_BINARY_CROSS_ENTROPY_REDUCE_MEAN_GRAD_KERNEL(half)
-REGISTER_BINARY_CROSS_ENTROPY_REDUCE_MEAN_GRAD_KERNEL(float)
-REGISTER_BINARY_CROSS_ENTROPY_REDUCE_MEAN_GRAD_KERNEL(double)
-
-}  // namespace user_op
+#include "hip/hip_runtime.h"
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "oneflow/user/kernels/binary_cross_entropy_with_logits_mean_kernel_util.h"
+#include "oneflow/core/ep/rocm/cuda_stream.h"
+#include "oneflow/core/hip/elementwise.hip.h"
+#include <hipcub/hipcub.hpp>
+#include "oneflow/core/kernel/cuda_graph_support.h"
+
+namespace oneflow {
+
+namespace user_op {
+
+namespace {
+
+constexpr int32_t kBlockSize = 1024;
+constexpr int32_t kReduceLocalSumBlockSize = 1024;
+constexpr int32_t kSingleBlockProcessNumThreshold = 1024;
+
+template<typename T>
+struct DefaultComputeType {
+  using type = T;
+};
+
+template<>
+struct DefaultComputeType<half> {
+  using type = float;
+};
+
+template<class Func>
+inline hipError_t GetNumBlocks(Func func, int64_t block_size, size_t dynamic_smem_size,
+                                int64_t max_blocks, int64_t waves, int* num_blocks) {
+  int dev;
+  {
+    hipError_t err = hipGetDevice(&dev);
+    if (err != hipSuccess) { return err; }
+  }
+  int sm_count;
+  {
+    hipError_t err = hipDeviceGetAttribute(&sm_count, hipDeviceAttributeMultiprocessorCount, dev);
+    if (err != hipSuccess) { return err; }
+  }
+  int max_active_blocks;
+  {
+    hipError_t err = hipOccupancyMaxActiveBlocksPerMultiprocessor(&max_active_blocks, func,
+                                                                    block_size, dynamic_smem_size);
+  }
+  *num_blocks =
+      std::max<int>(1, std::min<int64_t>(max_blocks, sm_count * max_active_blocks * waves));
+  return hipSuccess;
+}
+
+template<typename In, typename Out, typename ComputeType>
+__global__ void FusedBinaryCrossEntropyWithLogitsReduceMeanKernel(const In* input, const In* target,
+                                                                  Out* out,
+                                                                  const int32_t local_elem_cnt,
+                                                                  const int32_t reduce_elem_cnt) {
+  ComputeType zero = static_cast<ComputeType>(0.0);
+  ComputeType one = static_cast<ComputeType>(1.0);
+  using BlockReduce = hipcub::BlockReduce<ComputeType, kBlockSize>;
+  __shared__ typename BlockReduce::TempStorage temp_storage;
+  ComputeType reduce_sum = 0.0;
+  CUDA_1D_KERNEL_LOOP(i, local_elem_cnt) {
+    const ComputeType input_val = static_cast<ComputeType>(input[i]);
+    const ComputeType target_val = static_cast<ComputeType>(target[i]);
+    const ComputeType max_val = -input_val < zero ? zero : -input_val;
+    const ComputeType result =
+        (one - target_val) * input_val + max_val + (log(exp(-max_val) + exp(-input_val - max_val)));
+    reduce_sum += result;
+  }
+
+  const ComputeType block_reduce_sum = BlockReduce(temp_storage).Sum(reduce_sum);
+  if (threadIdx.x == 0) { out[blockIdx.x] = static_cast<Out>(block_reduce_sum / reduce_elem_cnt); }
+}
+
+template<typename Out, typename ComputeType>
+__global__ void ReduceLocalSumKernel(ComputeType* block_local_sum_buf, Out* out, int64_t elem_cnt) {
+  using BlockReduce = hipcub::BlockReduce<ComputeType, kReduceLocalSumBlockSize>;
+  __shared__ typename BlockReduce::TempStorage temp_storage;
+  ComputeType reduce_sum = 0.0;
+  CUDA_1D_KERNEL_LOOP(i, elem_cnt) { reduce_sum += block_local_sum_buf[i]; }
+  const ComputeType block_reduce_sum = BlockReduce(temp_storage).Sum(reduce_sum);
+  if (threadIdx.x == 0) { out[0] = static_cast<Out>(block_reduce_sum); }
+}
+
+template<typename T>
+__device__ __forceinline__ T Sigmoid(const T x) {
+  const T half_of_one = static_cast<T>(0.5);
+  return half_of_one * tanh(half_of_one * x) + half_of_one;
+}
+
+template<>
+__device__ __forceinline__ half Sigmoid(const half x) {
+  return __float2half(Sigmoid(__half2float(x)));
+}
+
+template<typename T, typename ComputeType>
+struct BinaryCrossEntropyWithLogitsReduceMeanGradFunctor {
+  OF_DEVICE_FUNC explicit BinaryCrossEntropyWithLogitsReduceMeanGradFunctor(
+      const T elem_cnt_reciprocal, const T dy)
+      : elem_cnt_reciprocal(elem_cnt_reciprocal), dy(dy) {}
+  __device__ T operator()(const T input_val, const T target_val) const {
+    return (Sigmoid(input_val) - target_val) * dy * elem_cnt_reciprocal;
+  }
+  const T dy;
+  const T elem_cnt_reciprocal;
+};
+
+template<typename T, typename ComputeType>
+struct BinaryCrossEntropyWithLogitsReduceMeanGradDyptrFunctor {
+  OF_DEVICE_FUNC explicit BinaryCrossEntropyWithLogitsReduceMeanGradDyptrFunctor(
+      const int32_t elem_cnt, const T* dy_ptr)
+      : elem_cnt_reciprocal(1.0f / elem_cnt), dy_ptr(dy_ptr) {}
+  __device__ BinaryCrossEntropyWithLogitsReduceMeanGradFunctor<T, ComputeType> operator()() const {
+    return BinaryCrossEntropyWithLogitsReduceMeanGradFunctor<T, ComputeType>(elem_cnt_reciprocal,
+                                                                             *dy_ptr);
+  }
+  const T* dy_ptr;
+  const T elem_cnt_reciprocal;
+};
+
+template<typename T>
+class BinaryCrossEntropyWithLogitsMeanKernel final : public user_op::OpKernel,
+                                                     public CudaGraphSupport {
+ public:
+  BinaryCrossEntropyWithLogitsMeanKernel() = default;
+  ~BinaryCrossEntropyWithLogitsMeanKernel() override = default;
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
+
+  std::shared_ptr<user_op::OpKernelCache> InitOpKernelCache(
+      user_op::KernelCacheContext* ctx) const override {
+    return CreateBCEWithLogitsReduceMeanKernelCache(ctx);
+  }
+
+ private:
+  using user_op::OpKernel::Compute;
+  void Compute(user_op::KernelComputeContext* ctx, user_op::OpKernelState* state,
+               const user_op::OpKernelCache* cache) const override {
+    const auto* input_blob = ctx->Tensor4ArgNameAndIndex("input", 0);
+    const auto* target_blob = ctx->Tensor4ArgNameAndIndex("target", 0);
+    auto* out_blob = ctx->Tensor4ArgNameAndIndex("out", 0);
+
+    int64_t local_elem_cnt = input_blob->shape_view().elem_cnt();
+    int64_t reduce_elem_cnt = local_elem_cnt;
+
+    if (cache != nullptr) {
+      // Because `out`'s SBP maybe P or B, we need to use reduce_elem_cnt as reduce_mean factor.
+      const auto* bce_cache = dynamic_cast<const BCEWithLogitsReduceMeanKernelCache*>(cache);
+      CHECK_NOTNULL(bce_cache);
+      reduce_elem_cnt = bce_cache->reduce_elem_cnt();
+    }
+
+    const T* input = input_blob->dptr<T>();
+    const T* target = target_blob->dptr<T>();
+    T* out = out_blob->mut_dptr<T>();
+    using ComputeType = typename DefaultComputeType<T>::type;
+
+    if (local_elem_cnt <= kSingleBlockProcessNumThreshold) {
+      FusedBinaryCrossEntropyWithLogitsReduceMeanKernel<T, T, ComputeType>
+          <<<1, kBlockSize, 0, ctx->stream()->As<ep::CudaStream>()->cuda_stream()>>>(
+              input_blob->dptr<T>(), target_blob->dptr<T>(), out_blob->mut_dptr<T>(),
+              local_elem_cnt, reduce_elem_cnt);
+    } else {
+      auto* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0);
+      const int64_t tmp_buffer_elem_cnt = tmp_buffer->shape_view().elem_cnt() / sizeof(T);
+      const int64_t block_num = (local_elem_cnt + kBlockSize - 1) / kBlockSize;
+      int launch_block = block_num;
+      OF_CUDA_CHECK(GetNumBlocks(
+          FusedBinaryCrossEntropyWithLogitsReduceMeanKernel<T, ComputeType, ComputeType>,
+          kBlockSize, 0, block_num, 32, &launch_block));
+      launch_block = std::min<int32_t>(tmp_buffer_elem_cnt, launch_block);
+      FusedBinaryCrossEntropyWithLogitsReduceMeanKernel<T, ComputeType, ComputeType>
+          <<<launch_block, kBlockSize, 0, ctx->stream()->As<ep::CudaStream>()->cuda_stream()>>>(
+              input_blob->dptr<T>(), target_blob->dptr<T>(), tmp_buffer->mut_dptr<ComputeType>(),
+              local_elem_cnt, reduce_elem_cnt);
+      ReduceLocalSumKernel<T, ComputeType>
+          <<<1, kReduceLocalSumBlockSize, 0, ctx->stream()->As<ep::CudaStream>()->cuda_stream()>>>(
+              tmp_buffer->mut_dptr<ComputeType>(), out_blob->mut_dptr<T>(), block_num);
+    }
+  }
+};
+
+template<typename T>
+class BinaryCrossEntropyWithLogitsReduceMeanGradKernel final : public user_op::OpKernel {
+ public:
+  BinaryCrossEntropyWithLogitsReduceMeanGradKernel() = default;
+  ~BinaryCrossEntropyWithLogitsReduceMeanGradKernel() = default;
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
+
+  std::shared_ptr<user_op::OpKernelCache> InitOpKernelCache(
+      user_op::KernelCacheContext* ctx) const override {
+    return CreateBCEWithLogitsReduceMeanKernelCache(ctx);
+  }
+
+ private:
+  using user_op::OpKernel::Compute;
+  void Compute(user_op::KernelComputeContext* ctx, user_op::OpKernelState* state,
+               const user_op::OpKernelCache* cache) const override {
+    const auto* input_blob = ctx->Tensor4ArgNameAndIndex("input", 0);
+    const auto* target_blob = ctx->Tensor4ArgNameAndIndex("target", 0);
+    const auto* dy_blob = ctx->Tensor4ArgNameAndIndex("dy", 0);
+    auto* dx_blob = ctx->Tensor4ArgNameAndIndex("dx", 0);
+
+    int64_t local_elem_cnt = input_blob->shape_view().elem_cnt();
+    int64_t reduce_elem_cnt = local_elem_cnt;
+    if (cache != nullptr) {
+      // Because `out`'s SBP maybe P or B, we need to use reduce_elem_cnt as reduce_mean factor.
+      const auto* bce_cache = dynamic_cast<const BCEWithLogitsReduceMeanKernelCache*>(cache);
+      CHECK_NOTNULL(bce_cache);
+      reduce_elem_cnt = bce_cache->reduce_elem_cnt();
+    }
+
+    const T* dy = dy_blob->dptr<T>();
+    const T* input = input_blob->dptr<T>();
+    const T* target = target_blob->dptr<T>();
+    T* dx = dx_blob->mut_dptr<T>();
+    using ComputeType = typename DefaultComputeType<T>::type;
+
+    OF_CUDA_CHECK((cuda::elementwise::BinaryWithFactory(
+        BinaryCrossEntropyWithLogitsReduceMeanGradDyptrFunctor<T, ComputeType>(reduce_elem_cnt, dy),
+        local_elem_cnt, dx, input, target, ctx->stream()->As<ep::CudaStream>()->cuda_stream())));
+  }
+};
+
+}  // namespace
+
+#define REGISTER_BINARY_CROSS_ENTROPY_REDUCE_MEAN_KERNEL(dtype)                                 \
+  REGISTER_USER_KERNEL("binary_cross_entropy_with_logits_reduce_mean")                          \
+      .SetCreateFn<BinaryCrossEntropyWithLogitsMeanKernel<dtype>>()                             \
+      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                          \
+                       && (user_op::HobDataType("input", 0) == GetDataType<dtype>::value)       \
+                       && (user_op::HobDataType("target", 0) == GetDataType<dtype>::value)      \
+                       && (user_op::HobDataType("out", 0) == GetDataType<dtype>::value))        \
+      .SetInferTmpSizeFn([](user_op::InferContext* ctx) {                                       \
+        const int64_t elem_cnt = ctx->InputShape("input", 0).elem_cnt();                        \
+        const int64_t block_num = (elem_cnt + kBlockSize - 1) / kBlockSize;                     \
+        int launch_block = block_num;                                                           \
+        using ComputeType = typename DefaultComputeType<dtype>::type;                           \
+        OF_CUDA_CHECK(GetNumBlocks(                                                             \
+            FusedBinaryCrossEntropyWithLogitsReduceMeanKernel<dtype, ComputeType, ComputeType>, \
+            kBlockSize, 0, block_num, 32, &launch_block));                                      \
+        const int64_t tmp_buffer_size = GetCudaAlignedSize(launch_block * sizeof(dtype));       \
+        return tmp_buffer_size;                                                                 \
+      });
+
+#define REGISTER_BINARY_CROSS_ENTROPY_REDUCE_MEAN_GRAD_KERNEL(dtype)                       \
+  REGISTER_USER_KERNEL("binary_cross_entropy_with_logits_reduce_mean_grad")                \
+      .SetCreateFn<BinaryCrossEntropyWithLogitsReduceMeanGradKernel<dtype>>()              \
+      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                     \
+                       && (user_op::HobDataType("input", 0) == GetDataType<dtype>::value)  \
+                       && (user_op::HobDataType("target", 0) == GetDataType<dtype>::value) \
+                       && (user_op::HobDataType("dy", 0) == GetDataType<dtype>::value)     \
+                       && (user_op::HobDataType("dx", 0) == GetDataType<dtype>::value));
+
+REGISTER_BINARY_CROSS_ENTROPY_REDUCE_MEAN_KERNEL(half)
+REGISTER_BINARY_CROSS_ENTROPY_REDUCE_MEAN_KERNEL(float)
+REGISTER_BINARY_CROSS_ENTROPY_REDUCE_MEAN_KERNEL(double)
+
+REGISTER_BINARY_CROSS_ENTROPY_REDUCE_MEAN_GRAD_KERNEL(half)
+REGISTER_BINARY_CROSS_ENTROPY_REDUCE_MEAN_GRAD_KERNEL(float)
+REGISTER_BINARY_CROSS_ENTROPY_REDUCE_MEAN_GRAD_KERNEL(double)
+
+}  // namespace user_op
 }  // namespace oneflow
\ No newline at end of file
diff --git a/oneflow/user/kernels/broadcast_pow_grad_kernel.hip.cpp b/oneflow/user/kernels/broadcast_pow_grad_kernel.hip.cpp
index 562f69c..2cf6e90 100644
--- a/oneflow/user/kernels/broadcast_pow_grad_kernel.hip.cpp
+++ b/oneflow/user/kernels/broadcast_pow_grad_kernel.hip.cpp
@@ -1,88 +1,88 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "hip/hip_runtime.h"
-#include "oneflow/core/kernel/new_kernel_util.h"
-#include "oneflow/core/kernel/kernel_util.hip.h"
-#include "oneflow/core/framework/framework.h"
-#include "oneflow/core/ndarray/ndarray_util.h"
-#include "oneflow/core/ndarray/xpu_var_ndarray.h"
-#include "oneflow/core/ep/rocm/cuda_stream.h"
-
-namespace oneflow {
-namespace {
-template<typename T>
-__global__ void ComputeLogGpu(const int64_t len, T* out, const T* in) {
-  CUDA_1D_KERNEL_LOOP(i, len) { out[i] = SafeLog(in[i]); }
-}
-template<>
-__global__ void ComputeLogGpu<float16>(const int64_t len, float16* out, const float16* in) {
-  const half* _in = reinterpret_cast<const half*>(in);
-  half* _out = reinterpret_cast<half*>(out);
-  CUDA_1D_KERNEL_LOOP(i, len) { _out[i] = SafeLog(_in[i]); }
-}
-
-template<DeviceType device, typename T>
-class BroadcastPowYGradKernel final : public user_op::OpKernel {
- public:
-  BroadcastPowYGradKernel() = default;
-  ~BroadcastPowYGradKernel() = default;
-
- private:
-  using user_op::OpKernel::Compute;
-  void Compute(user_op::KernelComputeContext* ctx) const override {
-    const user_op::Tensor* x_tensor = ctx->Tensor4ArgNameAndIndex("x", 0);
-    const user_op::Tensor* z_tensor = ctx->Tensor4ArgNameAndIndex("z", 0);
-    const user_op::Tensor* dz_tensor = ctx->Tensor4ArgNameAndIndex("dz", 0);
-    user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0);
-    user_op::Tensor* dy_tensor = ctx->Tensor4ArgNameAndIndex("dy", 0);
-
-    const int64_t num_axes = dz_tensor->shape_view().NumAxes();
-    const int64_t elem_cnt = z_tensor->shape_view().elem_cnt();
-    Memset<device>(ctx->stream(), tmp_buffer->mut_dptr<T>(), 0,
-                   GetCudaAlignedSize(elem_cnt * sizeof(T)));
-    XpuVarNdarray<const T> z(z_tensor->shape_view(), z_tensor->dptr<T>(), num_axes);
-    XpuVarNdarray<const T> dz(dz_tensor->shape_view(), dz_tensor->dptr<T>(), num_axes);
-    XpuVarNdarray<const T> const_tmp(dz.shape(), tmp_buffer->dptr<T>());
-    XpuVarNdarray<T> tmp(dz.shape(), tmp_buffer->mut_dptr<T>());
-    XpuVarNdarray<const T> x(x_tensor->shape_view(), x_tensor->dptr<T>(), num_axes);
-    XpuVarNdarray<T> dy(dy_tensor->shape_view(), dy_tensor->mut_dptr<T>(), num_axes);
-    NdarrayUtil<device, T>::BroadcastAdd(ctx->stream(), tmp, x, const_tmp);
-    ComputeLogGpu<T><<<BlocksNum4ThreadsNum(elem_cnt), kCudaThreadsNumPerBlock, 0,
-                       ctx->stream()->As<ep::CudaStream>()->cuda_stream()>>>(
-        elem_cnt, tmp_buffer->mut_dptr<T>(), tmp_buffer->dptr<T>());
-    NdarrayUtil<device, T>::BroadcastMul(ctx->stream(), tmp, dz, const_tmp);
-    NdarrayUtil<device, T>::BroadcastMul(ctx->stream(), tmp, z, const_tmp);
-    NdarrayUtil<device, T>::ReduceSum(ctx->stream(), dy, const_tmp, tmp);
-  }
-  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
-};
-
-}  // namespace
-#define REGISTER_BROADCAST_POW_Y_GRAD_KERNEL(device, dtype_pair)                           \
-  REGISTER_USER_KERNEL("broadcast_pow_y_grad")                                             \
-      .SetCreateFn<BroadcastPowYGradKernel<device, OF_PP_PAIR_FIRST(dtype_pair)>>()        \
-      .SetIsMatchedHob((user_op::HobDeviceType() == device)                                \
-                       && (user_op::HobDataType("x", 0) == OF_PP_PAIR_SECOND(dtype_pair))) \
-      .SetInferTmpSizeFn([](oneflow::user_op::InferContext* ctx) {                         \
-        const user_op::TensorDesc& z = ctx->InputTensorDesc("z", 0);                       \
-        const DataType& data_type = z.data_type();                                         \
-        const int64_t elem_cnt = z.shape().elem_cnt();                                     \
-        return GetCudaAlignedSize(elem_cnt * GetSizeOfDataType(data_type));                \
-      });
-
-OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_BROADCAST_POW_Y_GRAD_KERNEL, (DeviceType::kCUDA),
-                                 ARITHMETIC_DATA_TYPE_SEQ FLOAT16_DATA_TYPE_SEQ)
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "hip/hip_runtime.h"
+#include "oneflow/core/kernel/new_kernel_util.h"
+#include "oneflow/core/kernel/kernel_util.hip.h"
+#include "oneflow/core/framework/framework.h"
+#include "oneflow/core/ndarray/ndarray_util.h"
+#include "oneflow/core/ndarray/xpu_var_ndarray.h"
+#include "oneflow/core/ep/rocm/cuda_stream.h"
+
+namespace oneflow {
+namespace {
+template<typename T>
+__global__ void ComputeLogGpu(const int64_t len, T* out, const T* in) {
+  CUDA_1D_KERNEL_LOOP(i, len) { out[i] = SafeLog(in[i]); }
+}
+template<>
+__global__ void ComputeLogGpu<float16>(const int64_t len, float16* out, const float16* in) {
+  const half* _in = reinterpret_cast<const half*>(in);
+  half* _out = reinterpret_cast<half*>(out);
+  CUDA_1D_KERNEL_LOOP(i, len) { _out[i] = SafeLog(_in[i]); }
+}
+
+template<DeviceType device, typename T>
+class BroadcastPowYGradKernel final : public user_op::OpKernel {
+ public:
+  BroadcastPowYGradKernel() = default;
+  ~BroadcastPowYGradKernel() = default;
+
+ private:
+  using user_op::OpKernel::Compute;
+  void Compute(user_op::KernelComputeContext* ctx) const override {
+    const user_op::Tensor* x_tensor = ctx->Tensor4ArgNameAndIndex("x", 0);
+    const user_op::Tensor* z_tensor = ctx->Tensor4ArgNameAndIndex("z", 0);
+    const user_op::Tensor* dz_tensor = ctx->Tensor4ArgNameAndIndex("dz", 0);
+    user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0);
+    user_op::Tensor* dy_tensor = ctx->Tensor4ArgNameAndIndex("dy", 0);
+
+    const int64_t num_axes = dz_tensor->shape_view().NumAxes();
+    const int64_t elem_cnt = z_tensor->shape_view().elem_cnt();
+    Memset<device>(ctx->stream(), tmp_buffer->mut_dptr<T>(), 0,
+                   GetCudaAlignedSize(elem_cnt * sizeof(T)));
+    XpuVarNdarray<const T> z(z_tensor->shape_view(), z_tensor->dptr<T>(), num_axes);
+    XpuVarNdarray<const T> dz(dz_tensor->shape_view(), dz_tensor->dptr<T>(), num_axes);
+    XpuVarNdarray<const T> const_tmp(dz.shape(), tmp_buffer->dptr<T>());
+    XpuVarNdarray<T> tmp(dz.shape(), tmp_buffer->mut_dptr<T>());
+    XpuVarNdarray<const T> x(x_tensor->shape_view(), x_tensor->dptr<T>(), num_axes);
+    XpuVarNdarray<T> dy(dy_tensor->shape_view(), dy_tensor->mut_dptr<T>(), num_axes);
+    NdarrayUtil<device, T>::BroadcastAdd(ctx->stream(), tmp, x, const_tmp);
+    ComputeLogGpu<T><<<BlocksNum4ThreadsNum(elem_cnt), kCudaThreadsNumPerBlock, 0,
+                       ctx->stream()->As<ep::CudaStream>()->cuda_stream()>>>(
+        elem_cnt, tmp_buffer->mut_dptr<T>(), tmp_buffer->dptr<T>());
+    NdarrayUtil<device, T>::BroadcastMul(ctx->stream(), tmp, dz, const_tmp);
+    NdarrayUtil<device, T>::BroadcastMul(ctx->stream(), tmp, z, const_tmp);
+    NdarrayUtil<device, T>::ReduceSum(ctx->stream(), dy, const_tmp, tmp);
+  }
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
+};
+
+}  // namespace
+#define REGISTER_BROADCAST_POW_Y_GRAD_KERNEL(device, dtype_pair)                           \
+  REGISTER_USER_KERNEL("broadcast_pow_y_grad")                                             \
+      .SetCreateFn<BroadcastPowYGradKernel<device, OF_PP_PAIR_FIRST(dtype_pair)>>()        \
+      .SetIsMatchedHob((user_op::HobDeviceType() == device)                                \
+                       && (user_op::HobDataType("x", 0) == OF_PP_PAIR_SECOND(dtype_pair))) \
+      .SetInferTmpSizeFn([](oneflow::user_op::InferContext* ctx) {                         \
+        const user_op::TensorDesc& z = ctx->InputTensorDesc("z", 0);                       \
+        const DataType& data_type = z.data_type();                                         \
+        const int64_t elem_cnt = z.shape().elem_cnt();                                     \
+        return GetCudaAlignedSize(elem_cnt * GetSizeOfDataType(data_type));                \
+      });
+
+OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_BROADCAST_POW_Y_GRAD_KERNEL, (DeviceType::kCUDA),
+                                 ARITHMETIC_DATA_TYPE_SEQ FLOAT16_DATA_TYPE_SEQ)
 }  // namespace oneflow
\ No newline at end of file
diff --git a/oneflow/user/kernels/categorical_ordinal_encode_kernel_util.hip.cpp b/oneflow/user/kernels/categorical_ordinal_encode_kernel_util.hip.cpp
index 927b0c8..3dde691 100644
--- a/oneflow/user/kernels/categorical_ordinal_encode_kernel_util.hip.cpp
+++ b/oneflow/user/kernels/categorical_ordinal_encode_kernel_util.hip.cpp
@@ -1,125 +1,125 @@
-#include "hip/hip_runtime.h"
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#ifdef NDEBUG
-#undef NDEBUG
-#endif
-#include <assert.h>
-#include "oneflow/user/kernels/categorical_ordinal_encode_kernel_util.h"
-#include "oneflow/core/kernel/kernel_util.hip.h"
-#include "oneflow/core/ep/rocm/cuda_stream.h"
-
-namespace oneflow {
-
-namespace {
-
-using CuInt64T = unsigned long long int;
-
-__device__ __inline__ int32_t AtomicCAS(int32_t* address, int32_t compare, int32_t val) {
-  return atomicCAS(address, compare, val);
-}
-
-__device__ __inline__ int64_t AtomicCAS(int64_t* address, int64_t compare, int64_t val) {
-  static_assert(sizeof(int64_t) == sizeof(CuInt64T), "size error");
-  return static_cast<int64_t>(atomicCAS(reinterpret_cast<CuInt64T*>(address),
-                                        static_cast<CuInt64T>(compare),
-                                        static_cast<CuInt64T>(val)));
-}
-
-__device__ __inline__ int32_t AtomicAdd(int32_t* address, int32_t val) {
-  return atomicAdd(address, val);
-}
-
-__device__ __inline__ int64_t AtomicAdd(int64_t* address, int64_t val) {
-  static_assert(sizeof(int64_t) == sizeof(CuInt64T), "size error");
-  return static_cast<int64_t>(
-      atomicAdd(reinterpret_cast<CuInt64T*>(address), static_cast<CuInt64T>(val)));
-}
-
-template<typename K, typename V>
-__device__ bool TryGetOrInsert(K* key, volatile V* value, V* size, const K hash, V* out) {
-  K old_key = AtomicCAS(key, static_cast<K>(0), hash);
-  if (old_key == 0) {
-    V v = AtomicAdd(size, 1) + 1;
-    *value = v;
-    *out = v;
-    return true;
-  } else if (old_key == hash) {
-    while (true) {
-      V v = *value;
-      if (v != 0) {
-        *out = v;
-        break;
-      }
-    }
-    return true;
-  } else {
-    return false;
-  }
-}
-
-template<typename T>
-__device__ bool GetOrInsertOne(const size_t capacity, T* table, T* size, const T hash, T* out) {
-  if (hash == 0) {
-    *out = 0;
-    return true;
-  }
-  const size_t start_idx = static_cast<size_t>(hash) % capacity;
-  // fast path
-  {
-    T* key = table + start_idx * 2;
-    T* value = key + 1;
-    if (*key == hash && *value != 0) {
-      *out = *value;
-      return true;
-    }
-  }
-  for (size_t count = 0; count < capacity; ++count) {
-    const size_t idx = (start_idx + count) % capacity;
-    T* key = table + idx * 2;
-    T* value = key + 1;
-    if (TryGetOrInsert<T, T>(key, value, size, hash, out)) { return true; }
-  }
-  return false;
-}
-
-template<typename T>
-__global__ void EncodeGpu(const size_t capacity, T* table, T* size, const int64_t n, const T* hash,
-                          T* out) {
-  CUDA_1D_KERNEL_LOOP(i, n) {
-    bool success = GetOrInsertOne<T>(capacity, table, size, hash[i], out + i);
-    assert(success);
-  }
-}
-
-}  // namespace
-
-template<typename T>
-struct CategoricalOrdinalEncodeKernelUtil<DeviceType::kCUDA, T> {
-  static void Encode(ep::Stream* stream, int64_t capacity, T* table, T* size, int64_t n,
-                     const T* hash, T* out) {
-    EncodeGpu<T>
-        <<<BlocksNum4ThreadsNum(n), kCudaThreadsNumPerBlock, 0,
-           stream->As<ep::CudaStream>()->cuda_stream()>>>(capacity, table, size, n, hash, out);
-  }
-};
-
-#define INSTANTIATE_CATEGORICAL_ORDINAL_ENCODE_KERNEL_UTIL_CUDA(type_cpp, type_proto) \
-  template struct CategoricalOrdinalEncodeKernelUtil<DeviceType::kCUDA, type_cpp>;
-OF_PP_FOR_EACH_TUPLE(INSTANTIATE_CATEGORICAL_ORDINAL_ENCODE_KERNEL_UTIL_CUDA, INDEX_DATA_TYPE_SEQ);
-#undef INSTANTIATE_CATEGORICAL_ORDINAL_ENCODE_KERNEL_UTIL_CUDA
-
+#include "hip/hip_runtime.h"
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#ifdef NDEBUG
+#undef NDEBUG
+#endif
+#include <assert.h>
+#include "oneflow/user/kernels/categorical_ordinal_encode_kernel_util.h"
+#include "oneflow/core/kernel/kernel_util.hip.h"
+#include "oneflow/core/ep/rocm/cuda_stream.h"
+
+namespace oneflow {
+
+namespace {
+
+using CuInt64T = unsigned long long int;
+
+__device__ __inline__ int32_t AtomicCAS(int32_t* address, int32_t compare, int32_t val) {
+  return atomicCAS(address, compare, val);
+}
+
+__device__ __inline__ int64_t AtomicCAS(int64_t* address, int64_t compare, int64_t val) {
+  static_assert(sizeof(int64_t) == sizeof(CuInt64T), "size error");
+  return static_cast<int64_t>(atomicCAS(reinterpret_cast<CuInt64T*>(address),
+                                        static_cast<CuInt64T>(compare),
+                                        static_cast<CuInt64T>(val)));
+}
+
+__device__ __inline__ int32_t AtomicAdd(int32_t* address, int32_t val) {
+  return atomicAdd(address, val);
+}
+
+__device__ __inline__ int64_t AtomicAdd(int64_t* address, int64_t val) {
+  static_assert(sizeof(int64_t) == sizeof(CuInt64T), "size error");
+  return static_cast<int64_t>(
+      atomicAdd(reinterpret_cast<CuInt64T*>(address), static_cast<CuInt64T>(val)));
+}
+
+template<typename K, typename V>
+__device__ bool TryGetOrInsert(K* key, volatile V* value, V* size, const K hash, V* out) {
+  K old_key = AtomicCAS(key, static_cast<K>(0), hash);
+  if (old_key == 0) {
+    V v = AtomicAdd(size, 1) + 1;
+    *value = v;
+    *out = v;
+    return true;
+  } else if (old_key == hash) {
+    while (true) {
+      V v = *value;
+      if (v != 0) {
+        *out = v;
+        break;
+      }
+    }
+    return true;
+  } else {
+    return false;
+  }
+}
+
+template<typename T>
+__device__ bool GetOrInsertOne(const size_t capacity, T* table, T* size, const T hash, T* out) {
+  if (hash == 0) {
+    *out = 0;
+    return true;
+  }
+  const size_t start_idx = static_cast<size_t>(hash) % capacity;
+  // fast path
+  {
+    T* key = table + start_idx * 2;
+    T* value = key + 1;
+    if (*key == hash && *value != 0) {
+      *out = *value;
+      return true;
+    }
+  }
+  for (size_t count = 0; count < capacity; ++count) {
+    const size_t idx = (start_idx + count) % capacity;
+    T* key = table + idx * 2;
+    T* value = key + 1;
+    if (TryGetOrInsert<T, T>(key, value, size, hash, out)) { return true; }
+  }
+  return false;
+}
+
+template<typename T>
+__global__ void EncodeGpu(const size_t capacity, T* table, T* size, const int64_t n, const T* hash,
+                          T* out) {
+  CUDA_1D_KERNEL_LOOP(i, n) {
+    bool success = GetOrInsertOne<T>(capacity, table, size, hash[i], out + i);
+    assert(success);
+  }
+}
+
+}  // namespace
+
+template<typename T>
+struct CategoricalOrdinalEncodeKernelUtil<DeviceType::kCUDA, T> {
+  static void Encode(ep::Stream* stream, int64_t capacity, T* table, T* size, int64_t n,
+                     const T* hash, T* out) {
+    EncodeGpu<T>
+        <<<BlocksNum4ThreadsNum(n), kCudaThreadsNumPerBlock, 0,
+           stream->As<ep::CudaStream>()->cuda_stream()>>>(capacity, table, size, n, hash, out);
+  }
+};
+
+#define INSTANTIATE_CATEGORICAL_ORDINAL_ENCODE_KERNEL_UTIL_CUDA(type_cpp, type_proto) \
+  template struct CategoricalOrdinalEncodeKernelUtil<DeviceType::kCUDA, type_cpp>;
+OF_PP_FOR_EACH_TUPLE(INSTANTIATE_CATEGORICAL_ORDINAL_ENCODE_KERNEL_UTIL_CUDA, INDEX_DATA_TYPE_SEQ);
+#undef INSTANTIATE_CATEGORICAL_ORDINAL_ENCODE_KERNEL_UTIL_CUDA
+
 }  // namespace oneflow
\ No newline at end of file
diff --git a/oneflow/user/kernels/clip_by_value_kernel.hip.cpp b/oneflow/user/kernels/clip_by_value_kernel.hip.cpp
index eb2e550..7039aae 100644
--- a/oneflow/user/kernels/clip_by_value_kernel.hip.cpp
+++ b/oneflow/user/kernels/clip_by_value_kernel.hip.cpp
@@ -1,72 +1,72 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "hip/hip_runtime.h"
-#include "oneflow/user/kernels/clip_by_value_kernel.h"
-#include "oneflow/core/device/cuda_util.h"
-
-namespace oneflow {
-
-namespace {
-
-template<typename T, typename F>
-__global__ void CudaClipForward(F clip_func, int64_t n, const T* x, T* y) {
-  CUDA_1D_KERNEL_LOOP(i, n) { y[i] = clip_func(x[i]); }
-}
-
-template<typename T, typename F>
-__global__ void CudaClipBackward(F clip_func, int64_t n, const T* x, const T* dy, T* dx) {
-  CUDA_1D_KERNEL_LOOP(i, n) { dx[i] = clip_func(x[i], dy[i]); }
-}
-
-}  // namespace
-
-template<typename T>
-struct ClipKernelUtil<DeviceType::kCUDA, T> {
-  template<typename F>
-  static void Forward(ep::Stream* stream, F clip_func, const int64_t n, const T* x, T* y) {
-    if (n == 0) { return; }
-    RUN_CUDA_KERNEL((CudaClipForward<T, F>), stream, n, clip_func, n, x, y);
-  }
-
-  template<typename F>
-  static void Backward(ep::Stream* stream, F clip_func, const int64_t n, const T* x, const T* dy,
-                       T* dx) {
-    if (n == 0) { return; }
-    RUN_CUDA_KERNEL((CudaClipBackward<T, F>), stream, n, clip_func, n, x, dy, dx);
-  }
-};
-
-#define INITIATE_CLIP_KERNEL_UTIL_CUDA(dtype, dtype_v)                                          \
-  template struct ClipKernelUtil<DeviceType::kCUDA, dtype>;                                     \
-  template void ClipKernelUtil<DeviceType::kCUDA, dtype>::Forward(                              \
-      ep::Stream*, ClipByMinFunctor<dtype>, const int64_t n, const dtype*, dtype*);             \
-  template void ClipKernelUtil<DeviceType::kCUDA, dtype>::Forward(                              \
-      ep::Stream*, ClipByMaxFunctor<dtype>, const int64_t n, const dtype*, dtype*);             \
-  template void ClipKernelUtil<DeviceType::kCUDA, dtype>::Forward(                              \
-      ep::Stream*, ClipByMinMaxFunctor<dtype>, const int64_t n, const dtype*, dtype*);          \
-  template void ClipKernelUtil<DeviceType::kCUDA, dtype>::Backward(                             \
-      ep::Stream*, ClipByMinGradFunctor<dtype>, const int64_t n, const dtype*, const dtype*,    \
-      dtype*);                                                                                  \
-  template void ClipKernelUtil<DeviceType::kCUDA, dtype>::Backward(                             \
-      ep::Stream*, ClipByMaxGradFunctor<dtype>, const int64_t n, const dtype*, const dtype*,    \
-      dtype*);                                                                                  \
-  template void ClipKernelUtil<DeviceType::kCUDA, dtype>::Backward(                             \
-      ep::Stream*, ClipByMinMaxGradFunctor<dtype>, const int64_t n, const dtype*, const dtype*, \
-      dtype*);
-
-OF_PP_FOR_EACH_TUPLE(INITIATE_CLIP_KERNEL_UTIL_CUDA, ARITHMETIC_DATA_TYPE_SEQ)
-
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "hip/hip_runtime.h"
+#include "oneflow/user/kernels/clip_by_value_kernel.h"
+#include "oneflow/core/device/cuda_util.h"
+
+namespace oneflow {
+
+namespace {
+
+template<typename T, typename F>
+__global__ void CudaClipForward(F clip_func, int64_t n, const T* x, T* y) {
+  CUDA_1D_KERNEL_LOOP(i, n) { y[i] = clip_func(x[i]); }
+}
+
+template<typename T, typename F>
+__global__ void CudaClipBackward(F clip_func, int64_t n, const T* x, const T* dy, T* dx) {
+  CUDA_1D_KERNEL_LOOP(i, n) { dx[i] = clip_func(x[i], dy[i]); }
+}
+
+}  // namespace
+
+template<typename T>
+struct ClipKernelUtil<DeviceType::kCUDA, T> {
+  template<typename F>
+  static void Forward(ep::Stream* stream, F clip_func, const int64_t n, const T* x, T* y) {
+    if (n == 0) { return; }
+    RUN_CUDA_KERNEL((CudaClipForward<T, F>), stream, n, clip_func, n, x, y);
+  }
+
+  template<typename F>
+  static void Backward(ep::Stream* stream, F clip_func, const int64_t n, const T* x, const T* dy,
+                       T* dx) {
+    if (n == 0) { return; }
+    RUN_CUDA_KERNEL((CudaClipBackward<T, F>), stream, n, clip_func, n, x, dy, dx);
+  }
+};
+
+#define INITIATE_CLIP_KERNEL_UTIL_CUDA(dtype, dtype_v)                                          \
+  template struct ClipKernelUtil<DeviceType::kCUDA, dtype>;                                     \
+  template void ClipKernelUtil<DeviceType::kCUDA, dtype>::Forward(                              \
+      ep::Stream*, ClipByMinFunctor<dtype>, const int64_t n, const dtype*, dtype*);             \
+  template void ClipKernelUtil<DeviceType::kCUDA, dtype>::Forward(                              \
+      ep::Stream*, ClipByMaxFunctor<dtype>, const int64_t n, const dtype*, dtype*);             \
+  template void ClipKernelUtil<DeviceType::kCUDA, dtype>::Forward(                              \
+      ep::Stream*, ClipByMinMaxFunctor<dtype>, const int64_t n, const dtype*, dtype*);          \
+  template void ClipKernelUtil<DeviceType::kCUDA, dtype>::Backward(                             \
+      ep::Stream*, ClipByMinGradFunctor<dtype>, const int64_t n, const dtype*, const dtype*,    \
+      dtype*);                                                                                  \
+  template void ClipKernelUtil<DeviceType::kCUDA, dtype>::Backward(                             \
+      ep::Stream*, ClipByMaxGradFunctor<dtype>, const int64_t n, const dtype*, const dtype*,    \
+      dtype*);                                                                                  \
+  template void ClipKernelUtil<DeviceType::kCUDA, dtype>::Backward(                             \
+      ep::Stream*, ClipByMinMaxGradFunctor<dtype>, const int64_t n, const dtype*, const dtype*, \
+      dtype*);
+
+OF_PP_FOR_EACH_TUPLE(INITIATE_CLIP_KERNEL_UTIL_CUDA, ARITHMETIC_DATA_TYPE_SEQ)
+
 }  // namespace oneflow
\ No newline at end of file
diff --git a/oneflow/user/kernels/combined_margin_loss_kernel.hip.cpp b/oneflow/user/kernels/combined_margin_loss_kernel.hip.cpp
index 871ae34..446fd45 100644
--- a/oneflow/user/kernels/combined_margin_loss_kernel.hip.cpp
+++ b/oneflow/user/kernels/combined_margin_loss_kernel.hip.cpp
@@ -1,225 +1,225 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "hip/hip_runtime.h"
-#include "oneflow/core/framework/framework.h"
-#include "oneflow/core/common/balanced_splitter.h"
-#include "oneflow/core/kernel/kernel_util.h"
-#include "oneflow/user/kernels/math_unary_elementwise_func.h"
-#include "oneflow/core/ep/rocm/cuda_stream.h"
-
-namespace oneflow {
-
-namespace {
-
-template<typename T, typename K, bool is_cosine_loss>
-__global__ void GpuForward(const int64_t n, const int64_t num_classes, const int64_t lower_bound,
-                           const T m1, const T m2, const T m3, const T* in, const K* labels, T* out,
-                           T* theta) {
-  CUDA_1D_KERNEL_LOOP(i, n) {
-    const int32_t row_id = i / num_classes;
-    const int32_t col_id = i - row_id * num_classes;
-    const T in_data = in[i];
-    T out_data = in_data;
-    K label = labels[row_id] - lower_bound;
-    if (is_cosine_loss) {
-      if (label == col_id) { out_data = in_data - m3; }
-    } else {
-      if (label == col_id) {
-        const T theta_data = AcosFunctor<T>::Forward(in_data);
-        out_data = CosFunctor<T>::Forward(theta_data * m1 + m2) - m3;
-        theta[row_id] = theta_data;
-      } else if ((label < 0 || label >= num_classes) && col_id == 0) {
-        theta[row_id] = 0;
-      }
-    }
-    out[i] = out_data;
-  }
-}
-
-template<typename T, typename K, bool is_cosine_loss>
-__global__ void GpuBackward(const int64_t n, const int64_t num_classes, const int64_t lower_bound,
-                            const T m1, const T m2, const T m3, const T* dy, const K* labels,
-                            const T* theta, T* dx) {
-  CUDA_1D_KERNEL_LOOP(i, n) {
-    const int32_t row_id = i / num_classes;
-    const int32_t col_id = i - row_id * num_classes;
-    K label = labels[row_id] - lower_bound;
-    const T dy_data = dy[i];
-    const T theta_data = theta[row_id];
-    T dx_data = dy_data;
-    if (label == col_id && !is_cosine_loss) {
-      dx_data = dy_data * SinFunctor<T>::Forward(theta_data * m1 + m2) * m1
-                / SinFunctor<T>::Forward(theta_data);
-    }
-    dx[i] = dx_data;
-  }
-}
-
-class CombinedMarginLossOpKernelCache final : public user_op::OpKernelCache {
- public:
-  CombinedMarginLossOpKernelCache(int64_t lower, int64_t upper) : lower_(lower), upper_(upper) {}
-  ~CombinedMarginLossOpKernelCache() override = default;
-
-  int64_t lower() const { return lower_; }
-  int64_t upper() const { return upper_; }
-
- private:
-  const int64_t lower_;
-  const int64_t upper_;
-};
-
-std::shared_ptr<user_op::OpKernelCache> CreateCombinedMarginLossOpKernelCache(
-    user_op::KernelCacheContext* ctx, const std::string& in_arg_name) {
-  if (ctx->parallel_ctx().parallel_num() == 1) { return nullptr; }
-
-  const SbpParallel& in_sbp = ctx->SbpParallel4ArgNameAndIndex(in_arg_name, 0);
-  if (in_sbp.has_split_parallel() && in_sbp.split_parallel().axis() == 1
-      && ctx->parallel_ctx().parallel_num() > 1) {
-    CHECK(ctx->SbpParallel4ArgNameAndIndex("label", 0).has_broadcast_parallel());
-    const user_op::TensorDesc* in_logical_desc =
-        ctx->LogicalTensorDesc4ArgNameAndIndex(in_arg_name, 0);
-    const auto depth = ctx->Attr<int64_t>("depth");
-    CHECK_EQ(depth, in_logical_desc->shape().At(1));
-    BalancedSplitter bs(depth, ctx->parallel_ctx().parallel_num());
-    return std::make_shared<CombinedMarginLossOpKernelCache>(
-        bs.At(ctx->parallel_ctx().parallel_id()).begin(),
-        bs.At(ctx->parallel_ctx().parallel_id()).end());
-  } else {
-    return nullptr;
-  }
-}
-
-}  // namespace
-
-template<typename T, typename K>
-class CombinedMarginLossGpuKernel final : public user_op::OpKernel {
- public:
-  CombinedMarginLossGpuKernel() = default;
-  ~CombinedMarginLossGpuKernel() override = default;
-
-  std::shared_ptr<user_op::OpKernelCache> InitOpKernelCache(
-      user_op::KernelCacheContext* ctx) const override {
-    return CreateCombinedMarginLossOpKernelCache(ctx, "x");
-  }
-
- private:
-  using user_op::OpKernel::Compute;
-  void Compute(user_op::KernelComputeContext* ctx, user_op::OpKernelState*,
-               const user_op::OpKernelCache* cache) const override {
-    const user_op::Tensor* x = ctx->Tensor4ArgNameAndIndex("x", 0);
-    const user_op::Tensor* label = ctx->Tensor4ArgNameAndIndex("label", 0);
-    user_op::Tensor* y = ctx->Tensor4ArgNameAndIndex("y", 0);
-    user_op::Tensor* theta = ctx->Tensor4ArgNameAndIndex("theta", 0);
-    const float m1 = ctx->Attr<float>("m1");
-    const float m2 = ctx->Attr<float>("m2");
-    const float m3 = ctx->Attr<float>("m3");
-    int64_t lower_bound = 0;
-    if (cache != nullptr) {
-      auto* kernel_cache = dynamic_cast<const CombinedMarginLossOpKernelCache*>(cache);
-      CHECK_NOTNULL(kernel_cache);
-      CHECK_EQ(x->shape_view().Count(1), kernel_cache->upper() - kernel_cache->lower());
-      lower_bound = kernel_cache->lower();
-    }
-    if (m1 == 1.0 && m2 == 0.0) {
-      GpuForward<T, K, true>
-          <<<BlocksNum4ThreadsNum(x->shape_view().elem_cnt()), kCudaThreadsNumPerBlock, 0,
-             ctx->stream()->As<ep::CudaStream>()->cuda_stream()>>>(
-              x->shape_view().elem_cnt(), x->shape_view().Count(1), lower_bound, static_cast<T>(m1),
-              static_cast<T>(m2), static_cast<T>(m3), x->dptr<T>(), label->dptr<K>(),
-              y->mut_dptr<T>(), theta->mut_dptr<T>());
-    } else {
-      GpuForward<T, K, false>
-          <<<BlocksNum4ThreadsNum(x->shape_view().elem_cnt()), kCudaThreadsNumPerBlock, 0,
-             ctx->stream()->As<ep::CudaStream>()->cuda_stream()>>>(
-              x->shape_view().elem_cnt(), x->shape_view().Count(1), lower_bound, static_cast<T>(m1),
-              static_cast<T>(m2), static_cast<T>(m3), x->dptr<T>(), label->dptr<K>(),
-              y->mut_dptr<T>(), theta->mut_dptr<T>());
-    }
-  }
-  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
-};
-
-#define REGISTER_COMBINED_MARGIN_LOSS_CUDA_KERNEL(in_type, indices_type)               \
-  REGISTER_USER_KERNEL("combined_margin_loss")                                         \
-      .SetCreateFn<CombinedMarginLossGpuKernel<OF_PP_PAIR_FIRST(in_type),              \
-                                               OF_PP_PAIR_FIRST(indices_type)>>()      \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                 \
-                       && (user_op::HobDataType("x", 0) == OF_PP_PAIR_SECOND(in_type)) \
-                       && (user_op::HobDataType("label", 0) == OF_PP_PAIR_SECOND(indices_type)));
-
-OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_COMBINED_MARGIN_LOSS_CUDA_KERNEL, FLOATING_DATA_TYPE_SEQ,
-                                 INDEX_DATA_TYPE_SEQ)
-
-template<typename T, typename K>
-class CombinedMarginLossGradGpuKernel final : public user_op::OpKernel {
- public:
-  CombinedMarginLossGradGpuKernel() = default;
-  ~CombinedMarginLossGradGpuKernel() override = default;
-
-  std::shared_ptr<user_op::OpKernelCache> InitOpKernelCache(
-      user_op::KernelCacheContext* ctx) const override {
-    return CreateCombinedMarginLossOpKernelCache(ctx, "dy");
-  }
-
- private:
-  using user_op::OpKernel::Compute;
-  void Compute(user_op::KernelComputeContext* ctx, user_op::OpKernelState*,
-               const user_op::OpKernelCache* cache) const override {
-    const user_op::Tensor* dy = ctx->Tensor4ArgNameAndIndex("dy", 0);
-    const user_op::Tensor* label = ctx->Tensor4ArgNameAndIndex("label", 0);
-    const user_op::Tensor* theta = ctx->Tensor4ArgNameAndIndex("theta", 0);
-    user_op::Tensor* dx = ctx->Tensor4ArgNameAndIndex("dx", 0);
-    const float m1 = ctx->Attr<float>("m1");
-    const float m2 = ctx->Attr<float>("m2");
-    const float m3 = ctx->Attr<float>("m3");
-    int64_t lower_bound = 0;
-    if (cache != nullptr) {
-      auto* kernel_cache = dynamic_cast<const CombinedMarginLossOpKernelCache*>(cache);
-      CHECK_NOTNULL(kernel_cache);
-      CHECK_EQ(dy->shape_view().Count(1), kernel_cache->upper() - kernel_cache->lower());
-      lower_bound = kernel_cache->lower();
-    }
-    if (m1 == 1.0 && m2 == 0.0) {
-      GpuBackward<T, K, true>
-          <<<BlocksNum4ThreadsNum(dy->shape_view().elem_cnt()), kCudaThreadsNumPerBlock, 0,
-             ctx->stream()->As<ep::CudaStream>()->cuda_stream()>>>(
-              dy->shape_view().elem_cnt(), dy->shape_view().Count(1), lower_bound,
-              static_cast<T>(m1), static_cast<T>(m2), static_cast<T>(m3), dy->dptr<T>(),
-              label->dptr<K>(), theta->dptr<T>(), dx->mut_dptr<T>());
-    } else {
-      GpuBackward<T, K, false>
-          <<<BlocksNum4ThreadsNum(dy->shape_view().elem_cnt()), kCudaThreadsNumPerBlock, 0,
-             ctx->stream()->As<ep::CudaStream>()->cuda_stream()>>>(
-              dy->shape_view().elem_cnt(), dy->shape_view().Count(1), lower_bound,
-              static_cast<T>(m1), static_cast<T>(m2), static_cast<T>(m3), dy->dptr<T>(),
-              label->dptr<K>(), theta->dptr<T>(), dx->mut_dptr<T>());
-    }
-  }
-  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
-};
-
-#define REGISTER_COMBINED_MARGIN_LOSS_GRAD_CUDA_KERNEL(dy_type, indices_type)           \
-  REGISTER_USER_KERNEL("combined_margin_loss_grad")                                     \
-      .SetCreateFn<CombinedMarginLossGradGpuKernel<OF_PP_PAIR_FIRST(dy_type),           \
-                                                   OF_PP_PAIR_FIRST(indices_type)>>()   \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                  \
-                       && (user_op::HobDataType("dy", 0) == OF_PP_PAIR_SECOND(dy_type)) \
-                       && (user_op::HobDataType("label", 0) == OF_PP_PAIR_SECOND(indices_type)));
-
-OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_COMBINED_MARGIN_LOSS_GRAD_CUDA_KERNEL,
-                                 FLOATING_DATA_TYPE_SEQ, INDEX_DATA_TYPE_SEQ)
-
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "hip/hip_runtime.h"
+#include "oneflow/core/framework/framework.h"
+#include "oneflow/core/common/balanced_splitter.h"
+#include "oneflow/core/kernel/kernel_util.h"
+#include "oneflow/user/kernels/math_unary_elementwise_func.h"
+#include "oneflow/core/ep/rocm/cuda_stream.h"
+
+namespace oneflow {
+
+namespace {
+
+template<typename T, typename K, bool is_cosine_loss>
+__global__ void GpuForward(const int64_t n, const int64_t num_classes, const int64_t lower_bound,
+                           const T m1, const T m2, const T m3, const T* in, const K* labels, T* out,
+                           T* theta) {
+  CUDA_1D_KERNEL_LOOP(i, n) {
+    const int32_t row_id = i / num_classes;
+    const int32_t col_id = i - row_id * num_classes;
+    const T in_data = in[i];
+    T out_data = in_data;
+    K label = labels[row_id] - lower_bound;
+    if (is_cosine_loss) {
+      if (label == col_id) { out_data = in_data - m3; }
+    } else {
+      if (label == col_id) {
+        const T theta_data = AcosFunctor<T>::Forward(in_data);
+        out_data = CosFunctor<T>::Forward(theta_data * m1 + m2) - m3;
+        theta[row_id] = theta_data;
+      } else if ((label < 0 || label >= num_classes) && col_id == 0) {
+        theta[row_id] = 0;
+      }
+    }
+    out[i] = out_data;
+  }
+}
+
+template<typename T, typename K, bool is_cosine_loss>
+__global__ void GpuBackward(const int64_t n, const int64_t num_classes, const int64_t lower_bound,
+                            const T m1, const T m2, const T m3, const T* dy, const K* labels,
+                            const T* theta, T* dx) {
+  CUDA_1D_KERNEL_LOOP(i, n) {
+    const int32_t row_id = i / num_classes;
+    const int32_t col_id = i - row_id * num_classes;
+    K label = labels[row_id] - lower_bound;
+    const T dy_data = dy[i];
+    const T theta_data = theta[row_id];
+    T dx_data = dy_data;
+    if (label == col_id && !is_cosine_loss) {
+      dx_data = dy_data * SinFunctor<T>::Forward(theta_data * m1 + m2) * m1
+                / SinFunctor<T>::Forward(theta_data);
+    }
+    dx[i] = dx_data;
+  }
+}
+
+class CombinedMarginLossOpKernelCache final : public user_op::OpKernelCache {
+ public:
+  CombinedMarginLossOpKernelCache(int64_t lower, int64_t upper) : lower_(lower), upper_(upper) {}
+  ~CombinedMarginLossOpKernelCache() override = default;
+
+  int64_t lower() const { return lower_; }
+  int64_t upper() const { return upper_; }
+
+ private:
+  const int64_t lower_;
+  const int64_t upper_;
+};
+
+std::shared_ptr<user_op::OpKernelCache> CreateCombinedMarginLossOpKernelCache(
+    user_op::KernelCacheContext* ctx, const std::string& in_arg_name) {
+  if (ctx->parallel_ctx().parallel_num() == 1) { return nullptr; }
+
+  const SbpParallel& in_sbp = ctx->SbpParallel4ArgNameAndIndex(in_arg_name, 0);
+  if (in_sbp.has_split_parallel() && in_sbp.split_parallel().axis() == 1
+      && ctx->parallel_ctx().parallel_num() > 1) {
+    CHECK(ctx->SbpParallel4ArgNameAndIndex("label", 0).has_broadcast_parallel());
+    const user_op::TensorDesc* in_logical_desc =
+        ctx->LogicalTensorDesc4ArgNameAndIndex(in_arg_name, 0);
+    const auto depth = ctx->Attr<int64_t>("depth");
+    CHECK_EQ(depth, in_logical_desc->shape().At(1));
+    BalancedSplitter bs(depth, ctx->parallel_ctx().parallel_num());
+    return std::make_shared<CombinedMarginLossOpKernelCache>(
+        bs.At(ctx->parallel_ctx().parallel_id()).begin(),
+        bs.At(ctx->parallel_ctx().parallel_id()).end());
+  } else {
+    return nullptr;
+  }
+}
+
+}  // namespace
+
+template<typename T, typename K>
+class CombinedMarginLossGpuKernel final : public user_op::OpKernel {
+ public:
+  CombinedMarginLossGpuKernel() = default;
+  ~CombinedMarginLossGpuKernel() override = default;
+
+  std::shared_ptr<user_op::OpKernelCache> InitOpKernelCache(
+      user_op::KernelCacheContext* ctx) const override {
+    return CreateCombinedMarginLossOpKernelCache(ctx, "x");
+  }
+
+ private:
+  using user_op::OpKernel::Compute;
+  void Compute(user_op::KernelComputeContext* ctx, user_op::OpKernelState*,
+               const user_op::OpKernelCache* cache) const override {
+    const user_op::Tensor* x = ctx->Tensor4ArgNameAndIndex("x", 0);
+    const user_op::Tensor* label = ctx->Tensor4ArgNameAndIndex("label", 0);
+    user_op::Tensor* y = ctx->Tensor4ArgNameAndIndex("y", 0);
+    user_op::Tensor* theta = ctx->Tensor4ArgNameAndIndex("theta", 0);
+    const float m1 = ctx->Attr<float>("m1");
+    const float m2 = ctx->Attr<float>("m2");
+    const float m3 = ctx->Attr<float>("m3");
+    int64_t lower_bound = 0;
+    if (cache != nullptr) {
+      auto* kernel_cache = dynamic_cast<const CombinedMarginLossOpKernelCache*>(cache);
+      CHECK_NOTNULL(kernel_cache);
+      CHECK_EQ(x->shape_view().Count(1), kernel_cache->upper() - kernel_cache->lower());
+      lower_bound = kernel_cache->lower();
+    }
+    if (m1 == 1.0 && m2 == 0.0) {
+      GpuForward<T, K, true>
+          <<<BlocksNum4ThreadsNum(x->shape_view().elem_cnt()), kCudaThreadsNumPerBlock, 0,
+             ctx->stream()->As<ep::CudaStream>()->cuda_stream()>>>(
+              x->shape_view().elem_cnt(), x->shape_view().Count(1), lower_bound, static_cast<T>(m1),
+              static_cast<T>(m2), static_cast<T>(m3), x->dptr<T>(), label->dptr<K>(),
+              y->mut_dptr<T>(), theta->mut_dptr<T>());
+    } else {
+      GpuForward<T, K, false>
+          <<<BlocksNum4ThreadsNum(x->shape_view().elem_cnt()), kCudaThreadsNumPerBlock, 0,
+             ctx->stream()->As<ep::CudaStream>()->cuda_stream()>>>(
+              x->shape_view().elem_cnt(), x->shape_view().Count(1), lower_bound, static_cast<T>(m1),
+              static_cast<T>(m2), static_cast<T>(m3), x->dptr<T>(), label->dptr<K>(),
+              y->mut_dptr<T>(), theta->mut_dptr<T>());
+    }
+  }
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
+};
+
+#define REGISTER_COMBINED_MARGIN_LOSS_CUDA_KERNEL(in_type, indices_type)               \
+  REGISTER_USER_KERNEL("combined_margin_loss")                                         \
+      .SetCreateFn<CombinedMarginLossGpuKernel<OF_PP_PAIR_FIRST(in_type),              \
+                                               OF_PP_PAIR_FIRST(indices_type)>>()      \
+      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                 \
+                       && (user_op::HobDataType("x", 0) == OF_PP_PAIR_SECOND(in_type)) \
+                       && (user_op::HobDataType("label", 0) == OF_PP_PAIR_SECOND(indices_type)));
+
+OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_COMBINED_MARGIN_LOSS_CUDA_KERNEL, FLOATING_DATA_TYPE_SEQ,
+                                 INDEX_DATA_TYPE_SEQ)
+
+template<typename T, typename K>
+class CombinedMarginLossGradGpuKernel final : public user_op::OpKernel {
+ public:
+  CombinedMarginLossGradGpuKernel() = default;
+  ~CombinedMarginLossGradGpuKernel() override = default;
+
+  std::shared_ptr<user_op::OpKernelCache> InitOpKernelCache(
+      user_op::KernelCacheContext* ctx) const override {
+    return CreateCombinedMarginLossOpKernelCache(ctx, "dy");
+  }
+
+ private:
+  using user_op::OpKernel::Compute;
+  void Compute(user_op::KernelComputeContext* ctx, user_op::OpKernelState*,
+               const user_op::OpKernelCache* cache) const override {
+    const user_op::Tensor* dy = ctx->Tensor4ArgNameAndIndex("dy", 0);
+    const user_op::Tensor* label = ctx->Tensor4ArgNameAndIndex("label", 0);
+    const user_op::Tensor* theta = ctx->Tensor4ArgNameAndIndex("theta", 0);
+    user_op::Tensor* dx = ctx->Tensor4ArgNameAndIndex("dx", 0);
+    const float m1 = ctx->Attr<float>("m1");
+    const float m2 = ctx->Attr<float>("m2");
+    const float m3 = ctx->Attr<float>("m3");
+    int64_t lower_bound = 0;
+    if (cache != nullptr) {
+      auto* kernel_cache = dynamic_cast<const CombinedMarginLossOpKernelCache*>(cache);
+      CHECK_NOTNULL(kernel_cache);
+      CHECK_EQ(dy->shape_view().Count(1), kernel_cache->upper() - kernel_cache->lower());
+      lower_bound = kernel_cache->lower();
+    }
+    if (m1 == 1.0 && m2 == 0.0) {
+      GpuBackward<T, K, true>
+          <<<BlocksNum4ThreadsNum(dy->shape_view().elem_cnt()), kCudaThreadsNumPerBlock, 0,
+             ctx->stream()->As<ep::CudaStream>()->cuda_stream()>>>(
+              dy->shape_view().elem_cnt(), dy->shape_view().Count(1), lower_bound,
+              static_cast<T>(m1), static_cast<T>(m2), static_cast<T>(m3), dy->dptr<T>(),
+              label->dptr<K>(), theta->dptr<T>(), dx->mut_dptr<T>());
+    } else {
+      GpuBackward<T, K, false>
+          <<<BlocksNum4ThreadsNum(dy->shape_view().elem_cnt()), kCudaThreadsNumPerBlock, 0,
+             ctx->stream()->As<ep::CudaStream>()->cuda_stream()>>>(
+              dy->shape_view().elem_cnt(), dy->shape_view().Count(1), lower_bound,
+              static_cast<T>(m1), static_cast<T>(m2), static_cast<T>(m3), dy->dptr<T>(),
+              label->dptr<K>(), theta->dptr<T>(), dx->mut_dptr<T>());
+    }
+  }
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
+};
+
+#define REGISTER_COMBINED_MARGIN_LOSS_GRAD_CUDA_KERNEL(dy_type, indices_type)           \
+  REGISTER_USER_KERNEL("combined_margin_loss_grad")                                     \
+      .SetCreateFn<CombinedMarginLossGradGpuKernel<OF_PP_PAIR_FIRST(dy_type),           \
+                                                   OF_PP_PAIR_FIRST(indices_type)>>()   \
+      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                  \
+                       && (user_op::HobDataType("dy", 0) == OF_PP_PAIR_SECOND(dy_type)) \
+                       && (user_op::HobDataType("label", 0) == OF_PP_PAIR_SECOND(indices_type)));
+
+OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_COMBINED_MARGIN_LOSS_GRAD_CUDA_KERNEL,
+                                 FLOATING_DATA_TYPE_SEQ, INDEX_DATA_TYPE_SEQ)
+
 }  // namespace oneflow
\ No newline at end of file
diff --git a/oneflow/user/kernels/count_not_finite_kernel.hip.cpp b/oneflow/user/kernels/count_not_finite_kernel.hip.cpp
index 7275283..98a3487 100644
--- a/oneflow/user/kernels/count_not_finite_kernel.hip.cpp
+++ b/oneflow/user/kernels/count_not_finite_kernel.hip.cpp
@@ -1,173 +1,173 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "hip/hip_runtime.h"
-#include "oneflow/core/framework/framework.h"
-#include <hipcub/hipcub.hpp>
-#include "oneflow/core/kernel/new_kernel_util.h"
-#include "oneflow/core/kernel/cuda_graph_support.h"
-#include "oneflow/core/ep/rocm/cuda_stream.h"
-
-namespace oneflow {
-
-namespace {
-
-template<typename T, int32_t N>
-struct Param {
-  const T* x[N];
-  int64_t x_elem_cnt[N];
-  int64_t* y;
-  int64_t num_x;
-};
-
-using CuInt64T = unsigned long long int;
-
-__device__ __inline__ int64_t AtomicAdd(int64_t* address, int64_t val) {
-  static_assert(sizeof(int64_t) == sizeof(CuInt64T), "size error");
-  return static_cast<int64_t>(
-      atomicAdd(reinterpret_cast<CuInt64T*>(address), static_cast<CuInt64T>(val)));
-}
-
-template<typename T>
-__inline__ __device__ bool IsFinite(T x) {
-  return isfinite(x);
-}
-
-template<>
-__inline__ __device__ bool IsFinite<half>(half x) {
-  return IsFinite(static_cast<float>(x));
-}
-
-template<typename T>
-__global__ void CountNotFiniteGpu(const int64_t n, const T* x, int64_t* y) {
-  typedef hipcub::BlockReduce<int64_t, kCudaThreadsNumPerBlock> BlockReduce;
-  __shared__ typename BlockReduce::TempStorage cub_reduce_tmp_storage;
-  int64_t thread_count = 0;
-  CUDA_1D_KERNEL_LOOP(i, n) {
-    if (!IsFinite(x[i])) { thread_count += 1; }
-  }
-  __syncthreads();
-  int64_t block_count_sum = BlockReduce(cub_reduce_tmp_storage).Reduce(thread_count, hipcub::Sum());
-  if (threadIdx.x == 0) { AtomicAdd(y, block_count_sum); }
-}
-
-template<typename T, int32_t N>
-__global__ void MultiCountNotFiniteGpu(Param<T, N> param) {
-  typedef hipcub::BlockReduce<int64_t, kCudaThreadsNumPerBlock> BlockReduce;
-  __shared__ typename BlockReduce::TempStorage cub_reduce_tmp_storage;
-  int64_t thread_count = 0;
-  for (int32_t k = 0; k < param.num_x; ++k) {
-    CUDA_1D_KERNEL_LOOP(i, param.x_elem_cnt[k]) {
-      if (!IsFinite(param.x[k][i])) { thread_count += 1; }
-    }
-  }
-  __syncthreads();
-  int64_t block_count_sum = BlockReduce(cub_reduce_tmp_storage).Reduce(thread_count, hipcub::Sum());
-  if (threadIdx.x == 0) { AtomicAdd(param.y, block_count_sum); }
-}
-
-constexpr int64_t kCountNotFiniteNumBlocks = 512;
-
-int GetCountNotFiniteNumBlocks(const int64_t elem_cnt) {
-  return std::min((elem_cnt + kCudaThreadsNumPerBlock - 1) / kCudaThreadsNumPerBlock,
-                  kCountNotFiniteNumBlocks);
-}
-
-}  // namespace
-
-template<typename T>
-class CountNotFiniteGpuKernel final : public user_op::OpKernel, public user_op::CudaGraphSupport {
- public:
-  CountNotFiniteGpuKernel() = default;
-  ~CountNotFiniteGpuKernel() override = default;
-
- private:
-  using user_op::OpKernel::Compute;
-  void Compute(user_op::KernelComputeContext* ctx) const override {
-    const user_op::Tensor* x = ctx->Tensor4ArgNameAndIndex("x", 0);
-    user_op::Tensor* y = ctx->Tensor4ArgNameAndIndex("y", 0);
-    const int64_t elem_cnt = x->shape_view().elem_cnt();
-    Memset<DeviceType::kCUDA>(ctx->stream(), y->mut_dptr<int64_t>(), 0,
-                              y->shape_view().elem_cnt() * sizeof(int64_t));
-    CountNotFiniteGpu<T><<<GetCountNotFiniteNumBlocks(elem_cnt), kCudaThreadsNumPerBlock, 0,
-                           ctx->stream()->As<ep::CudaStream>()->cuda_stream()>>>(
-        elem_cnt, x->dptr<T>(), y->mut_dptr<int64_t>());
-  }
-  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
-};
-
-#define REGISTER_COUNT_NOT_FINITE_CUDA_KERNEL(dtype)                   \
-  REGISTER_USER_KERNEL("count_not_finite")                             \
-      .SetCreateFn<CountNotFiniteGpuKernel<dtype>>()                   \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \
-                       && (user_op::HobDataType("x", 0) == GetDataType<dtype>::value));
-
-REGISTER_COUNT_NOT_FINITE_CUDA_KERNEL(half)
-REGISTER_COUNT_NOT_FINITE_CUDA_KERNEL(float)
-REGISTER_COUNT_NOT_FINITE_CUDA_KERNEL(double)
-
-template<typename T>
-class MultiCountNotFiniteGpuKernel final : public user_op::OpKernel,
-                                           public user_op::CudaGraphSupport {
- public:
-  MultiCountNotFiniteGpuKernel() = default;
-  ~MultiCountNotFiniteGpuKernel() override = default;
-
- private:
-  using user_op::OpKernel::Compute;
-  void Compute(user_op::KernelComputeContext* ctx) const override {
-    user_op::Tensor* y = ctx->Tensor4ArgNameAndIndex("y", 0);
-    Param<T, 128> para;
-    Memset<DeviceType::kCUDA>(ctx->stream(), y->mut_dptr<int64_t>(), 0,
-                              y->shape_view().elem_cnt() * sizeof(int64_t));
-    para.y = y->mut_dptr<int64_t>();
-
-    int64_t remain_size = ctx->inputs().size();
-    int64_t input_id = 0;
-    while (remain_size > 0) {
-      if (remain_size > 128) {
-        remain_size -= 128;
-        para.num_x = 128;
-      } else {
-        para.num_x = remain_size;
-        remain_size = 0;
-      }
-      int64_t max_elem_cnt = 0;
-      for (int32_t i = 0; i < para.num_x; ++i) {
-        const user_op::Tensor* x = ctx->Tensor4ArgNameAndIndex("x", input_id);
-        input_id++;
-        para.x[i] = x->dptr<T>();
-        para.x_elem_cnt[i] = x->shape_view().elem_cnt();
-        max_elem_cnt = std::max(max_elem_cnt, x->shape_view().elem_cnt());
-      }
-      MultiCountNotFiniteGpu<T, 128>
-          <<<GetCountNotFiniteNumBlocks(max_elem_cnt), kCudaThreadsNumPerBlock, 0,
-             ctx->stream()->As<ep::CudaStream>()->cuda_stream()>>>(para);
-    }
-  }
-  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
-};
-
-#define REGISTER_MULTI_COUNT_NOT_FINITE_CUDA_KERNEL(dtype)             \
-  REGISTER_USER_KERNEL("multi_count_not_finite")                       \
-      .SetCreateFn<MultiCountNotFiniteGpuKernel<dtype>>()              \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \
-                       && (user_op::HobDataType("x", 0) == GetDataType<dtype>::value));
-
-REGISTER_MULTI_COUNT_NOT_FINITE_CUDA_KERNEL(half)
-REGISTER_MULTI_COUNT_NOT_FINITE_CUDA_KERNEL(float)
-REGISTER_MULTI_COUNT_NOT_FINITE_CUDA_KERNEL(double)
-
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "hip/hip_runtime.h"
+#include "oneflow/core/framework/framework.h"
+#include <hipcub/hipcub.hpp>
+#include "oneflow/core/kernel/new_kernel_util.h"
+#include "oneflow/core/kernel/cuda_graph_support.h"
+#include "oneflow/core/ep/rocm/cuda_stream.h"
+
+namespace oneflow {
+
+namespace {
+
+template<typename T, int32_t N>
+struct Param {
+  const T* x[N];
+  int64_t x_elem_cnt[N];
+  int64_t* y;
+  int64_t num_x;
+};
+
+using CuInt64T = unsigned long long int;
+
+__device__ __inline__ int64_t AtomicAdd(int64_t* address, int64_t val) {
+  static_assert(sizeof(int64_t) == sizeof(CuInt64T), "size error");
+  return static_cast<int64_t>(
+      atomicAdd(reinterpret_cast<CuInt64T*>(address), static_cast<CuInt64T>(val)));
+}
+
+template<typename T>
+__inline__ __device__ bool IsFinite(T x) {
+  return isfinite(x);
+}
+
+template<>
+__inline__ __device__ bool IsFinite<half>(half x) {
+  return IsFinite(static_cast<float>(x));
+}
+
+template<typename T>
+__global__ void CountNotFiniteGpu(const int64_t n, const T* x, int64_t* y) {
+  typedef hipcub::BlockReduce<int64_t, kCudaThreadsNumPerBlock> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage cub_reduce_tmp_storage;
+  int64_t thread_count = 0;
+  CUDA_1D_KERNEL_LOOP(i, n) {
+    if (!IsFinite(x[i])) { thread_count += 1; }
+  }
+  __syncthreads();
+  int64_t block_count_sum = BlockReduce(cub_reduce_tmp_storage).Reduce(thread_count, hipcub::Sum());
+  if (threadIdx.x == 0) { AtomicAdd(y, block_count_sum); }
+}
+
+template<typename T, int32_t N>
+__global__ void MultiCountNotFiniteGpu(Param<T, N> param) {
+  typedef hipcub::BlockReduce<int64_t, kCudaThreadsNumPerBlock> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage cub_reduce_tmp_storage;
+  int64_t thread_count = 0;
+  for (int32_t k = 0; k < param.num_x; ++k) {
+    CUDA_1D_KERNEL_LOOP(i, param.x_elem_cnt[k]) {
+      if (!IsFinite(param.x[k][i])) { thread_count += 1; }
+    }
+  }
+  __syncthreads();
+  int64_t block_count_sum = BlockReduce(cub_reduce_tmp_storage).Reduce(thread_count, hipcub::Sum());
+  if (threadIdx.x == 0) { AtomicAdd(param.y, block_count_sum); }
+}
+
+constexpr int64_t kCountNotFiniteNumBlocks = 512;
+
+int GetCountNotFiniteNumBlocks(const int64_t elem_cnt) {
+  return std::min((elem_cnt + kCudaThreadsNumPerBlock - 1) / kCudaThreadsNumPerBlock,
+                  kCountNotFiniteNumBlocks);
+}
+
+}  // namespace
+
+template<typename T>
+class CountNotFiniteGpuKernel final : public user_op::OpKernel, public user_op::CudaGraphSupport {
+ public:
+  CountNotFiniteGpuKernel() = default;
+  ~CountNotFiniteGpuKernel() override = default;
+
+ private:
+  using user_op::OpKernel::Compute;
+  void Compute(user_op::KernelComputeContext* ctx) const override {
+    const user_op::Tensor* x = ctx->Tensor4ArgNameAndIndex("x", 0);
+    user_op::Tensor* y = ctx->Tensor4ArgNameAndIndex("y", 0);
+    const int64_t elem_cnt = x->shape_view().elem_cnt();
+    Memset<DeviceType::kCUDA>(ctx->stream(), y->mut_dptr<int64_t>(), 0,
+                              y->shape_view().elem_cnt() * sizeof(int64_t));
+    CountNotFiniteGpu<T><<<GetCountNotFiniteNumBlocks(elem_cnt), kCudaThreadsNumPerBlock, 0,
+                           ctx->stream()->As<ep::CudaStream>()->cuda_stream()>>>(
+        elem_cnt, x->dptr<T>(), y->mut_dptr<int64_t>());
+  }
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
+};
+
+#define REGISTER_COUNT_NOT_FINITE_CUDA_KERNEL(dtype)                   \
+  REGISTER_USER_KERNEL("count_not_finite")                             \
+      .SetCreateFn<CountNotFiniteGpuKernel<dtype>>()                   \
+      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \
+                       && (user_op::HobDataType("x", 0) == GetDataType<dtype>::value));
+
+REGISTER_COUNT_NOT_FINITE_CUDA_KERNEL(half)
+REGISTER_COUNT_NOT_FINITE_CUDA_KERNEL(float)
+REGISTER_COUNT_NOT_FINITE_CUDA_KERNEL(double)
+
+template<typename T>
+class MultiCountNotFiniteGpuKernel final : public user_op::OpKernel,
+                                           public user_op::CudaGraphSupport {
+ public:
+  MultiCountNotFiniteGpuKernel() = default;
+  ~MultiCountNotFiniteGpuKernel() override = default;
+
+ private:
+  using user_op::OpKernel::Compute;
+  void Compute(user_op::KernelComputeContext* ctx) const override {
+    user_op::Tensor* y = ctx->Tensor4ArgNameAndIndex("y", 0);
+    Param<T, 128> para;
+    Memset<DeviceType::kCUDA>(ctx->stream(), y->mut_dptr<int64_t>(), 0,
+                              y->shape_view().elem_cnt() * sizeof(int64_t));
+    para.y = y->mut_dptr<int64_t>();
+
+    int64_t remain_size = ctx->inputs().size();
+    int64_t input_id = 0;
+    while (remain_size > 0) {
+      if (remain_size > 128) {
+        remain_size -= 128;
+        para.num_x = 128;
+      } else {
+        para.num_x = remain_size;
+        remain_size = 0;
+      }
+      int64_t max_elem_cnt = 0;
+      for (int32_t i = 0; i < para.num_x; ++i) {
+        const user_op::Tensor* x = ctx->Tensor4ArgNameAndIndex("x", input_id);
+        input_id++;
+        para.x[i] = x->dptr<T>();
+        para.x_elem_cnt[i] = x->shape_view().elem_cnt();
+        max_elem_cnt = std::max(max_elem_cnt, x->shape_view().elem_cnt());
+      }
+      MultiCountNotFiniteGpu<T, 128>
+          <<<GetCountNotFiniteNumBlocks(max_elem_cnt), kCudaThreadsNumPerBlock, 0,
+             ctx->stream()->As<ep::CudaStream>()->cuda_stream()>>>(para);
+    }
+  }
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
+};
+
+#define REGISTER_MULTI_COUNT_NOT_FINITE_CUDA_KERNEL(dtype)             \
+  REGISTER_USER_KERNEL("multi_count_not_finite")                       \
+      .SetCreateFn<MultiCountNotFiniteGpuKernel<dtype>>()              \
+      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \
+                       && (user_op::HobDataType("x", 0) == GetDataType<dtype>::value));
+
+REGISTER_MULTI_COUNT_NOT_FINITE_CUDA_KERNEL(half)
+REGISTER_MULTI_COUNT_NOT_FINITE_CUDA_KERNEL(float)
+REGISTER_MULTI_COUNT_NOT_FINITE_CUDA_KERNEL(double)
+
 }  // namespace oneflow
\ No newline at end of file
diff --git a/oneflow/user/kernels/ctc_greedy_decoder.hip.cpp b/oneflow/user/kernels/ctc_greedy_decoder.hip.cpp
index e754485..67568dd 100644
--- a/oneflow/user/kernels/ctc_greedy_decoder.hip.cpp
+++ b/oneflow/user/kernels/ctc_greedy_decoder.hip.cpp
@@ -1,146 +1,146 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "hip/hip_runtime.h"
-#include "oneflow/core/framework/framework.h"
-#include "oneflow/core/kernel/new_kernel_util.h"
-#include "oneflow/core/kernel/kernel_util.h"
-#include "oneflow/user/kernels/ctc_greedy_decoder.h"
-#include "oneflow/core/ep/rocm/cuda_stream.h"
-
-namespace oneflow {
-namespace {
-
-template<typename T>
-__global__ void CtcGreedyDecodeGpuMultiThread(int64_t* decoded_ptr, T* neg_sum_logits_ptr,
-                                              const T* log_probs_ptr,
-                                              const int64_t* input_lengths_ptr,
-                                              const bool merge_repeated,
-                                              const int64_t max_input_length,
-                                              const int64_t batch_size, const int64_t num_labels) {
-  const int64_t bid = blockIdx.x;
-  const int64_t tid = threadIdx.x;
-
-  for (int64_t b = bid; b < batch_size; b += gridDim.x) {
-    if (tid == 0) {
-      if (input_lengths_ptr[b] > max_input_length) {asm volatile("s_trap 0;");}
-    }
-  }
-
-  for (int64_t b = bid; b < batch_size; b += gridDim.x) {
-    extern __shared__ int64_t shared_max_indices_memory[];
-    int64_t* shared_max_indices = (int64_t*)shared_max_indices_memory;
-    NdIndexOffsetHelper<int64_t, 3> input_helper(max_input_length, batch_size, num_labels);
-    for (int64_t t = tid; t < max_input_length; t += blockDim.x) {
-      const T* prob_data_t = &log_probs_ptr[input_helper.NdIndexToOffset(t, b, 0)];
-      int64_t max_indice = 0;
-      T max_value = -FLT_MAX;
-      FOR_RANGE(int64_t, c, 0, num_labels) {
-        const T prob = prob_data_t[c];
-        if (prob > max_value) {
-          max_indice = c;
-          max_value = prob;
-        }
-      }
-      shared_max_indices[t] = max_indice;
-    }
-
-    __syncthreads();
-
-    if (tid == 0) {
-      int64_t prev_indices = -1, t_dec = 0;
-      FOR_RANGE(int64_t, t, 0, input_lengths_ptr[b]) {
-        const T* prob_data_t = &log_probs_ptr[input_helper.NdIndexToOffset(t, b, 0)];
-        const int64_t indice_t = shared_max_indices[t];
-        neg_sum_logits_ptr[b] -= prob_data_t[indice_t];
-        if (indice_t != num_labels - 1 && !(merge_repeated && (prev_indices == indice_t))) {
-          decoded_ptr[b * max_input_length + t_dec] = indice_t;
-          t_dec++;
-        }
-        prev_indices = indice_t;
-      }
-      FOR_RANGE(int64_t, t, t_dec, max_input_length) { decoded_ptr[b * max_input_length + t] = 0; }
-    }
-  }
-}
-
-template<typename T>
-__global__ void CtcGreedyDecodeGpu(int64_t* decoded_ptr, T* neg_sum_logits_ptr,
-                                   const T* log_probs_ptr, const int64_t* input_lengths_ptr,
-                                   const bool merge_repeated, const int64_t max_input_length,
-                                   const int64_t batch_size, const int64_t num_labels) {
-  for (int64_t b = 0; b < batch_size; b++) {
-    if (input_lengths_ptr[b] > max_input_length) {asm volatile("s_trap 0;");}
-  }
-  NdIndexOffsetHelper<int64_t, 3> input_helper(max_input_length, batch_size, num_labels);
-
-  CUDA_1D_KERNEL_LOOP(b, batch_size) {
-    int prev_indices = -1, t_dec = 0;
-    neg_sum_logits_ptr[b] = 0;
-    FOR_RANGE(int64_t, t, 0, input_lengths_ptr[b]) {
-      const T* prob_data_t = &log_probs_ptr[input_helper.NdIndexToOffset(t, b, 0)];
-      int64_t max_indice = -1;
-      T max_value = -FLT_MAX;
-      FOR_RANGE(int64_t, c, 0, num_labels) {
-        if (prob_data_t[c] > max_value) {
-          max_indice = c;
-          max_value = prob_data_t[c];
-        }
-      }
-      neg_sum_logits_ptr[b] -= max_value;
-      if (max_indice != num_labels - 1 && !(merge_repeated && (prev_indices == max_indice))) {
-        decoded_ptr[b * max_input_length + t_dec] = max_indice;
-        t_dec++;
-      }
-      prev_indices = max_indice;
-    }
-    FOR_RANGE(int64_t, t, t_dec, max_input_length) { decoded_ptr[b * max_input_length + t] = 0; }
-  }
-}
-
-template<typename T>
-struct CTCGreedyDecoderFunctor<DeviceType::kCUDA, T> final {
-  void operator()(ep::Stream* stream, int64_t* decoded_ptr, T* neg_sum_logits_ptr,
-                  const T* log_probs_ptr, const int64_t* input_lengths_ptr,
-                  const bool merge_repeated, const int64_t max_input_length,
-                  const int64_t batch_size, const int64_t num_labels) {
-    int32_t thread_num = batch_size * kCudaThreadsNumPerBlock;
-    int64_t shared_mem_size = max_input_length * sizeof(int64_t);
-
-    int max_active_blocks;
-    OF_CUDA_CHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor(
-        &max_active_blocks, CtcGreedyDecodeGpu<T>, kCudaThreadsNumPerBlock, shared_mem_size));
-    if (max_active_blocks > 0) {
-      CtcGreedyDecodeGpuMultiThread<<<BlocksNum4ThreadsNum(thread_num), kCudaThreadsNumPerBlock,
-                                      shared_mem_size,
-                                      stream->As<ep::CudaStream>()->cuda_stream()>>>(
-          decoded_ptr, neg_sum_logits_ptr, log_probs_ptr, input_lengths_ptr, merge_repeated,
-          max_input_length, batch_size, num_labels);
-
-    } else {
-      CtcGreedyDecodeGpu<<<BlocksNum4ThreadsNum(thread_num), kCudaThreadsNumPerBlock, 0,
-                           stream->As<ep::CudaStream>()->cuda_stream()>>>(
-          decoded_ptr, neg_sum_logits_ptr, log_probs_ptr, input_lengths_ptr, merge_repeated,
-          max_input_length, batch_size, num_labels);
-    }
-  }
-};
-
-}  // namespace
-
-REGISTER_CTC_GREEDY_DECODER_KERNELS(DeviceType::kCUDA, float);
-REGISTER_CTC_GREEDY_DECODER_KERNELS(DeviceType::kCUDA, double);
-
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "hip/hip_runtime.h"
+#include "oneflow/core/framework/framework.h"
+#include "oneflow/core/kernel/new_kernel_util.h"
+#include "oneflow/core/kernel/kernel_util.h"
+#include "oneflow/user/kernels/ctc_greedy_decoder.h"
+#include "oneflow/core/ep/rocm/cuda_stream.h"
+
+namespace oneflow {
+namespace {
+
+template<typename T>
+__global__ void CtcGreedyDecodeGpuMultiThread(int64_t* decoded_ptr, T* neg_sum_logits_ptr,
+                                              const T* log_probs_ptr,
+                                              const int64_t* input_lengths_ptr,
+                                              const bool merge_repeated,
+                                              const int64_t max_input_length,
+                                              const int64_t batch_size, const int64_t num_labels) {
+  const int64_t bid = blockIdx.x;
+  const int64_t tid = threadIdx.x;
+
+  for (int64_t b = bid; b < batch_size; b += gridDim.x) {
+    if (tid == 0) {
+      if (input_lengths_ptr[b] > max_input_length) {asm volatile("s_trap 0;");}
+    }
+  }
+
+  for (int64_t b = bid; b < batch_size; b += gridDim.x) {
+    extern __shared__ int64_t shared_max_indices_memory[];
+    int64_t* shared_max_indices = (int64_t*)shared_max_indices_memory;
+    NdIndexOffsetHelper<int64_t, 3> input_helper(max_input_length, batch_size, num_labels);
+    for (int64_t t = tid; t < max_input_length; t += blockDim.x) {
+      const T* prob_data_t = &log_probs_ptr[input_helper.NdIndexToOffset(t, b, 0)];
+      int64_t max_indice = 0;
+      T max_value = -FLT_MAX;
+      FOR_RANGE(int64_t, c, 0, num_labels) {
+        const T prob = prob_data_t[c];
+        if (prob > max_value) {
+          max_indice = c;
+          max_value = prob;
+        }
+      }
+      shared_max_indices[t] = max_indice;
+    }
+
+    __syncthreads();
+
+    if (tid == 0) {
+      int64_t prev_indices = -1, t_dec = 0;
+      FOR_RANGE(int64_t, t, 0, input_lengths_ptr[b]) {
+        const T* prob_data_t = &log_probs_ptr[input_helper.NdIndexToOffset(t, b, 0)];
+        const int64_t indice_t = shared_max_indices[t];
+        neg_sum_logits_ptr[b] -= prob_data_t[indice_t];
+        if (indice_t != num_labels - 1 && !(merge_repeated && (prev_indices == indice_t))) {
+          decoded_ptr[b * max_input_length + t_dec] = indice_t;
+          t_dec++;
+        }
+        prev_indices = indice_t;
+      }
+      FOR_RANGE(int64_t, t, t_dec, max_input_length) { decoded_ptr[b * max_input_length + t] = 0; }
+    }
+  }
+}
+
+template<typename T>
+__global__ void CtcGreedyDecodeGpu(int64_t* decoded_ptr, T* neg_sum_logits_ptr,
+                                   const T* log_probs_ptr, const int64_t* input_lengths_ptr,
+                                   const bool merge_repeated, const int64_t max_input_length,
+                                   const int64_t batch_size, const int64_t num_labels) {
+  for (int64_t b = 0; b < batch_size; b++) {
+    if (input_lengths_ptr[b] > max_input_length) {asm volatile("s_trap 0;");}
+  }
+  NdIndexOffsetHelper<int64_t, 3> input_helper(max_input_length, batch_size, num_labels);
+
+  CUDA_1D_KERNEL_LOOP(b, batch_size) {
+    int prev_indices = -1, t_dec = 0;
+    neg_sum_logits_ptr[b] = 0;
+    FOR_RANGE(int64_t, t, 0, input_lengths_ptr[b]) {
+      const T* prob_data_t = &log_probs_ptr[input_helper.NdIndexToOffset(t, b, 0)];
+      int64_t max_indice = -1;
+      T max_value = -FLT_MAX;
+      FOR_RANGE(int64_t, c, 0, num_labels) {
+        if (prob_data_t[c] > max_value) {
+          max_indice = c;
+          max_value = prob_data_t[c];
+        }
+      }
+      neg_sum_logits_ptr[b] -= max_value;
+      if (max_indice != num_labels - 1 && !(merge_repeated && (prev_indices == max_indice))) {
+        decoded_ptr[b * max_input_length + t_dec] = max_indice;
+        t_dec++;
+      }
+      prev_indices = max_indice;
+    }
+    FOR_RANGE(int64_t, t, t_dec, max_input_length) { decoded_ptr[b * max_input_length + t] = 0; }
+  }
+}
+
+template<typename T>
+struct CTCGreedyDecoderFunctor<DeviceType::kCUDA, T> final {
+  void operator()(ep::Stream* stream, int64_t* decoded_ptr, T* neg_sum_logits_ptr,
+                  const T* log_probs_ptr, const int64_t* input_lengths_ptr,
+                  const bool merge_repeated, const int64_t max_input_length,
+                  const int64_t batch_size, const int64_t num_labels) {
+    int32_t thread_num = batch_size * kCudaThreadsNumPerBlock;
+    int64_t shared_mem_size = max_input_length * sizeof(int64_t);
+
+    int max_active_blocks;
+    OF_CUDA_CHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor(
+        &max_active_blocks, CtcGreedyDecodeGpu<T>, kCudaThreadsNumPerBlock, shared_mem_size));
+    if (max_active_blocks > 0) {
+      CtcGreedyDecodeGpuMultiThread<<<BlocksNum4ThreadsNum(thread_num), kCudaThreadsNumPerBlock,
+                                      shared_mem_size,
+                                      stream->As<ep::CudaStream>()->cuda_stream()>>>(
+          decoded_ptr, neg_sum_logits_ptr, log_probs_ptr, input_lengths_ptr, merge_repeated,
+          max_input_length, batch_size, num_labels);
+
+    } else {
+      CtcGreedyDecodeGpu<<<BlocksNum4ThreadsNum(thread_num), kCudaThreadsNumPerBlock, 0,
+                           stream->As<ep::CudaStream>()->cuda_stream()>>>(
+          decoded_ptr, neg_sum_logits_ptr, log_probs_ptr, input_lengths_ptr, merge_repeated,
+          max_input_length, batch_size, num_labels);
+    }
+  }
+};
+
+}  // namespace
+
+REGISTER_CTC_GREEDY_DECODER_KERNELS(DeviceType::kCUDA, float);
+REGISTER_CTC_GREEDY_DECODER_KERNELS(DeviceType::kCUDA, double);
+
 }  // namespace oneflow
\ No newline at end of file
diff --git a/oneflow/user/kernels/ctc_loss_kernel_util.hip.cpp b/oneflow/user/kernels/ctc_loss_kernel_util.hip.cpp
index abbaccf..688df77 100644
--- a/oneflow/user/kernels/ctc_loss_kernel_util.hip.cpp
+++ b/oneflow/user/kernels/ctc_loss_kernel_util.hip.cpp
@@ -1,285 +1,285 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "hip/hip_runtime.h"
-#include "oneflow/user/kernels/ctc_loss_kernel_util.h"
-#include "oneflow/core/device/cuda_util.h"
-
-namespace oneflow {
-
-namespace {
-
-template<typename IDX>
-__device__ __inline__ static int get_target_prime(const int* targets_ptr,
-                                                  const IDX* target_lengths_ptr,
-                                                  int64_t max_target_length, int64_t b, int64_t s,
-                                                  int blank, const int32_t targets_ndim) {
-  if (s % 2 == 0) {
-    return blank;
-  } else {
-    int64_t idx = 0;
-    if (targets_ndim == 1) {
-      FOR_RANGE(int64_t, i, 0, b) { idx += target_lengths_ptr[i]; }
-    } else {  // targets_ndim == 2
-      idx = b * max_target_length;
-    }
-    idx += s / 2;
-    return targets_ptr[idx];
-  }
-}
-
-template<typename T, typename IDX>
-__global__ void CtcLossGpu(const T* log_probs_ptr, const int* targets_ptr,
-                           const IDX* input_lengths_ptr, const IDX* target_lengths_ptr,
-                           T* alpha_ptr, T* loss_ptr, NdIndexOffsetHelper<int64_t, 3> input_helper,
-                           NdIndexOffsetHelper<int64_t, 3> alpha_helper, const int64_t batch_size,
-                           const int64_t max_input_length, const int64_t max_target_length,
-                           const int blank, const int32_t targets_ndim) {
-  constexpr T neginf = -INFINITY;
-  const int32_t bid = blockIdx.x;
-  const int32_t tid = threadIdx.x;
-  for (int64_t b = bid; b < batch_size; b += gridDim.x) {
-    if (tid == 0) {
-      if (input_lengths_ptr[b] > max_input_length) {asm("s_trap 0;");}
-      if (target_lengths_ptr[b] > max_target_length) {asm("s_trap 0;");}
-    }
-  }
-  for (int64_t b = bid; b < batch_size; b += gridDim.x) {
-    IDX input_length = input_lengths_ptr[b];
-    IDX target_length = target_lengths_ptr[b];
-
-    for (IDX s = tid; s < 2 * target_length + 1; s += blockDim.x) {
-      alpha_ptr[alpha_helper.NdIndexToOffset(b, 0, s)] = neginf;
-    }
-    if (tid == 0) {
-      alpha_ptr[alpha_helper.NdIndexToOffset(b, 0, 0)] =
-          log_probs_ptr[input_helper.NdIndexToOffset(0, b, blank)];
-      if (target_length > 0) {
-        int target = get_target_prime(targets_ptr, target_lengths_ptr, max_target_length, b, 1,
-                                      blank, targets_ndim);
-        alpha_ptr[alpha_helper.NdIndexToOffset(b, 0, 1)] =
-            log_probs_ptr[input_helper.NdIndexToOffset(0, b, target)];
-      }
-    }
-    __syncthreads();
-    for (IDX t = 1; t < input_length; t++) {
-      for (IDX s = tid; s < 2 * target_length + 1; s += blockDim.x) {
-        int current_target_prime = get_target_prime(targets_ptr, target_lengths_ptr,
-                                                    max_target_length, b, s, blank, targets_ndim);
-        T la1 = alpha_ptr[alpha_helper.NdIndexToOffset(b, t - 1, s)];
-        T la2, la3, lamax = la1;
-        if (s > 0) {
-          la2 = alpha_ptr[alpha_helper.NdIndexToOffset(b, t - 1, s - 1)];
-          if (la2 > lamax) lamax = la2;
-        } else {
-          la2 = neginf;
-        }
-        if ((s > 1)
-            && (get_target_prime(targets_ptr, target_lengths_ptr, max_target_length, b, s - 2,
-                                 blank, targets_ndim)
-                != current_target_prime)) {
-          la3 = alpha_ptr[alpha_helper.NdIndexToOffset(b, t - 1, s - 2)];
-          if (la3 > lamax) lamax = la3;
-        } else {
-          la3 = neginf;
-        }
-        if (lamax == neginf) lamax = 0;
-
-        int64_t idx_t_s = alpha_helper.NdIndexToOffset(b, t, s);
-        alpha_ptr[idx_t_s] =
-            log(exp(la1 - lamax) + exp(la2 - lamax) + exp(la3 - lamax)) + lamax
-            + log_probs_ptr[input_helper.NdIndexToOffset(t, b, current_target_prime)];
-      }
-      __syncthreads();
-    }
-    if (tid == 0) {
-      if (target_length == 0) {
-        int64_t idx = alpha_helper.NdIndexToOffset(b, input_length - 1, 0);
-        loss_ptr[b] = -alpha_ptr[idx];
-      } else {
-        int64_t idx1 = alpha_helper.NdIndexToOffset(b, input_length - 1, target_length * 2);
-        int64_t idx2 = alpha_helper.NdIndexToOffset(b, input_length - 1, target_length * 2 - 1);
-        T l1 = alpha_ptr[idx1];
-        T l2 = alpha_ptr[idx2];
-        T m = max(l1, l2);
-        m = ((m == neginf) ? 0 : m);
-        T log_likelihood = log(exp(l1 - m) + exp(l2 - m)) + m;
-        loss_ptr[b] = -log_likelihood;
-      }
-    }
-  }
-}
-
-template<typename T, typename IDX>
-__global__ void CtcLossGradGpu(
-    const T* grad_out_ptr, const T* loss_ptr, const T* alpha_ptr, const T* log_probs_ptr,
-    const int* targets_ptr, const IDX* input_lengths_ptr, const IDX* target_lengths_ptr,
-    T* beta_ptr, T* grad_ptr, NdIndexOffsetHelper<int64_t, 3> input_helper,
-    NdIndexOffsetHelper<int64_t, 3> beta_helper, const int64_t batch_size,
-    const int64_t max_input_length, const int64_t max_target_length, const int64_t num_labels,
-    const int blank, const bool zero_infinity, const int32_t targets_ndim) {
-  constexpr T neginf = -INFINITY;
-  const int32_t bid = blockIdx.x;
-  const int32_t tid = threadIdx.x;
-
-  for (int64_t b = bid; b < batch_size; b += gridDim.x) {
-    IDX input_length = input_lengths_ptr[b];
-    IDX target_length = target_lengths_ptr[b];
-    T nll = loss_ptr[b];
-    if (zero_infinity && nll == INFINITY) {
-      for (IDX t = tid; t < max_input_length; t += blockDim.x) {
-        for (IDX c = 0; c < num_labels; c++) {
-          grad_ptr[input_helper.NdIndexToOffset(t, b, c)] = 0;
-        }
-      }
-      __syncthreads();
-      continue;
-    }
-
-    if (input_length > 0) {
-      for (IDX s = tid; s < 2 * target_length + 1; s += blockDim.x) {
-        beta_ptr[beta_helper.NdIndexToOffset(b, input_length - 1, s)] = neginf;
-      }
-      if (tid == 0) {
-        beta_ptr[beta_helper.NdIndexToOffset(b, input_length - 1, 2 * target_length)] =
-            log_probs_ptr[input_helper.NdIndexToOffset(input_length - 1, b, blank)];
-        if (target_length > 0) {
-          int target = get_target_prime(targets_ptr, target_lengths_ptr, max_target_length, b,
-                                        2 * target_length - 1, blank, targets_ndim);
-          beta_ptr[beta_helper.NdIndexToOffset(b, input_length - 1, 2 * target_length - 1)] =
-              log_probs_ptr[input_helper.NdIndexToOffset(input_length - 1, b, target)];
-        }
-      }
-      __syncthreads();
-    }
-    for (IDX t = input_length - 2; t >= 0; t--) {
-      for (IDX s = tid; s < 2 * target_length + 1; s += blockDim.x) {
-        int current_target_prime = get_target_prime(targets_ptr, target_lengths_ptr,
-                                                    max_target_length, b, s, blank, targets_ndim);
-        T lb1 = beta_ptr[beta_helper.NdIndexToOffset(b, t + 1, s)];
-        T lb2, lb3, lbmax = lb1;
-        if (s < 2 * target_length) {
-          lb2 = beta_ptr[beta_helper.NdIndexToOffset(b, t + 1, s + 1)];
-          if (lb2 > lbmax) lbmax = lb2;
-        } else {
-          lb2 = neginf;
-        }
-        if ((s < 2 * target_length - 1)
-            && (get_target_prime(targets_ptr, target_lengths_ptr, max_target_length, b, s + 2,
-                                 blank, targets_ndim)
-                != current_target_prime)) {
-          lb3 = beta_ptr[beta_helper.NdIndexToOffset(b, t + 1, s + 2)];
-          if (lb3 > lbmax) lbmax = lb3;
-        } else {
-          lb3 = neginf;
-        }
-        if (lbmax == neginf) lbmax = 0;
-
-        int64_t idx_t_s = beta_helper.NdIndexToOffset(b, t, s);
-        beta_ptr[idx_t_s] =
-            log(exp(lb1 - lbmax) + exp(lb2 - lbmax) + exp(lb3 - lbmax)) + lbmax
-            + log_probs_ptr[input_helper.NdIndexToOffset(t, b, current_target_prime)];
-      }
-      __syncthreads();
-    }
-    for (IDX t = tid; t < max_input_length; t += blockDim.x) {
-      for (IDX c = 0; c < num_labels; c++) {
-        grad_ptr[input_helper.NdIndexToOffset(t, b, c)] = t < input_length ? neginf : 0;
-      }
-    }
-    __syncthreads();
-    if (tid == 0) {
-      grad_ptr[input_helper.NdIndexToOffset(input_length - 1, b, blank)] =
-          alpha_ptr[beta_helper.NdIndexToOffset(b, input_length - 1, 2 * target_length)]
-          + beta_ptr[beta_helper.NdIndexToOffset(b, input_length - 1, 2 * target_length)];
-      if (target_length > 0) {
-        int target = get_target_prime(targets_ptr, target_lengths_ptr, max_target_length, b,
-                                      2 * target_length - 1, blank, targets_ndim);
-        grad_ptr[input_helper.NdIndexToOffset(input_length - 1, b, target)] =
-            alpha_ptr[beta_helper.NdIndexToOffset(b, input_length - 1, 2 * target_length - 1)]
-            + beta_ptr[beta_helper.NdIndexToOffset(b, input_length - 1, 2 * target_length - 1)];
-      }
-    }
-    __syncthreads();
-    for (IDX t = tid; t < input_length; t += blockDim.x) {
-      for (IDX s = 0; (t < input_length - 1) && (s < 2 * target_length + 1); s += 1) {
-        int current_target_prime = get_target_prime(targets_ptr, target_lengths_ptr,
-                                                    max_target_length, b, s, blank, targets_ndim);
-        int64_t idx_t_s = beta_helper.NdIndexToOffset(b, t, s);
-        T log_alpha_beta = alpha_ptr[idx_t_s] + beta_ptr[idx_t_s];
-        T& lcab = grad_ptr[input_helper.NdIndexToOffset(t, b, current_target_prime)];
-        if (lcab == neginf) {
-          lcab = log_alpha_beta;
-        } else {
-          T m = max(lcab, log_alpha_beta);
-          lcab = log(exp(lcab - m) + exp(log_alpha_beta - m)) + m;
-        }
-      }
-      for (int32_t c = 0; c < num_labels; c++) {
-        T& res = grad_ptr[input_helper.NdIndexToOffset(t, b, c)];
-        T lp = log_probs_ptr[input_helper.NdIndexToOffset(t, b, c)];
-        res = (exp(lp) - exp(res + nll - lp)) * grad_out_ptr[b];
-      }
-    }
-  }
-}
-
-}  // namespace
-
-template<typename T, typename IDX>
-struct CtcLossKernelUtil<DeviceType::kCUDA, T, IDX> {
-  static void CtcLossForward(ep::Stream* stream, const T* log_probs_ptr, const int* targets_ptr,
-                             const IDX* input_lengths_ptr, const IDX* target_lengths_ptr,
-                             T* alpha_ptr, T* loss_ptr,
-                             NdIndexOffsetHelper<int64_t, 3>& input_helper,
-                             NdIndexOffsetHelper<int64_t, 3>& alpha_helper,
-                             const int64_t batch_size, const int64_t max_input_length,
-                             const int64_t max_target_length, const int blank,
-                             const int32_t targets_ndim) {
-    int32_t thread_num = batch_size * kCudaThreadsNumPerBlock;
-    RUN_CUDA_KERNEL((CtcLossGpu<T, IDX>), stream, thread_num, log_probs_ptr, targets_ptr,
-                    input_lengths_ptr, target_lengths_ptr, alpha_ptr, loss_ptr, input_helper,
-                    alpha_helper, batch_size, max_input_length, max_target_length, blank,
-                    targets_ndim);
-  }
-
-  static void CtcLossBackward(ep::Stream* stream, const T* grad_out_ptr, const T* loss_ptr,
-                              const T* alpha_ptr, const T* log_probs_ptr, const int* targets_ptr,
-                              const IDX* input_lengths_ptr, const IDX* target_lengths_ptr,
-                              T* beta_ptr, T* grad_ptr,
-                              NdIndexOffsetHelper<int64_t, 3>& input_helper,
-                              NdIndexOffsetHelper<int64_t, 3>& beta_helper,
-                              const int64_t batch_size, const int64_t max_input_length,
-                              const int64_t max_target_length, const int64_t num_labels,
-                              const int blank, const bool zero_infinity,
-                              const int32_t targets_ndim) {
-    int32_t thread_num = batch_size * kCudaThreadsNumPerBlock;
-    RUN_CUDA_KERNEL((CtcLossGradGpu<T, IDX>), stream, thread_num, grad_out_ptr, loss_ptr, alpha_ptr,
-                    log_probs_ptr, targets_ptr, input_lengths_ptr, target_lengths_ptr, beta_ptr,
-                    grad_ptr, input_helper, beta_helper, batch_size, max_input_length,
-                    max_target_length, num_labels, blank, zero_infinity, targets_ndim);
-  }
-};
-
-#define INSTANTIATE_CTC_LOSS_KERNEL_UTIL_CUDA(device_type_v, log_probs_dtype_pair,         \
-                                              input_lengths_dtype_pair)                    \
-  template struct CtcLossKernelUtil<device_type_v, OF_PP_PAIR_FIRST(log_probs_dtype_pair), \
-                                    OF_PP_PAIR_FIRST(input_lengths_dtype_pair)>;
-
-OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_CTC_LOSS_KERNEL_UTIL_CUDA, (DeviceType::kCUDA),
-                                 FLOATING_DATA_TYPE_SEQ, INDEX_DATA_TYPE_SEQ)
-#undef INSTANTIATE_CTC_LOSS_KERNEL_UTIL_CUDA
-
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "hip/hip_runtime.h"
+#include "oneflow/user/kernels/ctc_loss_kernel_util.h"
+#include "oneflow/core/device/cuda_util.h"
+
+namespace oneflow {
+
+namespace {
+
+template<typename IDX>
+__device__ __inline__ static int get_target_prime(const int* targets_ptr,
+                                                  const IDX* target_lengths_ptr,
+                                                  int64_t max_target_length, int64_t b, int64_t s,
+                                                  int blank, const int32_t targets_ndim) {
+  if (s % 2 == 0) {
+    return blank;
+  } else {
+    int64_t idx = 0;
+    if (targets_ndim == 1) {
+      FOR_RANGE(int64_t, i, 0, b) { idx += target_lengths_ptr[i]; }
+    } else {  // targets_ndim == 2
+      idx = b * max_target_length;
+    }
+    idx += s / 2;
+    return targets_ptr[idx];
+  }
+}
+
+template<typename T, typename IDX>
+__global__ void CtcLossGpu(const T* log_probs_ptr, const int* targets_ptr,
+                           const IDX* input_lengths_ptr, const IDX* target_lengths_ptr,
+                           T* alpha_ptr, T* loss_ptr, NdIndexOffsetHelper<int64_t, 3> input_helper,
+                           NdIndexOffsetHelper<int64_t, 3> alpha_helper, const int64_t batch_size,
+                           const int64_t max_input_length, const int64_t max_target_length,
+                           const int blank, const int32_t targets_ndim) {
+  constexpr T neginf = -INFINITY;
+  const int32_t bid = blockIdx.x;
+  const int32_t tid = threadIdx.x;
+  for (int64_t b = bid; b < batch_size; b += gridDim.x) {
+    if (tid == 0) {
+      if (input_lengths_ptr[b] > max_input_length) {asm("s_trap 0;");}
+      if (target_lengths_ptr[b] > max_target_length) {asm("s_trap 0;");}
+    }
+  }
+  for (int64_t b = bid; b < batch_size; b += gridDim.x) {
+    IDX input_length = input_lengths_ptr[b];
+    IDX target_length = target_lengths_ptr[b];
+
+    for (IDX s = tid; s < 2 * target_length + 1; s += blockDim.x) {
+      alpha_ptr[alpha_helper.NdIndexToOffset(b, 0, s)] = neginf;
+    }
+    if (tid == 0) {
+      alpha_ptr[alpha_helper.NdIndexToOffset(b, 0, 0)] =
+          log_probs_ptr[input_helper.NdIndexToOffset(0, b, blank)];
+      if (target_length > 0) {
+        int target = get_target_prime(targets_ptr, target_lengths_ptr, max_target_length, b, 1,
+                                      blank, targets_ndim);
+        alpha_ptr[alpha_helper.NdIndexToOffset(b, 0, 1)] =
+            log_probs_ptr[input_helper.NdIndexToOffset(0, b, target)];
+      }
+    }
+    __syncthreads();
+    for (IDX t = 1; t < input_length; t++) {
+      for (IDX s = tid; s < 2 * target_length + 1; s += blockDim.x) {
+        int current_target_prime = get_target_prime(targets_ptr, target_lengths_ptr,
+                                                    max_target_length, b, s, blank, targets_ndim);
+        T la1 = alpha_ptr[alpha_helper.NdIndexToOffset(b, t - 1, s)];
+        T la2, la3, lamax = la1;
+        if (s > 0) {
+          la2 = alpha_ptr[alpha_helper.NdIndexToOffset(b, t - 1, s - 1)];
+          if (la2 > lamax) lamax = la2;
+        } else {
+          la2 = neginf;
+        }
+        if ((s > 1)
+            && (get_target_prime(targets_ptr, target_lengths_ptr, max_target_length, b, s - 2,
+                                 blank, targets_ndim)
+                != current_target_prime)) {
+          la3 = alpha_ptr[alpha_helper.NdIndexToOffset(b, t - 1, s - 2)];
+          if (la3 > lamax) lamax = la3;
+        } else {
+          la3 = neginf;
+        }
+        if (lamax == neginf) lamax = 0;
+
+        int64_t idx_t_s = alpha_helper.NdIndexToOffset(b, t, s);
+        alpha_ptr[idx_t_s] =
+            log(exp(la1 - lamax) + exp(la2 - lamax) + exp(la3 - lamax)) + lamax
+            + log_probs_ptr[input_helper.NdIndexToOffset(t, b, current_target_prime)];
+      }
+      __syncthreads();
+    }
+    if (tid == 0) {
+      if (target_length == 0) {
+        int64_t idx = alpha_helper.NdIndexToOffset(b, input_length - 1, 0);
+        loss_ptr[b] = -alpha_ptr[idx];
+      } else {
+        int64_t idx1 = alpha_helper.NdIndexToOffset(b, input_length - 1, target_length * 2);
+        int64_t idx2 = alpha_helper.NdIndexToOffset(b, input_length - 1, target_length * 2 - 1);
+        T l1 = alpha_ptr[idx1];
+        T l2 = alpha_ptr[idx2];
+        T m = max(l1, l2);
+        m = ((m == neginf) ? 0 : m);
+        T log_likelihood = log(exp(l1 - m) + exp(l2 - m)) + m;
+        loss_ptr[b] = -log_likelihood;
+      }
+    }
+  }
+}
+
+template<typename T, typename IDX>
+__global__ void CtcLossGradGpu(
+    const T* grad_out_ptr, const T* loss_ptr, const T* alpha_ptr, const T* log_probs_ptr,
+    const int* targets_ptr, const IDX* input_lengths_ptr, const IDX* target_lengths_ptr,
+    T* beta_ptr, T* grad_ptr, NdIndexOffsetHelper<int64_t, 3> input_helper,
+    NdIndexOffsetHelper<int64_t, 3> beta_helper, const int64_t batch_size,
+    const int64_t max_input_length, const int64_t max_target_length, const int64_t num_labels,
+    const int blank, const bool zero_infinity, const int32_t targets_ndim) {
+  constexpr T neginf = -INFINITY;
+  const int32_t bid = blockIdx.x;
+  const int32_t tid = threadIdx.x;
+
+  for (int64_t b = bid; b < batch_size; b += gridDim.x) {
+    IDX input_length = input_lengths_ptr[b];
+    IDX target_length = target_lengths_ptr[b];
+    T nll = loss_ptr[b];
+    if (zero_infinity && nll == INFINITY) {
+      for (IDX t = tid; t < max_input_length; t += blockDim.x) {
+        for (IDX c = 0; c < num_labels; c++) {
+          grad_ptr[input_helper.NdIndexToOffset(t, b, c)] = 0;
+        }
+      }
+      __syncthreads();
+      continue;
+    }
+
+    if (input_length > 0) {
+      for (IDX s = tid; s < 2 * target_length + 1; s += blockDim.x) {
+        beta_ptr[beta_helper.NdIndexToOffset(b, input_length - 1, s)] = neginf;
+      }
+      if (tid == 0) {
+        beta_ptr[beta_helper.NdIndexToOffset(b, input_length - 1, 2 * target_length)] =
+            log_probs_ptr[input_helper.NdIndexToOffset(input_length - 1, b, blank)];
+        if (target_length > 0) {
+          int target = get_target_prime(targets_ptr, target_lengths_ptr, max_target_length, b,
+                                        2 * target_length - 1, blank, targets_ndim);
+          beta_ptr[beta_helper.NdIndexToOffset(b, input_length - 1, 2 * target_length - 1)] =
+              log_probs_ptr[input_helper.NdIndexToOffset(input_length - 1, b, target)];
+        }
+      }
+      __syncthreads();
+    }
+    for (IDX t = input_length - 2; t >= 0; t--) {
+      for (IDX s = tid; s < 2 * target_length + 1; s += blockDim.x) {
+        int current_target_prime = get_target_prime(targets_ptr, target_lengths_ptr,
+                                                    max_target_length, b, s, blank, targets_ndim);
+        T lb1 = beta_ptr[beta_helper.NdIndexToOffset(b, t + 1, s)];
+        T lb2, lb3, lbmax = lb1;
+        if (s < 2 * target_length) {
+          lb2 = beta_ptr[beta_helper.NdIndexToOffset(b, t + 1, s + 1)];
+          if (lb2 > lbmax) lbmax = lb2;
+        } else {
+          lb2 = neginf;
+        }
+        if ((s < 2 * target_length - 1)
+            && (get_target_prime(targets_ptr, target_lengths_ptr, max_target_length, b, s + 2,
+                                 blank, targets_ndim)
+                != current_target_prime)) {
+          lb3 = beta_ptr[beta_helper.NdIndexToOffset(b, t + 1, s + 2)];
+          if (lb3 > lbmax) lbmax = lb3;
+        } else {
+          lb3 = neginf;
+        }
+        if (lbmax == neginf) lbmax = 0;
+
+        int64_t idx_t_s = beta_helper.NdIndexToOffset(b, t, s);
+        beta_ptr[idx_t_s] =
+            log(exp(lb1 - lbmax) + exp(lb2 - lbmax) + exp(lb3 - lbmax)) + lbmax
+            + log_probs_ptr[input_helper.NdIndexToOffset(t, b, current_target_prime)];
+      }
+      __syncthreads();
+    }
+    for (IDX t = tid; t < max_input_length; t += blockDim.x) {
+      for (IDX c = 0; c < num_labels; c++) {
+        grad_ptr[input_helper.NdIndexToOffset(t, b, c)] = t < input_length ? neginf : 0;
+      }
+    }
+    __syncthreads();
+    if (tid == 0) {
+      grad_ptr[input_helper.NdIndexToOffset(input_length - 1, b, blank)] =
+          alpha_ptr[beta_helper.NdIndexToOffset(b, input_length - 1, 2 * target_length)]
+          + beta_ptr[beta_helper.NdIndexToOffset(b, input_length - 1, 2 * target_length)];
+      if (target_length > 0) {
+        int target = get_target_prime(targets_ptr, target_lengths_ptr, max_target_length, b,
+                                      2 * target_length - 1, blank, targets_ndim);
+        grad_ptr[input_helper.NdIndexToOffset(input_length - 1, b, target)] =
+            alpha_ptr[beta_helper.NdIndexToOffset(b, input_length - 1, 2 * target_length - 1)]
+            + beta_ptr[beta_helper.NdIndexToOffset(b, input_length - 1, 2 * target_length - 1)];
+      }
+    }
+    __syncthreads();
+    for (IDX t = tid; t < input_length; t += blockDim.x) {
+      for (IDX s = 0; (t < input_length - 1) && (s < 2 * target_length + 1); s += 1) {
+        int current_target_prime = get_target_prime(targets_ptr, target_lengths_ptr,
+                                                    max_target_length, b, s, blank, targets_ndim);
+        int64_t idx_t_s = beta_helper.NdIndexToOffset(b, t, s);
+        T log_alpha_beta = alpha_ptr[idx_t_s] + beta_ptr[idx_t_s];
+        T& lcab = grad_ptr[input_helper.NdIndexToOffset(t, b, current_target_prime)];
+        if (lcab == neginf) {
+          lcab = log_alpha_beta;
+        } else {
+          T m = max(lcab, log_alpha_beta);
+          lcab = log(exp(lcab - m) + exp(log_alpha_beta - m)) + m;
+        }
+      }
+      for (int32_t c = 0; c < num_labels; c++) {
+        T& res = grad_ptr[input_helper.NdIndexToOffset(t, b, c)];
+        T lp = log_probs_ptr[input_helper.NdIndexToOffset(t, b, c)];
+        res = (exp(lp) - exp(res + nll - lp)) * grad_out_ptr[b];
+      }
+    }
+  }
+}
+
+}  // namespace
+
+template<typename T, typename IDX>
+struct CtcLossKernelUtil<DeviceType::kCUDA, T, IDX> {
+  static void CtcLossForward(ep::Stream* stream, const T* log_probs_ptr, const int* targets_ptr,
+                             const IDX* input_lengths_ptr, const IDX* target_lengths_ptr,
+                             T* alpha_ptr, T* loss_ptr,
+                             NdIndexOffsetHelper<int64_t, 3>& input_helper,
+                             NdIndexOffsetHelper<int64_t, 3>& alpha_helper,
+                             const int64_t batch_size, const int64_t max_input_length,
+                             const int64_t max_target_length, const int blank,
+                             const int32_t targets_ndim) {
+    int32_t thread_num = batch_size * kCudaThreadsNumPerBlock;
+    RUN_CUDA_KERNEL((CtcLossGpu<T, IDX>), stream, thread_num, log_probs_ptr, targets_ptr,
+                    input_lengths_ptr, target_lengths_ptr, alpha_ptr, loss_ptr, input_helper,
+                    alpha_helper, batch_size, max_input_length, max_target_length, blank,
+                    targets_ndim);
+  }
+
+  static void CtcLossBackward(ep::Stream* stream, const T* grad_out_ptr, const T* loss_ptr,
+                              const T* alpha_ptr, const T* log_probs_ptr, const int* targets_ptr,
+                              const IDX* input_lengths_ptr, const IDX* target_lengths_ptr,
+                              T* beta_ptr, T* grad_ptr,
+                              NdIndexOffsetHelper<int64_t, 3>& input_helper,
+                              NdIndexOffsetHelper<int64_t, 3>& beta_helper,
+                              const int64_t batch_size, const int64_t max_input_length,
+                              const int64_t max_target_length, const int64_t num_labels,
+                              const int blank, const bool zero_infinity,
+                              const int32_t targets_ndim) {
+    int32_t thread_num = batch_size * kCudaThreadsNumPerBlock;
+    RUN_CUDA_KERNEL((CtcLossGradGpu<T, IDX>), stream, thread_num, grad_out_ptr, loss_ptr, alpha_ptr,
+                    log_probs_ptr, targets_ptr, input_lengths_ptr, target_lengths_ptr, beta_ptr,
+                    grad_ptr, input_helper, beta_helper, batch_size, max_input_length,
+                    max_target_length, num_labels, blank, zero_infinity, targets_ndim);
+  }
+};
+
+#define INSTANTIATE_CTC_LOSS_KERNEL_UTIL_CUDA(device_type_v, log_probs_dtype_pair,         \
+                                              input_lengths_dtype_pair)                    \
+  template struct CtcLossKernelUtil<device_type_v, OF_PP_PAIR_FIRST(log_probs_dtype_pair), \
+                                    OF_PP_PAIR_FIRST(input_lengths_dtype_pair)>;
+
+OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_CTC_LOSS_KERNEL_UTIL_CUDA, (DeviceType::kCUDA),
+                                 FLOATING_DATA_TYPE_SEQ, INDEX_DATA_TYPE_SEQ)
+#undef INSTANTIATE_CTC_LOSS_KERNEL_UTIL_CUDA
+
 }  // namespace oneflow
\ No newline at end of file
diff --git a/oneflow/user/kernels/cum_backward_kernel.hip.cpp b/oneflow/user/kernels/cum_backward_kernel.hip.cpp
index b609c6e..98f581d 100644
--- a/oneflow/user/kernels/cum_backward_kernel.hip.cpp
+++ b/oneflow/user/kernels/cum_backward_kernel.hip.cpp
@@ -1,139 +1,139 @@
-#include "hip/hip_runtime.h"
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "oneflow/core/framework/framework.h"
-#include "oneflow/core/device/cuda_util.h"
-#include "oneflow/core/ep/rocm/cuda_stream.h"
-#include "oneflow/core/kernel/new_kernel_util.h"
-
-namespace oneflow {
-#ifdef WITH_ROCM
-namespace {
-template<typename T>
-__global__ void CumProdBackward(const T* dy_ptr, T* dx_ptr, const T* output_ptr, const T* input_ptr,
-                                const int64_t up_space, const int64_t space,
-                                const int64_t down_space, const int64_t thread_num) {
-  // A thread is responsible for a row along specific dimension.
-  const size_t up_space_step = space * down_space;
-  CUDA_1D_KERNEL_LOOP_T(size_t, i, thread_num) {
-    const size_t up_space_id = i / down_space;
-    const size_t down_space_id = i % down_space;
-    const size_t ptr_offset = up_space_id * up_space_step + down_space_id;
-    auto* dy_ptr_base = dy_ptr + ptr_offset;
-    auto* dx_ptr_base = dx_ptr + ptr_offset;
-    auto* input_ptr_base = input_ptr + ptr_offset;
-    auto* output_ptr_base = output_ptr + ptr_offset;
-
-    // Buffer storing number of zero element along specific dimension.
-    // Use dx as tmp buffer.
-    for (size_t j = 0; j < space; j++) {
-      const size_t data_offset = j * down_space;
-      int is_zero = input_ptr_base[data_offset] == 0 ? 1 : 0;
-      dx_ptr_base[data_offset] = is_zero + (j == 0 ? 0 : dx_ptr_base[data_offset - down_space]);
-    }
-
-    // Find index of first zero in input.
-    size_t first_zero_index = space;
-    for (size_t j = 0; j < space; j++) {
-      const size_t data_offset = j * down_space;
-      if (dx_ptr_base[data_offset] == 1) {
-        first_zero_index = j;
-        break;
-      }
-    }
-
-    // Suppose z is index of first zero element in input,
-    // for element which index is less than z grad is computed as below:
-    T reverse_cumsum = 0;
-    for (size_t j = 0; j < first_zero_index; j++) {
-      const size_t cur_index = first_zero_index - j - 1;
-      const size_t data_offset = cur_index * down_space;
-      reverse_cumsum += output_ptr_base[data_offset] * dy_ptr_base[data_offset];
-      dx_ptr_base[data_offset] = reverse_cumsum / input_ptr_base[data_offset];
-    }
-
-    // Where index is z, its grad is computed as below:
-    if (first_zero_index == space) { return; }
-    T cumprod = 1;
-    T cumsum = 0;
-    T cumprod_before_first_zero =
-        first_zero_index == 0 ? 1 : output_ptr_base[(first_zero_index - 1) * down_space];
-    for (size_t j = first_zero_index; j < space; j++) {
-      const size_t down_space_offset = j * down_space;
-      // Recover dx_ptr default value
-      if (dx_ptr_base[down_space_offset] >= 1) { dx_ptr_base[down_space_offset] = 0; }
-      if (j != first_zero_index) { cumprod *= input_ptr_base[down_space_offset]; }
-      cumsum += cumprod_before_first_zero * dy_ptr_base[down_space_offset] * cumprod;
-    }
-    dx_ptr_base[first_zero_index * down_space] = cumsum;
-  }
-}
-}  // namespace
-
-template<typename T>
-class GpuCumProdGradKernel final : public user_op::OpKernel {
- public:
-  GpuCumProdGradKernel() = default;
-  ~GpuCumProdGradKernel() = default;
-
- private:
-  using user_op::OpKernel::Compute;
-  void Compute(user_op::KernelComputeContext* ctx) const override {
-    const auto* output = ctx->Tensor4ArgNameAndIndex("output", 0);
-    const auto* input = ctx->Tensor4ArgNameAndIndex("input", 0);
-    const auto* dy = ctx->Tensor4ArgNameAndIndex("dy", 0);
-    auto* dx = ctx->Tensor4ArgNameAndIndex("dx", 0);
-    const auto elem_cnt = dy->shape_view().elem_cnt();
-    if (!elem_cnt) { return; }
-
-    const auto* output_ptr = output->dptr<T>();
-    const auto* input_ptr = input->dptr<T>();
-    const auto* dy_ptr = dy->dptr<T>();
-    auto* dx_ptr = dx->mut_dptr<T>();
-
-    // Data partition: up_space|space|down_space
-    auto dim = ctx->Attr<int64_t>("dim");
-    const auto up_space = elem_cnt / dx->shape_view().Count(dim);
-    const auto space = dx->shape_view().At(dim);
-    const auto down_space = dx->shape_view().Count(dim + 1);
-    const size_t thread_num = up_space * down_space;
-
-    if (space == 1) {
-      Memcpy<DeviceType::kCUDA>(ctx->stream(), dx_ptr, dy_ptr, elem_cnt * sizeof(T));
-      return;
-    }
-    ep::CudaLaunchConfig config{};
-    ctx->stream()->As<ep::CudaStream>()->InitLaunchConfigWithWaves(
-        &config, thread_num, /*DefaultBlockSize*/ 256, /*max_wave*/ 1);
-    CumProdBackward<<<config.grid_dim, config.block_dim, /*shared memory*/ 0,
-                      ctx->stream()->As<ep::CudaStream>()->cuda_stream()>>>(
-        dy_ptr, dx_ptr, output_ptr, input_ptr, up_space, space, down_space, thread_num);
-  }
-
-  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
-};
-
-#define REGISTER_CUDA_CUMPROD_GRAD_KERNEL(dtype)                       \
-  REGISTER_USER_KERNEL("cumprod_grad")                                 \
-      .SetCreateFn<GpuCumProdGradKernel<dtype>>()                      \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \
-                       && (user_op::HobDataType("dx", 0) == GetDataType<dtype>::value));
-
-REGISTER_CUDA_CUMPROD_GRAD_KERNEL(float)
-REGISTER_CUDA_CUMPROD_GRAD_KERNEL(double)
-#undef REGISTER_CUDA_CUMPROD_GRAD_KERNEL
-#endif
+#include "hip/hip_runtime.h"
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "oneflow/core/framework/framework.h"
+#include "oneflow/core/device/cuda_util.h"
+#include "oneflow/core/ep/rocm/cuda_stream.h"
+#include "oneflow/core/kernel/new_kernel_util.h"
+
+namespace oneflow {
+#ifdef WITH_ROCM
+namespace {
+template<typename T>
+__global__ void CumProdBackward(const T* dy_ptr, T* dx_ptr, const T* output_ptr, const T* input_ptr,
+                                const int64_t up_space, const int64_t space,
+                                const int64_t down_space, const int64_t thread_num) {
+  // A thread is responsible for a row along specific dimension.
+  const size_t up_space_step = space * down_space;
+  CUDA_1D_KERNEL_LOOP_T(size_t, i, thread_num) {
+    const size_t up_space_id = i / down_space;
+    const size_t down_space_id = i % down_space;
+    const size_t ptr_offset = up_space_id * up_space_step + down_space_id;
+    auto* dy_ptr_base = dy_ptr + ptr_offset;
+    auto* dx_ptr_base = dx_ptr + ptr_offset;
+    auto* input_ptr_base = input_ptr + ptr_offset;
+    auto* output_ptr_base = output_ptr + ptr_offset;
+
+    // Buffer storing number of zero element along specific dimension.
+    // Use dx as tmp buffer.
+    for (size_t j = 0; j < space; j++) {
+      const size_t data_offset = j * down_space;
+      int is_zero = input_ptr_base[data_offset] == 0 ? 1 : 0;
+      dx_ptr_base[data_offset] = is_zero + (j == 0 ? 0 : dx_ptr_base[data_offset - down_space]);
+    }
+
+    // Find index of first zero in input.
+    size_t first_zero_index = space;
+    for (size_t j = 0; j < space; j++) {
+      const size_t data_offset = j * down_space;
+      if (dx_ptr_base[data_offset] == 1) {
+        first_zero_index = j;
+        break;
+      }
+    }
+
+    // Suppose z is index of first zero element in input,
+    // for element which index is less than z grad is computed as below:
+    T reverse_cumsum = 0;
+    for (size_t j = 0; j < first_zero_index; j++) {
+      const size_t cur_index = first_zero_index - j - 1;
+      const size_t data_offset = cur_index * down_space;
+      reverse_cumsum += output_ptr_base[data_offset] * dy_ptr_base[data_offset];
+      dx_ptr_base[data_offset] = reverse_cumsum / input_ptr_base[data_offset];
+    }
+
+    // Where index is z, its grad is computed as below:
+    if (first_zero_index == space) { return; }
+    T cumprod = 1;
+    T cumsum = 0;
+    T cumprod_before_first_zero =
+        first_zero_index == 0 ? 1 : output_ptr_base[(first_zero_index - 1) * down_space];
+    for (size_t j = first_zero_index; j < space; j++) {
+      const size_t down_space_offset = j * down_space;
+      // Recover dx_ptr default value
+      if (dx_ptr_base[down_space_offset] >= 1) { dx_ptr_base[down_space_offset] = 0; }
+      if (j != first_zero_index) { cumprod *= input_ptr_base[down_space_offset]; }
+      cumsum += cumprod_before_first_zero * dy_ptr_base[down_space_offset] * cumprod;
+    }
+    dx_ptr_base[first_zero_index * down_space] = cumsum;
+  }
+}
+}  // namespace
+
+template<typename T>
+class GpuCumProdGradKernel final : public user_op::OpKernel {
+ public:
+  GpuCumProdGradKernel() = default;
+  ~GpuCumProdGradKernel() = default;
+
+ private:
+  using user_op::OpKernel::Compute;
+  void Compute(user_op::KernelComputeContext* ctx) const override {
+    const auto* output = ctx->Tensor4ArgNameAndIndex("output", 0);
+    const auto* input = ctx->Tensor4ArgNameAndIndex("input", 0);
+    const auto* dy = ctx->Tensor4ArgNameAndIndex("dy", 0);
+    auto* dx = ctx->Tensor4ArgNameAndIndex("dx", 0);
+    const auto elem_cnt = dy->shape_view().elem_cnt();
+    if (!elem_cnt) { return; }
+
+    const auto* output_ptr = output->dptr<T>();
+    const auto* input_ptr = input->dptr<T>();
+    const auto* dy_ptr = dy->dptr<T>();
+    auto* dx_ptr = dx->mut_dptr<T>();
+
+    // Data partition: up_space|space|down_space
+    auto dim = ctx->Attr<int64_t>("dim");
+    const auto up_space = elem_cnt / dx->shape_view().Count(dim);
+    const auto space = dx->shape_view().At(dim);
+    const auto down_space = dx->shape_view().Count(dim + 1);
+    const size_t thread_num = up_space * down_space;
+
+    if (space == 1) {
+      Memcpy<DeviceType::kCUDA>(ctx->stream(), dx_ptr, dy_ptr, elem_cnt * sizeof(T));
+      return;
+    }
+    ep::CudaLaunchConfig config{};
+    ctx->stream()->As<ep::CudaStream>()->InitLaunchConfigWithWaves(
+        &config, thread_num, /*DefaultBlockSize*/ 256, /*max_wave*/ 1);
+    CumProdBackward<<<config.grid_dim, config.block_dim, /*shared memory*/ 0,
+                      ctx->stream()->As<ep::CudaStream>()->cuda_stream()>>>(
+        dy_ptr, dx_ptr, output_ptr, input_ptr, up_space, space, down_space, thread_num);
+  }
+
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
+};
+
+#define REGISTER_CUDA_CUMPROD_GRAD_KERNEL(dtype)                       \
+  REGISTER_USER_KERNEL("cumprod_grad")                                 \
+      .SetCreateFn<GpuCumProdGradKernel<dtype>>()                      \
+      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \
+                       && (user_op::HobDataType("dx", 0) == GetDataType<dtype>::value));
+
+REGISTER_CUDA_CUMPROD_GRAD_KERNEL(float)
+REGISTER_CUDA_CUMPROD_GRAD_KERNEL(double)
+#undef REGISTER_CUDA_CUMPROD_GRAD_KERNEL
+#endif
 }  // namespace oneflow
\ No newline at end of file
diff --git a/oneflow/user/kernels/cum_forward_kernel.hip.cpp b/oneflow/user/kernels/cum_forward_kernel.hip.cpp
index 1a96aff..1c3671c 100644
--- a/oneflow/user/kernels/cum_forward_kernel.hip.cpp
+++ b/oneflow/user/kernels/cum_forward_kernel.hip.cpp
@@ -1,169 +1,169 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "hip/hip_runtime.h"
-#include "oneflow/core/framework/framework.h"
-#include "oneflow/core/device/cuda_util.h"
-#include "oneflow/core/ep/rocm/cuda_stream.h"
-#include "oneflow/core/kernel/new_kernel_util.h"
-#include "oneflow/core/ndarray/binary_func.h"
-
-namespace oneflow {
-#ifdef WITH_ROCM
-namespace {
-
-// total thread number: cs_up_space * cs_down_space
-// in cs_down_space part, use cs_down_space threads
-// to calculate as follows(m=cs_down_space-1, n=cs_space-1, '|' stands for dependency):
-// dm0, ..., d10, d00
-//  |         |    |
-// dm1, ..., d11, d01
-//  |         |    |
-// dm2, ..., d12, d02
-//  |         |    |
-// ...       ...  ...
-//  |         |    |
-// dmn, ..., d1n, d0n
-template<typename T, template<typename> class BinaryFunc>
-__global__ void CumsumForwardGpu(const T* in_ptr, T* out_ptr, int64_t cs_up_space, int64_t cs_space,
-                                 int64_t cs_down_space) {
-  CUDA_1D_KERNEL_LOOP(i, cs_up_space * cs_down_space) {
-    auto cs_up_space_id = i / cs_down_space;
-    auto cs_down_space_id = i - (i / cs_down_space) * cs_down_space;
-
-    auto* in_ptr_base = in_ptr + cs_up_space_id * cs_space * cs_down_space + cs_down_space_id;
-    auto* out_ptr_base = out_ptr + cs_up_space_id * cs_space * cs_down_space + cs_down_space_id;
-
-    // calculate cs_space data in one thread
-    for (auto j = 0; j < cs_space; j++) {
-      auto idx = j * cs_down_space;
-      out_ptr_base[idx] = in_ptr_base[idx];
-      if (j != 0) {
-        out_ptr_base[idx] =
-            BinaryFunc<T>::Invoke(out_ptr_base[idx], out_ptr_base[idx - cs_down_space]);
-      }
-    }
-  }
-}
-template<typename T, template<typename> class BinaryFunc>
-__global__ void CumsumForwardGpuUpSpaceIs1(const T* in_ptr, T* out_ptr, int64_t cs_space,
-                                           int64_t cs_down_space) {
-  CUDA_1D_KERNEL_LOOP(i, cs_down_space) {
-    auto* in_ptr_base = in_ptr + i;
-    auto* out_ptr_base = out_ptr + i;
-
-    // calculate cs_space data in one thread
-    for (auto j = 0; j < cs_space; j++) {
-      auto idx = j * cs_down_space;
-      out_ptr_base[idx] = in_ptr_base[idx];
-      if (j != 0) {
-        out_ptr_base[idx] =
-            BinaryFunc<T>::Invoke(out_ptr_base[idx], out_ptr_base[idx - cs_down_space]);
-      }
-    }
-  }
-}
-template<typename T, template<typename> class BinaryFunc>
-__global__ void CumsumForwardGpuDownSpaceIs1(const T* in_ptr, T* out_ptr, int64_t cs_up_space,
-                                             int64_t cs_space) {
-  CUDA_1D_KERNEL_LOOP(i, cs_up_space) {
-    auto* in_ptr_base = in_ptr + i * cs_space;
-    auto* out_ptr_base = out_ptr + i * cs_space;
-
-    // calculate cs_space data in one thread
-    for (auto j = 0; j < cs_space; j++) {
-      out_ptr_base[j] = in_ptr_base[j];
-      if (j != 0) { out_ptr_base[j] = BinaryFunc<T>::Invoke(out_ptr_base[j], out_ptr_base[j - 1]); }
-    }
-  }
-}
-}  // namespace
-
-template<typename T, template<typename> class BinaryFunc>
-class GpuCumKernel : public user_op::OpKernel {
- public:
-  GpuCumKernel() = default;
-  ~GpuCumKernel() = default;
-
- private:
-  using user_op::OpKernel::Compute;
-  void Compute(user_op::KernelComputeContext* ctx) const override {
-    // judge whether tensor has 0 size dimension first
-    const auto* in = ctx->Tensor4ArgNameAndIndex("x", 0);
-    auto elem_cnt = in->shape_view().elem_cnt();
-    if (!elem_cnt) { return; }
-
-    auto* out = ctx->Tensor4ArgNameAndIndex("y", 0);
-    auto dim = ctx->Attr<int64_t>("dim");
-    const auto* in_ptr = in->dptr<T>();
-    auto* out_ptr = out->mut_dptr<T>();
-
-    // data partition: up_space|space|down_space
-    auto up_space = elem_cnt / in->shape_view().Count(dim);
-    auto space = in->shape_view().At(dim);
-    auto down_space = in->shape_view().Count(dim + 1);
-    auto thread_num = up_space * down_space;
-
-    if (up_space == 1) {
-      RUN_CUDA_KERNEL((CumsumForwardGpuUpSpaceIs1<T, BinaryFunc>), ctx->stream(), thread_num,
-                      in_ptr, out_ptr, space, down_space);
-    } else if (down_space == 1) {
-      RUN_CUDA_KERNEL((CumsumForwardGpuDownSpaceIs1<T, BinaryFunc>), ctx->stream(), thread_num,
-                      in_ptr, out_ptr, up_space, space);
-    } else {
-      RUN_CUDA_KERNEL((CumsumForwardGpu<T, BinaryFunc>), ctx->stream(), thread_num, in_ptr, out_ptr,
-                      up_space, space, down_space);
-    }
-  }
-  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
-};
-
-template<typename T>
-class GpuCumSumKernel final : public GpuCumKernel<T, BinaryFuncAdd> {
- public:
-  GpuCumSumKernel() = default;
-  ~GpuCumSumKernel() = default;
-};
-
-#define REGISTER_CUDA_CUMSUM_KERNEL(dtype)                                              \
-  REGISTER_USER_KERNEL("cumsum").SetCreateFn<GpuCumSumKernel<dtype>>().SetIsMatchedHob( \
-      (user_op::HobDeviceType() == DeviceType::kCUDA)                                   \
-      && (user_op::HobDataType("y", 0) == GetDataType<dtype>::value));
-
-REGISTER_CUDA_CUMSUM_KERNEL(int32_t)
-REGISTER_CUDA_CUMSUM_KERNEL(int64_t)
-REGISTER_CUDA_CUMSUM_KERNEL(float)
-REGISTER_CUDA_CUMSUM_KERNEL(double)
-#undef REGISTER_CUDA_CUMSUM_KERNEL
-
-template<typename T>
-class GpuCumProdKernel final : public GpuCumKernel<T, BinaryFuncMul> {
- public:
-  GpuCumProdKernel() = default;
-  ~GpuCumProdKernel() = default;
-};
-
-#define REGISTER_CUDA_CUMPROD_KERNEL(dtype)                                               \
-  REGISTER_USER_KERNEL("cumprod").SetCreateFn<GpuCumProdKernel<dtype>>().SetIsMatchedHob( \
-      (user_op::HobDeviceType() == DeviceType::kCUDA)                                     \
-      && (user_op::HobDataType("y", 0) == GetDataType<dtype>::value));
-
-REGISTER_CUDA_CUMPROD_KERNEL(int32_t)
-REGISTER_CUDA_CUMPROD_KERNEL(int64_t)
-REGISTER_CUDA_CUMPROD_KERNEL(float)
-REGISTER_CUDA_CUMPROD_KERNEL(double)
-#undef REGISTER_CUDA_CUMPROD_KERNEL
-#endif
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "hip/hip_runtime.h"
+#include "oneflow/core/framework/framework.h"
+#include "oneflow/core/device/cuda_util.h"
+#include "oneflow/core/ep/rocm/cuda_stream.h"
+#include "oneflow/core/kernel/new_kernel_util.h"
+#include "oneflow/core/ndarray/binary_func.h"
+
+namespace oneflow {
+#ifdef WITH_ROCM
+namespace {
+
+// total thread number: cs_up_space * cs_down_space
+// in cs_down_space part, use cs_down_space threads
+// to calculate as follows(m=cs_down_space-1, n=cs_space-1, '|' stands for dependency):
+// dm0, ..., d10, d00
+//  |         |    |
+// dm1, ..., d11, d01
+//  |         |    |
+// dm2, ..., d12, d02
+//  |         |    |
+// ...       ...  ...
+//  |         |    |
+// dmn, ..., d1n, d0n
+template<typename T, template<typename> class BinaryFunc>
+__global__ void CumsumForwardGpu(const T* in_ptr, T* out_ptr, int64_t cs_up_space, int64_t cs_space,
+                                 int64_t cs_down_space) {
+  CUDA_1D_KERNEL_LOOP(i, cs_up_space * cs_down_space) {
+    auto cs_up_space_id = i / cs_down_space;
+    auto cs_down_space_id = i - (i / cs_down_space) * cs_down_space;
+
+    auto* in_ptr_base = in_ptr + cs_up_space_id * cs_space * cs_down_space + cs_down_space_id;
+    auto* out_ptr_base = out_ptr + cs_up_space_id * cs_space * cs_down_space + cs_down_space_id;
+
+    // calculate cs_space data in one thread
+    for (auto j = 0; j < cs_space; j++) {
+      auto idx = j * cs_down_space;
+      out_ptr_base[idx] = in_ptr_base[idx];
+      if (j != 0) {
+        out_ptr_base[idx] =
+            BinaryFunc<T>::Invoke(out_ptr_base[idx], out_ptr_base[idx - cs_down_space]);
+      }
+    }
+  }
+}
+template<typename T, template<typename> class BinaryFunc>
+__global__ void CumsumForwardGpuUpSpaceIs1(const T* in_ptr, T* out_ptr, int64_t cs_space,
+                                           int64_t cs_down_space) {
+  CUDA_1D_KERNEL_LOOP(i, cs_down_space) {
+    auto* in_ptr_base = in_ptr + i;
+    auto* out_ptr_base = out_ptr + i;
+
+    // calculate cs_space data in one thread
+    for (auto j = 0; j < cs_space; j++) {
+      auto idx = j * cs_down_space;
+      out_ptr_base[idx] = in_ptr_base[idx];
+      if (j != 0) {
+        out_ptr_base[idx] =
+            BinaryFunc<T>::Invoke(out_ptr_base[idx], out_ptr_base[idx - cs_down_space]);
+      }
+    }
+  }
+}
+template<typename T, template<typename> class BinaryFunc>
+__global__ void CumsumForwardGpuDownSpaceIs1(const T* in_ptr, T* out_ptr, int64_t cs_up_space,
+                                             int64_t cs_space) {
+  CUDA_1D_KERNEL_LOOP(i, cs_up_space) {
+    auto* in_ptr_base = in_ptr + i * cs_space;
+    auto* out_ptr_base = out_ptr + i * cs_space;
+
+    // calculate cs_space data in one thread
+    for (auto j = 0; j < cs_space; j++) {
+      out_ptr_base[j] = in_ptr_base[j];
+      if (j != 0) { out_ptr_base[j] = BinaryFunc<T>::Invoke(out_ptr_base[j], out_ptr_base[j - 1]); }
+    }
+  }
+}
+}  // namespace
+
+template<typename T, template<typename> class BinaryFunc>
+class GpuCumKernel : public user_op::OpKernel {
+ public:
+  GpuCumKernel() = default;
+  ~GpuCumKernel() = default;
+
+ private:
+  using user_op::OpKernel::Compute;
+  void Compute(user_op::KernelComputeContext* ctx) const override {
+    // judge whether tensor has 0 size dimension first
+    const auto* in = ctx->Tensor4ArgNameAndIndex("x", 0);
+    auto elem_cnt = in->shape_view().elem_cnt();
+    if (!elem_cnt) { return; }
+
+    auto* out = ctx->Tensor4ArgNameAndIndex("y", 0);
+    auto dim = ctx->Attr<int64_t>("dim");
+    const auto* in_ptr = in->dptr<T>();
+    auto* out_ptr = out->mut_dptr<T>();
+
+    // data partition: up_space|space|down_space
+    auto up_space = elem_cnt / in->shape_view().Count(dim);
+    auto space = in->shape_view().At(dim);
+    auto down_space = in->shape_view().Count(dim + 1);
+    auto thread_num = up_space * down_space;
+
+    if (up_space == 1) {
+      RUN_CUDA_KERNEL((CumsumForwardGpuUpSpaceIs1<T, BinaryFunc>), ctx->stream(), thread_num,
+                      in_ptr, out_ptr, space, down_space);
+    } else if (down_space == 1) {
+      RUN_CUDA_KERNEL((CumsumForwardGpuDownSpaceIs1<T, BinaryFunc>), ctx->stream(), thread_num,
+                      in_ptr, out_ptr, up_space, space);
+    } else {
+      RUN_CUDA_KERNEL((CumsumForwardGpu<T, BinaryFunc>), ctx->stream(), thread_num, in_ptr, out_ptr,
+                      up_space, space, down_space);
+    }
+  }
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
+};
+
+template<typename T>
+class GpuCumSumKernel final : public GpuCumKernel<T, BinaryFuncAdd> {
+ public:
+  GpuCumSumKernel() = default;
+  ~GpuCumSumKernel() = default;
+};
+
+#define REGISTER_CUDA_CUMSUM_KERNEL(dtype)                                              \
+  REGISTER_USER_KERNEL("cumsum").SetCreateFn<GpuCumSumKernel<dtype>>().SetIsMatchedHob( \
+      (user_op::HobDeviceType() == DeviceType::kCUDA)                                   \
+      && (user_op::HobDataType("y", 0) == GetDataType<dtype>::value));
+
+REGISTER_CUDA_CUMSUM_KERNEL(int32_t)
+REGISTER_CUDA_CUMSUM_KERNEL(int64_t)
+REGISTER_CUDA_CUMSUM_KERNEL(float)
+REGISTER_CUDA_CUMSUM_KERNEL(double)
+#undef REGISTER_CUDA_CUMSUM_KERNEL
+
+template<typename T>
+class GpuCumProdKernel final : public GpuCumKernel<T, BinaryFuncMul> {
+ public:
+  GpuCumProdKernel() = default;
+  ~GpuCumProdKernel() = default;
+};
+
+#define REGISTER_CUDA_CUMPROD_KERNEL(dtype)                                               \
+  REGISTER_USER_KERNEL("cumprod").SetCreateFn<GpuCumProdKernel<dtype>>().SetIsMatchedHob( \
+      (user_op::HobDeviceType() == DeviceType::kCUDA)                                     \
+      && (user_op::HobDataType("y", 0) == GetDataType<dtype>::value));
+
+REGISTER_CUDA_CUMPROD_KERNEL(int32_t)
+REGISTER_CUDA_CUMPROD_KERNEL(int64_t)
+REGISTER_CUDA_CUMPROD_KERNEL(float)
+REGISTER_CUDA_CUMPROD_KERNEL(double)
+#undef REGISTER_CUDA_CUMPROD_KERNEL
+#endif
 }  // namespace oneflow
\ No newline at end of file
diff --git a/oneflow/user/kernels/data_shuffle_kernel.hip.cpp b/oneflow/user/kernels/data_shuffle_kernel.hip.cpp
index 703b044..ed4b9b3 100644
--- a/oneflow/user/kernels/data_shuffle_kernel.hip.cpp
+++ b/oneflow/user/kernels/data_shuffle_kernel.hip.cpp
@@ -1,1523 +1,1523 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "hip/hip_runtime.h"
-#include "oneflow/core/framework/framework.h"
-#include "oneflow/core/device/nccl_util.h"
-#include "oneflow/core/job/eager_nccl_comm_manager.h"
-#include "oneflow/core/job/parallel_desc.h"
-#include "oneflow/core/ep/rocm/cuda_stream.h"
-#include "oneflow/user/kernels/gather_kernel_util.h"
-#include "oneflow/user/kernels/unsorted_segment_sum_kernel_util.h"
-#include "oneflow/core/hip/atomic.hip.h"
-#include "oneflow/core/embedding/hash_functions.hip.h"
-#include "oneflow/core/hip/elementwise.hip.h"
-#include "oneflow/core/ep/include/primitive/copy_nd.h"
-#include "oneflow/core/hip/atomic.hip.h"
-
-namespace oneflow {
-
-namespace {
-
-template<typename K>
-struct TableEntry {
-  K key;
-  uint32_t value;
-};
-
-template<typename K, typename V, typename IDX, typename HASH>
-__global__ void HashTableUniqueAndPartitionPairs(const uint32_t table_capacity,
-                                                 const uint32_t num_keys, int32_t num_partition,
-                                                 IDX* unique_counts, TableEntry<K>* table,
-                                                 const K* keys, const V* values,
-                                                 K* partitioned_unique_keys,
-                                                 V* partitioned_unique_values, IDX* reverse_index,
-                                                 bool need_process_values) {
-  CUDA_1D_KERNEL_LOOP_T(uint32_t, i, num_keys) {
-    IDX r_index_plus_one = 0;
-    const K key = keys[i];
-    size_t key_hash = HASH()(key);
-    uint32_t partition_id = key_hash % num_partition;
-    IDX* unique_count = unique_counts + partition_id;
-    K* unique_keys = partitioned_unique_keys + partition_id * num_keys;
-    uint32_t pos = key_hash % table_capacity;
-    const K key_hi = (key | 0x1);
-    const K key_lo = (key & 0x1);
-    uint32_t counter = 0;
-    while (r_index_plus_one == 0) {
-      bool prob_next = false;
-      K* key_ptr = &table[pos].key;
-      volatile uint32_t* table_value_ptr = &table[pos].value;
-      const K old_key = cuda::atomic::CAS(key_ptr, 0, key_hi);
-      if (old_key == 0) {
-        IDX unique_pos = cuda::atomic::Add(unique_count, 1);
-        r_index_plus_one = unique_pos + 1;
-        unique_keys[unique_pos] = key;
-        if (need_process_values) {
-          partitioned_unique_values[partition_id * num_keys + unique_pos] = values[i];
-        }
-        *table_value_ptr = ((r_index_plus_one << 1U) | key_lo);
-      } else if (old_key == key_hi) {
-        const uint32_t value = *table_value_ptr;
-        if (value == 0) {
-          // do nothing
-        } else if ((value & 0x1) == key_lo) {
-          r_index_plus_one = (value >> 1U);
-        } else {
-          prob_next = true;
-        }
-      } else {
-        prob_next = true;
-      }
-      if (prob_next) {
-        pos += 1;
-        counter += 1;
-        if (pos >= table_capacity) { pos -= table_capacity; }
-        if (counter >= table_capacity) { asm volatile("s_trap 0;"); }
-      }
-    }
-    reverse_index[i] = partition_id * num_keys + r_index_plus_one - 1;
-  }
-}
-
-template<typename U>
-__global__ void GenerateTableIds(int32_t elem_cnt, int32_t num_tables, U* table_ids) {
-  CUDA_1D_KERNEL_LOOP(i, elem_cnt) { table_ids[i] = i % num_tables; }
-}
-
-template<typename K, typename V, typename IDX, typename HASH>
-void UniqueAndPartition(hipStream_t cuda_stream, int64_t num_ids, size_t capacity,
-                        int64_t num_partition, const K* ids, const V* table_ids,
-                        IDX* num_partitioned_unique_ids_ptr, K* partitioned_unique_ids,
-                        V* partitioned_unique_table_ids, IDX* inverse_unique_partition_indices,
-                        void* workspace_ptr, size_t workspace_bytes, bool need_process_table_ids) {
-  size_t table_capacity_bytes = capacity * sizeof(TableEntry<K>);
-  CHECK_GE(workspace_bytes, table_capacity_bytes);
-  OF_CUDA_CHECK(hipMemsetAsync(workspace_ptr, 0, table_capacity_bytes, cuda_stream));
-  OF_CUDA_CHECK(
-      hipMemsetAsync(num_partitioned_unique_ids_ptr, 0, num_partition * sizeof(IDX), cuda_stream));
-  hipLaunchKernelGGL(HIP_KERNEL_NAME(HashTableUniqueAndPartitionPairs<K, V, IDX, HASH>), BlocksNum4ThreadsNum(num_ids), kCudaThreadsNumPerBlock, 0, cuda_stream, 
-          capacity, num_ids, num_partition, num_partitioned_unique_ids_ptr,
-          reinterpret_cast<TableEntry<K>*>(workspace_ptr), ids, table_ids, partitioned_unique_ids,
-          partitioned_unique_table_ids, inverse_unique_partition_indices, need_process_table_ids);
-}
-
-template<typename T>
-void ShuffleData(hipStream_t cuda_stream, ncclComm_t comm, DataType data_type,
-                 const std::vector<int64_t>& send_offsets,
-                 const std::vector<int64_t>& send_elem_cnt, const T* send_data,
-                 const std::vector<int64_t>& recv_offsets,
-                 const std::vector<int64_t>& recv_elem_cnt, T* recv_data) {
-  ncclDataType_t nccl_data_type = GetNcclDataType(data_type);
-  const int64_t parallel_num = send_offsets.size();
-  OF_NCCL_CHECK(ncclGroupStart());
-  for (int64_t i = 0; i < parallel_num; ++i) {
-    OF_NCCL_CHECK(ncclSend(send_data + send_offsets.at(i), send_elem_cnt.at(i), nccl_data_type, i,
-                           comm, cuda_stream));
-    OF_NCCL_CHECK(ncclRecv(recv_data + recv_offsets.at(i), recv_elem_cnt.at(i), nccl_data_type, i,
-                           comm, cuda_stream));
-  }
-  OF_NCCL_CHECK(ncclGroupEnd());
-}
-
-template<typename IDX>
-void MakeShuffleParams(const IDX* host_num_unique_matrix, const int64_t num_ids,
-                       const int64_t row_size, int64_t parallel_id, int64_t parallel_num,
-                       std::vector<int64_t>* scatter_offset_vec,
-                       std::vector<int64_t>* scatter_elem_cnt_vec,
-                       std::vector<int64_t>* gather_offset_vec,
-                       std::vector<int64_t>* gather_elem_cnt_vec) {
-  scatter_offset_vec->resize(parallel_num);
-  scatter_elem_cnt_vec->resize(parallel_num);
-  gather_offset_vec->resize(parallel_num);
-  gather_elem_cnt_vec->resize(parallel_num);
-  int64_t gather_offset = 0;
-  for (int64_t i = 0; i < parallel_num; ++i) {
-    const int64_t scatter_elem_cnt =
-        host_num_unique_matrix[parallel_id * parallel_num + i] * row_size;
-    const int64_t gather_elem_cnt =
-        host_num_unique_matrix[i * parallel_num + parallel_id] * row_size;
-    scatter_offset_vec->at(i) = i * num_ids * row_size;
-    scatter_elem_cnt_vec->at(i) = scatter_elem_cnt;
-    gather_offset_vec->at(i) = gather_offset;
-    gather_elem_cnt_vec->at(i) = gather_elem_cnt;
-    gather_offset += gather_elem_cnt;
-  }
-}
-
-template<typename K, typename U, typename IDX>
-void ShuffleIdsAndTableIds(hipStream_t cuda_stream, ncclComm_t comm, int64_t parallel_id,
-                           int64_t parallel_num, int64_t num_ids, DataType ids_data_type,
-                           DataType table_ids_data_type, IDX* host_num_unique_matrix,
-                           K* partitioned_unique_ids, U* partitioned_unique_table_ids,
-                           K* received_ids, U* received_table_ids, int64_t* received_elem_cnt,
-                           bool need_process_table_ids) {
-  std::vector<int64_t> send_offsets;
-  std::vector<int64_t> send_elem_cnt;
-  std::vector<int64_t> recv_offsets;
-  std::vector<int64_t> recv_elem_cnt;
-  MakeShuffleParams(host_num_unique_matrix, num_ids, 1, parallel_id, parallel_num, &send_offsets,
-                    &send_elem_cnt, &recv_offsets, &recv_elem_cnt);
-  ShuffleData(cuda_stream, comm, ids_data_type, send_offsets, send_elem_cnt, partitioned_unique_ids,
-              recv_offsets, recv_elem_cnt, received_ids);
-  *received_elem_cnt = recv_offsets.at(parallel_num - 1) + recv_elem_cnt.at(parallel_num - 1);
-  if (need_process_table_ids) {
-    ShuffleData(cuda_stream, comm, table_ids_data_type, send_offsets, send_elem_cnt,
-                partitioned_unique_table_ids, recv_offsets, recv_elem_cnt, received_table_ids);
-  }
-}
-
-enum class IdShuffleBufferType {
-  kNumPartitionedUnique = 0,
-  kPartitionedUniqueIds,
-  kReceivedIds,
-  kTableIds,
-  kPartitionedUniqueTableIds,
-  kReceivedTableIds,
-  kWorkspace,
-  kMaxType
-};
-
-template<typename K, typename U, typename IDX>
-class IdShuffleTmpBufferManager final {
- public:
-  OF_DISALLOW_COPY_AND_MOVE(IdShuffleTmpBufferManager);
-  IdShuffleTmpBufferManager(void* ptr, const int64_t num_ids, const int64_t parallel_num,
-                            bool need_table_ids, bool need_process_table_ids)
-      : offset_(0),
-        offsets_(static_cast<size_t>(IdShuffleBufferType::kMaxType), -1),
-        sizes_(static_cast<size_t>(IdShuffleBufferType::kMaxType)),
-        ptr_(ptr) {
-    const int64_t num_table_ids = need_process_table_ids ? num_ids : 0;
-    const size_t table_ids_bytes = need_table_ids ? num_ids * sizeof(U) : 0;
-    AllocBuffer(IdShuffleBufferType::kNumPartitionedUnique, parallel_num * sizeof(IDX));
-    size_t partitioned_ids_bytes = parallel_num * num_ids * sizeof(K);
-    AllocBuffer(IdShuffleBufferType::kPartitionedUniqueIds, partitioned_ids_bytes);
-    AllocBuffer(IdShuffleBufferType::kReceivedIds, partitioned_ids_bytes);
-    AllocBuffer(IdShuffleBufferType::kTableIds, table_ids_bytes);
-    size_t partitioned_table_ids_bytes = parallel_num * num_table_ids * sizeof(U);
-    AllocBuffer(IdShuffleBufferType::kPartitionedUniqueTableIds, partitioned_table_ids_bytes);
-    AllocBuffer(IdShuffleBufferType::kReceivedTableIds, partitioned_table_ids_bytes);
-    const size_t hash_table_capacity = parallel_num * num_ids;
-    AllocBuffer(IdShuffleBufferType::kWorkspace, hash_table_capacity * sizeof(TableEntry<K>));
-  }
-
-  template<typename T = void>
-  T* Ptr(IdShuffleBufferType type) {
-    CHECK(ptr_ != nullptr);
-    int64_t offset = offsets_.at(static_cast<size_t>(type));
-    CHECK_NE(offset, -1);
-    return reinterpret_cast<T*>(reinterpret_cast<char*>(ptr_) + offset);
-  }
-
-  int64_t Size(IdShuffleBufferType type) { return sizes_.at(static_cast<size_t>(type)); }
-
-  size_t TotalBufferSize() const { return offset_; }
-
- private:
-  void AllocBuffer(IdShuffleBufferType type, size_t size) {
-    const size_t type_id = static_cast<size_t>(type);
-    CHECK_EQ(offsets_.at(type_id), -1);
-    offsets_.at(type_id) = offset_;
-    sizes_.at(type_id) = size;
-    offset_ += GetCudaAlignedSize(size);
-  }
-  size_t offset_;
-  std::vector<int64_t> offsets_;
-  std::vector<int64_t> sizes_;
-  void* ptr_;
-};
-
-template<typename IDX>
-class DataShuffleKernelState final : public user_op::OpKernelState {
- public:
-  explicit DataShuffleKernelState(user_op::KernelInitContext* ctx)
-      : device_index_(-1),
-        stream_name_(EagerNcclCommMgr::kDefaultStreamName),
-        parallel_desc_(ctx->parallel_desc()) {
-    OF_CUDA_CHECK(hipGetDevice(&device_index_));
-    if (ctx->op_conf().has_stream_name_hint()) { stream_name_ = ctx->op_conf().stream_name_hint(); }
-    OF_CUDA_CHECK(hipMallocHost(
-        reinterpret_cast<void **>(&host_num_unique_matrix_),
-        parallel_desc_.parallel_num() * parallel_desc_.parallel_num() * sizeof(IDX)));
-  }
-  ~DataShuffleKernelState() {
-    CudaCurrentDeviceGuard guard(device_index_);
-    OF_CUDA_CHECK(hipHostFree(host_num_unique_matrix_));
-  }
-
-  ncclComm_t comm() { return GetOrCreate().comm; }
-
-  IDX* HostNumUniqueMatrix() { return host_num_unique_matrix_; }
-
- private:
-  struct Comm {
-    Comm(ncclComm_t comm) : comm(comm) {}
-    ncclComm_t comm;
-  };
-
-  const Comm& GetOrCreate() {
-    if (!comm_) { Init(); }
-    return *comm_;
-  }
-
-  void Init() {
-    std::set<std::pair<int64_t, int64_t>> device_set;
-    for (int64_t parallel_id = 0; parallel_id < parallel_desc_.parallel_num(); ++parallel_id) {
-      int64_t machine_id = CHECK_JUST(parallel_desc_.MachineId4ParallelId(parallel_id));
-      int64_t device_id = CHECK_JUST(parallel_desc_.DeviceId4ParallelId(parallel_id));
-      device_set.emplace(std::make_pair(machine_id, device_id));
-    }
-    EagerNcclCommMgr* comm_mgr = CHECK_NOTNULL(Singleton<EagerNcclCommMgr>::Get());
-    ncclComm_t comm;
-    comm = comm_mgr->GetCommForDeviceAndStreamName(device_set, stream_name_);
-    comm_.reset(new Comm(comm));
-  }
-
-  int device_index_;
-  bool has_independent_stream_;
-  std::string stream_name_;
-  ParallelDesc parallel_desc_;
-  std::unique_ptr<Comm> comm_;
-  IDX* host_num_unique_matrix_;
-};
-
-}  // namespace
-
-template<typename K, typename U, typename IDX>
-class IdShuffleKernel final : public user_op::OpKernel {
- public:
-  IdShuffleKernel() = default;
-  ~IdShuffleKernel() override = default;
-
-  std::shared_ptr<user_op::OpKernelState> CreateOpKernelState(
-      user_op::KernelInitContext* ctx) const override {
-    return std::make_shared<DataShuffleKernelState<IDX>>(ctx);
-  }
-
- private:
-  using user_op::OpKernel::Compute;
-  void Compute(user_op::KernelComputeContext* ctx, user_op::OpKernelState* state,
-               const user_op::OpKernelCache*) const override {
-    auto* kernel_state = dynamic_cast<DataShuffleKernelState<IDX>*>(state);
-    CHECK(kernel_state != nullptr);
-    const user_op::Tensor* ids = ctx->Tensor4ArgNameAndIndex("ids", 0);
-    user_op::Tensor* num_unique_matrix = ctx->Tensor4ArgNameAndIndex("num_unique_matrix", 0);
-    user_op::Tensor* inverse_unique_partition_indices =
-        ctx->Tensor4ArgNameAndIndex("inverse_unique_partition_indices", 0);
-    user_op::Tensor* cur_rank_num_unique = ctx->Tensor4ArgNameAndIndex("cur_rank_num_unique", 0);
-    user_op::Tensor* cur_rank_unique_ids = ctx->Tensor4ArgNameAndIndex("cur_rank_unique_ids", 0);
-    user_op::Tensor* cur_rank_unique_table_ids =
-        ctx->Tensor4ArgNameAndIndex("cur_rank_unique_table_ids", 0);
-    user_op::Tensor* cur_rank_inverse_indices =
-        ctx->Tensor4ArgNameAndIndex("cur_rank_inverse_indices", 0);
-    user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0);
-    const int32_t num_tables = ctx->Attr<int32_t>("num_tables");
-    const bool has_table_ids = ctx->has_input("table_ids", 0);
-    const bool need_gen_table_ids = (!has_table_ids && num_tables > 1);
-    const bool need_process_table_ids = (has_table_ids || num_tables > 1);
-    const int64_t num_ids = ids->shape_view().elem_cnt();
-    const int64_t parallel_num = ctx->parallel_ctx().parallel_num();
-    const int64_t parallel_id = ctx->parallel_ctx().parallel_id();
-    hipStream_t cuda_stream = ctx->stream()->As<ep::CudaStream>()->cuda_stream();
-    IdShuffleTmpBufferManager<K, U, IDX> buffer_manager(
-        tmp_buffer->mut_dptr(), num_ids, parallel_num, need_gen_table_ids, need_process_table_ids);
-    CHECK_GE(tmp_buffer->shape_view().elem_cnt(), buffer_manager.TotalBufferSize());
-
-    const U* table_ids_ptr;
-    if (has_table_ids) {
-      const user_op::Tensor* table_ids = ctx->Tensor4ArgNameAndIndex("table_ids", 0);
-      table_ids_ptr = reinterpret_cast<const U*>(table_ids->dptr());
-    } else if (need_gen_table_ids) {
-      hipLaunchKernelGGL(GenerateTableIds, BlocksNum4ThreadsNum(num_ids), kCudaThreadsNumPerBlock, 0, cuda_stream, 
-          num_ids, num_tables, buffer_manager.template Ptr<U>(IdShuffleBufferType::kTableIds));
-      table_ids_ptr = buffer_manager.template Ptr<U>(IdShuffleBufferType::kTableIds);
-    } else {
-      table_ids_ptr = nullptr;
-    }
-    IDX* num_partitioned_unique =
-        buffer_manager.template Ptr<IDX>(IdShuffleBufferType::kNumPartitionedUnique);
-    K* partitioned_unique_ids =
-        buffer_manager.template Ptr<K>(IdShuffleBufferType::kPartitionedUniqueIds);
-    U* partitioned_unique_table_ids =
-        buffer_manager.template Ptr<U>(IdShuffleBufferType::kPartitionedUniqueTableIds);
-    IDX* num_unique_matrix_ptr = reinterpret_cast<IDX*>(num_unique_matrix->mut_dptr());
-    size_t hash_table_capacity = parallel_num * num_ids;
-    void* workspace_ptr = buffer_manager.Ptr(IdShuffleBufferType::kWorkspace);
-    size_t workspace_size = buffer_manager.Size(IdShuffleBufferType::kWorkspace);
-    UniqueAndPartition<K, U, IDX, embedding::ShardingHash>(
-        cuda_stream, num_ids, hash_table_capacity, parallel_num,
-        reinterpret_cast<const K*>(ids->dptr()), table_ids_ptr, num_partitioned_unique,
-        partitioned_unique_ids, partitioned_unique_table_ids,
-        reinterpret_cast<IDX*>(inverse_unique_partition_indices->mut_dptr()), workspace_ptr,
-        workspace_size, need_process_table_ids);
-    ncclComm_t comm = kernel_state->comm();
-    OF_NCCL_CHECK(ncclAllGather(num_partitioned_unique, num_unique_matrix_ptr, parallel_num,
-                                GetNcclDataType(num_unique_matrix->data_type()), comm,
-                                cuda_stream));
-    IDX* host_num_unique_matrix = kernel_state->HostNumUniqueMatrix();
-    OF_CUDA_CHECK(hipMemcpyAsync(host_num_unique_matrix, num_unique_matrix_ptr,
-                                  parallel_num * parallel_num * sizeof(IDX), hipMemcpyDefault,
-                                  cuda_stream));
-    CHECK_JUST(ctx->stream()->Sync());
-
-    K* received_ids = buffer_manager.template Ptr<K>(IdShuffleBufferType::kReceivedIds);
-    U* received_table_ids = buffer_manager.template Ptr<U>(IdShuffleBufferType::kReceivedTableIds);
-    int64_t received_elem_cnt = 0;
-    ShuffleIdsAndTableIds(cuda_stream, comm, parallel_id, parallel_num, num_ids, ids->data_type(),
-                          cur_rank_unique_table_ids->data_type(), host_num_unique_matrix,
-                          partitioned_unique_ids, partitioned_unique_table_ids, received_ids,
-                          received_table_ids, &received_elem_cnt, need_process_table_ids);
-    UniqueAndPartition<K, U, IDX, embedding::LocalUniqueHash>(
-        cuda_stream, received_elem_cnt, hash_table_capacity, 1, received_ids, received_table_ids,
-        reinterpret_cast<IDX*>(cur_rank_num_unique->mut_dptr()),
-        reinterpret_cast<K*>(cur_rank_unique_ids->mut_dptr()),
-        reinterpret_cast<U*>(cur_rank_unique_table_ids->mut_dptr()),
-        reinterpret_cast<IDX*>(cur_rank_inverse_indices->mut_dptr()), workspace_ptr, workspace_size,
-        need_process_table_ids);
-    if (!need_process_table_ids) {
-      OF_CUDA_CHECK(hipMemsetAsync(cur_rank_unique_table_ids->mut_dptr(), 0,
-                                    received_elem_cnt * sizeof(U), cuda_stream));
-    }
-  }
-  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
-};
-
-#define ID_DATA_TYPE_SEQ                            \
-  OF_PP_MAKE_TUPLE_SEQ(uint32_t, DataType::kUInt32) \
-  OF_PP_MAKE_TUPLE_SEQ(uint64_t, DataType::kUInt64) \
-  OF_PP_MAKE_TUPLE_SEQ(int32_t, DataType::kInt32)   \
-  OF_PP_MAKE_TUPLE_SEQ(int64_t, DataType::kInt64)
-
-#define TABLE_ID_DATA_TYPE_SEQ                      \
-  OF_PP_MAKE_TUPLE_SEQ(uint8_t, DataType::kUInt8)   \
-  OF_PP_MAKE_TUPLE_SEQ(uint32_t, DataType::kUInt32) \
-  OF_PP_MAKE_TUPLE_SEQ(uint64_t, DataType::kUInt64) \
-  OF_PP_MAKE_TUPLE_SEQ(int8_t, DataType::kInt8)     \
-  OF_PP_MAKE_TUPLE_SEQ(int32_t, DataType::kInt32)   \
-  OF_PP_MAKE_TUPLE_SEQ(int64_t, DataType::kInt64)
-
-#define IDX_DATA_TYPE_SEQ                           \
-  OF_PP_MAKE_TUPLE_SEQ(uint32_t, DataType::kUInt32) \
-  OF_PP_MAKE_TUPLE_SEQ(int32_t, DataType::kInt32)
-
-#define REGISTER_CUDA_ID_SHUFFLE_KERNEL(k_dtype_pair, table_id_dtype_pair, idx_dtype_pair)        \
-  REGISTER_USER_KERNEL("id_shuffle")                                                              \
-      .SetCreateFn<                                                                               \
-          IdShuffleKernel<OF_PP_PAIR_FIRST(k_dtype_pair), OF_PP_PAIR_FIRST(table_id_dtype_pair),  \
-                          OF_PP_PAIR_FIRST(idx_dtype_pair)>>()                                    \
-      .SetIsMatchedHob(                                                                           \
-          (user_op::HobDeviceType() == DeviceType::kCUDA)                                         \
-          && (user_op::HobDataType("ids", 0) == OF_PP_PAIR_SECOND(k_dtype_pair))                  \
-          && (user_op::HobDataType("cur_rank_unique_table_ids", 0)                                \
-              == OF_PP_PAIR_SECOND(table_id_dtype_pair))                                          \
-          && (user_op::HobDataType("num_unique_matrix", 0) == OF_PP_PAIR_SECOND(idx_dtype_pair))) \
-      .SetInferTmpSizeFn([](user_op::InferContext* ctx) {                                         \
-        const user_op::TensorDesc& ids = ctx->InputTensorDesc("ids", 0);                          \
-        const bool has_table_ids = ctx->has_input("table_ids", 0);                                \
-        const int32_t num_tables = ctx->Attr<int32_t>("num_tables");                              \
-        const bool need_gen_table_ids = (!has_table_ids && num_tables > 1);                       \
-        const bool need_process_table_ids = (has_table_ids || num_tables > 1);                    \
-        IdShuffleTmpBufferManager<OF_PP_PAIR_FIRST(k_dtype_pair),                                 \
-                                  OF_PP_PAIR_FIRST(table_id_dtype_pair),                          \
-                                  OF_PP_PAIR_FIRST(idx_dtype_pair)>                               \
-            buffer_manager(nullptr, ids.shape().elem_cnt(), ctx->parallel_desc().parallel_num(),  \
-                           need_gen_table_ids, need_process_table_ids);                           \
-        return buffer_manager.TotalBufferSize();                                                  \
-      });
-
-OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_CUDA_ID_SHUFFLE_KERNEL, ID_DATA_TYPE_SEQ,
-                                 TABLE_ID_DATA_TYPE_SEQ, IDX_DATA_TYPE_SEQ)
-
-template<typename T, typename IDX>
-void ShuffleEmbeddings(hipStream_t cuda_stream, ncclComm_t comm, int64_t parallel_id,
-                       int64_t parallel_num, int64_t num_ids, int64_t embedding_size,
-                       DataType data_type, IDX* host_num_unique_matrix,
-                       T* reverse_unique_cur_rank_embeddings, T* received_embeddings) {
-  std::vector<int64_t> send_offsets;
-  std::vector<int64_t> send_elem_cnt;
-  std::vector<int64_t> recv_offsets;
-  std::vector<int64_t> recv_elem_cnt;
-  MakeShuffleParams(host_num_unique_matrix, num_ids, embedding_size, parallel_id, parallel_num,
-                    &recv_offsets, &recv_elem_cnt, &send_offsets, &send_elem_cnt);
-  ShuffleData(cuda_stream, comm, data_type, send_offsets, send_elem_cnt,
-              reverse_unique_cur_rank_embeddings, recv_offsets, recv_elem_cnt, received_embeddings);
-}
-
-// Quantized Version.
-template<typename T, typename IDX>
-void ShuffleEmbeddings(hipStream_t cuda_stream, ncclComm_t comm, int64_t parallel_id,
-                       int64_t parallel_num, int64_t num_ids, int64_t embedding_size,
-                       DataType data_type, IDX* host_num_unique_matrix,
-                       int8_t* reverse_unique_cur_rank_embeddings, int8_t* received_embeddings,
-                       T* reverse_cur_rank_quantize_factor, T* recv_quantize_factor) {
-  std::vector<int64_t> send_offsets;
-  std::vector<int64_t> send_elem_cnt;
-  std::vector<int64_t> recv_offsets;
-  std::vector<int64_t> recv_elem_cnt;
-  // shuffle quantized_embedding
-  MakeShuffleParams(host_num_unique_matrix, num_ids, embedding_size, parallel_id, parallel_num,
-                    &recv_offsets, &recv_elem_cnt, &send_offsets, &send_elem_cnt);
-  ShuffleData(cuda_stream, comm, DataType::kInt8, send_offsets, send_elem_cnt,
-              reverse_unique_cur_rank_embeddings, recv_offsets, recv_elem_cnt, received_embeddings);
-  // shuffle quantize_factor
-  MakeShuffleParams(host_num_unique_matrix, num_ids, /*embedding_size=*/1, parallel_id,
-                    parallel_num, &recv_offsets, &recv_elem_cnt, &send_offsets, &send_elem_cnt);
-  ShuffleData(cuda_stream, comm, data_type, send_offsets, send_elem_cnt,
-              reverse_cur_rank_quantize_factor, recv_offsets, recv_elem_cnt, recv_quantize_factor);
-}
-
-__device__ float RoundHalfAwayFromZero(const float x) {
-  float abs_val = abs(x);
-  float floor_val = floor(abs_val + static_cast<float>(0.5));
-  return copysignf(floor_val, x);
-}
-
-// warp reduce version.
-constexpr int32_t kWarpSize = 32;
-constexpr int32_t kMaxColSize = 1024;
-
-template<typename T, int thread_group_width = kWarpSize>
-__inline__ __device__ T WarpMaxAllReduce(T val) {
-  for (int32_t lane_mask = thread_group_width / 2; lane_mask > 0; lane_mask /= 2) {
-    // val = max(val, __shfl_xor_sync(0xffffffff, val, lane_mask, thread_group_width));
-    val = max(val, __shfl_xor(val, lane_mask, thread_group_width));
-  }
-  return val;
-}
-
-inline hipError_t GetWarpImplNumBlocks(int64_t block_size, int64_t max_blocks, int64_t waves,
-                                        int* num_blocks) {
-  int dev;
-  {
-    hipError_t err = hipGetDevice(&dev);
-    if (err != hipSuccess) { return err; }
-  }
-  int sm_count;
-  {
-    hipError_t err = hipDeviceGetAttribute(&sm_count, hipDeviceAttributeMultiprocessorCount, dev);
-    if (err != hipSuccess) { return err; }
-  }
-  int tpm;
-  {
-    hipError_t err = hipDeviceGetAttribute(&tpm, hipDeviceAttributeMaxThreadsPerMultiProcessor, dev);
-    if (err != hipSuccess) { return err; }
-  }
-  *num_blocks =
-      std::max<int>(1, std::min<int64_t>(max_blocks, sm_count * tpm / block_size * waves));
-  return hipSuccess;
-}
-
-template<typename T, typename ComputeType, int pack_size, int cols_per_thread,
-         int thread_group_width, int rows_per_access, bool padding>
-__global__ void QuantizeWarpImplKernel(const T* src, int8_t* dst, T* quantize_factor,
-                                       const int64_t rows, const int64_t cols) {
-  static_assert(cols_per_thread % pack_size == 0, "");
-  static_assert(thread_group_width <= kWarpSize, "");
-  static_assert(kWarpSize % thread_group_width == 0, "");
-  constexpr int num_packs = cols_per_thread / pack_size;
-  assert(cols <= cols_per_thread * thread_group_width);
-  ComputeType buf[rows_per_access][cols_per_thread];
-  const int global_thread_group_id = blockIdx.x * blockDim.y + threadIdx.y;
-  const int num_global_thread_group = gridDim.x * blockDim.y;
-  const int lane_id = threadIdx.x;
-  const int64_t step = num_global_thread_group * rows_per_access;
-  using LoadType = cuda::elementwise::PackType<T, pack_size>;
-  using LoadPack = cuda::elementwise::Pack<T, pack_size>;
-  using StoreType = cuda::elementwise::PackType<int8_t, pack_size>;
-  using StorePack = cuda::elementwise::Pack<int8_t, pack_size>;
-
-  for (int64_t row = global_thread_group_id * rows_per_access; row < rows; row += step) {
-    ComputeType thread_abs_max[rows_per_access];
-#pragma unroll
-    for (int row_id = 0; row_id < rows_per_access; row_id++) {
-      ComputeType* row_buf = buf[row_id];
-      thread_abs_max[row_id] = 0.0;
-#pragma unroll
-      for (int pack_id = 0; pack_id < num_packs; pack_id++) {
-        const int pack_offset = pack_id * pack_size;
-        const int col = (pack_id * thread_group_width + lane_id) * pack_size;
-        LoadPack load_pack;
-        if (!padding || col < cols) {
-          const int64_t load_offset = ((row + row_id) * cols + col) / pack_size;
-          load_pack.storage = *(reinterpret_cast<const LoadType*>(src) + load_offset);
-#pragma unroll
-          for (int i = 0; i < pack_size; i++) {
-            row_buf[pack_offset + i] = static_cast<ComputeType>(load_pack.elem[i]);
-            thread_abs_max[row_id] = max(thread_abs_max[row_id], abs(row_buf[pack_offset + i]));
-          }
-        } else {
-#pragma unroll
-          for (int i = 0; i < pack_size; i++) { row_buf[pack_offset + i] = 0.0; }
-        }
-      }
-    }
-    ComputeType warp_max[rows_per_access];
-#pragma unroll
-    for (int row_id = 0; row_id < rows_per_access; row_id++) {
-      warp_max[row_id] = WarpMaxAllReduce<ComputeType, thread_group_width>(thread_abs_max[row_id]);
-      if (threadIdx.x == 0) { quantize_factor[row + row_id] = static_cast<T>(warp_max[row_id]); }
-      ComputeType* row_buf = buf[row_id];
-      ComputeType quantize_factor_val = static_cast<ComputeType>(127.0) / warp_max[row_id];
-#pragma unroll
-      for (int col = 0; col < cols_per_thread; col++) {
-        row_buf[col] = RoundHalfAwayFromZero(row_buf[col] * quantize_factor_val);
-      }
-#pragma unroll
-      for (int pack_id = 0; pack_id < num_packs; pack_id++) {
-        const int pack_offset = pack_id * pack_size;
-        const int col = (pack_id * thread_group_width + lane_id) * pack_size;
-        StorePack store_pack;
-        if (!padding || col < cols) {
-          const int64_t store_offset = ((row + row_id) * cols + col) / pack_size;
-          for (int i = 0; i < pack_size; i++) {
-            store_pack.elem[i] = static_cast<int8_t>(row_buf[pack_id * pack_size + i]);
-          }
-          *(reinterpret_cast<StoreType*>(dst) + store_offset) = store_pack.storage;
-        }
-      }
-    }
-  }
-}
-
-template<typename T, typename ComputeType, int pack_size, int cols_per_thread,
-         int thread_group_width, int rows_per_access, bool padding>
-inline hipError_t LaunchQuantizeWarpImpl(hipStream_t stream, const T* src, int8_t* dst,
-                                          T* quantize_factor, const int64_t rows,
-                                          const int64_t cols) {
-  constexpr int block_size = 128;
-  constexpr int waves = 32;
-  static_assert(block_size % thread_group_width == 0, "");
-  constexpr int thread_groups_per_block = block_size / thread_group_width;
-  dim3 block_dim(thread_group_width, thread_groups_per_block);
-  const int64_t num_blocks =
-      (rows / rows_per_access + thread_groups_per_block - 1) / thread_groups_per_block;
-  int grid_dim_x = 0;
-
-  hipError_t err = GetWarpImplNumBlocks(block_size, num_blocks, waves, &grid_dim_x);
-  if (err != hipSuccess) { return err; }
-
-  QuantizeWarpImplKernel<T, ComputeType, pack_size, cols_per_thread, thread_group_width,
-                         rows_per_access, padding>
-      <<<grid_dim_x, block_dim, 0, stream>>>(src, dst, quantize_factor, rows, cols);
-  return hipPeekAtLastError();
-}
-
-template<typename T, typename ComputeType, int pack_size, int cols_per_thread,
-         int thread_group_width, int rows_per_access>
-inline hipError_t DispatchQuantizeWarpImplPadding(hipStream_t stream, const T* src, int8_t* dst,
-                                                   T* quantize_factor, const int64_t rows,
-                                                   const int64_t cols) {
-  if (cols == cols_per_thread * thread_group_width) {
-    return LaunchQuantizeWarpImpl<T, ComputeType, pack_size, cols_per_thread, thread_group_width,
-                                  rows_per_access, false>(stream, src, dst, quantize_factor, rows,
-                                                          cols);
-  } else {
-    return LaunchQuantizeWarpImpl<T, ComputeType, pack_size, cols_per_thread, thread_group_width,
-                                  rows_per_access, true>(stream, src, dst, quantize_factor, rows,
-                                                         cols);
-  }
-}
-
-template<typename T, typename ComputeType, int pack_size>
-typename std::enable_if<pack_size == 1, hipError_t>::type DispatchQuantizeWarpImplCols(
-    hipStream_t stream, const T* src, int8_t* dst, T* quantize_factor, const int64_t rows,
-    const int64_t cols) {
-  if (cols <= 0) { return hipErrorInvalidValue; }
-#define DEFINE_ONE_ELIF(thread_group_width)                                                       \
-  else if (cols <= (thread_group_width)*pack_size) {                                              \
-    if (rows % 2 == 0) {                                                                          \
-      return DispatchQuantizeWarpImplPadding<T, ComputeType, pack_size, pack_size,                \
-                                             thread_group_width, 2>(stream, src, dst,             \
-                                                                    quantize_factor, rows, cols); \
-    } else {                                                                                      \
-      return DispatchQuantizeWarpImplPadding<T, ComputeType, pack_size, pack_size,                \
-                                             thread_group_width, 1>(stream, src, dst,             \
-                                                                    quantize_factor, rows, cols); \
-    }                                                                                             \
-  }
-  DEFINE_ONE_ELIF(1)
-  DEFINE_ONE_ELIF(2)
-  DEFINE_ONE_ELIF(4)
-  DEFINE_ONE_ELIF(8)
-  DEFINE_ONE_ELIF(16)
-  DEFINE_ONE_ELIF(32)
-#undef DEFINE_ONE_ELIF
-#define DEFINE_ONE_ELIF(col)                                                              \
-  else if (cols <= (col)*kWarpSize) {                                                     \
-    return DispatchQuantizeWarpImplPadding<T, ComputeType, pack_size, col, kWarpSize, 1>( \
-        stream, src, dst, quantize_factor, rows, cols);                                   \
-  }
-  DEFINE_ONE_ELIF(2)
-  DEFINE_ONE_ELIF(3)
-  DEFINE_ONE_ELIF(4)
-  DEFINE_ONE_ELIF(5)
-  DEFINE_ONE_ELIF(6)
-  DEFINE_ONE_ELIF(7)
-  DEFINE_ONE_ELIF(8)
-  DEFINE_ONE_ELIF(9)
-  DEFINE_ONE_ELIF(10)
-  DEFINE_ONE_ELIF(11)
-  DEFINE_ONE_ELIF(12)
-  DEFINE_ONE_ELIF(13)
-  DEFINE_ONE_ELIF(14)
-  DEFINE_ONE_ELIF(15)
-  DEFINE_ONE_ELIF(16)
-  DEFINE_ONE_ELIF(17)
-  DEFINE_ONE_ELIF(18)
-  DEFINE_ONE_ELIF(19)
-  DEFINE_ONE_ELIF(20)
-  DEFINE_ONE_ELIF(21)
-  DEFINE_ONE_ELIF(22)
-  DEFINE_ONE_ELIF(23)
-  DEFINE_ONE_ELIF(24)
-  DEFINE_ONE_ELIF(25)
-  DEFINE_ONE_ELIF(26)
-  DEFINE_ONE_ELIF(27)
-  DEFINE_ONE_ELIF(28)
-  DEFINE_ONE_ELIF(29)
-  DEFINE_ONE_ELIF(30)
-  DEFINE_ONE_ELIF(31)
-  DEFINE_ONE_ELIF(32)
-#undef DEFINE_ONE_ELIF
-  else {
-    return hipErrorInvalidValue;
-  }
-}
-
-template<typename T, typename ComputeType, int pack_size>
-typename std::enable_if<pack_size == 2, hipError_t>::type DispatchQuantizeWarpImplCols(
-    hipStream_t stream, const T* src, int8_t* dst, T* quantize_factor, const int64_t rows,
-    const int64_t cols) {
-  if (cols <= 0) { return hipErrorInvalidValue; }
-#define DEFINE_ONE_ELIF(thread_group_width)                                                       \
-  else if (cols <= (thread_group_width)*pack_size) {                                              \
-    if (rows % 2 == 0) {                                                                          \
-      return DispatchQuantizeWarpImplPadding<T, ComputeType, pack_size, pack_size,                \
-                                             thread_group_width, 2>(stream, src, dst,             \
-                                                                    quantize_factor, rows, cols); \
-    } else {                                                                                      \
-      return DispatchQuantizeWarpImplPadding<T, ComputeType, pack_size, pack_size,                \
-                                             thread_group_width, 1>(stream, src, dst,             \
-                                                                    quantize_factor, rows, cols); \
-    }                                                                                             \
-  }
-  DEFINE_ONE_ELIF(1)
-  DEFINE_ONE_ELIF(2)
-  DEFINE_ONE_ELIF(4)
-  DEFINE_ONE_ELIF(8)
-  DEFINE_ONE_ELIF(16)
-  DEFINE_ONE_ELIF(32)
-#undef DEFINE_ONE_ELIF
-#define DEFINE_ONE_ELIF(col)                                                              \
-  else if (cols <= (col)*kWarpSize) {                                                     \
-    return DispatchQuantizeWarpImplPadding<T, ComputeType, pack_size, col, kWarpSize, 1>( \
-        stream, src, dst, quantize_factor, rows, cols);                                   \
-  }
-  DEFINE_ONE_ELIF(4)
-  DEFINE_ONE_ELIF(6)
-  DEFINE_ONE_ELIF(8)
-  DEFINE_ONE_ELIF(10)
-  DEFINE_ONE_ELIF(12)
-  DEFINE_ONE_ELIF(14)
-  DEFINE_ONE_ELIF(16)
-  DEFINE_ONE_ELIF(18)
-  DEFINE_ONE_ELIF(20)
-  DEFINE_ONE_ELIF(22)
-  DEFINE_ONE_ELIF(24)
-  DEFINE_ONE_ELIF(26)
-  DEFINE_ONE_ELIF(28)
-  DEFINE_ONE_ELIF(30)
-  DEFINE_ONE_ELIF(32)
-#undef DEFINE_ONE_ELIF
-  else {
-    return hipErrorInvalidValue;
-  }
-}
-
-template<typename T, typename ComputeType>
-struct DispatchQuantizeWarpImplPackSize {
-  hipError_t operator()(hipStream_t stream, const T* src, int8_t* dst, T* quantize_factor,
-                         const int64_t rows, const int64_t cols) {
-    if (cols % 2 == 0) {
-      return DispatchQuantizeWarpImplCols<T, ComputeType, 2>(stream, src, dst, quantize_factor,
-                                                             rows, cols);
-    } else {
-      return DispatchQuantizeWarpImplCols<T, ComputeType, 1>(stream, src, dst, quantize_factor,
-                                                             rows, cols);
-    }
-  }
-};
-
-template<typename T, typename ComputeType, typename IDX, int pack_size>
-__global__ void DequantizeKernel(const int8_t* x, T* quantize_factor, T* out, IDX col_size,
-                                 IDX elem_cnt);
-
-template<typename T, typename ComputeType, typename IDX, int pack_size>
-__global__ void DequantizeKernel(const int8_t* x, T* quantize_factor, T* out, IDX col_size,
-                                 IDX elem_cnt) {
-  IDX global_thread_id = blockIdx.x * blockDim.x + threadIdx.x;
-
-  for (int index = global_thread_id * pack_size; index < elem_cnt;
-       index += gridDim.x * blockDim.x * pack_size) {
-    IDX quantize_factor_idx = index / col_size;
-    ComputeType quantize_factor_val = static_cast<ComputeType>(quantize_factor[quantize_factor_idx])
-                                      / static_cast<ComputeType>(127.0);
-    using LoadPackType = cuda::elementwise::PackType<int8_t, pack_size>;
-    using LoadPack = cuda::elementwise::Pack<int8_t, pack_size>;
-    using StorePackType = cuda::elementwise::PackType<T, pack_size>;
-    using StorePack = cuda::elementwise::Pack<T, pack_size>;
-    LoadPack load_pack{};
-    StorePack store_pack{};
-    load_pack.storage = *(reinterpret_cast<const LoadPackType*>(x) + index / pack_size);
-#pragma unroll
-    for (int i = 0; i < pack_size; i++) {
-      store_pack.elem[i] =
-          static_cast<T>(static_cast<ComputeType>(load_pack.elem[i]) * quantize_factor_val);
-    }
-    *(reinterpret_cast<StorePackType*>(out) + index / pack_size) = store_pack.storage;
-  }
-}
-
-template<typename T, typename ComputeType, typename IDX, int pack_size>
-hipError_t DispatchDequantizeKernelPackSize(hipStream_t stream, const int8_t* src,
-                                             T* quantize_factor, T* dst, const int64_t col_size,
-                                             const int64_t elem_cnt) {
-  const int64_t pack_num = elem_cnt / pack_size;
-  int grid_size = 0;
-  hipError_t err = cuda::elementwise::GetNumBlocks(pack_num, &grid_size);
-  if (err != hipSuccess) { return err; }
-  hipLaunchKernelGGL(HIP_KERNEL_NAME(DequantizeKernel<T, ComputeType, IDX, pack_size>), grid_size, cuda::elementwise::kBlockSize, 0, stream, src, quantize_factor, dst, col_size,
-                                                                elem_cnt);
-  return hipSuccess;
-}
-
-template<typename T, typename ComputeType, typename IDX>
-inline hipError_t LaunchDequantizeKernel(hipStream_t stream, const int8_t* src,
-                                          T* quantize_factor, T* dst, const int64_t col_size,
-                                          const int64_t elem_cnt) {
-  constexpr int quantized_src_pack_size = cuda::elementwise::PackSize<int8_t>();
-  constexpr int dst_pack_size = cuda::elementwise::PackSize<T>();
-  int launch_pack_size = std::min(quantized_src_pack_size, dst_pack_size);
-  if (launch_pack_size == 8 && col_size % 8 == 0) {
-    hipError_t err = DispatchDequantizeKernelPackSize<T, ComputeType, IDX, 8>(
-        stream, src, quantize_factor, dst, col_size, elem_cnt);
-    if (err != hipSuccess) { return err; }
-  } else if (launch_pack_size == 4 && col_size % 4 == 0) {
-    hipError_t err = DispatchDequantizeKernelPackSize<T, ComputeType, IDX, 4>(
-        stream, src, quantize_factor, dst, col_size, elem_cnt);
-    if (err != hipSuccess) { return err; }
-  } else if (launch_pack_size == 2 && col_size % 2 == 0) {
-    hipError_t err = DispatchDequantizeKernelPackSize<T, ComputeType, IDX, 2>(
-        stream, src, quantize_factor, dst, col_size, elem_cnt);
-    if (err != hipSuccess) { return err; }
-  } else {
-    hipError_t err = DispatchDequantizeKernelPackSize<T, ComputeType, IDX, 1>(
-        stream, src, quantize_factor, dst, col_size, elem_cnt);
-    if (err != hipSuccess) { return err; }
-  }
-  return hipPeekAtLastError();
-}
-
-template<typename T>
-struct DefaultComputeType {
-  using type = T;
-};
-
-template<>
-struct DefaultComputeType<half> {
-  using type = float;
-};
-
-template<typename T, typename IDX>
-class EmbeddingShuffleKernel final : public user_op::OpKernel {
- public:
-  EmbeddingShuffleKernel() = default;
-  ~EmbeddingShuffleKernel() override = default;
-
-  std::shared_ptr<user_op::OpKernelState> CreateOpKernelState(
-      user_op::KernelInitContext* ctx) const override {
-    return std::make_shared<DataShuffleKernelState<IDX>>(ctx);
-  }
-
- private:
-  using user_op::OpKernel::Compute;
-  void Compute(user_op::KernelComputeContext* ctx, user_op::OpKernelState* state,
-               const user_op::OpKernelCache*) const override {
-    auto* kernel_state = dynamic_cast<DataShuffleKernelState<IDX>*>(state);
-    CHECK(kernel_state != nullptr);
-    const user_op::Tensor* cur_rank_embeddings =
-        ctx->Tensor4ArgNameAndIndex("cur_rank_embeddings", 0);
-    const user_op::Tensor* num_unique_matrix = ctx->Tensor4ArgNameAndIndex("num_unique_matrix", 0);
-    const user_op::Tensor* cur_rank_inverse_indices =
-        ctx->Tensor4ArgNameAndIndex("cur_rank_inverse_indices", 0);
-    const user_op::Tensor* inverse_unique_partition_indices =
-        ctx->Tensor4ArgNameAndIndex("inverse_unique_partition_indices", 0);
-    user_op::Tensor* embeddings = ctx->Tensor4ArgNameAndIndex("embeddings", 0);
-    user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0);
-    ncclComm_t comm = kernel_state->comm();
-    using ComputeType = typename DefaultComputeType<T>::type;
-    const int64_t embedding_size = cur_rank_embeddings->shape_view().At(1);
-    IDX* host_num_unique_matrix = kernel_state->HostNumUniqueMatrix();
-    DataType data_type = cur_rank_embeddings->data_type();
-    const int64_t num_ids = inverse_unique_partition_indices->shape_view().elem_cnt();
-    const int64_t parallel_num = ctx->parallel_ctx().parallel_num();
-    const int64_t parallel_id = ctx->parallel_ctx().parallel_id();
-    bool enable_quantized_comm_env_var =
-        ParseBooleanFromEnv("ONEFLOW_ONE_EMBEDDING_ENABLE_QUANTIZED_COMM", false);
-    bool enable_quantized_comm = enable_quantized_comm_env_var && (embedding_size < kMaxColSize);
-    if (enable_quantized_comm_env_var && !enable_quantized_comm) {
-      LOG(WARNING) << "Only envrionment variable ONEFLOW_ONE_EMBEDDING_ENABLE_QUANTIZED_COMM=1 and "
-                      "embedding_size less equal than 1024 can use quantized communication. ";
-    }
-    hipStream_t cuda_stream = ctx->stream()->As<ep::CudaStream>()->cuda_stream();
-    OF_CUDA_CHECK(hipMemcpyAsync(
-        host_num_unique_matrix, reinterpret_cast<const IDX*>(num_unique_matrix->dptr()),
-        parallel_num * parallel_num * sizeof(IDX), hipMemcpyDefault, cuda_stream));
-    CHECK_JUST(ctx->stream()->Sync());
-    int64_t cur_rank_num_ids = 0;
-    for (int64_t i = 0; i < parallel_num; ++i) {
-      cur_rank_num_ids += host_num_unique_matrix[i * parallel_num + parallel_id];
-    }
-    size_t full_elem_cnt = parallel_num * num_ids * embedding_size;
-    CHECK_EQ(full_elem_cnt, cur_rank_embeddings->shape_view().elem_cnt());
-    if (!enable_quantized_comm) {
-      size_t reverse_unique_cur_rank_embeddings_size =
-          GetCudaAlignedSize(full_elem_cnt * sizeof(T));
-      size_t received_embeddings_size = reverse_unique_cur_rank_embeddings_size;
-
-      CHECK_GE(tmp_buffer->shape_view().elem_cnt(),
-               reverse_unique_cur_rank_embeddings_size + received_embeddings_size);
-
-      T* reverse_unique_cur_rank_embeddings = reinterpret_cast<T*>(tmp_buffer->mut_dptr());
-      T* received_embeddings = reinterpret_cast<T*>(tmp_buffer->mut_dptr<char>()
-                                                    + reverse_unique_cur_rank_embeddings_size);
-      // reverse cur_rank unique
-      GatherKernelUtilImpl<DeviceType::kCUDA, T, IDX>::Forward(
-          ctx->stream(), reinterpret_cast<const IDX*>(cur_rank_inverse_indices->dptr()),
-          cur_rank_num_ids, cur_rank_embeddings->dptr<T>(),
-          Shape({1, cur_rank_embeddings->shape_view().elem_cnt() / embedding_size, embedding_size}),
-          reverse_unique_cur_rank_embeddings, 0);
-
-      ShuffleEmbeddings(cuda_stream, comm, parallel_id, parallel_num, num_ids, embedding_size,
-                        data_type, host_num_unique_matrix, reverse_unique_cur_rank_embeddings,
-                        received_embeddings);
-
-      // reverse unique_partition
-      GatherKernelUtilImpl<DeviceType::kCUDA, T, IDX>::Forward(
-          ctx->stream(), reinterpret_cast<const IDX*>(inverse_unique_partition_indices->dptr()),
-          inverse_unique_partition_indices->shape_view().elem_cnt(), received_embeddings,
-          Shape({1, parallel_num * num_ids, embedding_size}), embeddings->mut_dptr<T>(), 0);
-    } else {
-      size_t reverse_unique_cur_rank_embeddings_size =
-          GetCudaAlignedSize(full_elem_cnt * sizeof(int8_t));
-      size_t received_embeddings_size = reverse_unique_cur_rank_embeddings_size;
-      size_t quantize_cur_rank_embeddings_size = reverse_unique_cur_rank_embeddings_size;
-      size_t reverse_recv_quantize_cur_rank_embeddings_size =
-          reverse_unique_cur_rank_embeddings_size;
-      size_t cur_rank_quantize_factor_size =
-          GetCudaAlignedSize(cur_rank_embeddings->shape_view().At(0) * sizeof(T));
-      size_t reverse_cur_rank_quantize_factor_size = cur_rank_quantize_factor_size;
-      size_t recv_quantize_factor_size = cur_rank_quantize_factor_size;
-      size_t reverse_recv_quantize_factor_size = cur_rank_quantize_factor_size;
-      CHECK_GE(tmp_buffer->shape_view().elem_cnt(),
-               reverse_unique_cur_rank_embeddings_size + received_embeddings_size
-                   + quantize_cur_rank_embeddings_size
-                   + reverse_recv_quantize_cur_rank_embeddings_size + cur_rank_quantize_factor_size
-                   + reverse_cur_rank_quantize_factor_size + recv_quantize_factor_size
-                   + reverse_recv_quantize_factor_size);
-      int8_t* reverse_unique_cur_rank_embeddings =
-          reinterpret_cast<int8_t*>(tmp_buffer->mut_dptr());
-      int8_t* received_embeddings = reinterpret_cast<int8_t*>(
-          tmp_buffer->mut_dptr<char>() + reverse_unique_cur_rank_embeddings_size);
-      int8_t* quantize_cur_rank_embeddings = reinterpret_cast<int8_t*>(
-          tmp_buffer->mut_dptr<char>() + reverse_unique_cur_rank_embeddings_size
-          + received_embeddings_size);
-      int8_t* reverse_recv_quantize_cur_rank_embeddings = reinterpret_cast<int8_t*>(
-          tmp_buffer->mut_dptr<char>() + reverse_unique_cur_rank_embeddings_size
-          + received_embeddings_size + quantize_cur_rank_embeddings_size);
-      T* cur_rank_quantize_factor = reinterpret_cast<T*>(
-          tmp_buffer->mut_dptr<char>() + reverse_unique_cur_rank_embeddings_size
-          + received_embeddings_size + quantize_cur_rank_embeddings_size
-          + reverse_recv_quantize_cur_rank_embeddings_size);
-      T* reverse_cur_rank_quantize_factor = reinterpret_cast<T*>(
-          tmp_buffer->mut_dptr<char>() + reverse_unique_cur_rank_embeddings_size
-          + received_embeddings_size + quantize_cur_rank_embeddings_size
-          + reverse_recv_quantize_cur_rank_embeddings_size + cur_rank_quantize_factor_size);
-      T* recv_quantize_factor = reinterpret_cast<T*>(
-          tmp_buffer->mut_dptr<char>() + reverse_unique_cur_rank_embeddings_size
-          + received_embeddings_size + quantize_cur_rank_embeddings_size
-          + reverse_recv_quantize_cur_rank_embeddings_size + cur_rank_quantize_factor_size
-          + reverse_cur_rank_quantize_factor_size);
-      T* reverse_recv_quantize_factor = reinterpret_cast<T*>(
-          tmp_buffer->mut_dptr<char>() + reverse_unique_cur_rank_embeddings_size
-          + received_embeddings_size + quantize_cur_rank_embeddings_size
-          + reverse_recv_quantize_cur_rank_embeddings_size + cur_rank_quantize_factor_size
-          + reverse_cur_rank_quantize_factor_size + recv_quantize_factor_size);
-      DispatchQuantizeWarpImplPackSize<T, ComputeType>()(
-          cuda_stream, cur_rank_embeddings->dptr<T>(), quantize_cur_rank_embeddings,
-          cur_rank_quantize_factor, cur_rank_num_ids, embedding_size);
-      // reverse cur_rank embedding unique
-      GatherKernelUtilImpl<DeviceType::kCUDA, int8_t, IDX>::Forward(
-          ctx->stream(), reinterpret_cast<const IDX*>(cur_rank_inverse_indices->dptr()),
-          cur_rank_num_ids, quantize_cur_rank_embeddings,
-          Shape({1, cur_rank_embeddings->shape_view().elem_cnt() / embedding_size, embedding_size}),
-          reverse_unique_cur_rank_embeddings, 0);
-
-      // reverse cur_rank quantize factor unique
-      GatherKernelUtilImpl<DeviceType::kCUDA, T, IDX>::Forward(
-          ctx->stream(), reinterpret_cast<const IDX*>(cur_rank_inverse_indices->dptr()),
-          cur_rank_num_ids, cur_rank_quantize_factor,
-          Shape({1, cur_rank_embeddings->shape_view().elem_cnt() / embedding_size, 1}),
-          reverse_cur_rank_quantize_factor, 0);
-
-      ShuffleEmbeddings(cuda_stream, comm, parallel_id, parallel_num, num_ids, embedding_size,
-                        data_type, host_num_unique_matrix, reverse_unique_cur_rank_embeddings,
-                        received_embeddings, reverse_cur_rank_quantize_factor,
-                        recv_quantize_factor);
-
-      // reverse unique_partition
-      GatherKernelUtilImpl<DeviceType::kCUDA, int8_t, IDX>::Forward(
-          ctx->stream(), reinterpret_cast<const IDX*>(inverse_unique_partition_indices->dptr()),
-          inverse_unique_partition_indices->shape_view().elem_cnt(), received_embeddings,
-          Shape({1, parallel_num * num_ids, embedding_size}),
-          reverse_recv_quantize_cur_rank_embeddings, 0);
-
-      GatherKernelUtilImpl<DeviceType::kCUDA, T, IDX>::Forward(
-          ctx->stream(), reinterpret_cast<const IDX*>(inverse_unique_partition_indices->dptr()),
-          inverse_unique_partition_indices->shape_view().elem_cnt(), recv_quantize_factor,
-          Shape({1, parallel_num * num_ids, 1}), reverse_recv_quantize_factor, 0);
-
-      int32_t dequantize_row_size = inverse_unique_partition_indices->shape_view().elem_cnt();
-      IDX dequantize_elem_cnt = dequantize_row_size * embedding_size;
-      OF_CUDA_CHECK((LaunchDequantizeKernel<T, ComputeType, IDX>(
-          cuda_stream, reverse_recv_quantize_cur_rank_embeddings, reverse_recv_quantize_factor,
-          embeddings->mut_dptr<T>(), embedding_size, dequantize_elem_cnt)));
-    }
-  }
-  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
-};
-
-#define REGISTER_CUDA_EMBEDDING_SHUFFLE_KERNEL(t_dtype_pair, idx_dtype_pair)                      \
-  REGISTER_USER_KERNEL("embedding_shuffle")                                                       \
-      .SetCreateFn<EmbeddingShuffleKernel<OF_PP_PAIR_FIRST(t_dtype_pair),                         \
-                                          OF_PP_PAIR_FIRST(idx_dtype_pair)>>()                    \
-      .SetIsMatchedHob(                                                                           \
-          (user_op::HobDeviceType() == DeviceType::kCUDA)                                         \
-          && (user_op::HobDataType("cur_rank_embeddings", 0) == OF_PP_PAIR_SECOND(t_dtype_pair))  \
-          && (user_op::HobDataType("num_unique_matrix", 0) == OF_PP_PAIR_SECOND(idx_dtype_pair))) \
-      .SetInferTmpSizeFn([](user_op::InferContext* ctx) {                                         \
-        const user_op::TensorDesc& cur_rank_embeddings =                                          \
-            ctx->InputTensorDesc("cur_rank_embeddings", 0);                                       \
-        bool enable_quantized_comm =                                                              \
-            ParseBooleanFromEnv("ONEFLOW_ONE_EMBEDDING_ENABLE_QUANTIZED_COMM", false)             \
-            && (cur_rank_embeddings.shape().At(1) < kMaxColSize);                                 \
-        size_t tmp_size = 0;                                                                      \
-        if (!enable_quantized_comm) {                                                             \
-          size_t reverse_cur_rank_embeddings_size = GetCudaAlignedSize(                           \
-              cur_rank_embeddings.shape().elem_cnt() * sizeof(OF_PP_PAIR_FIRST(t_dtype_pair)));   \
-          size_t recv_unique_embeddings_size = reverse_cur_rank_embeddings_size;                  \
-          tmp_size = reverse_cur_rank_embeddings_size + recv_unique_embeddings_size;              \
-        } else {                                                                                  \
-          size_t total_elem_cnt = cur_rank_embeddings.shape().elem_cnt();                         \
-          size_t reverse_cur_rank_embeddings_size =                                               \
-              GetCudaAlignedSize(total_elem_cnt * sizeof(int8_t));                                \
-          size_t recv_unique_embeddings = reverse_cur_rank_embeddings_size;                       \
-          size_t quantize_cur_rank_embeddings_size = reverse_cur_rank_embeddings_size;            \
-          size_t reverse_recv_quantize_cur_rank_embeddings_size =                                 \
-              reverse_cur_rank_embeddings_size;                                                   \
-          size_t cur_rank_quantize_factor_size = GetCudaAlignedSize(                              \
-              cur_rank_embeddings.shape().At(0) * sizeof(OF_PP_PAIR_FIRST(t_dtype_pair)));        \
-          size_t reverse_cur_rank_quantize_factor_size = cur_rank_quantize_factor_size;           \
-          size_t recv_quantize_factor_size = cur_rank_quantize_factor_size;                       \
-          size_t reverse_recv_quantize_factor_size = cur_rank_quantize_factor_size;               \
-          tmp_size = reverse_cur_rank_embeddings_size + recv_unique_embeddings                    \
-                     + quantize_cur_rank_embeddings_size                                          \
-                     + reverse_recv_quantize_cur_rank_embeddings_size                             \
-                     + cur_rank_quantize_factor_size + reverse_cur_rank_quantize_factor_size      \
-                     + recv_quantize_factor_size + reverse_recv_quantize_factor_size;             \
-        }                                                                                         \
-        return tmp_size;                                                                          \
-      });
-
-OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_CUDA_EMBEDDING_SHUFFLE_KERNEL,
-                                 FLOATING_DATA_TYPE_SEQ HALF_DATA_TYPE_SEQ, IDX_DATA_TYPE_SEQ)
-                                //  FLOATING_DATA_TYPE_SEQ, IDX_DATA_TYPE_SEQ)
-                                 
-
-template<typename T, typename IDX>
-void ShuffleEmbeddingsGrad(hipStream_t cuda_stream, ncclComm_t comm, int64_t parallel_id,
-                           int64_t parallel_num, int64_t num_ids, int64_t embedding_size,
-                           DataType data_type, IDX* host_num_unique_matrix,
-                           T* unique_partition_embedding_grad, T* received_embeddings_grad) {
-  std::vector<int64_t> send_offsets;
-  std::vector<int64_t> send_elem_cnt;
-  std::vector<int64_t> recv_offsets;
-  std::vector<int64_t> recv_elem_cnt;
-  MakeShuffleParams(host_num_unique_matrix, num_ids, embedding_size, parallel_id, parallel_num,
-                    &send_offsets, &send_elem_cnt, &recv_offsets, &recv_elem_cnt);
-  ShuffleData(cuda_stream, comm, data_type, send_offsets, send_elem_cnt,
-              unique_partition_embedding_grad, recv_offsets, recv_elem_cnt,
-              received_embeddings_grad);
-}
-
-// Quantize Version.
-template<typename T, typename IDX>
-void ShuffleEmbeddingsGrad(hipStream_t cuda_stream, ncclComm_t comm, int64_t parallel_id,
-                           int64_t parallel_num, int64_t num_ids, int64_t embedding_size,
-                           DataType data_type, IDX* host_num_unique_matrix,
-                           int8_t* unique_partition_embedding_grad,
-                           int8_t* received_embeddings_grad, T* cur_rank_quantize_factor,
-                           T* received_cur_rank_quantize_factor) {
-  std::vector<int64_t> send_offsets;
-  std::vector<int64_t> send_elem_cnt;
-  std::vector<int64_t> recv_offsets;
-  std::vector<int64_t> recv_elem_cnt;
-  // Shuffle Embedding Grad.
-  MakeShuffleParams(host_num_unique_matrix, num_ids, embedding_size, parallel_id, parallel_num,
-                    &send_offsets, &send_elem_cnt, &recv_offsets, &recv_elem_cnt);
-  ShuffleData(cuda_stream, comm, DataType::kInt8, send_offsets, send_elem_cnt,
-              unique_partition_embedding_grad, recv_offsets, recv_elem_cnt,
-              received_embeddings_grad);
-  // Shuffle Quantize factor.
-  MakeShuffleParams(host_num_unique_matrix, num_ids, /*embedding_size=*/1, parallel_id,
-                    parallel_num, &send_offsets, &send_elem_cnt, &recv_offsets, &recv_elem_cnt);
-  ShuffleData(cuda_stream, comm, data_type, send_offsets, send_elem_cnt, cur_rank_quantize_factor,
-              recv_offsets, recv_elem_cnt, received_cur_rank_quantize_factor);
-}
-
-template<typename K, typename IDX>
-__global__ void UnsortedSegmentHalfGpu(const IDX in_h2_elem_cnt, const IDX h2_inner_dim_size,
-                                       const IDX inner_dim_size, const half* data,
-                                       const K* segment_ids, const IDX num_segments,
-                                       half2* out_h2) {
-  CUDA_1D_KERNEL_LOOP_T(IDX, i, in_h2_elem_cnt) {
-    const IDX segment_id_idx = i / h2_inner_dim_size;
-    const IDX h2_inner_idx = i - segment_id_idx * h2_inner_dim_size;
-    const IDX inner_idx_0 = 2 * h2_inner_idx;
-    const IDX inner_idx_1 = inner_idx_0 + 1;
-    const half* data_row = data + segment_id_idx * inner_dim_size;
-    half2 val;
-    val.data.x = data_row[inner_idx_0];
-    val.data.y = (inner_idx_1 >= inner_dim_size) ? static_cast<half>(0) : data_row[inner_idx_1];
-    const IDX idx = segment_ids[segment_id_idx];
-    const IDX out_h2_offset = idx * h2_inner_dim_size + h2_inner_idx;
-    cuda::atomic::Add(out_h2 + out_h2_offset, val);
-  }
-}
-
-template<typename T, typename K>
-struct UnsortedSegmentSumPad {
-  void operator()(ep::Stream* stream, const K* segment_ids, const T* data, int64_t num_segment_ids,
-                  int64_t num_segments, int64_t inner_dim_size, int64_t padded_inner_dim_size,
-                  T* out) const {
-    UNIMPLEMENTED();
-  }
-};
-
-template<typename K>
-struct UnsortedSegmentSumPad<half, K> {
-  void operator()(ep::Stream* stream, const K* segment_ids, const half* data,
-                  int64_t num_segment_ids, int64_t num_segments, int64_t inner_dim_size,
-                  int64_t padded_inner_dim_size, half* out) const {
-    const int64_t data_elem_cnt = num_segment_ids * inner_dim_size;
-    const int64_t out_elem_cnt = num_segments * padded_inner_dim_size;
-    CHECK_EQ(padded_inner_dim_size % 2, 0);
-    CHECK_EQ(inner_dim_size + 1, padded_inner_dim_size);
-    const int64_t h2_inner_dim_size = padded_inner_dim_size / 2;
-    const int64_t in_h2_elem_cnt = num_segment_ids * h2_inner_dim_size;
-    if (std::max(data_elem_cnt, out_elem_cnt) < GetMaxVal<int32_t>() / 2) {
-      UnsortedSegmentHalfGpu<K, int32_t>
-          <<<BlocksNum4ThreadsNum(in_h2_elem_cnt), kCudaThreadsNumPerBlock, 0,
-             stream->As<ep::CudaStream>()->cuda_stream()>>>(
-              in_h2_elem_cnt, h2_inner_dim_size, inner_dim_size, data, segment_ids, num_segments,
-              reinterpret_cast<half2*>(out));
-    } else {
-      UnsortedSegmentHalfGpu<K, int64_t>
-          <<<BlocksNum4ThreadsNum(in_h2_elem_cnt), kCudaThreadsNumPerBlock, 0,
-             stream->As<ep::CudaStream>()->cuda_stream()>>>(
-              in_h2_elem_cnt, h2_inner_dim_size, inner_dim_size, data, segment_ids, num_segments,
-              reinterpret_cast<half2*>(out));
-    }
-  }
-};
-
-template<typename T, typename K>
-void UnsortedSegmentSum(ep::Stream* stream, const K* segment_ids, const T* data,
-                        int64_t num_segment_ids, int64_t num_segments, int64_t inner_dim_size,
-                        int64_t padded_inner_dim_size, T* out) {
-  if (inner_dim_size == padded_inner_dim_size) {
-    UnsortedSegmentSumKernelUtil<DeviceType::kCUDA, T, K, T>::UnsortedSegmentSum(
-        stream, segment_ids, data, num_segment_ids, num_segments, 1, inner_dim_size, 0, out);
-  } else {
-    CHECK_EQ(inner_dim_size + 1, padded_inner_dim_size);
-    UnsortedSegmentSumPad<T, K>()(stream, segment_ids, data, num_segment_ids, num_segments,
-                                  inner_dim_size, padded_inner_dim_size, out);
-  }
-}
-
-template<typename T, typename IDX>
-void UniquePartitionEmbeddingGrad(ep::Stream* stream, int64_t parallel_id, int64_t parallel_num,
-                                  int64_t num_ids, int64_t embedding_size,
-                                  int64_t padded_embedding_size, const IDX* host_num_unique_matrix,
-                                  const T* embedding_grad,
-                                  const IDX* inverse_unique_partition_indices,
-                                  T* unique_partition_embedding_grad) {
-  for (int64_t i = 0; i < parallel_num; ++i) {
-    const int64_t offset = i * num_ids * padded_embedding_size;
-    const int64_t valid_value_size =
-        host_num_unique_matrix[parallel_id * parallel_num + i] * padded_embedding_size * sizeof(T);
-    OF_CUDA_CHECK(hipMemsetAsync(unique_partition_embedding_grad + offset, 0, valid_value_size,
-                                  stream->As<ep::CudaStream>()->cuda_stream()));
-  }
-  UnsortedSegmentSum<T, IDX>(stream, inverse_unique_partition_indices, embedding_grad, num_ids,
-                             parallel_num * num_ids, embedding_size, padded_embedding_size,
-                             unique_partition_embedding_grad);
-}
-
-template<typename T, typename IDX>
-void UniqueCurRankEmbeddingGrad(ep::Stream* stream, DataType data_type, int64_t cur_rank_num_ids,
-                                int64_t embedding_size, int64_t padded_embedding_size,
-                                const T* cur_rank_embedding_grad,
-                                const IDX* cur_rank_inverse_indices,
-                                T* cur_rank_unique_embedding_grad, T* tmp_buffer) {
-  T* unsorted_segment_sum_out =
-      (embedding_size == padded_embedding_size) ? cur_rank_unique_embedding_grad : tmp_buffer;
-  OF_CUDA_CHECK(hipMemsetAsync(unsorted_segment_sum_out, 0,
-                                cur_rank_num_ids * padded_embedding_size * sizeof(T),
-                                stream->As<ep::CudaStream>()->cuda_stream()));
-  UnsortedSegmentSum<T, IDX>(stream, cur_rank_inverse_indices, cur_rank_embedding_grad,
-                             cur_rank_num_ids, cur_rank_num_ids, padded_embedding_size,
-                             padded_embedding_size, unsorted_segment_sum_out);
-  if (embedding_size != padded_embedding_size) {
-    std::unique_ptr<ep::primitive::CopyNd> primitive =
-        ep::primitive::NewPrimitive<ep::primitive::CopyNdFactory>(DeviceType::kCUDA, 2);
-    DimVector dst_shape = {cur_rank_num_ids, embedding_size};
-    DimVector dst_pos_vec = {0, 0};
-    DimVector src_shape = {cur_rank_num_ids, padded_embedding_size};
-    DimVector src_pos_vec = {0, 0};
-    DimVector extent_vec = {cur_rank_num_ids, embedding_size};
-    primitive->Launch(stream, data_type, 2, cur_rank_unique_embedding_grad, dst_shape.data(),
-                      dst_pos_vec.data(), unsorted_segment_sum_out, src_shape.data(),
-                      src_pos_vec.data(), extent_vec.data());
-  }
-}
-
-int64_t GetPaddedEmbeddingSize(DataType data_type, int64_t embedding_size) {
-  if (data_type == DataType::kFloat16 && embedding_size % 2 != 0) {
-    return embedding_size + 1;
-  } else {
-    return embedding_size;
-  }
-}
-
-template<typename T, typename IDX>
-class EmbeddingGradientShuffleKernel final : public user_op::OpKernel {
- public:
-  EmbeddingGradientShuffleKernel() = default;
-  ~EmbeddingGradientShuffleKernel() override = default;
-
-  std::shared_ptr<user_op::OpKernelState> CreateOpKernelState(
-      user_op::KernelInitContext* ctx) const override {
-    return std::make_shared<DataShuffleKernelState<IDX>>(ctx);
-  }
-
- private:
-  using user_op::OpKernel::Compute;
-  void Compute(user_op::KernelComputeContext* ctx, user_op::OpKernelState* state,
-               const user_op::OpKernelCache*) const override {
-    auto* kernel_state = dynamic_cast<DataShuffleKernelState<IDX>*>(state);
-    CHECK(kernel_state != nullptr);
-    const user_op::Tensor* embedding_grad = ctx->Tensor4ArgNameAndIndex("embedding_grad", 0);
-
-    const user_op::Tensor* num_unique_matrix = ctx->Tensor4ArgNameAndIndex("num_unique_matrix", 0);
-    const user_op::Tensor* cur_rank_inverse_indices =
-        ctx->Tensor4ArgNameAndIndex("cur_rank_inverse_indices", 0);
-    const user_op::Tensor* inverse_unique_partition_indices =
-        ctx->Tensor4ArgNameAndIndex("inverse_unique_partition_indices", 0);
-    user_op::Tensor* cur_rank_unique_embedding_grad =
-        ctx->Tensor4ArgNameAndIndex("cur_rank_unique_embedding_grad", 0);
-    const int64_t embedding_size = cur_rank_unique_embedding_grad->shape_view().At(1);
-    IDX* host_num_unique_matrix = kernel_state->HostNumUniqueMatrix();
-    DataType data_type = embedding_grad->data_type();
-    const int64_t num_ids = inverse_unique_partition_indices->shape_view().elem_cnt();
-    const int64_t parallel_num = ctx->parallel_ctx().parallel_num();
-    const int64_t parallel_id = ctx->parallel_ctx().parallel_id();
-    const int64_t padded_embedding_size = GetPaddedEmbeddingSize(data_type, embedding_size);
-    user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0);
-    ncclComm_t comm = kernel_state->comm();
-    using ComputeType = typename DefaultComputeType<T>::type;
-    bool enable_quantized_comm_env_var =
-        ParseBooleanFromEnv("ONEFLOW_ONE_EMBEDDING_ENABLE_QUANTIZED_COMM", false);
-    bool enable_quantized_comm =
-        enable_quantized_comm_env_var && (padded_embedding_size < kMaxColSize);
-    if (enable_quantized_comm_env_var && !enable_quantized_comm) {
-      LOG(WARNING) << "Only envrionment variable ONEFLOW_ONE_EMBEDDING_ENABLE_QUANTIZED_COMM=1 and "
-                      "embedding_size less equal than 1024 can use quantized communication. ";
-    }
-    hipStream_t cuda_stream = ctx->stream()->As<ep::CudaStream>()->cuda_stream();
-    OF_CUDA_CHECK(hipMemcpyAsync(host_num_unique_matrix, num_unique_matrix->dptr(),
-                                  parallel_num * parallel_num * sizeof(IDX), hipMemcpyDefault,
-                                  cuda_stream));
-    CHECK_JUST(ctx->stream()->Sync());
-
-    int64_t cur_rank_num_ids = 0;
-    for (int64_t i = 0; i < parallel_num; ++i) {
-      cur_rank_num_ids += host_num_unique_matrix[i * parallel_num + parallel_id];
-    }
-    size_t full_num_ids = parallel_num * num_ids;
-    size_t full_elem_cnt = full_num_ids * padded_embedding_size;
-    size_t unique_partition_embedding_grad_size = GetCudaAlignedSize(full_elem_cnt * sizeof(T));
-
-    if (!enable_quantized_comm) {
-      size_t received_embedding_grad_size = unique_partition_embedding_grad_size;
-      T* unique_partition_embedding_grad = reinterpret_cast<T*>(tmp_buffer->mut_dptr());
-      T* received_embedding_grad =
-          reinterpret_cast<T*>(tmp_buffer->mut_dptr<char>() + unique_partition_embedding_grad_size);
-      CHECK_GE(tmp_buffer->shape_view().elem_cnt(),
-               unique_partition_embedding_grad_size + received_embedding_grad_size);
-
-      UniquePartitionEmbeddingGrad(
-          ctx->stream(), parallel_id, parallel_num, num_ids, embedding_size, padded_embedding_size,
-          host_num_unique_matrix, embedding_grad->dptr<T>(),
-          reinterpret_cast<const IDX*>(inverse_unique_partition_indices->dptr()),
-          unique_partition_embedding_grad);
-
-      ShuffleEmbeddingsGrad(cuda_stream, comm, parallel_id, parallel_num, num_ids,
-                            padded_embedding_size, data_type, host_num_unique_matrix,
-                            unique_partition_embedding_grad, received_embedding_grad);
-
-      // use unique_partition_embedding_grad as UniqueCurRankEmbeddingGrad buffer.
-      T* buffer_ptr = unique_partition_embedding_grad;
-      UniqueCurRankEmbeddingGrad(ctx->stream(), data_type, cur_rank_num_ids, embedding_size,
-                                 padded_embedding_size, received_embedding_grad,
-                                 reinterpret_cast<const IDX*>(cur_rank_inverse_indices->dptr()),
-                                 cur_rank_unique_embedding_grad->mut_dptr<T>(), buffer_ptr);
-    } else {
-      size_t received_embedding_grad_size = GetCudaAlignedSize(full_elem_cnt * sizeof(int8_t));
-      size_t quantize_cur_rank_embedding_grad_size = received_embedding_grad_size;
-      size_t cur_rank_quantize_factor_size = GetCudaAlignedSize(full_num_ids * sizeof(T));
-      size_t received_cur_rank_quantize_factor_size = cur_rank_quantize_factor_size;
-      size_t dequantize_cur_rank_embedding_grad_size =
-          GetCudaAlignedSize(full_elem_cnt * sizeof(T));
-      CHECK_GE(tmp_buffer->shape_view().elem_cnt(),
-               unique_partition_embedding_grad_size + received_embedding_grad_size
-                   + quantize_cur_rank_embedding_grad_size + cur_rank_quantize_factor_size
-                   + received_cur_rank_quantize_factor_size
-                   + dequantize_cur_rank_embedding_grad_size);
-      T* unique_partition_embedding_grad = reinterpret_cast<T*>(tmp_buffer->mut_dptr());
-      int8_t* received_embedding_grad = reinterpret_cast<int8_t*>(
-          tmp_buffer->mut_dptr<char>() + unique_partition_embedding_grad_size);
-
-      int8_t* quantize_cur_rank_embedding_grad = reinterpret_cast<int8_t*>(
-          tmp_buffer->mut_dptr<char>() + unique_partition_embedding_grad_size
-          + received_embedding_grad_size);
-      T* cur_rank_quantize_factor = reinterpret_cast<T*>(
-          tmp_buffer->mut_dptr<char>() + unique_partition_embedding_grad_size
-          + received_embedding_grad_size + quantize_cur_rank_embedding_grad_size);
-      T* received_cur_rank_quantize_factor = reinterpret_cast<T*>(
-          tmp_buffer->mut_dptr<char>() + unique_partition_embedding_grad_size
-          + received_embedding_grad_size + quantize_cur_rank_embedding_grad_size
-          + cur_rank_quantize_factor_size);
-      T* dequantize_cur_rank_embedding_grad = reinterpret_cast<T*>(
-          tmp_buffer->mut_dptr<char>() + unique_partition_embedding_grad_size
-          + received_embedding_grad_size + quantize_cur_rank_embedding_grad_size
-          + cur_rank_quantize_factor_size + received_cur_rank_quantize_factor_size);
-
-      UniquePartitionEmbeddingGrad(
-          ctx->stream(), parallel_id, parallel_num, num_ids, embedding_size, padded_embedding_size,
-          host_num_unique_matrix, embedding_grad->dptr<T>(),
-          reinterpret_cast<const IDX*>(inverse_unique_partition_indices->dptr()),
-          unique_partition_embedding_grad);
-
-      // Quantize.
-      for (int64_t i = 0; i < parallel_num; ++i) {
-        const int64_t embedding_grad_offset = i * num_ids * padded_embedding_size;
-        const int64_t quantize_factor_offset = i * num_ids;
-        const int64_t valid_row_size = host_num_unique_matrix[parallel_id * parallel_num + i];
-        DispatchQuantizeWarpImplPackSize<T, ComputeType>()(
-            cuda_stream, unique_partition_embedding_grad + embedding_grad_offset,
-            quantize_cur_rank_embedding_grad + embedding_grad_offset,
-            cur_rank_quantize_factor + quantize_factor_offset, valid_row_size,
-            padded_embedding_size);
-      }
-
-      ShuffleEmbeddingsGrad(cuda_stream, comm, parallel_id, parallel_num, num_ids,
-                            padded_embedding_size, data_type, host_num_unique_matrix,
-                            quantize_cur_rank_embedding_grad, received_embedding_grad,
-                            cur_rank_quantize_factor, received_cur_rank_quantize_factor);
-
-      int64_t dequantize_cur_rank_num = 0;
-      for (int64_t i = 0; i < parallel_num; ++i) {
-        /*
-        Host num unique matrix:
-                |  Partition0  |  Partition1  |
-        | Rank0 |      2       |       4      |
-        | Rank1 |      3       |       3      |
-        After ShuffleEmbeddingGrads, each rank will exchange partition.
-        For example:
-        Rank0 will have (matrix[rank0][part0] + matrix[rank1][part0]) grad tensor.
-        Rank1 will have (matrix[rank0][part1] + matrix[rank1][part1]) grad tensor.
-        */
-        dequantize_cur_rank_num += host_num_unique_matrix[i * parallel_num + parallel_id];
-      }
-      IDX dequantize_elem_cnt = dequantize_cur_rank_num * padded_embedding_size;
-      OF_CUDA_CHECK((LaunchDequantizeKernel<T, ComputeType, IDX>(
-          cuda_stream, received_embedding_grad, received_cur_rank_quantize_factor,
-          dequantize_cur_rank_embedding_grad, padded_embedding_size, dequantize_elem_cnt)));
-      // use unique_partition_embedding_grad as UniqueCurRankEmbeddingGrad buffer.
-      T* buffer_ptr = unique_partition_embedding_grad;
-      UniqueCurRankEmbeddingGrad(ctx->stream(), data_type, cur_rank_num_ids, embedding_size,
-                                 padded_embedding_size, dequantize_cur_rank_embedding_grad,
-                                 reinterpret_cast<const IDX*>(cur_rank_inverse_indices->dptr()),
-                                 cur_rank_unique_embedding_grad->mut_dptr<T>(), buffer_ptr);
-    }
-  }
-  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
-};
-
-#define REGISTER_CUDA_EMBEDDING_GRADIENT_SHUFFLE_KERNEL(t_dtype_pair, idx_dtype_pair)             \
-  REGISTER_USER_KERNEL("embedding_gradient_shuffle")                                              \
-      .SetCreateFn<EmbeddingGradientShuffleKernel<OF_PP_PAIR_FIRST(t_dtype_pair),                 \
-                                                  OF_PP_PAIR_FIRST(idx_dtype_pair)>>()            \
-      .SetIsMatchedHob(                                                                           \
-          (user_op::HobDeviceType() == DeviceType::kCUDA)                                         \
-          && (user_op::HobDataType("embedding_grad", 0) == OF_PP_PAIR_SECOND(t_dtype_pair))       \
-          && (user_op::HobDataType("num_unique_matrix", 0) == OF_PP_PAIR_SECOND(idx_dtype_pair))) \
-      .SetInferTmpSizeFn([](user_op::InferContext* ctx) {                                         \
-        const user_op::TensorDesc& cur_rank_unique_embedding_grad =                               \
-            ctx->InputTensorDesc("cur_rank_unique_embedding_grad", 0);                            \
-        size_t cur_rank_embedding_grad_num = cur_rank_unique_embedding_grad.shape().At(0);        \
-        size_t embedding_size = cur_rank_unique_embedding_grad.shape().At(1);                     \
-        size_t padded_embedding_size =                                                            \
-            GetPaddedEmbeddingSize(cur_rank_unique_embedding_grad.data_type(), embedding_size);   \
-        size_t cur_rank_embedding_grad_elem_cnt =                                                 \
-            cur_rank_embedding_grad_num * padded_embedding_size;                                  \
-        bool enable_quantized_comm =                                                              \
-            ParseBooleanFromEnv("ONEFLOW_ONE_EMBEDDING_ENABLE_QUANTIZED_COMM", false)             \
-            && (padded_embedding_size < kMaxColSize);                                             \
-        size_t tmp_size = 0;                                                                      \
-        if (!enable_quantized_comm) {                                                             \
-          size_t cur_rank_embedding_grad_size = GetCudaAlignedSize(                               \
-              cur_rank_embedding_grad_elem_cnt * sizeof(OF_PP_PAIR_FIRST(t_dtype_pair)));         \
-          tmp_size = 2 * cur_rank_embedding_grad_size;                                            \
-        } else {                                                                                  \
-          size_t unique_partition_embedding_grad_size = GetCudaAlignedSize(                       \
-              cur_rank_embedding_grad_elem_cnt * sizeof(OF_PP_PAIR_FIRST(t_dtype_pair)));         \
-          size_t received_embedding_grad_size =                                                   \
-              GetCudaAlignedSize(cur_rank_embedding_grad_elem_cnt * sizeof(int8_t));              \
-          size_t quantize_cur_rank_embedding_grad_size = received_embedding_grad_size;            \
-          size_t cur_rank_quantize_factor_size = GetCudaAlignedSize(                              \
-              cur_rank_embedding_grad_num * sizeof(OF_PP_PAIR_FIRST(t_dtype_pair)));              \
-          size_t received_cur_rank_quantize_factor_size = cur_rank_quantize_factor_size;          \
-          size_t dequantize_cur_rank_embedding_grad_size = unique_partition_embedding_grad_size;  \
-          tmp_size = unique_partition_embedding_grad_size + received_embedding_grad_size          \
-                     + quantize_cur_rank_embedding_grad_size + cur_rank_quantize_factor_size      \
-                     + received_cur_rank_quantize_factor_size                                     \
-                     + dequantize_cur_rank_embedding_grad_size;                                   \
-        }                                                                                         \
-        return tmp_size;                                                                          \
-      });
-
-OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_CUDA_EMBEDDING_GRADIENT_SHUFFLE_KERNEL,
-                                 FLOATING_DATA_TYPE_SEQ HALF_DATA_TYPE_SEQ, IDX_DATA_TYPE_SEQ)
-                                //  FLOATING_DATA_TYPE_SEQ, IDX_DATA_TYPE_SEQ)
-
-template<typename K, typename V, typename IDX>
-class UniqueKeyValuePairKernel final : public user_op::OpKernel {
- public:
-  UniqueKeyValuePairKernel() = default;
-  ~UniqueKeyValuePairKernel() override = default;
-
- private:
-  using user_op::OpKernel::Compute;
-
-  void Compute(user_op::KernelComputeContext* ctx) const override {
-    const user_op::Tensor* keys = ctx->Tensor4ArgNameAndIndex("keys", 0);
-    user_op::Tensor* num_unique = ctx->Tensor4ArgNameAndIndex("num_unique", 0);
-    user_op::Tensor* unique_keys = ctx->Tensor4ArgNameAndIndex("unique_keys", 0);
-    user_op::Tensor* unique_values = ctx->Tensor4ArgNameAndIndex("unique_values", 0);
-    user_op::Tensor* inverse_indices = ctx->Tensor4ArgNameAndIndex("inverse_indices", 0);
-    user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0);
-    const int32_t num_tables = ctx->Attr<int32_t>("num_tables");
-    const bool has_values = ctx->has_input("values", 0);
-    const bool need_values_buffer = (!has_values && num_tables > 1);
-    size_t values_buffer_bytes =
-        need_values_buffer ? GetCudaAlignedSize(keys->shape_view().elem_cnt() * sizeof(V)) : 0;
-    const int64_t num_keys = keys->shape_view().elem_cnt();
-    const int64_t hash_capacity = num_keys;
-    const size_t workspace_bytes = GetCudaAlignedSize(hash_capacity * sizeof(TableEntry<K>));
-    CHECK_LE(values_buffer_bytes + workspace_bytes, tmp_buffer->shape_view().elem_cnt());
-    hipStream_t cuda_stream = ctx->stream()->As<ep::CudaStream>()->cuda_stream();
-    const V* values_ptr;
-    if (has_values) {
-      const user_op::Tensor* values = ctx->Tensor4ArgNameAndIndex("values", 0);
-      values_ptr = reinterpret_cast<const V*>(values->dptr());
-    } else if (need_values_buffer) {
-      V* values_buffer_ptr = reinterpret_cast<V*>(tmp_buffer->mut_dptr());
-      hipLaunchKernelGGL(GenerateTableIds, BlocksNum4ThreadsNum(num_keys), kCudaThreadsNumPerBlock, 0, cuda_stream, 
-          num_keys, num_tables, values_buffer_ptr);
-      values_ptr = values_buffer_ptr;
-    } else {
-      values_ptr = nullptr;
-    }
-    const bool need_process_table_ids = (has_values || num_tables > 1);
-    TableEntry<K>* workspace_ptr =
-        reinterpret_cast<TableEntry<K>*>(tmp_buffer->mut_dptr<char>() + values_buffer_bytes);
-    UniqueAndPartition<K, V, IDX, embedding::GlobalUniqueHash>(
-        cuda_stream, num_keys, hash_capacity, 1, reinterpret_cast<const K*>(keys->dptr()),
-        values_ptr, reinterpret_cast<IDX*>(num_unique->mut_dptr()),
-        reinterpret_cast<K*>(unique_keys->mut_dptr()),
-        reinterpret_cast<V*>(unique_values->mut_dptr()),
-        reinterpret_cast<IDX*>(inverse_indices->mut_dptr()), workspace_ptr, workspace_bytes,
-        need_process_table_ids);
-  }
-  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
-};
-
-#define REGISTER_CUDA_UNIQUE_KEY_VALUE_PAIR_KERNEL(k_dtype_pair, value_dtype_pair, idx_dtype_pair) \
-  REGISTER_USER_KERNEL("unique_key_value_pair")                                                    \
-      .SetCreateFn<UniqueKeyValuePairKernel<OF_PP_PAIR_FIRST(k_dtype_pair),                        \
-                                            OF_PP_PAIR_FIRST(value_dtype_pair),                    \
-                                            OF_PP_PAIR_FIRST(idx_dtype_pair)>>()                   \
-      .SetIsMatchedHob(                                                                            \
-          (user_op::HobDeviceType() == DeviceType::kCUDA)                                          \
-          && (user_op::HobDataType("keys", 0) == OF_PP_PAIR_SECOND(k_dtype_pair))                  \
-          && (user_op::HobDataType("inverse_indices", 0) == OF_PP_PAIR_SECOND(idx_dtype_pair))     \
-          && (user_op::HobDataType("unique_values", 0) == OF_PP_PAIR_SECOND(value_dtype_pair)))    \
-      .SetInferTmpSizeFn([](user_op::InferContext* ctx) {                                          \
-        const user_op::TensorDesc& keys = ctx->InputTensorDesc("keys", 0);                         \
-        const int64_t num_keys = keys.shape().elem_cnt();                                          \
-        const int64_t hash_capacity = num_keys;                                                    \
-        const size_t workspace_bytes = GetCudaAlignedSize(                                         \
-            hash_capacity * sizeof(TableEntry<OF_PP_PAIR_FIRST(k_dtype_pair)>));                   \
-        const int32_t num_tables = ctx->Attr<int32_t>("num_tables");                               \
-        const bool has_values = ctx->has_input("values", 0);                                       \
-        const bool need_values_buffer = (!has_values && num_tables > 1);                           \
-        size_t values_buffer_bytes =                                                               \
-            need_values_buffer                                                                     \
-                ? GetCudaAlignedSize(num_keys * sizeof(OF_PP_PAIR_FIRST(value_dtype_pair)))        \
-                : 0;                                                                               \
-        return workspace_bytes + values_buffer_bytes;                                              \
-      });
-
-OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_CUDA_UNIQUE_KEY_VALUE_PAIR_KERNEL, ID_DATA_TYPE_SEQ,
-                                 ID_DATA_TYPE_SEQ, IDX_DATA_TYPE_SEQ)
-
-REGISTER_USER_KERNEL_UNIFIED_NCCL_COMM_INIT("id_shuffle");
-REGISTER_USER_KERNEL_UNIFIED_NCCL_COMM_INIT("embedding_shuffle");
-REGISTER_USER_KERNEL_UNIFIED_NCCL_COMM_INIT("embedding_gradient_shuffle");
-
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "hip/hip_runtime.h"
+#include "oneflow/core/framework/framework.h"
+#include "oneflow/core/device/nccl_util.h"
+#include "oneflow/core/job/eager_nccl_comm_manager.h"
+#include "oneflow/core/job/parallel_desc.h"
+#include "oneflow/core/ep/rocm/cuda_stream.h"
+#include "oneflow/user/kernels/gather_kernel_util.h"
+#include "oneflow/user/kernels/unsorted_segment_sum_kernel_util.h"
+#include "oneflow/core/hip/atomic.hip.h"
+#include "oneflow/core/embedding/hash_functions.hip.h"
+#include "oneflow/core/hip/elementwise.hip.h"
+#include "oneflow/core/ep/include/primitive/copy_nd.h"
+#include "oneflow/core/hip/atomic.hip.h"
+
+namespace oneflow {
+
+namespace {
+
+template<typename K>
+struct TableEntry {
+  K key;
+  uint32_t value;
+};
+
+template<typename K, typename V, typename IDX, typename HASH>
+__global__ void HashTableUniqueAndPartitionPairs(const uint32_t table_capacity,
+                                                 const uint32_t num_keys, int32_t num_partition,
+                                                 IDX* unique_counts, TableEntry<K>* table,
+                                                 const K* keys, const V* values,
+                                                 K* partitioned_unique_keys,
+                                                 V* partitioned_unique_values, IDX* reverse_index,
+                                                 bool need_process_values) {
+  CUDA_1D_KERNEL_LOOP_T(uint32_t, i, num_keys) {
+    IDX r_index_plus_one = 0;
+    const K key = keys[i];
+    size_t key_hash = HASH()(key);
+    uint32_t partition_id = key_hash % num_partition;
+    IDX* unique_count = unique_counts + partition_id;
+    K* unique_keys = partitioned_unique_keys + partition_id * num_keys;
+    uint32_t pos = key_hash % table_capacity;
+    const K key_hi = (key | 0x1);
+    const K key_lo = (key & 0x1);
+    uint32_t counter = 0;
+    while (r_index_plus_one == 0) {
+      bool prob_next = false;
+      K* key_ptr = &table[pos].key;
+      volatile uint32_t* table_value_ptr = &table[pos].value;
+      const K old_key = cuda::atomic::CAS(key_ptr, 0, key_hi);
+      if (old_key == 0) {
+        IDX unique_pos = cuda::atomic::Add(unique_count, 1);
+        r_index_plus_one = unique_pos + 1;
+        unique_keys[unique_pos] = key;
+        if (need_process_values) {
+          partitioned_unique_values[partition_id * num_keys + unique_pos] = values[i];
+        }
+        *table_value_ptr = ((r_index_plus_one << 1U) | key_lo);
+      } else if (old_key == key_hi) {
+        const uint32_t value = *table_value_ptr;
+        if (value == 0) {
+          // do nothing
+        } else if ((value & 0x1) == key_lo) {
+          r_index_plus_one = (value >> 1U);
+        } else {
+          prob_next = true;
+        }
+      } else {
+        prob_next = true;
+      }
+      if (prob_next) {
+        pos += 1;
+        counter += 1;
+        if (pos >= table_capacity) { pos -= table_capacity; }
+        if (counter >= table_capacity) { asm volatile("s_trap 0;"); }
+      }
+    }
+    reverse_index[i] = partition_id * num_keys + r_index_plus_one - 1;
+  }
+}
+
+template<typename U>
+__global__ void GenerateTableIds(int32_t elem_cnt, int32_t num_tables, U* table_ids) {
+  CUDA_1D_KERNEL_LOOP(i, elem_cnt) { table_ids[i] = i % num_tables; }
+}
+
+template<typename K, typename V, typename IDX, typename HASH>
+void UniqueAndPartition(hipStream_t cuda_stream, int64_t num_ids, size_t capacity,
+                        int64_t num_partition, const K* ids, const V* table_ids,
+                        IDX* num_partitioned_unique_ids_ptr, K* partitioned_unique_ids,
+                        V* partitioned_unique_table_ids, IDX* inverse_unique_partition_indices,
+                        void* workspace_ptr, size_t workspace_bytes, bool need_process_table_ids) {
+  size_t table_capacity_bytes = capacity * sizeof(TableEntry<K>);
+  CHECK_GE(workspace_bytes, table_capacity_bytes);
+  OF_CUDA_CHECK(hipMemsetAsync(workspace_ptr, 0, table_capacity_bytes, cuda_stream));
+  OF_CUDA_CHECK(
+      hipMemsetAsync(num_partitioned_unique_ids_ptr, 0, num_partition * sizeof(IDX), cuda_stream));
+  hipLaunchKernelGGL(HIP_KERNEL_NAME(HashTableUniqueAndPartitionPairs<K, V, IDX, HASH>), BlocksNum4ThreadsNum(num_ids), kCudaThreadsNumPerBlock, 0, cuda_stream, 
+          capacity, num_ids, num_partition, num_partitioned_unique_ids_ptr,
+          reinterpret_cast<TableEntry<K>*>(workspace_ptr), ids, table_ids, partitioned_unique_ids,
+          partitioned_unique_table_ids, inverse_unique_partition_indices, need_process_table_ids);
+}
+
+template<typename T>
+void ShuffleData(hipStream_t cuda_stream, ncclComm_t comm, DataType data_type,
+                 const std::vector<int64_t>& send_offsets,
+                 const std::vector<int64_t>& send_elem_cnt, const T* send_data,
+                 const std::vector<int64_t>& recv_offsets,
+                 const std::vector<int64_t>& recv_elem_cnt, T* recv_data) {
+  ncclDataType_t nccl_data_type = GetNcclDataType(data_type);
+  const int64_t parallel_num = send_offsets.size();
+  OF_NCCL_CHECK(ncclGroupStart());
+  for (int64_t i = 0; i < parallel_num; ++i) {
+    OF_NCCL_CHECK(ncclSend(send_data + send_offsets.at(i), send_elem_cnt.at(i), nccl_data_type, i,
+                           comm, cuda_stream));
+    OF_NCCL_CHECK(ncclRecv(recv_data + recv_offsets.at(i), recv_elem_cnt.at(i), nccl_data_type, i,
+                           comm, cuda_stream));
+  }
+  OF_NCCL_CHECK(ncclGroupEnd());
+}
+
+template<typename IDX>
+void MakeShuffleParams(const IDX* host_num_unique_matrix, const int64_t num_ids,
+                       const int64_t row_size, int64_t parallel_id, int64_t parallel_num,
+                       std::vector<int64_t>* scatter_offset_vec,
+                       std::vector<int64_t>* scatter_elem_cnt_vec,
+                       std::vector<int64_t>* gather_offset_vec,
+                       std::vector<int64_t>* gather_elem_cnt_vec) {
+  scatter_offset_vec->resize(parallel_num);
+  scatter_elem_cnt_vec->resize(parallel_num);
+  gather_offset_vec->resize(parallel_num);
+  gather_elem_cnt_vec->resize(parallel_num);
+  int64_t gather_offset = 0;
+  for (int64_t i = 0; i < parallel_num; ++i) {
+    const int64_t scatter_elem_cnt =
+        host_num_unique_matrix[parallel_id * parallel_num + i] * row_size;
+    const int64_t gather_elem_cnt =
+        host_num_unique_matrix[i * parallel_num + parallel_id] * row_size;
+    scatter_offset_vec->at(i) = i * num_ids * row_size;
+    scatter_elem_cnt_vec->at(i) = scatter_elem_cnt;
+    gather_offset_vec->at(i) = gather_offset;
+    gather_elem_cnt_vec->at(i) = gather_elem_cnt;
+    gather_offset += gather_elem_cnt;
+  }
+}
+
+template<typename K, typename U, typename IDX>
+void ShuffleIdsAndTableIds(hipStream_t cuda_stream, ncclComm_t comm, int64_t parallel_id,
+                           int64_t parallel_num, int64_t num_ids, DataType ids_data_type,
+                           DataType table_ids_data_type, IDX* host_num_unique_matrix,
+                           K* partitioned_unique_ids, U* partitioned_unique_table_ids,
+                           K* received_ids, U* received_table_ids, int64_t* received_elem_cnt,
+                           bool need_process_table_ids) {
+  std::vector<int64_t> send_offsets;
+  std::vector<int64_t> send_elem_cnt;
+  std::vector<int64_t> recv_offsets;
+  std::vector<int64_t> recv_elem_cnt;
+  MakeShuffleParams(host_num_unique_matrix, num_ids, 1, parallel_id, parallel_num, &send_offsets,
+                    &send_elem_cnt, &recv_offsets, &recv_elem_cnt);
+  ShuffleData(cuda_stream, comm, ids_data_type, send_offsets, send_elem_cnt, partitioned_unique_ids,
+              recv_offsets, recv_elem_cnt, received_ids);
+  *received_elem_cnt = recv_offsets.at(parallel_num - 1) + recv_elem_cnt.at(parallel_num - 1);
+  if (need_process_table_ids) {
+    ShuffleData(cuda_stream, comm, table_ids_data_type, send_offsets, send_elem_cnt,
+                partitioned_unique_table_ids, recv_offsets, recv_elem_cnt, received_table_ids);
+  }
+}
+
+enum class IdShuffleBufferType {
+  kNumPartitionedUnique = 0,
+  kPartitionedUniqueIds,
+  kReceivedIds,
+  kTableIds,
+  kPartitionedUniqueTableIds,
+  kReceivedTableIds,
+  kWorkspace,
+  kMaxType
+};
+
+template<typename K, typename U, typename IDX>
+class IdShuffleTmpBufferManager final {
+ public:
+  OF_DISALLOW_COPY_AND_MOVE(IdShuffleTmpBufferManager);
+  IdShuffleTmpBufferManager(void* ptr, const int64_t num_ids, const int64_t parallel_num,
+                            bool need_table_ids, bool need_process_table_ids)
+      : offset_(0),
+        offsets_(static_cast<size_t>(IdShuffleBufferType::kMaxType), -1),
+        sizes_(static_cast<size_t>(IdShuffleBufferType::kMaxType)),
+        ptr_(ptr) {
+    const int64_t num_table_ids = need_process_table_ids ? num_ids : 0;
+    const size_t table_ids_bytes = need_table_ids ? num_ids * sizeof(U) : 0;
+    AllocBuffer(IdShuffleBufferType::kNumPartitionedUnique, parallel_num * sizeof(IDX));
+    size_t partitioned_ids_bytes = parallel_num * num_ids * sizeof(K);
+    AllocBuffer(IdShuffleBufferType::kPartitionedUniqueIds, partitioned_ids_bytes);
+    AllocBuffer(IdShuffleBufferType::kReceivedIds, partitioned_ids_bytes);
+    AllocBuffer(IdShuffleBufferType::kTableIds, table_ids_bytes);
+    size_t partitioned_table_ids_bytes = parallel_num * num_table_ids * sizeof(U);
+    AllocBuffer(IdShuffleBufferType::kPartitionedUniqueTableIds, partitioned_table_ids_bytes);
+    AllocBuffer(IdShuffleBufferType::kReceivedTableIds, partitioned_table_ids_bytes);
+    const size_t hash_table_capacity = parallel_num * num_ids;
+    AllocBuffer(IdShuffleBufferType::kWorkspace, hash_table_capacity * sizeof(TableEntry<K>));
+  }
+
+  template<typename T = void>
+  T* Ptr(IdShuffleBufferType type) {
+    CHECK(ptr_ != nullptr);
+    int64_t offset = offsets_.at(static_cast<size_t>(type));
+    CHECK_NE(offset, -1);
+    return reinterpret_cast<T*>(reinterpret_cast<char*>(ptr_) + offset);
+  }
+
+  int64_t Size(IdShuffleBufferType type) { return sizes_.at(static_cast<size_t>(type)); }
+
+  size_t TotalBufferSize() const { return offset_; }
+
+ private:
+  void AllocBuffer(IdShuffleBufferType type, size_t size) {
+    const size_t type_id = static_cast<size_t>(type);
+    CHECK_EQ(offsets_.at(type_id), -1);
+    offsets_.at(type_id) = offset_;
+    sizes_.at(type_id) = size;
+    offset_ += GetCudaAlignedSize(size);
+  }
+  size_t offset_;
+  std::vector<int64_t> offsets_;
+  std::vector<int64_t> sizes_;
+  void* ptr_;
+};
+
+template<typename IDX>
+class DataShuffleKernelState final : public user_op::OpKernelState {
+ public:
+  explicit DataShuffleKernelState(user_op::KernelInitContext* ctx)
+      : device_index_(-1),
+        stream_name_(EagerNcclCommMgr::kDefaultStreamName),
+        parallel_desc_(ctx->parallel_desc()) {
+    OF_CUDA_CHECK(hipGetDevice(&device_index_));
+    if (ctx->op_conf().has_stream_name_hint()) { stream_name_ = ctx->op_conf().stream_name_hint(); }
+    OF_CUDA_CHECK(hipMallocHost(
+        reinterpret_cast<void **>(&host_num_unique_matrix_),
+        parallel_desc_.parallel_num() * parallel_desc_.parallel_num() * sizeof(IDX)));
+  }
+  ~DataShuffleKernelState() {
+    CudaCurrentDeviceGuard guard(device_index_);
+    OF_CUDA_CHECK(hipHostFree(host_num_unique_matrix_));
+  }
+
+  ncclComm_t comm() { return GetOrCreate().comm; }
+
+  IDX* HostNumUniqueMatrix() { return host_num_unique_matrix_; }
+
+ private:
+  struct Comm {
+    Comm(ncclComm_t comm) : comm(comm) {}
+    ncclComm_t comm;
+  };
+
+  const Comm& GetOrCreate() {
+    if (!comm_) { Init(); }
+    return *comm_;
+  }
+
+  void Init() {
+    std::set<std::pair<int64_t, int64_t>> device_set;
+    for (int64_t parallel_id = 0; parallel_id < parallel_desc_.parallel_num(); ++parallel_id) {
+      int64_t machine_id = CHECK_JUST(parallel_desc_.MachineId4ParallelId(parallel_id));
+      int64_t device_id = CHECK_JUST(parallel_desc_.DeviceId4ParallelId(parallel_id));
+      device_set.emplace(std::make_pair(machine_id, device_id));
+    }
+    EagerNcclCommMgr* comm_mgr = CHECK_NOTNULL(Singleton<EagerNcclCommMgr>::Get());
+    ncclComm_t comm;
+    comm = comm_mgr->GetCommForDeviceAndStreamName(device_set, stream_name_);
+    comm_.reset(new Comm(comm));
+  }
+
+  int device_index_;
+  bool has_independent_stream_;
+  std::string stream_name_;
+  ParallelDesc parallel_desc_;
+  std::unique_ptr<Comm> comm_;
+  IDX* host_num_unique_matrix_;
+};
+
+}  // namespace
+
+template<typename K, typename U, typename IDX>
+class IdShuffleKernel final : public user_op::OpKernel {
+ public:
+  IdShuffleKernel() = default;
+  ~IdShuffleKernel() override = default;
+
+  std::shared_ptr<user_op::OpKernelState> CreateOpKernelState(
+      user_op::KernelInitContext* ctx) const override {
+    return std::make_shared<DataShuffleKernelState<IDX>>(ctx);
+  }
+
+ private:
+  using user_op::OpKernel::Compute;
+  void Compute(user_op::KernelComputeContext* ctx, user_op::OpKernelState* state,
+               const user_op::OpKernelCache*) const override {
+    auto* kernel_state = dynamic_cast<DataShuffleKernelState<IDX>*>(state);
+    CHECK(kernel_state != nullptr);
+    const user_op::Tensor* ids = ctx->Tensor4ArgNameAndIndex("ids", 0);
+    user_op::Tensor* num_unique_matrix = ctx->Tensor4ArgNameAndIndex("num_unique_matrix", 0);
+    user_op::Tensor* inverse_unique_partition_indices =
+        ctx->Tensor4ArgNameAndIndex("inverse_unique_partition_indices", 0);
+    user_op::Tensor* cur_rank_num_unique = ctx->Tensor4ArgNameAndIndex("cur_rank_num_unique", 0);
+    user_op::Tensor* cur_rank_unique_ids = ctx->Tensor4ArgNameAndIndex("cur_rank_unique_ids", 0);
+    user_op::Tensor* cur_rank_unique_table_ids =
+        ctx->Tensor4ArgNameAndIndex("cur_rank_unique_table_ids", 0);
+    user_op::Tensor* cur_rank_inverse_indices =
+        ctx->Tensor4ArgNameAndIndex("cur_rank_inverse_indices", 0);
+    user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0);
+    const int32_t num_tables = ctx->Attr<int32_t>("num_tables");
+    const bool has_table_ids = ctx->has_input("table_ids", 0);
+    const bool need_gen_table_ids = (!has_table_ids && num_tables > 1);
+    const bool need_process_table_ids = (has_table_ids || num_tables > 1);
+    const int64_t num_ids = ids->shape_view().elem_cnt();
+    const int64_t parallel_num = ctx->parallel_ctx().parallel_num();
+    const int64_t parallel_id = ctx->parallel_ctx().parallel_id();
+    hipStream_t cuda_stream = ctx->stream()->As<ep::CudaStream>()->cuda_stream();
+    IdShuffleTmpBufferManager<K, U, IDX> buffer_manager(
+        tmp_buffer->mut_dptr(), num_ids, parallel_num, need_gen_table_ids, need_process_table_ids);
+    CHECK_GE(tmp_buffer->shape_view().elem_cnt(), buffer_manager.TotalBufferSize());
+
+    const U* table_ids_ptr;
+    if (has_table_ids) {
+      const user_op::Tensor* table_ids = ctx->Tensor4ArgNameAndIndex("table_ids", 0);
+      table_ids_ptr = reinterpret_cast<const U*>(table_ids->dptr());
+    } else if (need_gen_table_ids) {
+      hipLaunchKernelGGL(GenerateTableIds, BlocksNum4ThreadsNum(num_ids), kCudaThreadsNumPerBlock, 0, cuda_stream, 
+          num_ids, num_tables, buffer_manager.template Ptr<U>(IdShuffleBufferType::kTableIds));
+      table_ids_ptr = buffer_manager.template Ptr<U>(IdShuffleBufferType::kTableIds);
+    } else {
+      table_ids_ptr = nullptr;
+    }
+    IDX* num_partitioned_unique =
+        buffer_manager.template Ptr<IDX>(IdShuffleBufferType::kNumPartitionedUnique);
+    K* partitioned_unique_ids =
+        buffer_manager.template Ptr<K>(IdShuffleBufferType::kPartitionedUniqueIds);
+    U* partitioned_unique_table_ids =
+        buffer_manager.template Ptr<U>(IdShuffleBufferType::kPartitionedUniqueTableIds);
+    IDX* num_unique_matrix_ptr = reinterpret_cast<IDX*>(num_unique_matrix->mut_dptr());
+    size_t hash_table_capacity = parallel_num * num_ids;
+    void* workspace_ptr = buffer_manager.Ptr(IdShuffleBufferType::kWorkspace);
+    size_t workspace_size = buffer_manager.Size(IdShuffleBufferType::kWorkspace);
+    UniqueAndPartition<K, U, IDX, embedding::ShardingHash>(
+        cuda_stream, num_ids, hash_table_capacity, parallel_num,
+        reinterpret_cast<const K*>(ids->dptr()), table_ids_ptr, num_partitioned_unique,
+        partitioned_unique_ids, partitioned_unique_table_ids,
+        reinterpret_cast<IDX*>(inverse_unique_partition_indices->mut_dptr()), workspace_ptr,
+        workspace_size, need_process_table_ids);
+    ncclComm_t comm = kernel_state->comm();
+    OF_NCCL_CHECK(ncclAllGather(num_partitioned_unique, num_unique_matrix_ptr, parallel_num,
+                                GetNcclDataType(num_unique_matrix->data_type()), comm,
+                                cuda_stream));
+    IDX* host_num_unique_matrix = kernel_state->HostNumUniqueMatrix();
+    OF_CUDA_CHECK(hipMemcpyAsync(host_num_unique_matrix, num_unique_matrix_ptr,
+                                  parallel_num * parallel_num * sizeof(IDX), hipMemcpyDefault,
+                                  cuda_stream));
+    CHECK_JUST(ctx->stream()->Sync());
+
+    K* received_ids = buffer_manager.template Ptr<K>(IdShuffleBufferType::kReceivedIds);
+    U* received_table_ids = buffer_manager.template Ptr<U>(IdShuffleBufferType::kReceivedTableIds);
+    int64_t received_elem_cnt = 0;
+    ShuffleIdsAndTableIds(cuda_stream, comm, parallel_id, parallel_num, num_ids, ids->data_type(),
+                          cur_rank_unique_table_ids->data_type(), host_num_unique_matrix,
+                          partitioned_unique_ids, partitioned_unique_table_ids, received_ids,
+                          received_table_ids, &received_elem_cnt, need_process_table_ids);
+    UniqueAndPartition<K, U, IDX, embedding::LocalUniqueHash>(
+        cuda_stream, received_elem_cnt, hash_table_capacity, 1, received_ids, received_table_ids,
+        reinterpret_cast<IDX*>(cur_rank_num_unique->mut_dptr()),
+        reinterpret_cast<K*>(cur_rank_unique_ids->mut_dptr()),
+        reinterpret_cast<U*>(cur_rank_unique_table_ids->mut_dptr()),
+        reinterpret_cast<IDX*>(cur_rank_inverse_indices->mut_dptr()), workspace_ptr, workspace_size,
+        need_process_table_ids);
+    if (!need_process_table_ids) {
+      OF_CUDA_CHECK(hipMemsetAsync(cur_rank_unique_table_ids->mut_dptr(), 0,
+                                    received_elem_cnt * sizeof(U), cuda_stream));
+    }
+  }
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
+};
+
+#define ID_DATA_TYPE_SEQ                            \
+  OF_PP_MAKE_TUPLE_SEQ(uint32_t, DataType::kUInt32) \
+  OF_PP_MAKE_TUPLE_SEQ(uint64_t, DataType::kUInt64) \
+  OF_PP_MAKE_TUPLE_SEQ(int32_t, DataType::kInt32)   \
+  OF_PP_MAKE_TUPLE_SEQ(int64_t, DataType::kInt64)
+
+#define TABLE_ID_DATA_TYPE_SEQ                      \
+  OF_PP_MAKE_TUPLE_SEQ(uint8_t, DataType::kUInt8)   \
+  OF_PP_MAKE_TUPLE_SEQ(uint32_t, DataType::kUInt32) \
+  OF_PP_MAKE_TUPLE_SEQ(uint64_t, DataType::kUInt64) \
+  OF_PP_MAKE_TUPLE_SEQ(int8_t, DataType::kInt8)     \
+  OF_PP_MAKE_TUPLE_SEQ(int32_t, DataType::kInt32)   \
+  OF_PP_MAKE_TUPLE_SEQ(int64_t, DataType::kInt64)
+
+#define IDX_DATA_TYPE_SEQ                           \
+  OF_PP_MAKE_TUPLE_SEQ(uint32_t, DataType::kUInt32) \
+  OF_PP_MAKE_TUPLE_SEQ(int32_t, DataType::kInt32)
+
+#define REGISTER_CUDA_ID_SHUFFLE_KERNEL(k_dtype_pair, table_id_dtype_pair, idx_dtype_pair)        \
+  REGISTER_USER_KERNEL("id_shuffle")                                                              \
+      .SetCreateFn<                                                                               \
+          IdShuffleKernel<OF_PP_PAIR_FIRST(k_dtype_pair), OF_PP_PAIR_FIRST(table_id_dtype_pair),  \
+                          OF_PP_PAIR_FIRST(idx_dtype_pair)>>()                                    \
+      .SetIsMatchedHob(                                                                           \
+          (user_op::HobDeviceType() == DeviceType::kCUDA)                                         \
+          && (user_op::HobDataType("ids", 0) == OF_PP_PAIR_SECOND(k_dtype_pair))                  \
+          && (user_op::HobDataType("cur_rank_unique_table_ids", 0)                                \
+              == OF_PP_PAIR_SECOND(table_id_dtype_pair))                                          \
+          && (user_op::HobDataType("num_unique_matrix", 0) == OF_PP_PAIR_SECOND(idx_dtype_pair))) \
+      .SetInferTmpSizeFn([](user_op::InferContext* ctx) {                                         \
+        const user_op::TensorDesc& ids = ctx->InputTensorDesc("ids", 0);                          \
+        const bool has_table_ids = ctx->has_input("table_ids", 0);                                \
+        const int32_t num_tables = ctx->Attr<int32_t>("num_tables");                              \
+        const bool need_gen_table_ids = (!has_table_ids && num_tables > 1);                       \
+        const bool need_process_table_ids = (has_table_ids || num_tables > 1);                    \
+        IdShuffleTmpBufferManager<OF_PP_PAIR_FIRST(k_dtype_pair),                                 \
+                                  OF_PP_PAIR_FIRST(table_id_dtype_pair),                          \
+                                  OF_PP_PAIR_FIRST(idx_dtype_pair)>                               \
+            buffer_manager(nullptr, ids.shape().elem_cnt(), ctx->parallel_desc().parallel_num(),  \
+                           need_gen_table_ids, need_process_table_ids);                           \
+        return buffer_manager.TotalBufferSize();                                                  \
+      });
+
+OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_CUDA_ID_SHUFFLE_KERNEL, ID_DATA_TYPE_SEQ,
+                                 TABLE_ID_DATA_TYPE_SEQ, IDX_DATA_TYPE_SEQ)
+
+template<typename T, typename IDX>
+void ShuffleEmbeddings(hipStream_t cuda_stream, ncclComm_t comm, int64_t parallel_id,
+                       int64_t parallel_num, int64_t num_ids, int64_t embedding_size,
+                       DataType data_type, IDX* host_num_unique_matrix,
+                       T* reverse_unique_cur_rank_embeddings, T* received_embeddings) {
+  std::vector<int64_t> send_offsets;
+  std::vector<int64_t> send_elem_cnt;
+  std::vector<int64_t> recv_offsets;
+  std::vector<int64_t> recv_elem_cnt;
+  MakeShuffleParams(host_num_unique_matrix, num_ids, embedding_size, parallel_id, parallel_num,
+                    &recv_offsets, &recv_elem_cnt, &send_offsets, &send_elem_cnt);
+  ShuffleData(cuda_stream, comm, data_type, send_offsets, send_elem_cnt,
+              reverse_unique_cur_rank_embeddings, recv_offsets, recv_elem_cnt, received_embeddings);
+}
+
+// Quantized Version.
+template<typename T, typename IDX>
+void ShuffleEmbeddings(hipStream_t cuda_stream, ncclComm_t comm, int64_t parallel_id,
+                       int64_t parallel_num, int64_t num_ids, int64_t embedding_size,
+                       DataType data_type, IDX* host_num_unique_matrix,
+                       int8_t* reverse_unique_cur_rank_embeddings, int8_t* received_embeddings,
+                       T* reverse_cur_rank_quantize_factor, T* recv_quantize_factor) {
+  std::vector<int64_t> send_offsets;
+  std::vector<int64_t> send_elem_cnt;
+  std::vector<int64_t> recv_offsets;
+  std::vector<int64_t> recv_elem_cnt;
+  // shuffle quantized_embedding
+  MakeShuffleParams(host_num_unique_matrix, num_ids, embedding_size, parallel_id, parallel_num,
+                    &recv_offsets, &recv_elem_cnt, &send_offsets, &send_elem_cnt);
+  ShuffleData(cuda_stream, comm, DataType::kInt8, send_offsets, send_elem_cnt,
+              reverse_unique_cur_rank_embeddings, recv_offsets, recv_elem_cnt, received_embeddings);
+  // shuffle quantize_factor
+  MakeShuffleParams(host_num_unique_matrix, num_ids, /*embedding_size=*/1, parallel_id,
+                    parallel_num, &recv_offsets, &recv_elem_cnt, &send_offsets, &send_elem_cnt);
+  ShuffleData(cuda_stream, comm, data_type, send_offsets, send_elem_cnt,
+              reverse_cur_rank_quantize_factor, recv_offsets, recv_elem_cnt, recv_quantize_factor);
+}
+
+__device__ float RoundHalfAwayFromZero(const float x) {
+  float abs_val = abs(x);
+  float floor_val = floor(abs_val + static_cast<float>(0.5));
+  return copysignf(floor_val, x);
+}
+
+// warp reduce version.
+constexpr int32_t kWarpSize = 32;
+constexpr int32_t kMaxColSize = 1024;
+
+template<typename T, int thread_group_width = kWarpSize>
+__inline__ __device__ T WarpMaxAllReduce(T val) {
+  for (int32_t lane_mask = thread_group_width / 2; lane_mask > 0; lane_mask /= 2) {
+    // val = max(val, __shfl_xor_sync(0xffffffff, val, lane_mask, thread_group_width));
+    val = max(val, __shfl_xor(val, lane_mask, thread_group_width));
+  }
+  return val;
+}
+
+inline hipError_t GetWarpImplNumBlocks(int64_t block_size, int64_t max_blocks, int64_t waves,
+                                        int* num_blocks) {
+  int dev;
+  {
+    hipError_t err = hipGetDevice(&dev);
+    if (err != hipSuccess) { return err; }
+  }
+  int sm_count;
+  {
+    hipError_t err = hipDeviceGetAttribute(&sm_count, hipDeviceAttributeMultiprocessorCount, dev);
+    if (err != hipSuccess) { return err; }
+  }
+  int tpm;
+  {
+    hipError_t err = hipDeviceGetAttribute(&tpm, hipDeviceAttributeMaxThreadsPerMultiProcessor, dev);
+    if (err != hipSuccess) { return err; }
+  }
+  *num_blocks =
+      std::max<int>(1, std::min<int64_t>(max_blocks, sm_count * tpm / block_size * waves));
+  return hipSuccess;
+}
+
+template<typename T, typename ComputeType, int pack_size, int cols_per_thread,
+         int thread_group_width, int rows_per_access, bool padding>
+__global__ void QuantizeWarpImplKernel(const T* src, int8_t* dst, T* quantize_factor,
+                                       const int64_t rows, const int64_t cols) {
+  static_assert(cols_per_thread % pack_size == 0, "");
+  static_assert(thread_group_width <= kWarpSize, "");
+  static_assert(kWarpSize % thread_group_width == 0, "");
+  constexpr int num_packs = cols_per_thread / pack_size;
+  assert(cols <= cols_per_thread * thread_group_width);
+  ComputeType buf[rows_per_access][cols_per_thread];
+  const int global_thread_group_id = blockIdx.x * blockDim.y + threadIdx.y;
+  const int num_global_thread_group = gridDim.x * blockDim.y;
+  const int lane_id = threadIdx.x;
+  const int64_t step = num_global_thread_group * rows_per_access;
+  using LoadType = cuda::elementwise::PackType<T, pack_size>;
+  using LoadPack = cuda::elementwise::Pack<T, pack_size>;
+  using StoreType = cuda::elementwise::PackType<int8_t, pack_size>;
+  using StorePack = cuda::elementwise::Pack<int8_t, pack_size>;
+
+  for (int64_t row = global_thread_group_id * rows_per_access; row < rows; row += step) {
+    ComputeType thread_abs_max[rows_per_access];
+#pragma unroll
+    for (int row_id = 0; row_id < rows_per_access; row_id++) {
+      ComputeType* row_buf = buf[row_id];
+      thread_abs_max[row_id] = 0.0;
+#pragma unroll
+      for (int pack_id = 0; pack_id < num_packs; pack_id++) {
+        const int pack_offset = pack_id * pack_size;
+        const int col = (pack_id * thread_group_width + lane_id) * pack_size;
+        LoadPack load_pack;
+        if (!padding || col < cols) {
+          const int64_t load_offset = ((row + row_id) * cols + col) / pack_size;
+          load_pack.storage = *(reinterpret_cast<const LoadType*>(src) + load_offset);
+#pragma unroll
+          for (int i = 0; i < pack_size; i++) {
+            row_buf[pack_offset + i] = static_cast<ComputeType>(load_pack.elem[i]);
+            thread_abs_max[row_id] = max(thread_abs_max[row_id], abs(row_buf[pack_offset + i]));
+          }
+        } else {
+#pragma unroll
+          for (int i = 0; i < pack_size; i++) { row_buf[pack_offset + i] = 0.0; }
+        }
+      }
+    }
+    ComputeType warp_max[rows_per_access];
+#pragma unroll
+    for (int row_id = 0; row_id < rows_per_access; row_id++) {
+      warp_max[row_id] = WarpMaxAllReduce<ComputeType, thread_group_width>(thread_abs_max[row_id]);
+      if (threadIdx.x == 0) { quantize_factor[row + row_id] = static_cast<T>(warp_max[row_id]); }
+      ComputeType* row_buf = buf[row_id];
+      ComputeType quantize_factor_val = static_cast<ComputeType>(127.0) / warp_max[row_id];
+#pragma unroll
+      for (int col = 0; col < cols_per_thread; col++) {
+        row_buf[col] = RoundHalfAwayFromZero(row_buf[col] * quantize_factor_val);
+      }
+#pragma unroll
+      for (int pack_id = 0; pack_id < num_packs; pack_id++) {
+        const int pack_offset = pack_id * pack_size;
+        const int col = (pack_id * thread_group_width + lane_id) * pack_size;
+        StorePack store_pack;
+        if (!padding || col < cols) {
+          const int64_t store_offset = ((row + row_id) * cols + col) / pack_size;
+          for (int i = 0; i < pack_size; i++) {
+            store_pack.elem[i] = static_cast<int8_t>(row_buf[pack_id * pack_size + i]);
+          }
+          *(reinterpret_cast<StoreType*>(dst) + store_offset) = store_pack.storage;
+        }
+      }
+    }
+  }
+}
+
+template<typename T, typename ComputeType, int pack_size, int cols_per_thread,
+         int thread_group_width, int rows_per_access, bool padding>
+inline hipError_t LaunchQuantizeWarpImpl(hipStream_t stream, const T* src, int8_t* dst,
+                                          T* quantize_factor, const int64_t rows,
+                                          const int64_t cols) {
+  constexpr int block_size = 128;
+  constexpr int waves = 32;
+  static_assert(block_size % thread_group_width == 0, "");
+  constexpr int thread_groups_per_block = block_size / thread_group_width;
+  dim3 block_dim(thread_group_width, thread_groups_per_block);
+  const int64_t num_blocks =
+      (rows / rows_per_access + thread_groups_per_block - 1) / thread_groups_per_block;
+  int grid_dim_x = 0;
+
+  hipError_t err = GetWarpImplNumBlocks(block_size, num_blocks, waves, &grid_dim_x);
+  if (err != hipSuccess) { return err; }
+
+  QuantizeWarpImplKernel<T, ComputeType, pack_size, cols_per_thread, thread_group_width,
+                         rows_per_access, padding>
+      <<<grid_dim_x, block_dim, 0, stream>>>(src, dst, quantize_factor, rows, cols);
+  return hipPeekAtLastError();
+}
+
+template<typename T, typename ComputeType, int pack_size, int cols_per_thread,
+         int thread_group_width, int rows_per_access>
+inline hipError_t DispatchQuantizeWarpImplPadding(hipStream_t stream, const T* src, int8_t* dst,
+                                                   T* quantize_factor, const int64_t rows,
+                                                   const int64_t cols) {
+  if (cols == cols_per_thread * thread_group_width) {
+    return LaunchQuantizeWarpImpl<T, ComputeType, pack_size, cols_per_thread, thread_group_width,
+                                  rows_per_access, false>(stream, src, dst, quantize_factor, rows,
+                                                          cols);
+  } else {
+    return LaunchQuantizeWarpImpl<T, ComputeType, pack_size, cols_per_thread, thread_group_width,
+                                  rows_per_access, true>(stream, src, dst, quantize_factor, rows,
+                                                         cols);
+  }
+}
+
+template<typename T, typename ComputeType, int pack_size>
+typename std::enable_if<pack_size == 1, hipError_t>::type DispatchQuantizeWarpImplCols(
+    hipStream_t stream, const T* src, int8_t* dst, T* quantize_factor, const int64_t rows,
+    const int64_t cols) {
+  if (cols <= 0) { return hipErrorInvalidValue; }
+#define DEFINE_ONE_ELIF(thread_group_width)                                                       \
+  else if (cols <= (thread_group_width)*pack_size) {                                              \
+    if (rows % 2 == 0) {                                                                          \
+      return DispatchQuantizeWarpImplPadding<T, ComputeType, pack_size, pack_size,                \
+                                             thread_group_width, 2>(stream, src, dst,             \
+                                                                    quantize_factor, rows, cols); \
+    } else {                                                                                      \
+      return DispatchQuantizeWarpImplPadding<T, ComputeType, pack_size, pack_size,                \
+                                             thread_group_width, 1>(stream, src, dst,             \
+                                                                    quantize_factor, rows, cols); \
+    }                                                                                             \
+  }
+  DEFINE_ONE_ELIF(1)
+  DEFINE_ONE_ELIF(2)
+  DEFINE_ONE_ELIF(4)
+  DEFINE_ONE_ELIF(8)
+  DEFINE_ONE_ELIF(16)
+  DEFINE_ONE_ELIF(32)
+#undef DEFINE_ONE_ELIF
+#define DEFINE_ONE_ELIF(col)                                                              \
+  else if (cols <= (col)*kWarpSize) {                                                     \
+    return DispatchQuantizeWarpImplPadding<T, ComputeType, pack_size, col, kWarpSize, 1>( \
+        stream, src, dst, quantize_factor, rows, cols);                                   \
+  }
+  DEFINE_ONE_ELIF(2)
+  DEFINE_ONE_ELIF(3)
+  DEFINE_ONE_ELIF(4)
+  DEFINE_ONE_ELIF(5)
+  DEFINE_ONE_ELIF(6)
+  DEFINE_ONE_ELIF(7)
+  DEFINE_ONE_ELIF(8)
+  DEFINE_ONE_ELIF(9)
+  DEFINE_ONE_ELIF(10)
+  DEFINE_ONE_ELIF(11)
+  DEFINE_ONE_ELIF(12)
+  DEFINE_ONE_ELIF(13)
+  DEFINE_ONE_ELIF(14)
+  DEFINE_ONE_ELIF(15)
+  DEFINE_ONE_ELIF(16)
+  DEFINE_ONE_ELIF(17)
+  DEFINE_ONE_ELIF(18)
+  DEFINE_ONE_ELIF(19)
+  DEFINE_ONE_ELIF(20)
+  DEFINE_ONE_ELIF(21)
+  DEFINE_ONE_ELIF(22)
+  DEFINE_ONE_ELIF(23)
+  DEFINE_ONE_ELIF(24)
+  DEFINE_ONE_ELIF(25)
+  DEFINE_ONE_ELIF(26)
+  DEFINE_ONE_ELIF(27)
+  DEFINE_ONE_ELIF(28)
+  DEFINE_ONE_ELIF(29)
+  DEFINE_ONE_ELIF(30)
+  DEFINE_ONE_ELIF(31)
+  DEFINE_ONE_ELIF(32)
+#undef DEFINE_ONE_ELIF
+  else {
+    return hipErrorInvalidValue;
+  }
+}
+
+template<typename T, typename ComputeType, int pack_size>
+typename std::enable_if<pack_size == 2, hipError_t>::type DispatchQuantizeWarpImplCols(
+    hipStream_t stream, const T* src, int8_t* dst, T* quantize_factor, const int64_t rows,
+    const int64_t cols) {
+  if (cols <= 0) { return hipErrorInvalidValue; }
+#define DEFINE_ONE_ELIF(thread_group_width)                                                       \
+  else if (cols <= (thread_group_width)*pack_size) {                                              \
+    if (rows % 2 == 0) {                                                                          \
+      return DispatchQuantizeWarpImplPadding<T, ComputeType, pack_size, pack_size,                \
+                                             thread_group_width, 2>(stream, src, dst,             \
+                                                                    quantize_factor, rows, cols); \
+    } else {                                                                                      \
+      return DispatchQuantizeWarpImplPadding<T, ComputeType, pack_size, pack_size,                \
+                                             thread_group_width, 1>(stream, src, dst,             \
+                                                                    quantize_factor, rows, cols); \
+    }                                                                                             \
+  }
+  DEFINE_ONE_ELIF(1)
+  DEFINE_ONE_ELIF(2)
+  DEFINE_ONE_ELIF(4)
+  DEFINE_ONE_ELIF(8)
+  DEFINE_ONE_ELIF(16)
+  DEFINE_ONE_ELIF(32)
+#undef DEFINE_ONE_ELIF
+#define DEFINE_ONE_ELIF(col)                                                              \
+  else if (cols <= (col)*kWarpSize) {                                                     \
+    return DispatchQuantizeWarpImplPadding<T, ComputeType, pack_size, col, kWarpSize, 1>( \
+        stream, src, dst, quantize_factor, rows, cols);                                   \
+  }
+  DEFINE_ONE_ELIF(4)
+  DEFINE_ONE_ELIF(6)
+  DEFINE_ONE_ELIF(8)
+  DEFINE_ONE_ELIF(10)
+  DEFINE_ONE_ELIF(12)
+  DEFINE_ONE_ELIF(14)
+  DEFINE_ONE_ELIF(16)
+  DEFINE_ONE_ELIF(18)
+  DEFINE_ONE_ELIF(20)
+  DEFINE_ONE_ELIF(22)
+  DEFINE_ONE_ELIF(24)
+  DEFINE_ONE_ELIF(26)
+  DEFINE_ONE_ELIF(28)
+  DEFINE_ONE_ELIF(30)
+  DEFINE_ONE_ELIF(32)
+#undef DEFINE_ONE_ELIF
+  else {
+    return hipErrorInvalidValue;
+  }
+}
+
+template<typename T, typename ComputeType>
+struct DispatchQuantizeWarpImplPackSize {
+  hipError_t operator()(hipStream_t stream, const T* src, int8_t* dst, T* quantize_factor,
+                         const int64_t rows, const int64_t cols) {
+    if (cols % 2 == 0) {
+      return DispatchQuantizeWarpImplCols<T, ComputeType, 2>(stream, src, dst, quantize_factor,
+                                                             rows, cols);
+    } else {
+      return DispatchQuantizeWarpImplCols<T, ComputeType, 1>(stream, src, dst, quantize_factor,
+                                                             rows, cols);
+    }
+  }
+};
+
+template<typename T, typename ComputeType, typename IDX, int pack_size>
+__global__ void DequantizeKernel(const int8_t* x, T* quantize_factor, T* out, IDX col_size,
+                                 IDX elem_cnt);
+
+template<typename T, typename ComputeType, typename IDX, int pack_size>
+__global__ void DequantizeKernel(const int8_t* x, T* quantize_factor, T* out, IDX col_size,
+                                 IDX elem_cnt) {
+  IDX global_thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+
+  for (int index = global_thread_id * pack_size; index < elem_cnt;
+       index += gridDim.x * blockDim.x * pack_size) {
+    IDX quantize_factor_idx = index / col_size;
+    ComputeType quantize_factor_val = static_cast<ComputeType>(quantize_factor[quantize_factor_idx])
+                                      / static_cast<ComputeType>(127.0);
+    using LoadPackType = cuda::elementwise::PackType<int8_t, pack_size>;
+    using LoadPack = cuda::elementwise::Pack<int8_t, pack_size>;
+    using StorePackType = cuda::elementwise::PackType<T, pack_size>;
+    using StorePack = cuda::elementwise::Pack<T, pack_size>;
+    LoadPack load_pack{};
+    StorePack store_pack{};
+    load_pack.storage = *(reinterpret_cast<const LoadPackType*>(x) + index / pack_size);
+#pragma unroll
+    for (int i = 0; i < pack_size; i++) {
+      store_pack.elem[i] =
+          static_cast<T>(static_cast<ComputeType>(load_pack.elem[i]) * quantize_factor_val);
+    }
+    *(reinterpret_cast<StorePackType*>(out) + index / pack_size) = store_pack.storage;
+  }
+}
+
+template<typename T, typename ComputeType, typename IDX, int pack_size>
+hipError_t DispatchDequantizeKernelPackSize(hipStream_t stream, const int8_t* src,
+                                             T* quantize_factor, T* dst, const int64_t col_size,
+                                             const int64_t elem_cnt) {
+  const int64_t pack_num = elem_cnt / pack_size;
+  int grid_size = 0;
+  hipError_t err = cuda::elementwise::GetNumBlocks(pack_num, &grid_size);
+  if (err != hipSuccess) { return err; }
+  hipLaunchKernelGGL(HIP_KERNEL_NAME(DequantizeKernel<T, ComputeType, IDX, pack_size>), grid_size, cuda::elementwise::kBlockSize, 0, stream, src, quantize_factor, dst, col_size,
+                                                                elem_cnt);
+  return hipSuccess;
+}
+
+template<typename T, typename ComputeType, typename IDX>
+inline hipError_t LaunchDequantizeKernel(hipStream_t stream, const int8_t* src,
+                                          T* quantize_factor, T* dst, const int64_t col_size,
+                                          const int64_t elem_cnt) {
+  constexpr int quantized_src_pack_size = cuda::elementwise::PackSize<int8_t>();
+  constexpr int dst_pack_size = cuda::elementwise::PackSize<T>();
+  int launch_pack_size = std::min(quantized_src_pack_size, dst_pack_size);
+  if (launch_pack_size == 8 && col_size % 8 == 0) {
+    hipError_t err = DispatchDequantizeKernelPackSize<T, ComputeType, IDX, 8>(
+        stream, src, quantize_factor, dst, col_size, elem_cnt);
+    if (err != hipSuccess) { return err; }
+  } else if (launch_pack_size == 4 && col_size % 4 == 0) {
+    hipError_t err = DispatchDequantizeKernelPackSize<T, ComputeType, IDX, 4>(
+        stream, src, quantize_factor, dst, col_size, elem_cnt);
+    if (err != hipSuccess) { return err; }
+  } else if (launch_pack_size == 2 && col_size % 2 == 0) {
+    hipError_t err = DispatchDequantizeKernelPackSize<T, ComputeType, IDX, 2>(
+        stream, src, quantize_factor, dst, col_size, elem_cnt);
+    if (err != hipSuccess) { return err; }
+  } else {
+    hipError_t err = DispatchDequantizeKernelPackSize<T, ComputeType, IDX, 1>(
+        stream, src, quantize_factor, dst, col_size, elem_cnt);
+    if (err != hipSuccess) { return err; }
+  }
+  return hipPeekAtLastError();
+}
+
+template<typename T>
+struct DefaultComputeType {
+  using type = T;
+};
+
+template<>
+struct DefaultComputeType<half> {
+  using type = float;
+};
+
+template<typename T, typename IDX>
+class EmbeddingShuffleKernel final : public user_op::OpKernel {
+ public:
+  EmbeddingShuffleKernel() = default;
+  ~EmbeddingShuffleKernel() override = default;
+
+  std::shared_ptr<user_op::OpKernelState> CreateOpKernelState(
+      user_op::KernelInitContext* ctx) const override {
+    return std::make_shared<DataShuffleKernelState<IDX>>(ctx);
+  }
+
+ private:
+  using user_op::OpKernel::Compute;
+  void Compute(user_op::KernelComputeContext* ctx, user_op::OpKernelState* state,
+               const user_op::OpKernelCache*) const override {
+    auto* kernel_state = dynamic_cast<DataShuffleKernelState<IDX>*>(state);
+    CHECK(kernel_state != nullptr);
+    const user_op::Tensor* cur_rank_embeddings =
+        ctx->Tensor4ArgNameAndIndex("cur_rank_embeddings", 0);
+    const user_op::Tensor* num_unique_matrix = ctx->Tensor4ArgNameAndIndex("num_unique_matrix", 0);
+    const user_op::Tensor* cur_rank_inverse_indices =
+        ctx->Tensor4ArgNameAndIndex("cur_rank_inverse_indices", 0);
+    const user_op::Tensor* inverse_unique_partition_indices =
+        ctx->Tensor4ArgNameAndIndex("inverse_unique_partition_indices", 0);
+    user_op::Tensor* embeddings = ctx->Tensor4ArgNameAndIndex("embeddings", 0);
+    user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0);
+    ncclComm_t comm = kernel_state->comm();
+    using ComputeType = typename DefaultComputeType<T>::type;
+    const int64_t embedding_size = cur_rank_embeddings->shape_view().At(1);
+    IDX* host_num_unique_matrix = kernel_state->HostNumUniqueMatrix();
+    DataType data_type = cur_rank_embeddings->data_type();
+    const int64_t num_ids = inverse_unique_partition_indices->shape_view().elem_cnt();
+    const int64_t parallel_num = ctx->parallel_ctx().parallel_num();
+    const int64_t parallel_id = ctx->parallel_ctx().parallel_id();
+    bool enable_quantized_comm_env_var =
+        ParseBooleanFromEnv("ONEFLOW_ONE_EMBEDDING_ENABLE_QUANTIZED_COMM", false);
+    bool enable_quantized_comm = enable_quantized_comm_env_var && (embedding_size < kMaxColSize);
+    if (enable_quantized_comm_env_var && !enable_quantized_comm) {
+      LOG(WARNING) << "Only envrionment variable ONEFLOW_ONE_EMBEDDING_ENABLE_QUANTIZED_COMM=1 and "
+                      "embedding_size less equal than 1024 can use quantized communication. ";
+    }
+    hipStream_t cuda_stream = ctx->stream()->As<ep::CudaStream>()->cuda_stream();
+    OF_CUDA_CHECK(hipMemcpyAsync(
+        host_num_unique_matrix, reinterpret_cast<const IDX*>(num_unique_matrix->dptr()),
+        parallel_num * parallel_num * sizeof(IDX), hipMemcpyDefault, cuda_stream));
+    CHECK_JUST(ctx->stream()->Sync());
+    int64_t cur_rank_num_ids = 0;
+    for (int64_t i = 0; i < parallel_num; ++i) {
+      cur_rank_num_ids += host_num_unique_matrix[i * parallel_num + parallel_id];
+    }
+    size_t full_elem_cnt = parallel_num * num_ids * embedding_size;
+    CHECK_EQ(full_elem_cnt, cur_rank_embeddings->shape_view().elem_cnt());
+    if (!enable_quantized_comm) {
+      size_t reverse_unique_cur_rank_embeddings_size =
+          GetCudaAlignedSize(full_elem_cnt * sizeof(T));
+      size_t received_embeddings_size = reverse_unique_cur_rank_embeddings_size;
+
+      CHECK_GE(tmp_buffer->shape_view().elem_cnt(),
+               reverse_unique_cur_rank_embeddings_size + received_embeddings_size);
+
+      T* reverse_unique_cur_rank_embeddings = reinterpret_cast<T*>(tmp_buffer->mut_dptr());
+      T* received_embeddings = reinterpret_cast<T*>(tmp_buffer->mut_dptr<char>()
+                                                    + reverse_unique_cur_rank_embeddings_size);
+      // reverse cur_rank unique
+      GatherKernelUtilImpl<DeviceType::kCUDA, T, IDX>::Forward(
+          ctx->stream(), reinterpret_cast<const IDX*>(cur_rank_inverse_indices->dptr()),
+          cur_rank_num_ids, cur_rank_embeddings->dptr<T>(),
+          Shape({1, cur_rank_embeddings->shape_view().elem_cnt() / embedding_size, embedding_size}),
+          reverse_unique_cur_rank_embeddings, 0);
+
+      ShuffleEmbeddings(cuda_stream, comm, parallel_id, parallel_num, num_ids, embedding_size,
+                        data_type, host_num_unique_matrix, reverse_unique_cur_rank_embeddings,
+                        received_embeddings);
+
+      // reverse unique_partition
+      GatherKernelUtilImpl<DeviceType::kCUDA, T, IDX>::Forward(
+          ctx->stream(), reinterpret_cast<const IDX*>(inverse_unique_partition_indices->dptr()),
+          inverse_unique_partition_indices->shape_view().elem_cnt(), received_embeddings,
+          Shape({1, parallel_num * num_ids, embedding_size}), embeddings->mut_dptr<T>(), 0);
+    } else {
+      size_t reverse_unique_cur_rank_embeddings_size =
+          GetCudaAlignedSize(full_elem_cnt * sizeof(int8_t));
+      size_t received_embeddings_size = reverse_unique_cur_rank_embeddings_size;
+      size_t quantize_cur_rank_embeddings_size = reverse_unique_cur_rank_embeddings_size;
+      size_t reverse_recv_quantize_cur_rank_embeddings_size =
+          reverse_unique_cur_rank_embeddings_size;
+      size_t cur_rank_quantize_factor_size =
+          GetCudaAlignedSize(cur_rank_embeddings->shape_view().At(0) * sizeof(T));
+      size_t reverse_cur_rank_quantize_factor_size = cur_rank_quantize_factor_size;
+      size_t recv_quantize_factor_size = cur_rank_quantize_factor_size;
+      size_t reverse_recv_quantize_factor_size = cur_rank_quantize_factor_size;
+      CHECK_GE(tmp_buffer->shape_view().elem_cnt(),
+               reverse_unique_cur_rank_embeddings_size + received_embeddings_size
+                   + quantize_cur_rank_embeddings_size
+                   + reverse_recv_quantize_cur_rank_embeddings_size + cur_rank_quantize_factor_size
+                   + reverse_cur_rank_quantize_factor_size + recv_quantize_factor_size
+                   + reverse_recv_quantize_factor_size);
+      int8_t* reverse_unique_cur_rank_embeddings =
+          reinterpret_cast<int8_t*>(tmp_buffer->mut_dptr());
+      int8_t* received_embeddings = reinterpret_cast<int8_t*>(
+          tmp_buffer->mut_dptr<char>() + reverse_unique_cur_rank_embeddings_size);
+      int8_t* quantize_cur_rank_embeddings = reinterpret_cast<int8_t*>(
+          tmp_buffer->mut_dptr<char>() + reverse_unique_cur_rank_embeddings_size
+          + received_embeddings_size);
+      int8_t* reverse_recv_quantize_cur_rank_embeddings = reinterpret_cast<int8_t*>(
+          tmp_buffer->mut_dptr<char>() + reverse_unique_cur_rank_embeddings_size
+          + received_embeddings_size + quantize_cur_rank_embeddings_size);
+      T* cur_rank_quantize_factor = reinterpret_cast<T*>(
+          tmp_buffer->mut_dptr<char>() + reverse_unique_cur_rank_embeddings_size
+          + received_embeddings_size + quantize_cur_rank_embeddings_size
+          + reverse_recv_quantize_cur_rank_embeddings_size);
+      T* reverse_cur_rank_quantize_factor = reinterpret_cast<T*>(
+          tmp_buffer->mut_dptr<char>() + reverse_unique_cur_rank_embeddings_size
+          + received_embeddings_size + quantize_cur_rank_embeddings_size
+          + reverse_recv_quantize_cur_rank_embeddings_size + cur_rank_quantize_factor_size);
+      T* recv_quantize_factor = reinterpret_cast<T*>(
+          tmp_buffer->mut_dptr<char>() + reverse_unique_cur_rank_embeddings_size
+          + received_embeddings_size + quantize_cur_rank_embeddings_size
+          + reverse_recv_quantize_cur_rank_embeddings_size + cur_rank_quantize_factor_size
+          + reverse_cur_rank_quantize_factor_size);
+      T* reverse_recv_quantize_factor = reinterpret_cast<T*>(
+          tmp_buffer->mut_dptr<char>() + reverse_unique_cur_rank_embeddings_size
+          + received_embeddings_size + quantize_cur_rank_embeddings_size
+          + reverse_recv_quantize_cur_rank_embeddings_size + cur_rank_quantize_factor_size
+          + reverse_cur_rank_quantize_factor_size + recv_quantize_factor_size);
+      DispatchQuantizeWarpImplPackSize<T, ComputeType>()(
+          cuda_stream, cur_rank_embeddings->dptr<T>(), quantize_cur_rank_embeddings,
+          cur_rank_quantize_factor, cur_rank_num_ids, embedding_size);
+      // reverse cur_rank embedding unique
+      GatherKernelUtilImpl<DeviceType::kCUDA, int8_t, IDX>::Forward(
+          ctx->stream(), reinterpret_cast<const IDX*>(cur_rank_inverse_indices->dptr()),
+          cur_rank_num_ids, quantize_cur_rank_embeddings,
+          Shape({1, cur_rank_embeddings->shape_view().elem_cnt() / embedding_size, embedding_size}),
+          reverse_unique_cur_rank_embeddings, 0);
+
+      // reverse cur_rank quantize factor unique
+      GatherKernelUtilImpl<DeviceType::kCUDA, T, IDX>::Forward(
+          ctx->stream(), reinterpret_cast<const IDX*>(cur_rank_inverse_indices->dptr()),
+          cur_rank_num_ids, cur_rank_quantize_factor,
+          Shape({1, cur_rank_embeddings->shape_view().elem_cnt() / embedding_size, 1}),
+          reverse_cur_rank_quantize_factor, 0);
+
+      ShuffleEmbeddings(cuda_stream, comm, parallel_id, parallel_num, num_ids, embedding_size,
+                        data_type, host_num_unique_matrix, reverse_unique_cur_rank_embeddings,
+                        received_embeddings, reverse_cur_rank_quantize_factor,
+                        recv_quantize_factor);
+
+      // reverse unique_partition
+      GatherKernelUtilImpl<DeviceType::kCUDA, int8_t, IDX>::Forward(
+          ctx->stream(), reinterpret_cast<const IDX*>(inverse_unique_partition_indices->dptr()),
+          inverse_unique_partition_indices->shape_view().elem_cnt(), received_embeddings,
+          Shape({1, parallel_num * num_ids, embedding_size}),
+          reverse_recv_quantize_cur_rank_embeddings, 0);
+
+      GatherKernelUtilImpl<DeviceType::kCUDA, T, IDX>::Forward(
+          ctx->stream(), reinterpret_cast<const IDX*>(inverse_unique_partition_indices->dptr()),
+          inverse_unique_partition_indices->shape_view().elem_cnt(), recv_quantize_factor,
+          Shape({1, parallel_num * num_ids, 1}), reverse_recv_quantize_factor, 0);
+
+      int32_t dequantize_row_size = inverse_unique_partition_indices->shape_view().elem_cnt();
+      IDX dequantize_elem_cnt = dequantize_row_size * embedding_size;
+      OF_CUDA_CHECK((LaunchDequantizeKernel<T, ComputeType, IDX>(
+          cuda_stream, reverse_recv_quantize_cur_rank_embeddings, reverse_recv_quantize_factor,
+          embeddings->mut_dptr<T>(), embedding_size, dequantize_elem_cnt)));
+    }
+  }
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
+};
+
+#define REGISTER_CUDA_EMBEDDING_SHUFFLE_KERNEL(t_dtype_pair, idx_dtype_pair)                      \
+  REGISTER_USER_KERNEL("embedding_shuffle")                                                       \
+      .SetCreateFn<EmbeddingShuffleKernel<OF_PP_PAIR_FIRST(t_dtype_pair),                         \
+                                          OF_PP_PAIR_FIRST(idx_dtype_pair)>>()                    \
+      .SetIsMatchedHob(                                                                           \
+          (user_op::HobDeviceType() == DeviceType::kCUDA)                                         \
+          && (user_op::HobDataType("cur_rank_embeddings", 0) == OF_PP_PAIR_SECOND(t_dtype_pair))  \
+          && (user_op::HobDataType("num_unique_matrix", 0) == OF_PP_PAIR_SECOND(idx_dtype_pair))) \
+      .SetInferTmpSizeFn([](user_op::InferContext* ctx) {                                         \
+        const user_op::TensorDesc& cur_rank_embeddings =                                          \
+            ctx->InputTensorDesc("cur_rank_embeddings", 0);                                       \
+        bool enable_quantized_comm =                                                              \
+            ParseBooleanFromEnv("ONEFLOW_ONE_EMBEDDING_ENABLE_QUANTIZED_COMM", false)             \
+            && (cur_rank_embeddings.shape().At(1) < kMaxColSize);                                 \
+        size_t tmp_size = 0;                                                                      \
+        if (!enable_quantized_comm) {                                                             \
+          size_t reverse_cur_rank_embeddings_size = GetCudaAlignedSize(                           \
+              cur_rank_embeddings.shape().elem_cnt() * sizeof(OF_PP_PAIR_FIRST(t_dtype_pair)));   \
+          size_t recv_unique_embeddings_size = reverse_cur_rank_embeddings_size;                  \
+          tmp_size = reverse_cur_rank_embeddings_size + recv_unique_embeddings_size;              \
+        } else {                                                                                  \
+          size_t total_elem_cnt = cur_rank_embeddings.shape().elem_cnt();                         \
+          size_t reverse_cur_rank_embeddings_size =                                               \
+              GetCudaAlignedSize(total_elem_cnt * sizeof(int8_t));                                \
+          size_t recv_unique_embeddings = reverse_cur_rank_embeddings_size;                       \
+          size_t quantize_cur_rank_embeddings_size = reverse_cur_rank_embeddings_size;            \
+          size_t reverse_recv_quantize_cur_rank_embeddings_size =                                 \
+              reverse_cur_rank_embeddings_size;                                                   \
+          size_t cur_rank_quantize_factor_size = GetCudaAlignedSize(                              \
+              cur_rank_embeddings.shape().At(0) * sizeof(OF_PP_PAIR_FIRST(t_dtype_pair)));        \
+          size_t reverse_cur_rank_quantize_factor_size = cur_rank_quantize_factor_size;           \
+          size_t recv_quantize_factor_size = cur_rank_quantize_factor_size;                       \
+          size_t reverse_recv_quantize_factor_size = cur_rank_quantize_factor_size;               \
+          tmp_size = reverse_cur_rank_embeddings_size + recv_unique_embeddings                    \
+                     + quantize_cur_rank_embeddings_size                                          \
+                     + reverse_recv_quantize_cur_rank_embeddings_size                             \
+                     + cur_rank_quantize_factor_size + reverse_cur_rank_quantize_factor_size      \
+                     + recv_quantize_factor_size + reverse_recv_quantize_factor_size;             \
+        }                                                                                         \
+        return tmp_size;                                                                          \
+      });
+
+OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_CUDA_EMBEDDING_SHUFFLE_KERNEL,
+                                 FLOATING_DATA_TYPE_SEQ HALF_DATA_TYPE_SEQ, IDX_DATA_TYPE_SEQ)
+                                //  FLOATING_DATA_TYPE_SEQ, IDX_DATA_TYPE_SEQ)
+                                 
+
+template<typename T, typename IDX>
+void ShuffleEmbeddingsGrad(hipStream_t cuda_stream, ncclComm_t comm, int64_t parallel_id,
+                           int64_t parallel_num, int64_t num_ids, int64_t embedding_size,
+                           DataType data_type, IDX* host_num_unique_matrix,
+                           T* unique_partition_embedding_grad, T* received_embeddings_grad) {
+  std::vector<int64_t> send_offsets;
+  std::vector<int64_t> send_elem_cnt;
+  std::vector<int64_t> recv_offsets;
+  std::vector<int64_t> recv_elem_cnt;
+  MakeShuffleParams(host_num_unique_matrix, num_ids, embedding_size, parallel_id, parallel_num,
+                    &send_offsets, &send_elem_cnt, &recv_offsets, &recv_elem_cnt);
+  ShuffleData(cuda_stream, comm, data_type, send_offsets, send_elem_cnt,
+              unique_partition_embedding_grad, recv_offsets, recv_elem_cnt,
+              received_embeddings_grad);
+}
+
+// Quantize Version.
+template<typename T, typename IDX>
+void ShuffleEmbeddingsGrad(hipStream_t cuda_stream, ncclComm_t comm, int64_t parallel_id,
+                           int64_t parallel_num, int64_t num_ids, int64_t embedding_size,
+                           DataType data_type, IDX* host_num_unique_matrix,
+                           int8_t* unique_partition_embedding_grad,
+                           int8_t* received_embeddings_grad, T* cur_rank_quantize_factor,
+                           T* received_cur_rank_quantize_factor) {
+  std::vector<int64_t> send_offsets;
+  std::vector<int64_t> send_elem_cnt;
+  std::vector<int64_t> recv_offsets;
+  std::vector<int64_t> recv_elem_cnt;
+  // Shuffle Embedding Grad.
+  MakeShuffleParams(host_num_unique_matrix, num_ids, embedding_size, parallel_id, parallel_num,
+                    &send_offsets, &send_elem_cnt, &recv_offsets, &recv_elem_cnt);
+  ShuffleData(cuda_stream, comm, DataType::kInt8, send_offsets, send_elem_cnt,
+              unique_partition_embedding_grad, recv_offsets, recv_elem_cnt,
+              received_embeddings_grad);
+  // Shuffle Quantize factor.
+  MakeShuffleParams(host_num_unique_matrix, num_ids, /*embedding_size=*/1, parallel_id,
+                    parallel_num, &send_offsets, &send_elem_cnt, &recv_offsets, &recv_elem_cnt);
+  ShuffleData(cuda_stream, comm, data_type, send_offsets, send_elem_cnt, cur_rank_quantize_factor,
+              recv_offsets, recv_elem_cnt, received_cur_rank_quantize_factor);
+}
+
+template<typename K, typename IDX>
+__global__ void UnsortedSegmentHalfGpu(const IDX in_h2_elem_cnt, const IDX h2_inner_dim_size,
+                                       const IDX inner_dim_size, const half* data,
+                                       const K* segment_ids, const IDX num_segments,
+                                       half2* out_h2) {
+  CUDA_1D_KERNEL_LOOP_T(IDX, i, in_h2_elem_cnt) {
+    const IDX segment_id_idx = i / h2_inner_dim_size;
+    const IDX h2_inner_idx = i - segment_id_idx * h2_inner_dim_size;
+    const IDX inner_idx_0 = 2 * h2_inner_idx;
+    const IDX inner_idx_1 = inner_idx_0 + 1;
+    const half* data_row = data + segment_id_idx * inner_dim_size;
+    half2 val;
+    val.data.x = data_row[inner_idx_0];
+    val.data.y = (inner_idx_1 >= inner_dim_size) ? static_cast<half>(0) : data_row[inner_idx_1];
+    const IDX idx = segment_ids[segment_id_idx];
+    const IDX out_h2_offset = idx * h2_inner_dim_size + h2_inner_idx;
+    cuda::atomic::Add(out_h2 + out_h2_offset, val);
+  }
+}
+
+template<typename T, typename K>
+struct UnsortedSegmentSumPad {
+  void operator()(ep::Stream* stream, const K* segment_ids, const T* data, int64_t num_segment_ids,
+                  int64_t num_segments, int64_t inner_dim_size, int64_t padded_inner_dim_size,
+                  T* out) const {
+    UNIMPLEMENTED();
+  }
+};
+
+template<typename K>
+struct UnsortedSegmentSumPad<half, K> {
+  void operator()(ep::Stream* stream, const K* segment_ids, const half* data,
+                  int64_t num_segment_ids, int64_t num_segments, int64_t inner_dim_size,
+                  int64_t padded_inner_dim_size, half* out) const {
+    const int64_t data_elem_cnt = num_segment_ids * inner_dim_size;
+    const int64_t out_elem_cnt = num_segments * padded_inner_dim_size;
+    CHECK_EQ(padded_inner_dim_size % 2, 0);
+    CHECK_EQ(inner_dim_size + 1, padded_inner_dim_size);
+    const int64_t h2_inner_dim_size = padded_inner_dim_size / 2;
+    const int64_t in_h2_elem_cnt = num_segment_ids * h2_inner_dim_size;
+    if (std::max(data_elem_cnt, out_elem_cnt) < GetMaxVal<int32_t>() / 2) {
+      UnsortedSegmentHalfGpu<K, int32_t>
+          <<<BlocksNum4ThreadsNum(in_h2_elem_cnt), kCudaThreadsNumPerBlock, 0,
+             stream->As<ep::CudaStream>()->cuda_stream()>>>(
+              in_h2_elem_cnt, h2_inner_dim_size, inner_dim_size, data, segment_ids, num_segments,
+              reinterpret_cast<half2*>(out));
+    } else {
+      UnsortedSegmentHalfGpu<K, int64_t>
+          <<<BlocksNum4ThreadsNum(in_h2_elem_cnt), kCudaThreadsNumPerBlock, 0,
+             stream->As<ep::CudaStream>()->cuda_stream()>>>(
+              in_h2_elem_cnt, h2_inner_dim_size, inner_dim_size, data, segment_ids, num_segments,
+              reinterpret_cast<half2*>(out));
+    }
+  }
+};
+
+template<typename T, typename K>
+void UnsortedSegmentSum(ep::Stream* stream, const K* segment_ids, const T* data,
+                        int64_t num_segment_ids, int64_t num_segments, int64_t inner_dim_size,
+                        int64_t padded_inner_dim_size, T* out) {
+  if (inner_dim_size == padded_inner_dim_size) {
+    UnsortedSegmentSumKernelUtil<DeviceType::kCUDA, T, K, T>::UnsortedSegmentSum(
+        stream, segment_ids, data, num_segment_ids, num_segments, 1, inner_dim_size, 0, out);
+  } else {
+    CHECK_EQ(inner_dim_size + 1, padded_inner_dim_size);
+    UnsortedSegmentSumPad<T, K>()(stream, segment_ids, data, num_segment_ids, num_segments,
+                                  inner_dim_size, padded_inner_dim_size, out);
+  }
+}
+
+template<typename T, typename IDX>
+void UniquePartitionEmbeddingGrad(ep::Stream* stream, int64_t parallel_id, int64_t parallel_num,
+                                  int64_t num_ids, int64_t embedding_size,
+                                  int64_t padded_embedding_size, const IDX* host_num_unique_matrix,
+                                  const T* embedding_grad,
+                                  const IDX* inverse_unique_partition_indices,
+                                  T* unique_partition_embedding_grad) {
+  for (int64_t i = 0; i < parallel_num; ++i) {
+    const int64_t offset = i * num_ids * padded_embedding_size;
+    const int64_t valid_value_size =
+        host_num_unique_matrix[parallel_id * parallel_num + i] * padded_embedding_size * sizeof(T);
+    OF_CUDA_CHECK(hipMemsetAsync(unique_partition_embedding_grad + offset, 0, valid_value_size,
+                                  stream->As<ep::CudaStream>()->cuda_stream()));
+  }
+  UnsortedSegmentSum<T, IDX>(stream, inverse_unique_partition_indices, embedding_grad, num_ids,
+                             parallel_num * num_ids, embedding_size, padded_embedding_size,
+                             unique_partition_embedding_grad);
+}
+
+template<typename T, typename IDX>
+void UniqueCurRankEmbeddingGrad(ep::Stream* stream, DataType data_type, int64_t cur_rank_num_ids,
+                                int64_t embedding_size, int64_t padded_embedding_size,
+                                const T* cur_rank_embedding_grad,
+                                const IDX* cur_rank_inverse_indices,
+                                T* cur_rank_unique_embedding_grad, T* tmp_buffer) {
+  T* unsorted_segment_sum_out =
+      (embedding_size == padded_embedding_size) ? cur_rank_unique_embedding_grad : tmp_buffer;
+  OF_CUDA_CHECK(hipMemsetAsync(unsorted_segment_sum_out, 0,
+                                cur_rank_num_ids * padded_embedding_size * sizeof(T),
+                                stream->As<ep::CudaStream>()->cuda_stream()));
+  UnsortedSegmentSum<T, IDX>(stream, cur_rank_inverse_indices, cur_rank_embedding_grad,
+                             cur_rank_num_ids, cur_rank_num_ids, padded_embedding_size,
+                             padded_embedding_size, unsorted_segment_sum_out);
+  if (embedding_size != padded_embedding_size) {
+    std::unique_ptr<ep::primitive::CopyNd> primitive =
+        ep::primitive::NewPrimitive<ep::primitive::CopyNdFactory>(DeviceType::kCUDA, 2);
+    DimVector dst_shape = {cur_rank_num_ids, embedding_size};
+    DimVector dst_pos_vec = {0, 0};
+    DimVector src_shape = {cur_rank_num_ids, padded_embedding_size};
+    DimVector src_pos_vec = {0, 0};
+    DimVector extent_vec = {cur_rank_num_ids, embedding_size};
+    primitive->Launch(stream, data_type, 2, cur_rank_unique_embedding_grad, dst_shape.data(),
+                      dst_pos_vec.data(), unsorted_segment_sum_out, src_shape.data(),
+                      src_pos_vec.data(), extent_vec.data());
+  }
+}
+
+int64_t GetPaddedEmbeddingSize(DataType data_type, int64_t embedding_size) {
+  if (data_type == DataType::kFloat16 && embedding_size % 2 != 0) {
+    return embedding_size + 1;
+  } else {
+    return embedding_size;
+  }
+}
+
+template<typename T, typename IDX>
+class EmbeddingGradientShuffleKernel final : public user_op::OpKernel {
+ public:
+  EmbeddingGradientShuffleKernel() = default;
+  ~EmbeddingGradientShuffleKernel() override = default;
+
+  std::shared_ptr<user_op::OpKernelState> CreateOpKernelState(
+      user_op::KernelInitContext* ctx) const override {
+    return std::make_shared<DataShuffleKernelState<IDX>>(ctx);
+  }
+
+ private:
+  using user_op::OpKernel::Compute;
+  void Compute(user_op::KernelComputeContext* ctx, user_op::OpKernelState* state,
+               const user_op::OpKernelCache*) const override {
+    auto* kernel_state = dynamic_cast<DataShuffleKernelState<IDX>*>(state);
+    CHECK(kernel_state != nullptr);
+    const user_op::Tensor* embedding_grad = ctx->Tensor4ArgNameAndIndex("embedding_grad", 0);
+
+    const user_op::Tensor* num_unique_matrix = ctx->Tensor4ArgNameAndIndex("num_unique_matrix", 0);
+    const user_op::Tensor* cur_rank_inverse_indices =
+        ctx->Tensor4ArgNameAndIndex("cur_rank_inverse_indices", 0);
+    const user_op::Tensor* inverse_unique_partition_indices =
+        ctx->Tensor4ArgNameAndIndex("inverse_unique_partition_indices", 0);
+    user_op::Tensor* cur_rank_unique_embedding_grad =
+        ctx->Tensor4ArgNameAndIndex("cur_rank_unique_embedding_grad", 0);
+    const int64_t embedding_size = cur_rank_unique_embedding_grad->shape_view().At(1);
+    IDX* host_num_unique_matrix = kernel_state->HostNumUniqueMatrix();
+    DataType data_type = embedding_grad->data_type();
+    const int64_t num_ids = inverse_unique_partition_indices->shape_view().elem_cnt();
+    const int64_t parallel_num = ctx->parallel_ctx().parallel_num();
+    const int64_t parallel_id = ctx->parallel_ctx().parallel_id();
+    const int64_t padded_embedding_size = GetPaddedEmbeddingSize(data_type, embedding_size);
+    user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0);
+    ncclComm_t comm = kernel_state->comm();
+    using ComputeType = typename DefaultComputeType<T>::type;
+    bool enable_quantized_comm_env_var =
+        ParseBooleanFromEnv("ONEFLOW_ONE_EMBEDDING_ENABLE_QUANTIZED_COMM", false);
+    bool enable_quantized_comm =
+        enable_quantized_comm_env_var && (padded_embedding_size < kMaxColSize);
+    if (enable_quantized_comm_env_var && !enable_quantized_comm) {
+      LOG(WARNING) << "Only envrionment variable ONEFLOW_ONE_EMBEDDING_ENABLE_QUANTIZED_COMM=1 and "
+                      "embedding_size less equal than 1024 can use quantized communication. ";
+    }
+    hipStream_t cuda_stream = ctx->stream()->As<ep::CudaStream>()->cuda_stream();
+    OF_CUDA_CHECK(hipMemcpyAsync(host_num_unique_matrix, num_unique_matrix->dptr(),
+                                  parallel_num * parallel_num * sizeof(IDX), hipMemcpyDefault,
+                                  cuda_stream));
+    CHECK_JUST(ctx->stream()->Sync());
+
+    int64_t cur_rank_num_ids = 0;
+    for (int64_t i = 0; i < parallel_num; ++i) {
+      cur_rank_num_ids += host_num_unique_matrix[i * parallel_num + parallel_id];
+    }
+    size_t full_num_ids = parallel_num * num_ids;
+    size_t full_elem_cnt = full_num_ids * padded_embedding_size;
+    size_t unique_partition_embedding_grad_size = GetCudaAlignedSize(full_elem_cnt * sizeof(T));
+
+    if (!enable_quantized_comm) {
+      size_t received_embedding_grad_size = unique_partition_embedding_grad_size;
+      T* unique_partition_embedding_grad = reinterpret_cast<T*>(tmp_buffer->mut_dptr());
+      T* received_embedding_grad =
+          reinterpret_cast<T*>(tmp_buffer->mut_dptr<char>() + unique_partition_embedding_grad_size);
+      CHECK_GE(tmp_buffer->shape_view().elem_cnt(),
+               unique_partition_embedding_grad_size + received_embedding_grad_size);
+
+      UniquePartitionEmbeddingGrad(
+          ctx->stream(), parallel_id, parallel_num, num_ids, embedding_size, padded_embedding_size,
+          host_num_unique_matrix, embedding_grad->dptr<T>(),
+          reinterpret_cast<const IDX*>(inverse_unique_partition_indices->dptr()),
+          unique_partition_embedding_grad);
+
+      ShuffleEmbeddingsGrad(cuda_stream, comm, parallel_id, parallel_num, num_ids,
+                            padded_embedding_size, data_type, host_num_unique_matrix,
+                            unique_partition_embedding_grad, received_embedding_grad);
+
+      // use unique_partition_embedding_grad as UniqueCurRankEmbeddingGrad buffer.
+      T* buffer_ptr = unique_partition_embedding_grad;
+      UniqueCurRankEmbeddingGrad(ctx->stream(), data_type, cur_rank_num_ids, embedding_size,
+                                 padded_embedding_size, received_embedding_grad,
+                                 reinterpret_cast<const IDX*>(cur_rank_inverse_indices->dptr()),
+                                 cur_rank_unique_embedding_grad->mut_dptr<T>(), buffer_ptr);
+    } else {
+      size_t received_embedding_grad_size = GetCudaAlignedSize(full_elem_cnt * sizeof(int8_t));
+      size_t quantize_cur_rank_embedding_grad_size = received_embedding_grad_size;
+      size_t cur_rank_quantize_factor_size = GetCudaAlignedSize(full_num_ids * sizeof(T));
+      size_t received_cur_rank_quantize_factor_size = cur_rank_quantize_factor_size;
+      size_t dequantize_cur_rank_embedding_grad_size =
+          GetCudaAlignedSize(full_elem_cnt * sizeof(T));
+      CHECK_GE(tmp_buffer->shape_view().elem_cnt(),
+               unique_partition_embedding_grad_size + received_embedding_grad_size
+                   + quantize_cur_rank_embedding_grad_size + cur_rank_quantize_factor_size
+                   + received_cur_rank_quantize_factor_size
+                   + dequantize_cur_rank_embedding_grad_size);
+      T* unique_partition_embedding_grad = reinterpret_cast<T*>(tmp_buffer->mut_dptr());
+      int8_t* received_embedding_grad = reinterpret_cast<int8_t*>(
+          tmp_buffer->mut_dptr<char>() + unique_partition_embedding_grad_size);
+
+      int8_t* quantize_cur_rank_embedding_grad = reinterpret_cast<int8_t*>(
+          tmp_buffer->mut_dptr<char>() + unique_partition_embedding_grad_size
+          + received_embedding_grad_size);
+      T* cur_rank_quantize_factor = reinterpret_cast<T*>(
+          tmp_buffer->mut_dptr<char>() + unique_partition_embedding_grad_size
+          + received_embedding_grad_size + quantize_cur_rank_embedding_grad_size);
+      T* received_cur_rank_quantize_factor = reinterpret_cast<T*>(
+          tmp_buffer->mut_dptr<char>() + unique_partition_embedding_grad_size
+          + received_embedding_grad_size + quantize_cur_rank_embedding_grad_size
+          + cur_rank_quantize_factor_size);
+      T* dequantize_cur_rank_embedding_grad = reinterpret_cast<T*>(
+          tmp_buffer->mut_dptr<char>() + unique_partition_embedding_grad_size
+          + received_embedding_grad_size + quantize_cur_rank_embedding_grad_size
+          + cur_rank_quantize_factor_size + received_cur_rank_quantize_factor_size);
+
+      UniquePartitionEmbeddingGrad(
+          ctx->stream(), parallel_id, parallel_num, num_ids, embedding_size, padded_embedding_size,
+          host_num_unique_matrix, embedding_grad->dptr<T>(),
+          reinterpret_cast<const IDX*>(inverse_unique_partition_indices->dptr()),
+          unique_partition_embedding_grad);
+
+      // Quantize.
+      for (int64_t i = 0; i < parallel_num; ++i) {
+        const int64_t embedding_grad_offset = i * num_ids * padded_embedding_size;
+        const int64_t quantize_factor_offset = i * num_ids;
+        const int64_t valid_row_size = host_num_unique_matrix[parallel_id * parallel_num + i];
+        DispatchQuantizeWarpImplPackSize<T, ComputeType>()(
+            cuda_stream, unique_partition_embedding_grad + embedding_grad_offset,
+            quantize_cur_rank_embedding_grad + embedding_grad_offset,
+            cur_rank_quantize_factor + quantize_factor_offset, valid_row_size,
+            padded_embedding_size);
+      }
+
+      ShuffleEmbeddingsGrad(cuda_stream, comm, parallel_id, parallel_num, num_ids,
+                            padded_embedding_size, data_type, host_num_unique_matrix,
+                            quantize_cur_rank_embedding_grad, received_embedding_grad,
+                            cur_rank_quantize_factor, received_cur_rank_quantize_factor);
+
+      int64_t dequantize_cur_rank_num = 0;
+      for (int64_t i = 0; i < parallel_num; ++i) {
+        /*
+        Host num unique matrix:
+                |  Partition0  |  Partition1  |
+        | Rank0 |      2       |       4      |
+        | Rank1 |      3       |       3      |
+        After ShuffleEmbeddingGrads, each rank will exchange partition.
+        For example:
+        Rank0 will have (matrix[rank0][part0] + matrix[rank1][part0]) grad tensor.
+        Rank1 will have (matrix[rank0][part1] + matrix[rank1][part1]) grad tensor.
+        */
+        dequantize_cur_rank_num += host_num_unique_matrix[i * parallel_num + parallel_id];
+      }
+      IDX dequantize_elem_cnt = dequantize_cur_rank_num * padded_embedding_size;
+      OF_CUDA_CHECK((LaunchDequantizeKernel<T, ComputeType, IDX>(
+          cuda_stream, received_embedding_grad, received_cur_rank_quantize_factor,
+          dequantize_cur_rank_embedding_grad, padded_embedding_size, dequantize_elem_cnt)));
+      // use unique_partition_embedding_grad as UniqueCurRankEmbeddingGrad buffer.
+      T* buffer_ptr = unique_partition_embedding_grad;
+      UniqueCurRankEmbeddingGrad(ctx->stream(), data_type, cur_rank_num_ids, embedding_size,
+                                 padded_embedding_size, dequantize_cur_rank_embedding_grad,
+                                 reinterpret_cast<const IDX*>(cur_rank_inverse_indices->dptr()),
+                                 cur_rank_unique_embedding_grad->mut_dptr<T>(), buffer_ptr);
+    }
+  }
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
+};
+
+#define REGISTER_CUDA_EMBEDDING_GRADIENT_SHUFFLE_KERNEL(t_dtype_pair, idx_dtype_pair)             \
+  REGISTER_USER_KERNEL("embedding_gradient_shuffle")                                              \
+      .SetCreateFn<EmbeddingGradientShuffleKernel<OF_PP_PAIR_FIRST(t_dtype_pair),                 \
+                                                  OF_PP_PAIR_FIRST(idx_dtype_pair)>>()            \
+      .SetIsMatchedHob(                                                                           \
+          (user_op::HobDeviceType() == DeviceType::kCUDA)                                         \
+          && (user_op::HobDataType("embedding_grad", 0) == OF_PP_PAIR_SECOND(t_dtype_pair))       \
+          && (user_op::HobDataType("num_unique_matrix", 0) == OF_PP_PAIR_SECOND(idx_dtype_pair))) \
+      .SetInferTmpSizeFn([](user_op::InferContext* ctx) {                                         \
+        const user_op::TensorDesc& cur_rank_unique_embedding_grad =                               \
+            ctx->InputTensorDesc("cur_rank_unique_embedding_grad", 0);                            \
+        size_t cur_rank_embedding_grad_num = cur_rank_unique_embedding_grad.shape().At(0);        \
+        size_t embedding_size = cur_rank_unique_embedding_grad.shape().At(1);                     \
+        size_t padded_embedding_size =                                                            \
+            GetPaddedEmbeddingSize(cur_rank_unique_embedding_grad.data_type(), embedding_size);   \
+        size_t cur_rank_embedding_grad_elem_cnt =                                                 \
+            cur_rank_embedding_grad_num * padded_embedding_size;                                  \
+        bool enable_quantized_comm =                                                              \
+            ParseBooleanFromEnv("ONEFLOW_ONE_EMBEDDING_ENABLE_QUANTIZED_COMM", false)             \
+            && (padded_embedding_size < kMaxColSize);                                             \
+        size_t tmp_size = 0;                                                                      \
+        if (!enable_quantized_comm) {                                                             \
+          size_t cur_rank_embedding_grad_size = GetCudaAlignedSize(                               \
+              cur_rank_embedding_grad_elem_cnt * sizeof(OF_PP_PAIR_FIRST(t_dtype_pair)));         \
+          tmp_size = 2 * cur_rank_embedding_grad_size;                                            \
+        } else {                                                                                  \
+          size_t unique_partition_embedding_grad_size = GetCudaAlignedSize(                       \
+              cur_rank_embedding_grad_elem_cnt * sizeof(OF_PP_PAIR_FIRST(t_dtype_pair)));         \
+          size_t received_embedding_grad_size =                                                   \
+              GetCudaAlignedSize(cur_rank_embedding_grad_elem_cnt * sizeof(int8_t));              \
+          size_t quantize_cur_rank_embedding_grad_size = received_embedding_grad_size;            \
+          size_t cur_rank_quantize_factor_size = GetCudaAlignedSize(                              \
+              cur_rank_embedding_grad_num * sizeof(OF_PP_PAIR_FIRST(t_dtype_pair)));              \
+          size_t received_cur_rank_quantize_factor_size = cur_rank_quantize_factor_size;          \
+          size_t dequantize_cur_rank_embedding_grad_size = unique_partition_embedding_grad_size;  \
+          tmp_size = unique_partition_embedding_grad_size + received_embedding_grad_size          \
+                     + quantize_cur_rank_embedding_grad_size + cur_rank_quantize_factor_size      \
+                     + received_cur_rank_quantize_factor_size                                     \
+                     + dequantize_cur_rank_embedding_grad_size;                                   \
+        }                                                                                         \
+        return tmp_size;                                                                          \
+      });
+
+OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_CUDA_EMBEDDING_GRADIENT_SHUFFLE_KERNEL,
+                                 FLOATING_DATA_TYPE_SEQ HALF_DATA_TYPE_SEQ, IDX_DATA_TYPE_SEQ)
+                                //  FLOATING_DATA_TYPE_SEQ, IDX_DATA_TYPE_SEQ)
+
+template<typename K, typename V, typename IDX>
+class UniqueKeyValuePairKernel final : public user_op::OpKernel {
+ public:
+  UniqueKeyValuePairKernel() = default;
+  ~UniqueKeyValuePairKernel() override = default;
+
+ private:
+  using user_op::OpKernel::Compute;
+
+  void Compute(user_op::KernelComputeContext* ctx) const override {
+    const user_op::Tensor* keys = ctx->Tensor4ArgNameAndIndex("keys", 0);
+    user_op::Tensor* num_unique = ctx->Tensor4ArgNameAndIndex("num_unique", 0);
+    user_op::Tensor* unique_keys = ctx->Tensor4ArgNameAndIndex("unique_keys", 0);
+    user_op::Tensor* unique_values = ctx->Tensor4ArgNameAndIndex("unique_values", 0);
+    user_op::Tensor* inverse_indices = ctx->Tensor4ArgNameAndIndex("inverse_indices", 0);
+    user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0);
+    const int32_t num_tables = ctx->Attr<int32_t>("num_tables");
+    const bool has_values = ctx->has_input("values", 0);
+    const bool need_values_buffer = (!has_values && num_tables > 1);
+    size_t values_buffer_bytes =
+        need_values_buffer ? GetCudaAlignedSize(keys->shape_view().elem_cnt() * sizeof(V)) : 0;
+    const int64_t num_keys = keys->shape_view().elem_cnt();
+    const int64_t hash_capacity = num_keys;
+    const size_t workspace_bytes = GetCudaAlignedSize(hash_capacity * sizeof(TableEntry<K>));
+    CHECK_LE(values_buffer_bytes + workspace_bytes, tmp_buffer->shape_view().elem_cnt());
+    hipStream_t cuda_stream = ctx->stream()->As<ep::CudaStream>()->cuda_stream();
+    const V* values_ptr;
+    if (has_values) {
+      const user_op::Tensor* values = ctx->Tensor4ArgNameAndIndex("values", 0);
+      values_ptr = reinterpret_cast<const V*>(values->dptr());
+    } else if (need_values_buffer) {
+      V* values_buffer_ptr = reinterpret_cast<V*>(tmp_buffer->mut_dptr());
+      hipLaunchKernelGGL(GenerateTableIds, BlocksNum4ThreadsNum(num_keys), kCudaThreadsNumPerBlock, 0, cuda_stream, 
+          num_keys, num_tables, values_buffer_ptr);
+      values_ptr = values_buffer_ptr;
+    } else {
+      values_ptr = nullptr;
+    }
+    const bool need_process_table_ids = (has_values || num_tables > 1);
+    TableEntry<K>* workspace_ptr =
+        reinterpret_cast<TableEntry<K>*>(tmp_buffer->mut_dptr<char>() + values_buffer_bytes);
+    UniqueAndPartition<K, V, IDX, embedding::GlobalUniqueHash>(
+        cuda_stream, num_keys, hash_capacity, 1, reinterpret_cast<const K*>(keys->dptr()),
+        values_ptr, reinterpret_cast<IDX*>(num_unique->mut_dptr()),
+        reinterpret_cast<K*>(unique_keys->mut_dptr()),
+        reinterpret_cast<V*>(unique_values->mut_dptr()),
+        reinterpret_cast<IDX*>(inverse_indices->mut_dptr()), workspace_ptr, workspace_bytes,
+        need_process_table_ids);
+  }
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
+};
+
+#define REGISTER_CUDA_UNIQUE_KEY_VALUE_PAIR_KERNEL(k_dtype_pair, value_dtype_pair, idx_dtype_pair) \
+  REGISTER_USER_KERNEL("unique_key_value_pair")                                                    \
+      .SetCreateFn<UniqueKeyValuePairKernel<OF_PP_PAIR_FIRST(k_dtype_pair),                        \
+                                            OF_PP_PAIR_FIRST(value_dtype_pair),                    \
+                                            OF_PP_PAIR_FIRST(idx_dtype_pair)>>()                   \
+      .SetIsMatchedHob(                                                                            \
+          (user_op::HobDeviceType() == DeviceType::kCUDA)                                          \
+          && (user_op::HobDataType("keys", 0) == OF_PP_PAIR_SECOND(k_dtype_pair))                  \
+          && (user_op::HobDataType("inverse_indices", 0) == OF_PP_PAIR_SECOND(idx_dtype_pair))     \
+          && (user_op::HobDataType("unique_values", 0) == OF_PP_PAIR_SECOND(value_dtype_pair)))    \
+      .SetInferTmpSizeFn([](user_op::InferContext* ctx) {                                          \
+        const user_op::TensorDesc& keys = ctx->InputTensorDesc("keys", 0);                         \
+        const int64_t num_keys = keys.shape().elem_cnt();                                          \
+        const int64_t hash_capacity = num_keys;                                                    \
+        const size_t workspace_bytes = GetCudaAlignedSize(                                         \
+            hash_capacity * sizeof(TableEntry<OF_PP_PAIR_FIRST(k_dtype_pair)>));                   \
+        const int32_t num_tables = ctx->Attr<int32_t>("num_tables");                               \
+        const bool has_values = ctx->has_input("values", 0);                                       \
+        const bool need_values_buffer = (!has_values && num_tables > 1);                           \
+        size_t values_buffer_bytes =                                                               \
+            need_values_buffer                                                                     \
+                ? GetCudaAlignedSize(num_keys * sizeof(OF_PP_PAIR_FIRST(value_dtype_pair)))        \
+                : 0;                                                                               \
+        return workspace_bytes + values_buffer_bytes;                                              \
+      });
+
+OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_CUDA_UNIQUE_KEY_VALUE_PAIR_KERNEL, ID_DATA_TYPE_SEQ,
+                                 ID_DATA_TYPE_SEQ, IDX_DATA_TYPE_SEQ)
+
+REGISTER_USER_KERNEL_UNIFIED_NCCL_COMM_INIT("id_shuffle");
+REGISTER_USER_KERNEL_UNIFIED_NCCL_COMM_INIT("embedding_shuffle");
+REGISTER_USER_KERNEL_UNIFIED_NCCL_COMM_INIT("embedding_gradient_shuffle");
+
 }  // namespace oneflow
\ No newline at end of file
diff --git a/oneflow/user/kernels/diag_kernel.hip.cpp b/oneflow/user/kernels/diag_kernel.hip.cpp
index fa8a382..99550b2 100644
--- a/oneflow/user/kernels/diag_kernel.hip.cpp
+++ b/oneflow/user/kernels/diag_kernel.hip.cpp
@@ -1,80 +1,80 @@
-#include "hip/hip_runtime.h"
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-#include "oneflow/core/framework/framework.h"
-#include "oneflow/core/kernel/new_kernel_util.h"
-#include "oneflow/core/kernel/kernel_util.h"
-#include "oneflow/user/kernels/diag_kernel.h"
-#include "oneflow/core/ep/rocm/cuda_stream.h"
-
-namespace oneflow {
-namespace {
-
-template<typename T>
-__global__ void vector_diagonal_kernel(T* out_buf, const T* in_buf, int32_t size, int32_t stride) {
-  CUDA_1D_KERNEL_LOOP(i, size) { out_buf[i * stride] = in_buf[i]; }
-}
-
-template<typename T>
-__global__ void matrix_diagonal_kernel(T* out_buf, const T* in_buf, int32_t size, int32_t stride) {
-  CUDA_1D_KERNEL_LOOP(i, size) { out_buf[i] = in_buf[i * stride]; }
-}
-
-template<typename T>
-struct DiagFunctor<DeviceType::kCUDA, T> final {
-  void operator()(ep::Stream* stream, T* out_buf, const T* in_buf, int32_t size, int32_t stride,
-                  int32_t in_dim) {
-    if (in_dim == 1) {
-      vector_diagonal_kernel<<<BlocksNum4ThreadsNum(size * size), kCudaThreadsNumPerBlock, 0,
-                               stream->As<ep::CudaStream>()->cuda_stream()>>>(out_buf, in_buf, size,
-                                                                              stride);
-    } else {
-      matrix_diagonal_kernel<<<BlocksNum4ThreadsNum(size * size), kCudaThreadsNumPerBlock, 0,
-                               stream->As<ep::CudaStream>()->cuda_stream()>>>(out_buf, in_buf, size,
-                                                                              stride);
-    }
-  }
-};
-
-template<typename T>
-struct DiagGradFunctor<DeviceType::kCUDA, T> final {
-  void operator()(ep::Stream* stream, T* dx_buf, const T* dy_buf, int32_t dx_cnt, int32_t dy_cnt,
-                  int32_t stride, int32_t in_dim) {
-    if (in_dim == 1) {
-      matrix_diagonal_kernel<<<BlocksNum4ThreadsNum(dx_cnt), kCudaThreadsNumPerBlock, 0,
-                               stream->As<ep::CudaStream>()->cuda_stream()>>>(dx_buf, dy_buf,
-                                                                              dx_cnt, stride);
-    } else {
-      vector_diagonal_kernel<<<BlocksNum4ThreadsNum(dy_cnt), kCudaThreadsNumPerBlock, 0,
-                               stream->As<ep::CudaStream>()->cuda_stream()>>>(dx_buf, dy_buf,
-                                                                              dy_cnt, stride);
-    }
-  }
-};
-
-}  // namespace
-
-REGISTER_DIAG_KERNELS(DeviceType::kCUDA, half);
-REGISTER_DIAG_KERNELS(DeviceType::kCUDA, float);
-REGISTER_DIAG_KERNELS(DeviceType::kCUDA, double);
-REGISTER_DIAG_KERNELS(DeviceType::kCUDA, bool);
-REGISTER_DIAG_KERNELS(DeviceType::kCUDA, uint8_t);
-REGISTER_DIAG_KERNELS(DeviceType::kCUDA, int8_t);
-REGISTER_DIAG_KERNELS(DeviceType::kCUDA, int32_t);
-REGISTER_DIAG_KERNELS(DeviceType::kCUDA, int64_t);
-
+#include "hip/hip_runtime.h"
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+#include "oneflow/core/framework/framework.h"
+#include "oneflow/core/kernel/new_kernel_util.h"
+#include "oneflow/core/kernel/kernel_util.h"
+#include "oneflow/user/kernels/diag_kernel.h"
+#include "oneflow/core/ep/rocm/cuda_stream.h"
+
+namespace oneflow {
+namespace {
+
+template<typename T>
+__global__ void vector_diagonal_kernel(T* out_buf, const T* in_buf, int32_t size, int32_t stride) {
+  CUDA_1D_KERNEL_LOOP(i, size) { out_buf[i * stride] = in_buf[i]; }
+}
+
+template<typename T>
+__global__ void matrix_diagonal_kernel(T* out_buf, const T* in_buf, int32_t size, int32_t stride) {
+  CUDA_1D_KERNEL_LOOP(i, size) { out_buf[i] = in_buf[i * stride]; }
+}
+
+template<typename T>
+struct DiagFunctor<DeviceType::kCUDA, T> final {
+  void operator()(ep::Stream* stream, T* out_buf, const T* in_buf, int32_t size, int32_t stride,
+                  int32_t in_dim) {
+    if (in_dim == 1) {
+      vector_diagonal_kernel<<<BlocksNum4ThreadsNum(size * size), kCudaThreadsNumPerBlock, 0,
+                               stream->As<ep::CudaStream>()->cuda_stream()>>>(out_buf, in_buf, size,
+                                                                              stride);
+    } else {
+      matrix_diagonal_kernel<<<BlocksNum4ThreadsNum(size * size), kCudaThreadsNumPerBlock, 0,
+                               stream->As<ep::CudaStream>()->cuda_stream()>>>(out_buf, in_buf, size,
+                                                                              stride);
+    }
+  }
+};
+
+template<typename T>
+struct DiagGradFunctor<DeviceType::kCUDA, T> final {
+  void operator()(ep::Stream* stream, T* dx_buf, const T* dy_buf, int32_t dx_cnt, int32_t dy_cnt,
+                  int32_t stride, int32_t in_dim) {
+    if (in_dim == 1) {
+      matrix_diagonal_kernel<<<BlocksNum4ThreadsNum(dx_cnt), kCudaThreadsNumPerBlock, 0,
+                               stream->As<ep::CudaStream>()->cuda_stream()>>>(dx_buf, dy_buf,
+                                                                              dx_cnt, stride);
+    } else {
+      vector_diagonal_kernel<<<BlocksNum4ThreadsNum(dy_cnt), kCudaThreadsNumPerBlock, 0,
+                               stream->As<ep::CudaStream>()->cuda_stream()>>>(dx_buf, dy_buf,
+                                                                              dy_cnt, stride);
+    }
+  }
+};
+
+}  // namespace
+
+REGISTER_DIAG_KERNELS(DeviceType::kCUDA, half);
+REGISTER_DIAG_KERNELS(DeviceType::kCUDA, float);
+REGISTER_DIAG_KERNELS(DeviceType::kCUDA, double);
+REGISTER_DIAG_KERNELS(DeviceType::kCUDA, bool);
+REGISTER_DIAG_KERNELS(DeviceType::kCUDA, uint8_t);
+REGISTER_DIAG_KERNELS(DeviceType::kCUDA, int8_t);
+REGISTER_DIAG_KERNELS(DeviceType::kCUDA, int32_t);
+REGISTER_DIAG_KERNELS(DeviceType::kCUDA, int64_t);
+
 }  // namespace oneflow
\ No newline at end of file
diff --git a/oneflow/user/kernels/diagonal_kernel.hip.cpp b/oneflow/user/kernels/diagonal_kernel.hip.cpp
index cd0815e..e6aa116 100644
--- a/oneflow/user/kernels/diagonal_kernel.hip.cpp
+++ b/oneflow/user/kernels/diagonal_kernel.hip.cpp
@@ -1,163 +1,163 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "hip/hip_runtime.h"
-#include "oneflow/core/common/util.h"
-#include "oneflow/core/framework/framework.h"
-#include "oneflow/core/kernel/new_kernel_util.h"
-#include "oneflow/core/kernel/kernel_util.h"
-#include "oneflow/core/ep/rocm/cuda_stream.h"
-
-namespace oneflow {
-namespace {
-
-template<typename T>
-__global__ void forward_diagonal_kernel(T* out_buf, const T* in_buf, int32_t size, int32_t dim1,
-                                        int32_t dim2) {
-  int32_t offset_index = (dim1 + 1) * dim2;
-  CUDA_1D_KERNEL_LOOP(index, size * dim2) {
-    int32_t i = index / dim2;
-    int32_t j = index - i * dim2;
-    out_buf[j * size + i] = in_buf[i * offset_index + j];
-  }
-}
-
-template<typename T>
-__global__ void backward_diagonal_kernel(T* dx_buf, const T* dy_buf, int32_t size, int32_t dim1,
-                                         int32_t dim2) {
-  int32_t offset_index = (dim1 + 1) * dim2;
-  CUDA_1D_KERNEL_LOOP(index, size * dim2) {
-    int32_t i = index / dim2;
-    int32_t j = index - i * dim2;
-    dx_buf[i * offset_index + j] = dy_buf[j * size + i];
-  }
-}
-
-template<typename T>
-struct DiagonalFunctor final {
-  void operator()(ep::Stream* stream, T* out_buf, const T* in_buf, int32_t size, int32_t dim1,
-                  int32_t dim2) {
-    if (size * dim2 > 0) {
-      forward_diagonal_kernel<T>
-          <<<BlocksNum4ThreadsNum(size * dim2), kCudaThreadsNumPerBlock, 0,
-             stream->As<ep::CudaStream>()->cuda_stream()>>>(out_buf, in_buf, size, dim1, dim2);
-    }
-  }
-};
-
-template<typename T>
-struct DiagonalGradFunctor final {
-  void operator()(ep::Stream* stream, T* dx_buf, const T* dy_buf, int32_t size, int32_t dim1,
-                  int32_t dim2) {
-    if (size * dim2 > 0) {
-      backward_diagonal_kernel<T>
-          <<<BlocksNum4ThreadsNum(size * dim2), kCudaThreadsNumPerBlock, 0,
-             stream->As<ep::CudaStream>()->cuda_stream()>>>(dx_buf, dy_buf, size, dim1, dim2);
-    }
-  }
-};
-
-}  // namespace
-
-template<typename T>
-class GpuDiagonalKernel final : public user_op::OpKernel {
- public:
-  GpuDiagonalKernel() = default;
-  ~GpuDiagonalKernel() = default;
-
- private:
-  using user_op::OpKernel::Compute;
-  void Compute(user_op::KernelComputeContext* ctx) const override {
-    const int32_t offset = ctx->Attr<int32_t>("offset");
-    const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("in", 0);
-    user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0);
-    const ShapeView& out_shape = out->shape_view();
-    const ShapeView& in_shape = in->shape_view();
-    const T* in_buf = in->dptr<T>();
-    T* out_buf = out->mut_dptr<T>();
-
-    int32_t size = out_shape.At(out_shape.NumAxes() - 1);
-    int32_t dim1 = in_shape.At(1);
-    int32_t dim2 = 0;
-    if (in_shape.NumAxes() <= 2) {
-      dim2 = 1;
-    } else {
-      dim2 = in_shape.Count(2, in_shape.NumAxes());
-    }
-
-    int32_t offset_in_bufer = (offset >= 0 ? offset * dim2 : -offset * dim1 * dim2);
-    in_buf += offset_in_bufer;
-
-    DiagonalFunctor<T>()(ctx->stream(), out_buf, in_buf, size, dim1, dim2);
-  }
-  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
-};
-
-template<typename T>
-class GpuDiagonalBackwardKernel final : public user_op::OpKernel {
- public:
-  GpuDiagonalBackwardKernel() = default;
-  ~GpuDiagonalBackwardKernel() = default;
-
- private:
-  using user_op::OpKernel::Compute;
-  void Compute(user_op::KernelComputeContext* ctx) const override {
-    const user_op::Tensor* dy = ctx->Tensor4ArgNameAndIndex("dy", 0);
-    user_op::Tensor* dx = ctx->Tensor4ArgNameAndIndex("dx", 0);
-    int32_t offset = ctx->Attr<int32_t>("offset");
-    const ShapeView& dx_shape = dx->shape_view();
-    const ShapeView& dy_shape = dy->shape_view();
-    T* dx_buf = dx->mut_dptr<T>();
-    const T* dy_buf = dy->dptr<T>();
-
-    Memset<DeviceType::kCUDA>(ctx->stream(), dx->mut_dptr<T>(), 0, dx_shape.elem_cnt() * sizeof(T));
-
-    int32_t dim1 = dx_shape.At(1);
-    int32_t dim2 = 0;
-    if (dx_shape.NumAxes() <= 2) {
-      dim2 = 1;
-    } else {
-      dim2 = dx_shape.Count(2, dx_shape.NumAxes());
-    }
-    int32_t size = dy_shape.At(dy_shape.NumAxes() - 1);
-    int32_t offset_in_bufer = (offset >= 0 ? offset * dim2 : -offset * dim1 * dim2);
-    dx_buf += offset_in_bufer;
-
-    DiagonalGradFunctor<T>()(ctx->stream(), dx_buf, dy_buf, size, dim1, dim2);
-  }
-  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
-};
-
-#define REGISTER_DIAGONAL_KERNELS(dtype)                                                 \
-  REGISTER_USER_KERNEL("diagonal")                                                       \
-      .SetCreateFn<GpuDiagonalKernel<dtype>>()                                           \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                   \
-                       && (user_op::HobDataType("in", 0) == GetDataType<dtype>::value)); \
-  REGISTER_USER_KERNEL("diagonal_grad")                                                  \
-      .SetCreateFn<GpuDiagonalBackwardKernel<dtype>>()                                   \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                   \
-                       && (user_op::HobDataType("in", 0) == GetDataType<dtype>::value));
-
-REGISTER_DIAGONAL_KERNELS(bool);
-REGISTER_DIAGONAL_KERNELS(half);
-REGISTER_DIAGONAL_KERNELS(float);
-REGISTER_DIAGONAL_KERNELS(double);
-REGISTER_DIAGONAL_KERNELS(int8_t);
-REGISTER_DIAGONAL_KERNELS(int32_t);
-REGISTER_DIAGONAL_KERNELS(int64_t);
-
-#undef REGISTER_DIAGONAL_KERNELS
-
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "hip/hip_runtime.h"
+#include "oneflow/core/common/util.h"
+#include "oneflow/core/framework/framework.h"
+#include "oneflow/core/kernel/new_kernel_util.h"
+#include "oneflow/core/kernel/kernel_util.h"
+#include "oneflow/core/ep/rocm/cuda_stream.h"
+
+namespace oneflow {
+namespace {
+
+template<typename T>
+__global__ void forward_diagonal_kernel(T* out_buf, const T* in_buf, int32_t size, int32_t dim1,
+                                        int32_t dim2) {
+  int32_t offset_index = (dim1 + 1) * dim2;
+  CUDA_1D_KERNEL_LOOP(index, size * dim2) {
+    int32_t i = index / dim2;
+    int32_t j = index - i * dim2;
+    out_buf[j * size + i] = in_buf[i * offset_index + j];
+  }
+}
+
+template<typename T>
+__global__ void backward_diagonal_kernel(T* dx_buf, const T* dy_buf, int32_t size, int32_t dim1,
+                                         int32_t dim2) {
+  int32_t offset_index = (dim1 + 1) * dim2;
+  CUDA_1D_KERNEL_LOOP(index, size * dim2) {
+    int32_t i = index / dim2;
+    int32_t j = index - i * dim2;
+    dx_buf[i * offset_index + j] = dy_buf[j * size + i];
+  }
+}
+
+template<typename T>
+struct DiagonalFunctor final {
+  void operator()(ep::Stream* stream, T* out_buf, const T* in_buf, int32_t size, int32_t dim1,
+                  int32_t dim2) {
+    if (size * dim2 > 0) {
+      forward_diagonal_kernel<T>
+          <<<BlocksNum4ThreadsNum(size * dim2), kCudaThreadsNumPerBlock, 0,
+             stream->As<ep::CudaStream>()->cuda_stream()>>>(out_buf, in_buf, size, dim1, dim2);
+    }
+  }
+};
+
+template<typename T>
+struct DiagonalGradFunctor final {
+  void operator()(ep::Stream* stream, T* dx_buf, const T* dy_buf, int32_t size, int32_t dim1,
+                  int32_t dim2) {
+    if (size * dim2 > 0) {
+      backward_diagonal_kernel<T>
+          <<<BlocksNum4ThreadsNum(size * dim2), kCudaThreadsNumPerBlock, 0,
+             stream->As<ep::CudaStream>()->cuda_stream()>>>(dx_buf, dy_buf, size, dim1, dim2);
+    }
+  }
+};
+
+}  // namespace
+
+template<typename T>
+class GpuDiagonalKernel final : public user_op::OpKernel {
+ public:
+  GpuDiagonalKernel() = default;
+  ~GpuDiagonalKernel() = default;
+
+ private:
+  using user_op::OpKernel::Compute;
+  void Compute(user_op::KernelComputeContext* ctx) const override {
+    const int32_t offset = ctx->Attr<int32_t>("offset");
+    const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("in", 0);
+    user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0);
+    const ShapeView& out_shape = out->shape_view();
+    const ShapeView& in_shape = in->shape_view();
+    const T* in_buf = in->dptr<T>();
+    T* out_buf = out->mut_dptr<T>();
+
+    int32_t size = out_shape.At(out_shape.NumAxes() - 1);
+    int32_t dim1 = in_shape.At(1);
+    int32_t dim2 = 0;
+    if (in_shape.NumAxes() <= 2) {
+      dim2 = 1;
+    } else {
+      dim2 = in_shape.Count(2, in_shape.NumAxes());
+    }
+
+    int32_t offset_in_bufer = (offset >= 0 ? offset * dim2 : -offset * dim1 * dim2);
+    in_buf += offset_in_bufer;
+
+    DiagonalFunctor<T>()(ctx->stream(), out_buf, in_buf, size, dim1, dim2);
+  }
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
+};
+
+template<typename T>
+class GpuDiagonalBackwardKernel final : public user_op::OpKernel {
+ public:
+  GpuDiagonalBackwardKernel() = default;
+  ~GpuDiagonalBackwardKernel() = default;
+
+ private:
+  using user_op::OpKernel::Compute;
+  void Compute(user_op::KernelComputeContext* ctx) const override {
+    const user_op::Tensor* dy = ctx->Tensor4ArgNameAndIndex("dy", 0);
+    user_op::Tensor* dx = ctx->Tensor4ArgNameAndIndex("dx", 0);
+    int32_t offset = ctx->Attr<int32_t>("offset");
+    const ShapeView& dx_shape = dx->shape_view();
+    const ShapeView& dy_shape = dy->shape_view();
+    T* dx_buf = dx->mut_dptr<T>();
+    const T* dy_buf = dy->dptr<T>();
+
+    Memset<DeviceType::kCUDA>(ctx->stream(), dx->mut_dptr<T>(), 0, dx_shape.elem_cnt() * sizeof(T));
+
+    int32_t dim1 = dx_shape.At(1);
+    int32_t dim2 = 0;
+    if (dx_shape.NumAxes() <= 2) {
+      dim2 = 1;
+    } else {
+      dim2 = dx_shape.Count(2, dx_shape.NumAxes());
+    }
+    int32_t size = dy_shape.At(dy_shape.NumAxes() - 1);
+    int32_t offset_in_bufer = (offset >= 0 ? offset * dim2 : -offset * dim1 * dim2);
+    dx_buf += offset_in_bufer;
+
+    DiagonalGradFunctor<T>()(ctx->stream(), dx_buf, dy_buf, size, dim1, dim2);
+  }
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
+};
+
+#define REGISTER_DIAGONAL_KERNELS(dtype)                                                 \
+  REGISTER_USER_KERNEL("diagonal")                                                       \
+      .SetCreateFn<GpuDiagonalKernel<dtype>>()                                           \
+      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                   \
+                       && (user_op::HobDataType("in", 0) == GetDataType<dtype>::value)); \
+  REGISTER_USER_KERNEL("diagonal_grad")                                                  \
+      .SetCreateFn<GpuDiagonalBackwardKernel<dtype>>()                                   \
+      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                   \
+                       && (user_op::HobDataType("in", 0) == GetDataType<dtype>::value));
+
+REGISTER_DIAGONAL_KERNELS(bool);
+REGISTER_DIAGONAL_KERNELS(half);
+REGISTER_DIAGONAL_KERNELS(float);
+REGISTER_DIAGONAL_KERNELS(double);
+REGISTER_DIAGONAL_KERNELS(int8_t);
+REGISTER_DIAGONAL_KERNELS(int32_t);
+REGISTER_DIAGONAL_KERNELS(int64_t);
+
+#undef REGISTER_DIAGONAL_KERNELS
+
 }  // namespace oneflow
\ No newline at end of file
diff --git a/oneflow/user/kernels/dim_gather_kernel_util.hip.cpp b/oneflow/user/kernels/dim_gather_kernel_util.hip.cpp
index 69ce03c..934c29f 100644
--- a/oneflow/user/kernels/dim_gather_kernel_util.hip.cpp
+++ b/oneflow/user/kernels/dim_gather_kernel_util.hip.cpp
@@ -1,65 +1,65 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#ifdef WITH_ROCM
-#include "hip/hip_runtime.h"
-#include "oneflow/core/framework/framework.h"
-#include "oneflow/user/kernels/dim_gather_kernel_util.h"
-
-namespace oneflow {
-
-namespace user_op {
-
-template<typename IN_T, typename IDX_T>
-__global__ void DoCUDADimGather(const DimOpIndexNdHelper<IDX_T> input_nd_helper,
-                                const DimOpIndexNdHelper<IDX_T> index_nd_helper, int ndim,
-                                int64_t elem_cnt, int32_t dim_length, int32_t dim,
-                                const IDX_T* index, const IN_T* input, IN_T* output) {
-  DoDimGather<IN_T, IDX_T>(input_nd_helper, index_nd_helper, ndim, elem_cnt, dim_length, dim, index,
-                           input, output);
-}
-
-template<typename IDX_T, typename IN_T>
-struct DimGatherFunctor<DeviceType::kCUDA, IN_T, IDX_T> final {
-  void operator()(ep::Stream* stream, const DimOpIndexNdHelper<IDX_T>& input_nd_helper,
-                  const DimOpIndexNdHelper<IDX_T>& index_nd_helper, int ndim, int64_t elem_cnt,
-                  int32_t dim_length, int32_t dim, const IDX_T* index, const IN_T* input,
-                  IN_T* output) {
-    RUN_CUDA_KERNEL((DoCUDADimGather<IN_T, IDX_T>), stream, BlocksNum4ThreadsNum(elem_cnt),
-                    input_nd_helper, index_nd_helper, ndim, elem_cnt, dim_length, dim, index, input,
-                    output);
-  }
-};
-
-// float16 special case of DimGatherFunctor template
-template<typename IDX_T>
-struct DimGatherFunctor<DeviceType::kCUDA, float16, IDX_T> final {
-  void operator()(ep::Stream* stream, const DimOpIndexNdHelper<IDX_T>& input_nd_helper,
-                  const DimOpIndexNdHelper<IDX_T>& index_nd_helper, int ndim, int64_t elem_cnt,
-                  int32_t dim_length, int32_t dim, const IDX_T* index, const float16* input,
-                  float16* output) {
-    RUN_CUDA_KERNEL((DoCUDADimGather<half, IDX_T>), stream, BlocksNum4ThreadsNum(elem_cnt),
-                    input_nd_helper, index_nd_helper, ndim, elem_cnt, dim_length, dim, index,
-                    reinterpret_cast<const half*>(input), reinterpret_cast<half*>(output));
-  }
-};
-
-OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_DIM_GATHER_FUNCTOR, (DeviceType::kCUDA),
-                                 DIM_GATHER_SCATTER_DATA_TYPE_CUDA_SEQ, INDEX_DATA_TYPE_SEQ);
-
-}  // namespace user_op
-}  // namespace oneflow
-
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#ifdef WITH_ROCM
+#include "hip/hip_runtime.h"
+#include "oneflow/core/framework/framework.h"
+#include "oneflow/user/kernels/dim_gather_kernel_util.h"
+
+namespace oneflow {
+
+namespace user_op {
+
+template<typename IN_T, typename IDX_T>
+__global__ void DoCUDADimGather(const DimOpIndexNdHelper<IDX_T> input_nd_helper,
+                                const DimOpIndexNdHelper<IDX_T> index_nd_helper, int ndim,
+                                int64_t elem_cnt, int32_t dim_length, int32_t dim,
+                                const IDX_T* index, const IN_T* input, IN_T* output) {
+  DoDimGather<IN_T, IDX_T>(input_nd_helper, index_nd_helper, ndim, elem_cnt, dim_length, dim, index,
+                           input, output);
+}
+
+template<typename IDX_T, typename IN_T>
+struct DimGatherFunctor<DeviceType::kCUDA, IN_T, IDX_T> final {
+  void operator()(ep::Stream* stream, const DimOpIndexNdHelper<IDX_T>& input_nd_helper,
+                  const DimOpIndexNdHelper<IDX_T>& index_nd_helper, int ndim, int64_t elem_cnt,
+                  int32_t dim_length, int32_t dim, const IDX_T* index, const IN_T* input,
+                  IN_T* output) {
+    RUN_CUDA_KERNEL((DoCUDADimGather<IN_T, IDX_T>), stream, BlocksNum4ThreadsNum(elem_cnt),
+                    input_nd_helper, index_nd_helper, ndim, elem_cnt, dim_length, dim, index, input,
+                    output);
+  }
+};
+
+// float16 special case of DimGatherFunctor template
+template<typename IDX_T>
+struct DimGatherFunctor<DeviceType::kCUDA, float16, IDX_T> final {
+  void operator()(ep::Stream* stream, const DimOpIndexNdHelper<IDX_T>& input_nd_helper,
+                  const DimOpIndexNdHelper<IDX_T>& index_nd_helper, int ndim, int64_t elem_cnt,
+                  int32_t dim_length, int32_t dim, const IDX_T* index, const float16* input,
+                  float16* output) {
+    RUN_CUDA_KERNEL((DoCUDADimGather<half, IDX_T>), stream, BlocksNum4ThreadsNum(elem_cnt),
+                    input_nd_helper, index_nd_helper, ndim, elem_cnt, dim_length, dim, index,
+                    reinterpret_cast<const half*>(input), reinterpret_cast<half*>(output));
+  }
+};
+
+OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_DIM_GATHER_FUNCTOR, (DeviceType::kCUDA),
+                                 DIM_GATHER_SCATTER_DATA_TYPE_CUDA_SEQ, INDEX_DATA_TYPE_SEQ);
+
+}  // namespace user_op
+}  // namespace oneflow
+
 #endif  // WITH_ROCM
\ No newline at end of file
diff --git a/oneflow/user/kernels/dim_scatter_kernel_util.hip.cpp b/oneflow/user/kernels/dim_scatter_kernel_util.hip.cpp
index cc996ea..d436a74 100644
--- a/oneflow/user/kernels/dim_scatter_kernel_util.hip.cpp
+++ b/oneflow/user/kernels/dim_scatter_kernel_util.hip.cpp
@@ -1,67 +1,67 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#ifdef WITH_ROCM
-#include "hip/hip_runtime.h"
-#include "oneflow/user/kernels/dim_scatter_kernel_util.h"
-
-namespace oneflow {
-namespace user_op {
-
-template<typename IN_T, typename IDX_T, template<typename T> class Opt>
-__global__ void DoCUDADimScatter(const DimOpIndexNdHelper<IDX_T> src_nd_helper,
-                                 const DimOpIndexNdHelper<IDX_T> idx_nd_helper,
-                                 const DimOpIndexNdHelper<IDX_T> output_nd_helper, const int ndim,
-                                 const int64_t elem_cnt, const int32_t dim,
-                                 const int64_t upper_bound, const IDX_T* index, const IN_T* src,
-                                 IN_T* output) {
-  DoDimScatter<IN_T, IDX_T, Opt>(src_nd_helper, idx_nd_helper, output_nd_helper, ndim, elem_cnt,
-                                 dim, upper_bound, index, src, output);
-}
-
-template<typename IN_T, typename IDX_T, template<typename T> class Opt>
-struct DimScatterFunctor<DeviceType::kCUDA, IN_T, IDX_T, Opt> final {
-  void operator()(ep::Stream* stream, const DimOpIndexNdHelper<IDX_T>& src_nd_helper,
-                  const DimOpIndexNdHelper<IDX_T>& idx_nd_helper,
-                  const DimOpIndexNdHelper<IDX_T>& output_nd_helper, const int ndim,
-                  const int64_t elem_cnt, const int32_t dim, const int64_t upper_bound,
-                  const IDX_T* index, const IN_T* src, IN_T* output) {
-    RUN_CUDA_KERNEL((DoCUDADimScatter<IN_T, IDX_T, Opt>), stream, BlocksNum4ThreadsNum(elem_cnt),
-                    src_nd_helper, idx_nd_helper, output_nd_helper, ndim, elem_cnt, dim,
-                    upper_bound, index, src, output);
-  }
-};
-
-template<typename IDX_T, template<typename T> class Opt>
-struct DimScatterFunctor<DeviceType::kCUDA, float16, IDX_T, Opt> final {
-  void operator()(ep::Stream* stream, const DimOpIndexNdHelper<IDX_T>& src_nd_helper,
-                  const DimOpIndexNdHelper<IDX_T>& idx_nd_helper,
-                  const DimOpIndexNdHelper<IDX_T>& output_nd_helper, const int ndim,
-                  const int64_t elem_cnt, const int32_t dim, const int64_t upper_bound,
-                  const IDX_T* index, const float16* src, float16* output) {
-    RUN_CUDA_KERNEL((DoCUDADimScatter<half, IDX_T, Opt>), stream, BlocksNum4ThreadsNum(elem_cnt),
-                    src_nd_helper, idx_nd_helper, output_nd_helper, ndim, elem_cnt, dim,
-                    upper_bound, index, reinterpret_cast<const half*>(src),
-                    reinterpret_cast<half*>(output));
-  }
-};
-
-INSTANTIATE_DIM_SCATTER_FUNCTORS(DeviceType::kCUDA, BinOpAddFunctor);
-INSTANTIATE_DIM_SCATTER_FUNCTORS(DeviceType::kCUDA, BinOpUpdateFunctor);
-
-}  // namespace user_op
-}  // namespace oneflow
-
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#ifdef WITH_ROCM
+#include "hip/hip_runtime.h"
+#include "oneflow/user/kernels/dim_scatter_kernel_util.h"
+
+namespace oneflow {
+namespace user_op {
+
+template<typename IN_T, typename IDX_T, template<typename T> class Opt>
+__global__ void DoCUDADimScatter(const DimOpIndexNdHelper<IDX_T> src_nd_helper,
+                                 const DimOpIndexNdHelper<IDX_T> idx_nd_helper,
+                                 const DimOpIndexNdHelper<IDX_T> output_nd_helper, const int ndim,
+                                 const int64_t elem_cnt, const int32_t dim,
+                                 const int64_t upper_bound, const IDX_T* index, const IN_T* src,
+                                 IN_T* output) {
+  DoDimScatter<IN_T, IDX_T, Opt>(src_nd_helper, idx_nd_helper, output_nd_helper, ndim, elem_cnt,
+                                 dim, upper_bound, index, src, output);
+}
+
+template<typename IN_T, typename IDX_T, template<typename T> class Opt>
+struct DimScatterFunctor<DeviceType::kCUDA, IN_T, IDX_T, Opt> final {
+  void operator()(ep::Stream* stream, const DimOpIndexNdHelper<IDX_T>& src_nd_helper,
+                  const DimOpIndexNdHelper<IDX_T>& idx_nd_helper,
+                  const DimOpIndexNdHelper<IDX_T>& output_nd_helper, const int ndim,
+                  const int64_t elem_cnt, const int32_t dim, const int64_t upper_bound,
+                  const IDX_T* index, const IN_T* src, IN_T* output) {
+    RUN_CUDA_KERNEL((DoCUDADimScatter<IN_T, IDX_T, Opt>), stream, BlocksNum4ThreadsNum(elem_cnt),
+                    src_nd_helper, idx_nd_helper, output_nd_helper, ndim, elem_cnt, dim,
+                    upper_bound, index, src, output);
+  }
+};
+
+template<typename IDX_T, template<typename T> class Opt>
+struct DimScatterFunctor<DeviceType::kCUDA, float16, IDX_T, Opt> final {
+  void operator()(ep::Stream* stream, const DimOpIndexNdHelper<IDX_T>& src_nd_helper,
+                  const DimOpIndexNdHelper<IDX_T>& idx_nd_helper,
+                  const DimOpIndexNdHelper<IDX_T>& output_nd_helper, const int ndim,
+                  const int64_t elem_cnt, const int32_t dim, const int64_t upper_bound,
+                  const IDX_T* index, const float16* src, float16* output) {
+    RUN_CUDA_KERNEL((DoCUDADimScatter<half, IDX_T, Opt>), stream, BlocksNum4ThreadsNum(elem_cnt),
+                    src_nd_helper, idx_nd_helper, output_nd_helper, ndim, elem_cnt, dim,
+                    upper_bound, index, reinterpret_cast<const half*>(src),
+                    reinterpret_cast<half*>(output));
+  }
+};
+
+INSTANTIATE_DIM_SCATTER_FUNCTORS(DeviceType::kCUDA, BinOpAddFunctor);
+INSTANTIATE_DIM_SCATTER_FUNCTORS(DeviceType::kCUDA, BinOpUpdateFunctor);
+
+}  // namespace user_op
+}  // namespace oneflow
+
 #endif  // WITH_ROCM
\ No newline at end of file
diff --git a/oneflow/user/kernels/dim_scatter_scalar_kernel_util.hip.cpp b/oneflow/user/kernels/dim_scatter_scalar_kernel_util.hip.cpp
index 297271b..e01a1ef 100644
--- a/oneflow/user/kernels/dim_scatter_scalar_kernel_util.hip.cpp
+++ b/oneflow/user/kernels/dim_scatter_scalar_kernel_util.hip.cpp
@@ -1,51 +1,51 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#ifdef WITH_ROCM
-#include "hip/hip_runtime.h"
-#include "oneflow/user/kernels/dim_scatter_scalar_kernel_util.h"
-
-namespace oneflow {
-
-namespace user_op {
-
-template<typename IN_T, typename IDX_T, template<typename T> class Opt>
-__global__ void DoCUDADimScatterScalar(const DimOpIndexNdHelper<IDX_T> idx_nd_helper,
-                                       const DimOpIndexNdHelper<IDX_T> output_nd_helper,
-                                       const int ndim, const int64_t elem_cnt, const int32_t dim,
-                                       const int64_t upper_bound, const IDX_T* index,
-                                       const IN_T src_scalar, IN_T* output) {
-  DoScatterScalarFunctor<IN_T, IDX_T, Opt>(idx_nd_helper, output_nd_helper, ndim, elem_cnt, dim,
-                                           upper_bound, index, src_scalar, output);
-}
-
-template<typename IN_T, typename IDX_T, template<typename T> class Opt>
-struct DimScatterScalarFunctor<DeviceType::kCUDA, IN_T, IDX_T, Opt> final {
-  void operator()(ep::Stream* stream, const DimOpIndexNdHelper<IDX_T>& idx_nd_helper,
-                  const DimOpIndexNdHelper<IDX_T>& output_nd_helper, const int ndim,
-                  const int64_t elem_cnt, const int32_t dim, int64_t upper_bound,
-                  const IDX_T* index, const IN_T src, IN_T* output) {
-    RUN_CUDA_KERNEL((DoCUDADimScatterScalar<IN_T, IDX_T, Opt>), stream,
-                    BlocksNum4ThreadsNum(elem_cnt), idx_nd_helper, output_nd_helper, ndim, elem_cnt,
-                    dim, upper_bound, index, src, output);
-  }
-};
-
-INSTANTIATE_DIM_SCATTER_SCARLAR_FUNCTORS(DeviceType::kCUDA, UpdateScalarFunctor);
-INSTANTIATE_DIM_SCATTER_SCARLAR_FUNCTORS(DeviceType::kCUDA, AddScalarFunctor);
-
-}  // namespace user_op
-}  // namespace oneflow
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#ifdef WITH_ROCM
+#include "hip/hip_runtime.h"
+#include "oneflow/user/kernels/dim_scatter_scalar_kernel_util.h"
+
+namespace oneflow {
+
+namespace user_op {
+
+template<typename IN_T, typename IDX_T, template<typename T> class Opt>
+__global__ void DoCUDADimScatterScalar(const DimOpIndexNdHelper<IDX_T> idx_nd_helper,
+                                       const DimOpIndexNdHelper<IDX_T> output_nd_helper,
+                                       const int ndim, const int64_t elem_cnt, const int32_t dim,
+                                       const int64_t upper_bound, const IDX_T* index,
+                                       const IN_T src_scalar, IN_T* output) {
+  DoScatterScalarFunctor<IN_T, IDX_T, Opt>(idx_nd_helper, output_nd_helper, ndim, elem_cnt, dim,
+                                           upper_bound, index, src_scalar, output);
+}
+
+template<typename IN_T, typename IDX_T, template<typename T> class Opt>
+struct DimScatterScalarFunctor<DeviceType::kCUDA, IN_T, IDX_T, Opt> final {
+  void operator()(ep::Stream* stream, const DimOpIndexNdHelper<IDX_T>& idx_nd_helper,
+                  const DimOpIndexNdHelper<IDX_T>& output_nd_helper, const int ndim,
+                  const int64_t elem_cnt, const int32_t dim, int64_t upper_bound,
+                  const IDX_T* index, const IN_T src, IN_T* output) {
+    RUN_CUDA_KERNEL((DoCUDADimScatterScalar<IN_T, IDX_T, Opt>), stream,
+                    BlocksNum4ThreadsNum(elem_cnt), idx_nd_helper, output_nd_helper, ndim, elem_cnt,
+                    dim, upper_bound, index, src, output);
+  }
+};
+
+INSTANTIATE_DIM_SCATTER_SCARLAR_FUNCTORS(DeviceType::kCUDA, UpdateScalarFunctor);
+INSTANTIATE_DIM_SCATTER_SCARLAR_FUNCTORS(DeviceType::kCUDA, AddScalarFunctor);
+
+}  // namespace user_op
+}  // namespace oneflow
 #endif
\ No newline at end of file
diff --git a/oneflow/user/kernels/distributions/normal_distribution.hip.cpp b/oneflow/user/kernels/distributions/normal_distribution.hip.cpp
index 6056f47..6a888fc 100644
--- a/oneflow/user/kernels/distributions/normal_distribution.hip.cpp
+++ b/oneflow/user/kernels/distributions/normal_distribution.hip.cpp
@@ -1,71 +1,71 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "hip/hip_runtime.h"
-#include "oneflow/user/kernels/distributions/normal_distribution.h"
-#include "oneflow/core/common/data_type.h"
-#include "oneflow/core/ep/include/device.h"
-#include "oneflow/core/ep/rocm/cuda_stream.h"
-
-namespace oneflow {
-
-namespace {
-
-template<typename T>
-__device__ T GenNormal(hiprandState* state, const T mean, const T std);
-
-template<>
-__device__ float GenNormal<float>(hiprandState* state, const float mean, const float std) {
-  return (hiprand_normal(state) + mean) / std;
-}
-
-template<>
-__device__ double GenNormal<double>(hiprandState* state, const double mean, const double std) {
-  return (hiprand_normal_double(state) + mean) / std;
-}
-
-template<typename T>
-__global__ void GenerateGpu(hiprandState* state, const int64_t elem_cnt, T* dptr, const T mean,
-                            const T std) {
-  const int id = blockIdx.x * blockDim.x + threadIdx.x;
-  hiprandState localState = state[id];
-  CUDA_1D_KERNEL_LOOP(i, elem_cnt) { dptr[i] = GenNormal<T>(&localState, mean, std); }
-  state[id] = localState;
-}
-
-}  // namespace
-
-template<typename T>
-void NormalDistribution<DeviceType::kCUDA, T>::operator()(
-    ep::Stream* stream, const int64_t elem_cnt, T* dptr,
-    const std::shared_ptr<one::Generator>& generator) const {
-  CHECK_GE(elem_cnt, 0);
-  const auto device_index = stream->device()->device_index();
-  auto gen = CHECK_JUST(generator->Get<one::CUDAGeneratorImpl>(device_index));
-  int32_t block_num = gen->max_block_num();
-  int32_t thread_num = gen->max_thread_num();
-  auto* curand_states = gen->curand_states();
-  GenerateGpu<T><<<block_num, thread_num, 0, stream->As<ep::CudaStream>()->cuda_stream()>>>(
-      curand_states, elem_cnt, dptr, mean_, std_);
-}
-
-#define INITIATE_CUDA_NORMAL_DISTRIBUTION(T, typeproto)               \
-  template void NormalDistribution<DeviceType::kCUDA, T>::operator()( \
-      ep::Stream* stream, const int64_t elem_cnt, T* dptr,            \
-      const std::shared_ptr<one::Generator>& generator) const;
-
-OF_PP_FOR_EACH_TUPLE(INITIATE_CUDA_NORMAL_DISTRIBUTION, FLOATING_DATA_TYPE_SEQ)
-
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "hip/hip_runtime.h"
+#include "oneflow/user/kernels/distributions/normal_distribution.h"
+#include "oneflow/core/common/data_type.h"
+#include "oneflow/core/ep/include/device.h"
+#include "oneflow/core/ep/rocm/cuda_stream.h"
+
+namespace oneflow {
+
+namespace {
+
+template<typename T>
+__device__ T GenNormal(hiprandState* state, const T mean, const T std);
+
+template<>
+__device__ float GenNormal<float>(hiprandState* state, const float mean, const float std) {
+  return (hiprand_normal(state) + mean) / std;
+}
+
+template<>
+__device__ double GenNormal<double>(hiprandState* state, const double mean, const double std) {
+  return (hiprand_normal_double(state) + mean) / std;
+}
+
+template<typename T>
+__global__ void GenerateGpu(hiprandState* state, const int64_t elem_cnt, T* dptr, const T mean,
+                            const T std) {
+  const int id = blockIdx.x * blockDim.x + threadIdx.x;
+  hiprandState localState = state[id];
+  CUDA_1D_KERNEL_LOOP(i, elem_cnt) { dptr[i] = GenNormal<T>(&localState, mean, std); }
+  state[id] = localState;
+}
+
+}  // namespace
+
+template<typename T>
+void NormalDistribution<DeviceType::kCUDA, T>::operator()(
+    ep::Stream* stream, const int64_t elem_cnt, T* dptr,
+    const std::shared_ptr<one::Generator>& generator) const {
+  CHECK_GE(elem_cnt, 0);
+  const auto device_index = stream->device()->device_index();
+  auto gen = CHECK_JUST(generator->Get<one::CUDAGeneratorImpl>(device_index));
+  int32_t block_num = gen->max_block_num();
+  int32_t thread_num = gen->max_thread_num();
+  auto* curand_states = gen->curand_states();
+  GenerateGpu<T><<<block_num, thread_num, 0, stream->As<ep::CudaStream>()->cuda_stream()>>>(
+      curand_states, elem_cnt, dptr, mean_, std_);
+}
+
+#define INITIATE_CUDA_NORMAL_DISTRIBUTION(T, typeproto)               \
+  template void NormalDistribution<DeviceType::kCUDA, T>::operator()( \
+      ep::Stream* stream, const int64_t elem_cnt, T* dptr,            \
+      const std::shared_ptr<one::Generator>& generator) const;
+
+OF_PP_FOR_EACH_TUPLE(INITIATE_CUDA_NORMAL_DISTRIBUTION, FLOATING_DATA_TYPE_SEQ)
+
 }  // namespace oneflow
\ No newline at end of file
diff --git a/oneflow/user/kernels/distributions/uniform_distribution.hip.cpp b/oneflow/user/kernels/distributions/uniform_distribution.hip.cpp
index b6fbc7c..d22241c 100644
--- a/oneflow/user/kernels/distributions/uniform_distribution.hip.cpp
+++ b/oneflow/user/kernels/distributions/uniform_distribution.hip.cpp
@@ -1,77 +1,77 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "hip/hip_runtime.h"
-#include "oneflow/core/common/data_type.h"
-#include "oneflow/user/kernels/distributions/uniform_distribution.h"
-#include "oneflow/core/ep/include/device.h"
-#include "oneflow/core/ep/rocm/cuda_stream.h"
-
-namespace oneflow {
-
-namespace {
-
-template<typename T>
-__device__ T GenUniform(hiprandState* state, const T low, const T high);
-
-template<>
-__device__ float GenUniform<float>(hiprandState* state, const float low, const float high) {
-  auto rand_num = hiprand_uniform(state);
-  // hiprand_uniform generates (0.0, 1.0], but we want [0.0, 1.0) here
-  if (rand_num == 1.0) { rand_num = 0.0; }
-  return rand_num * (high - low) + low;
-}
-
-template<>
-__device__ double GenUniform<double>(hiprandState* state, const double low, const double high) {
-  auto rand_num = hiprand_uniform_double(state);
-  // hiprand_uniform_double generates (0.0, 1.0], but we want [0.0, 1.0) here
-  if (rand_num == 1.0) { rand_num = 0.0; }
-  return rand_num * (high - low) + low;
-}
-
-template<typename T>
-__global__ void GenerateGpu(hiprandState* state, const int64_t elem_cnt, T* dptr, const T low,
-                            const T high) {
-  const int id = blockIdx.x * blockDim.x + threadIdx.x;
-  hiprandState localState = state[id];
-  CUDA_1D_KERNEL_LOOP(i, elem_cnt) { dptr[i] = GenUniform<T>(&localState, low, high); }
-  state[id] = localState;
-}
-
-}  // namespace
-
-template<typename T>
-void UniformDistribution<DeviceType::kCUDA, T>::operator()(
-    ep::Stream* stream, const int64_t elem_cnt, T* dptr,
-    const std::shared_ptr<one::Generator>& generator) const {
-  CHECK_GE(elem_cnt, 0);
-  const auto device_index = stream->device()->device_index();
-  auto gen = CHECK_JUST(generator->Get<one::CUDAGeneratorImpl>(device_index));
-  int32_t block_num = gen->max_block_num();
-  int32_t thread_num = gen->max_thread_num();
-  auto* curand_states = gen->curand_states();
-  GenerateGpu<T><<<block_num, thread_num, 0, stream->As<ep::CudaStream>()->cuda_stream()>>>(
-      curand_states, elem_cnt, dptr, low_, high_);
-}
-
-#define INITIATE_CUDA_UNIFORM_DISTRIBUTION(T, typeproto)               \
-  template void UniformDistribution<DeviceType::kCUDA, T>::operator()( \
-      ep::Stream* stream, const int64_t elem_cnt, T* dptr,             \
-      const std::shared_ptr<one::Generator>& generator) const;
-
-OF_PP_FOR_EACH_TUPLE(INITIATE_CUDA_UNIFORM_DISTRIBUTION, FLOATING_DATA_TYPE_SEQ)
-
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "hip/hip_runtime.h"
+#include "oneflow/core/common/data_type.h"
+#include "oneflow/user/kernels/distributions/uniform_distribution.h"
+#include "oneflow/core/ep/include/device.h"
+#include "oneflow/core/ep/rocm/cuda_stream.h"
+
+namespace oneflow {
+
+namespace {
+
+template<typename T>
+__device__ T GenUniform(hiprandState* state, const T low, const T high);
+
+template<>
+__device__ float GenUniform<float>(hiprandState* state, const float low, const float high) {
+  auto rand_num = hiprand_uniform(state);
+  // hiprand_uniform generates (0.0, 1.0], but we want [0.0, 1.0) here
+  if (rand_num == 1.0) { rand_num = 0.0; }
+  return rand_num * (high - low) + low;
+}
+
+template<>
+__device__ double GenUniform<double>(hiprandState* state, const double low, const double high) {
+  auto rand_num = hiprand_uniform_double(state);
+  // hiprand_uniform_double generates (0.0, 1.0], but we want [0.0, 1.0) here
+  if (rand_num == 1.0) { rand_num = 0.0; }
+  return rand_num * (high - low) + low;
+}
+
+template<typename T>
+__global__ void GenerateGpu(hiprandState* state, const int64_t elem_cnt, T* dptr, const T low,
+                            const T high) {
+  const int id = blockIdx.x * blockDim.x + threadIdx.x;
+  hiprandState localState = state[id];
+  CUDA_1D_KERNEL_LOOP(i, elem_cnt) { dptr[i] = GenUniform<T>(&localState, low, high); }
+  state[id] = localState;
+}
+
+}  // namespace
+
+template<typename T>
+void UniformDistribution<DeviceType::kCUDA, T>::operator()(
+    ep::Stream* stream, const int64_t elem_cnt, T* dptr,
+    const std::shared_ptr<one::Generator>& generator) const {
+  CHECK_GE(elem_cnt, 0);
+  const auto device_index = stream->device()->device_index();
+  auto gen = CHECK_JUST(generator->Get<one::CUDAGeneratorImpl>(device_index));
+  int32_t block_num = gen->max_block_num();
+  int32_t thread_num = gen->max_thread_num();
+  auto* curand_states = gen->curand_states();
+  GenerateGpu<T><<<block_num, thread_num, 0, stream->As<ep::CudaStream>()->cuda_stream()>>>(
+      curand_states, elem_cnt, dptr, low_, high_);
+}
+
+#define INITIATE_CUDA_UNIFORM_DISTRIBUTION(T, typeproto)               \
+  template void UniformDistribution<DeviceType::kCUDA, T>::operator()( \
+      ep::Stream* stream, const int64_t elem_cnt, T* dptr,             \
+      const std::shared_ptr<one::Generator>& generator) const;
+
+OF_PP_FOR_EACH_TUPLE(INITIATE_CUDA_UNIFORM_DISTRIBUTION, FLOATING_DATA_TYPE_SEQ)
+
 }  // namespace oneflow
\ No newline at end of file
diff --git a/oneflow/user/kernels/distributions/uniform_int_distribution.hip.cpp b/oneflow/user/kernels/distributions/uniform_int_distribution.hip.cpp
index 0ff5160..e4e72a2 100644
--- a/oneflow/user/kernels/distributions/uniform_int_distribution.hip.cpp
+++ b/oneflow/user/kernels/distributions/uniform_int_distribution.hip.cpp
@@ -1,72 +1,72 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "hip/hip_runtime.h"
-#include "oneflow/core/common/data_type.h"
-#include "oneflow/core/common/preprocessor.h"
-#include "oneflow/core/framework/framework.h"
-#include "oneflow/core/framework/dtype.h"
-#include "oneflow/user/kernels/distributions/uniform_int_distribution.h"
-#include "oneflow/core/ep/include/device.h"
-#include "oneflow/core/ep/rocm/cuda_stream.h"
-
-namespace oneflow {
-
-namespace {
-
-__device__ int64_t GenUniformInt(hiprandState* state, const int64_t low, const int64_t high) {
-  auto rand_num = hiprand_uniform(state);
-  // hiprand_uniform generates (0.0, 1.0], but we want [0.0, 1.0) here
-  if (rand_num == 1.0) { rand_num = 0.0; }
-  return static_cast<int64_t>(rand_num * (high - low) + low);
-}
-
-template<typename T>
-__global__ void GenerateGpu(hiprandState* state, const int64_t elem_cnt, T* dptr, const int64_t low,
-                            const int64_t high) {
-  const int id = blockIdx.x * blockDim.x + threadIdx.x;
-  hiprandState localState = state[id];
-  CUDA_1D_KERNEL_LOOP(i, elem_cnt) {
-    dptr[i] = static_cast<T>(GenUniformInt(&localState, low, high));
-  }
-  state[id] = localState;
-}
-
-}  // namespace
-
-template<typename T>
-void UniformIntDistribution<DeviceType::kCUDA, T>::operator()(
-    ep::Stream* stream, const int64_t elem_cnt, T* dptr,
-    const std::shared_ptr<one::Generator>& generator) const {
-  CHECK_GE(elem_cnt, 0);
-  const auto device_index = stream->device()->device_index();
-  auto gen = CHECK_JUST(generator->Get<one::CUDAGeneratorImpl>(device_index));
-  int32_t block_num = gen->max_block_num();
-  int32_t thread_num = gen->max_thread_num();
-  auto* curand_states = gen->curand_states();
-  GenerateGpu<T><<<block_num, thread_num, 0, stream->As<ep::CudaStream>()->cuda_stream()>>>(
-      curand_states, elem_cnt, dptr, low_, high_);
-}
-
-#define INITIATE_CUDA_UNIFORM_INT_DISTRIBUTION(T, typeproto)              \
-  template void UniformIntDistribution<DeviceType::kCUDA, T>::operator()( \
-      ep::Stream* stream, const int64_t elem_cnt, T* dptr,                \
-      const std::shared_ptr<one::Generator>& generator) const;
-
-OF_PP_FOR_EACH_TUPLE(INITIATE_CUDA_UNIFORM_INT_DISTRIBUTION, FLOATING_DATA_TYPE_SEQ)
-OF_PP_FOR_EACH_TUPLE(INITIATE_CUDA_UNIFORM_INT_DISTRIBUTION, INT_DATA_TYPE_SEQ)
-OF_PP_FOR_EACH_TUPLE(INITIATE_CUDA_UNIFORM_INT_DISTRIBUTION, UNSIGNED_INT_DATA_TYPE_SEQ)
-
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "hip/hip_runtime.h"
+#include "oneflow/core/common/data_type.h"
+#include "oneflow/core/common/preprocessor.h"
+#include "oneflow/core/framework/framework.h"
+#include "oneflow/core/framework/dtype.h"
+#include "oneflow/user/kernels/distributions/uniform_int_distribution.h"
+#include "oneflow/core/ep/include/device.h"
+#include "oneflow/core/ep/rocm/cuda_stream.h"
+
+namespace oneflow {
+
+namespace {
+
+__device__ int64_t GenUniformInt(hiprandState* state, const int64_t low, const int64_t high) {
+  auto rand_num = hiprand_uniform(state);
+  // hiprand_uniform generates (0.0, 1.0], but we want [0.0, 1.0) here
+  if (rand_num == 1.0) { rand_num = 0.0; }
+  return static_cast<int64_t>(rand_num * (high - low) + low);
+}
+
+template<typename T>
+__global__ void GenerateGpu(hiprandState* state, const int64_t elem_cnt, T* dptr, const int64_t low,
+                            const int64_t high) {
+  const int id = blockIdx.x * blockDim.x + threadIdx.x;
+  hiprandState localState = state[id];
+  CUDA_1D_KERNEL_LOOP(i, elem_cnt) {
+    dptr[i] = static_cast<T>(GenUniformInt(&localState, low, high));
+  }
+  state[id] = localState;
+}
+
+}  // namespace
+
+template<typename T>
+void UniformIntDistribution<DeviceType::kCUDA, T>::operator()(
+    ep::Stream* stream, const int64_t elem_cnt, T* dptr,
+    const std::shared_ptr<one::Generator>& generator) const {
+  CHECK_GE(elem_cnt, 0);
+  const auto device_index = stream->device()->device_index();
+  auto gen = CHECK_JUST(generator->Get<one::CUDAGeneratorImpl>(device_index));
+  int32_t block_num = gen->max_block_num();
+  int32_t thread_num = gen->max_thread_num();
+  auto* curand_states = gen->curand_states();
+  GenerateGpu<T><<<block_num, thread_num, 0, stream->As<ep::CudaStream>()->cuda_stream()>>>(
+      curand_states, elem_cnt, dptr, low_, high_);
+}
+
+#define INITIATE_CUDA_UNIFORM_INT_DISTRIBUTION(T, typeproto)              \
+  template void UniformIntDistribution<DeviceType::kCUDA, T>::operator()( \
+      ep::Stream* stream, const int64_t elem_cnt, T* dptr,                \
+      const std::shared_ptr<one::Generator>& generator) const;
+
+OF_PP_FOR_EACH_TUPLE(INITIATE_CUDA_UNIFORM_INT_DISTRIBUTION, FLOATING_DATA_TYPE_SEQ)
+OF_PP_FOR_EACH_TUPLE(INITIATE_CUDA_UNIFORM_INT_DISTRIBUTION, INT_DATA_TYPE_SEQ)
+OF_PP_FOR_EACH_TUPLE(INITIATE_CUDA_UNIFORM_INT_DISTRIBUTION, UNSIGNED_INT_DATA_TYPE_SEQ)
+
 }  // namespace oneflow
\ No newline at end of file
diff --git a/oneflow/user/kernels/dropout_kernel.hip.cpp b/oneflow/user/kernels/dropout_kernel.hip.cpp
index 4fe3552..5a4aab1 100644
--- a/oneflow/user/kernels/dropout_kernel.hip.cpp
+++ b/oneflow/user/kernels/dropout_kernel.hip.cpp
@@ -1,463 +1,463 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "hip/hip_runtime.h"
-#include "oneflow/user/kernels/op_kernel_wrapper.h"
-#include "oneflow/core/common/data_type.h"
-#include "oneflow/core/hip/elementwise.hip.h"
-#include "oneflow/core/hip/atomic.hip.h"
-#include "oneflow/user/kernels/dropout_kernel.h"
-#include "oneflow/core/kernel/cuda_graph_support.h"
-#include "oneflow/core/ep/include/device.h"
-#include "oneflow/core/ep/rocm/cuda_stream.h"
-// #include "oneflow/core/device/cuda_pseudo_bfloat16.h"
-namespace oneflow {
-
-namespace {
-
-constexpr int32_t kVecSize = 4;
-constexpr int32_t kBlockSize = 256;
-
-template<typename T>
-constexpr int32_t GetDropoutPackSize() {
-  // For float, bfloat16, half.
-  return 4;
-};
-
-template<>
-constexpr int32_t GetDropoutPackSize<half2>() {
-  return 2;
-};
-
-template<>
-constexpr int32_t GetDropoutPackSize<double>() {
-  return 2;
-};
-
-union RandPack4 {
-  float4 storage;
-  float elem[4];
-};
-
-template<typename T>
-struct GetPack2Type {
-  using T2 = typename std::aligned_storage<2 * sizeof(T), 2 * sizeof(T)>::type;
-};
-
-template<>
-struct GetPack2Type<half> {
-  using T2 = half2;
-};
-
-
-template<typename T>
-using Pack2Type = typename GetPack2Type<T>::T2;
-
-using H2PackType = typename std::aligned_storage<4 * sizeof(half), 4 * sizeof(half)>::type;
-
-template<typename T>
-union H2Pack {
-  cuda::elementwise::Pack<T, 4> pack_storage;
-  Pack2Type<T> h2[2];
-  __device__ H2Pack() {
-    // do nothing
-  }
-};
-
-template<>
-union H2Pack<half> {
-  cuda::elementwise::Pack<half, 4> pack_storage;
-  half2 h2[2];
-  __device__ H2Pack() {
-    // do nothing
-  }
-};
-
-template<typename T>
-__device__ Pack2Type<T> Make2(float v);
-
-template<>
-__device__ Pack2Type<half> Make2<half>(float v) {
-  return __float2half2_rn(v);
-}
-
-#define RETURN_VOID_IF_HALF typename std::enable_if_t<std::is_same<T, half>::value, void>
-
-#define RETURN_VOID_IF_FLOAT typename std::enable_if_t<std::is_same<T, float>::value, void>
-#define RETURN_VOID_IF_DOUBLE typename std::enable_if_t<std::is_same<T, double>::value, void>
-
-template<typename T, int pack_size, bool tail, bool has_addend>
-__global__ RETURN_VOID_IF_FLOAT FusedDropoutAddGpu(
-    uint64_t seed, one::CUDAGeneratorState* cuda_gen_state, uint64_t inc_offset,
-    const int64_t elem_cnt, float rate, float scale, int64_t n_tail, const T* x, bool* mask,
-    const T* addend, T* y, const T* tail_x, bool* tail_mask, const T* tail_addend, T* tail_y) {
-  int32_t global_thread_id = blockIdx.x * blockDim.x + threadIdx.x;
-  hiprandStatePhilox4_32_10_t state;
-  hiprand_init(seed, global_thread_id, cuda_gen_state->dev_offset, &state);
-  using LoadType = cuda::elementwise::PackType<T, pack_size>;
-  using LoadPack = cuda::elementwise::Pack<T, pack_size>;
-  using MaskType = cuda::elementwise::PackType<bool, pack_size>;
-  using MaskPack = cuda::elementwise::Pack<bool, pack_size>;
-
-  T t_scale = static_cast<T>(scale);
-  RandPack4 rand_uniform_pack4;
-  for (int64_t linear_index = global_thread_id * pack_size; linear_index < elem_cnt;
-       linear_index += gridDim.x * blockDim.x * pack_size) {
-    rand_uniform_pack4.storage = hiprand_uniform4(&state);
-
-    const LoadType* x_load = reinterpret_cast<const LoadType*>(x + linear_index);
-    LoadPack x_vec;
-    x_vec.storage = *x_load;
-
-    LoadPack addend_vec;
-    if (has_addend) {
-      const LoadType* addend_load = reinterpret_cast<const LoadType*>(addend + linear_index);
-      addend_vec.storage = *addend_load;
-    }
-
-    MaskPack mask_vec;
-    LoadPack y_vec;
-#pragma unroll
-    for (int i = 0; i < pack_size; i++) {
-      mask_vec.elem[i] = rand_uniform_pack4.elem[i] > rate;
-      T tmp_float_mask = static_cast<float>(mask_vec.elem[i]);
-      y_vec.elem[i] = x_vec.elem[i] * tmp_float_mask * t_scale;
-      if (has_addend) { y_vec.elem[i] += addend_vec.elem[i]; }
-    }
-
-    *(reinterpret_cast<LoadType*>(y + linear_index)) = y_vec.storage;
-    *(reinterpret_cast<MaskType*>(mask + linear_index)) = mask_vec.storage;
-  }
-
-  if (tail && global_thread_id < n_tail) {
-    const float rand_uniform = hiprand_uniform(&state);
-    const bool mask_val = rand_uniform > rate;
-    tail_mask[global_thread_id] = mask_val;
-    T tmp_float_mask = static_cast<float>(mask_val);
-    T tmp_tail_out = tail_x[global_thread_id] * tmp_float_mask * t_scale;
-    if (has_addend) { tmp_tail_out += tail_addend[global_thread_id]; }
-    tail_y[global_thread_id] = tmp_tail_out;
-  }
-
-  __syncthreads();
-
-  if (threadIdx.x == 0) {
-    int32_t new_counter = cuda::atomic::Add(&cuda_gen_state->dev_counter, 1) + 1;
-    if (new_counter == gridDim.x) {
-      cuda_gen_state->dev_counter = 0;           // reset counter to zero
-      cuda_gen_state->dev_offset += inc_offset;  // maintain the state of generator's dev_offset
-    }
-  }
-}
-
-template<typename T, int pack_size, bool tail, bool has_addend>
-__global__ RETURN_VOID_IF_HALF FusedDropoutAddGpu(
-    uint64_t seed, one::CUDAGeneratorState* cuda_gen_state, uint64_t inc_offset,
-    const int64_t elem_cnt, float rate, float scale, int64_t n_tail, const T* x, bool* mask,
-    const T* addend, T* y, const T* tail_x, bool* tail_mask, const T* tail_addend, T* tail_y) {
-  int32_t global_thread_id = blockIdx.x * blockDim.x + threadIdx.x;
-  hiprandStatePhilox4_32_10_t state;
-  hiprand_init(seed, global_thread_id, cuda_gen_state->dev_offset, &state);
-  using LoadType = cuda::elementwise::PackType<T, pack_size>;
-  using LoadPack = cuda::elementwise::Pack<T, pack_size>;
-  using StoreType = cuda::elementwise::PackType<Pack2Type<T>, pack_size / 2>;
-  using StorePack = cuda::elementwise::Pack<Pack2Type<T>, pack_size / 2>;
-  using MaskType = cuda::elementwise::PackType<bool, pack_size>;
-  using MaskPack = cuda::elementwise::Pack<bool, pack_size>;
-
-  RandPack4 rand_uniform_pack4;
-  Pack2Type<T> h2_scale = Make2<T>(scale);
-
-  for (int64_t linear_index = global_thread_id * pack_size; linear_index < elem_cnt;
-       linear_index += gridDim.x * blockDim.x * pack_size) {
-    rand_uniform_pack4.storage = hiprand_uniform4(&state);
-    const LoadType* x_load = reinterpret_cast<const LoadType*>(x + linear_index);
-    H2Pack<T> x_vec{};
-    x_vec.pack_storage.storage = *x_load;
-
-    H2Pack<T> addend_vec{};
-    if (has_addend) {
-      const LoadType* addend_load = reinterpret_cast<const LoadType*>(addend + linear_index);
-      addend_vec.pack_storage.storage = *addend_load;
-    }
-
-    MaskPack mask_vec;
-    StorePack y_vec;
-    StorePack one_or_zero_h2;
-
-    mask_vec.elem[0] = rand_uniform_pack4.elem[0] > rate;
-    float tmp_float_mask = static_cast<float>(mask_vec.elem[0]);
-    one_or_zero_h2.elem[0].x = tmp_float_mask;
-    mask_vec.elem[1] = rand_uniform_pack4.elem[1] > rate;
-    tmp_float_mask = static_cast<float>(mask_vec.elem[1]);
-    one_or_zero_h2.elem[0].y = tmp_float_mask;
-    y_vec.elem[0] = __hmul2(__hmul2(x_vec.h2[0], one_or_zero_h2.elem[0]), h2_scale);
-
-    mask_vec.elem[2] = rand_uniform_pack4.elem[2] > rate;
-    tmp_float_mask = static_cast<float>(mask_vec.elem[2]);
-    one_or_zero_h2.elem[1].x = tmp_float_mask;
-    mask_vec.elem[3] = rand_uniform_pack4.elem[3] > rate;
-    tmp_float_mask = static_cast<float>(mask_vec.elem[3]);
-    one_or_zero_h2.elem[1].y = tmp_float_mask;
-    y_vec.elem[1] = __hmul2(__hmul2(x_vec.h2[1], one_or_zero_h2.elem[1]), h2_scale);
-
-    if (has_addend) {
-      y_vec.elem[0] = __hadd2(y_vec.elem[0], addend_vec.h2[0]);
-      y_vec.elem[1] = __hadd2(y_vec.elem[1], addend_vec.h2[1]);
-    }
-
-    *(reinterpret_cast<StoreType*>(y + linear_index)) = y_vec.storage;
-    *(reinterpret_cast<MaskType*>(mask + linear_index)) = mask_vec.storage;
-  }
-
-  if (tail && global_thread_id < n_tail) {
-    const float rand_uniform = hiprand_uniform(&state);
-    const bool mask_val = rand_uniform > rate;
-    tail_mask[global_thread_id] = mask_val;
-    float tmp_half_mask = static_cast<float>(mask_val);
-    T tmp_tail_out = tail_x[global_thread_id] * static_cast<T>(tmp_half_mask) * static_cast<T>(h2_scale.data.x);
-    if (has_addend) { tmp_tail_out += tail_addend[global_thread_id]; }
-    tail_y[global_thread_id] = tmp_tail_out;
-  }
-
-  __syncthreads();
-  if (threadIdx.x == 0) {
-    int32_t new_counter = cuda::atomic::Add(&cuda_gen_state->dev_counter, 1) + 1;
-    if (new_counter == gridDim.x) {
-      cuda_gen_state->dev_counter = 0;           // reset counter to zero
-      cuda_gen_state->dev_offset += inc_offset;  // maintain the state of generator's dev_offset
-    }
-  }
-}
-
-template<typename T, int pack_size, bool tail, bool has_addend>
-__global__ RETURN_VOID_IF_DOUBLE FusedDropoutAddGpu(
-    uint64_t seed, one::CUDAGeneratorState* cuda_gen_state, uint64_t inc_offset,
-    const int64_t elem_cnt, float rate, float scale, int64_t n_tail, const T* x, bool* mask,
-    const T* addend, T* y, const T* tail_x, bool* tail_mask, const T* tail_addend, T* tail_y) {
-  int32_t global_thread_id = blockIdx.x * blockDim.x + threadIdx.x;
-  hiprandStatePhilox4_32_10_t state;
-  hiprand_init(seed, global_thread_id, cuda_gen_state->dev_offset, &state);
-  using LoadType = cuda::elementwise::PackType<T, pack_size>;
-  using LoadPack = cuda::elementwise::Pack<T, pack_size>;
-  using MaskType = cuda::elementwise::PackType<bool, pack_size>;
-  using MaskPack = cuda::elementwise::Pack<bool, pack_size>;
-
-  RandPack4 rand_uniform_pack4;
-  bool grid_loop_rand_state = 0;
-
-  for (int64_t linear_index = global_thread_id * pack_size; linear_index < elem_cnt;
-       linear_index += gridDim.x * blockDim.x * pack_size) {
-    if (grid_loop_rand_state == 0) {
-      rand_uniform_pack4.storage = hiprand_uniform4(&state);
-      grid_loop_rand_state ^= 1;
-    } else {
-      // Use the last two random numbers we generated in previous iteration.
-      rand_uniform_pack4.elem[0] = rand_uniform_pack4.elem[2];
-      rand_uniform_pack4.elem[1] = rand_uniform_pack4.elem[3];
-      grid_loop_rand_state ^= 1;
-    }
-    const LoadType* x_load = reinterpret_cast<const LoadType*>(x + linear_index);
-    LoadPack x_vec;
-    x_vec.storage = *x_load;
-
-    LoadPack addend_vec;
-    if (has_addend) {
-      const LoadType* addend_load = reinterpret_cast<const LoadType*>(addend + linear_index);
-      addend_vec.storage = *addend_load;
-    }
-
-    MaskPack mask_vec;
-    LoadPack y_vec;
-#pragma unroll
-    for (int i = 0; i < pack_size; i++) {
-      mask_vec.elem[i] = rand_uniform_pack4.elem[i] > rate;
-      y_vec.elem[i] = x_vec.elem[i] * mask_vec.elem[i] * scale;
-      if (has_addend) { y_vec.elem[i] += addend_vec.elem[i]; }
-    }
-    *(reinterpret_cast<LoadType*>(y + linear_index)) = y_vec.storage;
-    *(reinterpret_cast<MaskType*>(mask + linear_index)) = mask_vec.storage;
-  }
-
-  if (tail && global_thread_id < n_tail) {
-    const float rand_uniform = hiprand_uniform(&state);
-    const bool mask_val = rand_uniform > rate;
-    tail_mask[global_thread_id] = mask_val;
-    double tmp_tail_out = tail_x[global_thread_id] * mask_val * scale;
-    if (has_addend) { tmp_tail_out += tail_addend[global_thread_id]; }
-    tail_y[global_thread_id] = tmp_tail_out;
-  }
-
-  __syncthreads();
-  if (threadIdx.x == 0) {
-    int32_t new_counter = cuda::atomic::Add(&cuda_gen_state->dev_counter, 1) + 1;
-    if (new_counter == gridDim.x) {
-      cuda_gen_state->dev_counter = 0;           // reset counter to zero
-      cuda_gen_state->dev_offset += inc_offset;  // maintain the state of generator's dev_offset
-    }
-  }
-}
-
-unsigned int ComputeGridSize(ep::Stream* stream, const int32_t block_size, const int64_t elem_cnt) {
-  auto* cuda_stream = stream->As<ep::CudaStream>();
-  const int32_t max_threads_multi_process =
-      cuda_stream->device_properties().maxThreadsPerMultiProcessor;
-  const int32_t multi_processor_count = cuda_stream->device_properties().multiProcessorCount;
-  unsigned int blocks_per_sm = max_threads_multi_process / block_size;
-  unsigned int grid_size = std::max((int64_t)1, ((elem_cnt + block_size - 1) / block_size));
-  grid_size = std::min((unsigned int)multi_processor_count * blocks_per_sm, grid_size);
-  return grid_size;
-}
-
-template<typename T, bool has_addend>
-void DispatchTail(ep::Stream* stream, uint64_t seed, one::CUDAGeneratorState* cuda_gen_state,
-                  const int64_t elem_cnt, float rate, float scale, const T* x, bool* mask,
-                  const T* addend, T* y) {
-  constexpr int pack_size = GetDropoutPackSize<T>();
-  const int64_t pack_num = elem_cnt / pack_size;
-  unsigned int grid_size = ComputeGridSize(stream, kBlockSize, pack_num);
-  const int64_t tail_offset = pack_num * pack_size;
-  const int64_t n_tail = elem_cnt - tail_offset;
-  const bool tail = n_tail > 0 ? true : false;
-  uint64_t inc_offset = 0;
-
-  if (tail) {
-    // If tail, we need generate randnum one more time, so here we add another `1`.
-    inc_offset = ((elem_cnt - 1) / (kBlockSize * grid_size * kVecSize) + 1) * kVecSize + 1;
-    FusedDropoutAddGpu<T, pack_size, true, has_addend>
-        <<<grid_size, kBlockSize, 0, stream->As<ep::CudaStream>()->cuda_stream()>>>(
-            seed, cuda_gen_state, inc_offset, elem_cnt, rate, scale, n_tail, x, mask, addend, y,
-            (x + tail_offset), (mask + tail_offset), (addend + tail_offset), (y + tail_offset));
-  } else {
-    inc_offset = ((elem_cnt - 1) / (kBlockSize * grid_size * kVecSize) + 1) * kVecSize;
-    FusedDropoutAddGpu<T, pack_size, false, has_addend>
-        <<<grid_size, kBlockSize, 0, stream->As<ep::CudaStream>()->cuda_stream()>>>(
-            seed, cuda_gen_state, inc_offset, elem_cnt, rate, scale, n_tail, x, mask, addend, y,
-            nullptr, nullptr, nullptr, nullptr);
-  }
-}
-
-template<typename T>
-struct MaskAndScaleFunctor {
-  OF_DEVICE_FUNC explicit MaskAndScaleFunctor(float scale) : scale(scale) {}
-  __device__ T operator()(T x, bool mask) const {
-    return x * static_cast<T>(mask) * static_cast<T>(scale);
-  }
-  float scale;
-};
-
-
-
-template<typename T>
-class DropoutKernelGPU final : public user_op::OpKernel, public user_op::CudaGraphSupport {
- public:
-  DropoutKernelGPU() = default;
-  ~DropoutKernelGPU() = default;
-
-  std::shared_ptr<user_op::OpKernelState> CreateOpKernelState(
-      user_op::KernelInitContext* ctx) const override {
-    const auto& generator = CHECK_JUST(one::MakeGenerator(DeviceType::kCUDA));
-    return std::make_shared<FusedDropoutKernelState>(generator);
-  }
-
- private:
-  using user_op::OpKernel::Compute;
-  void Compute(user_op::KernelComputeContext* ctx, user_op::OpKernelState* state,
-               const user_op::OpKernelCache*) const override {
-    const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("in", 0);
-    user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0);
-    user_op::Tensor* mask = ctx->Tensor4ArgNameAndIndex("mask", 0);
-    auto* fused_dropout_kernel_state = dynamic_cast<FusedDropoutKernelState*>(state);
-    CHECK_NOTNULL(fused_dropout_kernel_state);
-    const auto& generator = fused_dropout_kernel_state->generator();
-    CHECK_NOTNULL(generator);
-    auto* stream = ctx->stream();
-    const auto device_index = stream->device()->device_index();
-    std::shared_ptr<one::CUDAGeneratorImpl> cuda_generator =
-        CHECK_JUST(generator->Get<one::CUDAGeneratorImpl>(device_index));
-    uint64_t seed = cuda_generator->current_seed();
-
-    const float rate = ctx->Attr<float>("rate");
-    float scale = 0.0;
-    if (rate < 1.0f) { scale = 1.0f / (1.0f - rate); }
-    one::CUDAGeneratorState* cuda_gen_state = cuda_generator->cuda_gen_state();
-
-    if (ctx->has_input("_add_to_output", 0)) {
-      const user_op::Tensor* addend = ctx->Tensor4ArgNameAndIndex("_add_to_output", 0);
-      DispatchTail<T, true>(
-          stream, seed, cuda_gen_state, in->shape_view().elem_cnt(), rate, scale,
-          reinterpret_cast<const T*>(in->dptr()), reinterpret_cast<bool*>(mask->mut_dptr()),
-          reinterpret_cast<const T*>(addend->dptr()), reinterpret_cast<T*>(out->mut_dptr()));
-    } else {
-      DispatchTail<T, false>(stream, seed, cuda_gen_state, in->shape_view().elem_cnt(), rate, scale,
-                             reinterpret_cast<const T*>(in->dptr()),
-                             reinterpret_cast<bool*>(mask->mut_dptr()), nullptr,
-                             reinterpret_cast<T*>(out->mut_dptr()));
-    }
-  }
-  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
-};
-
-#define REGISTER_DROPOUT_KERNEL_GPU(cpp_type, data_type)                                     \
-  REGISTER_USER_KERNEL("dropout").SetCreateFn<DropoutKernelGPU<cpp_type>>().SetIsMatchedHob( \
-      (user_op::HobDeviceType() == DeviceType::kCUDA)                                        \
-      && (user_op::HobDataType("out", 0) == data_type)                                       \
-      && (user_op::HobDataType("mask", 0) == GetDataType<bool>::value))
-
-REGISTER_DROPOUT_KERNEL_GPU(half, DataType::kFloat16);
-REGISTER_DROPOUT_KERNEL_GPU(float, DataType::kFloat);
-REGISTER_DROPOUT_KERNEL_GPU(double, DataType::kDouble);
-
-
-template<typename T>
-class DropoutGradKernelGPU final : public user_op::OpKernel, public user_op::CudaGraphSupport {
- public:
-  DropoutGradKernelGPU() = default;
-  ~DropoutGradKernelGPU() = default;
-
- private:
-  using user_op::OpKernel::Compute;
-  void Compute(user_op::KernelComputeContext* ctx) const override {
-    const user_op::Tensor* dy = ctx->Tensor4ArgNameAndIndex("dy", 0);
-    const user_op::Tensor* mask = ctx->Tensor4ArgNameAndIndex("mask", 0);
-    user_op::Tensor* dx = ctx->Tensor4ArgNameAndIndex("dx", 0);
-    const float scale = ctx->Attr<float>("scale");
-    const int64_t elem_cnt = dy->shape_view().elem_cnt();
-    OF_CUDA_CHECK((cuda::elementwise::Binary(
-        MaskAndScaleFunctor<T>(scale), elem_cnt, reinterpret_cast<T*>(dx->mut_dptr()),
-        reinterpret_cast<const T*>(dy->dptr()), reinterpret_cast<const bool*>(mask->dptr()),
-        ctx->stream()->As<ep::CudaStream>()->cuda_stream())));
-  }
-  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
-};
-
-#define REGISTER_DROPOUT_GRAD_KERNEL_GPU(cpp_type, data_type)                                   \
-  REGISTER_USER_KERNEL("dropout_grad")                                                          \
-      .SetCreateFn<DropoutGradKernelGPU<cpp_type>>()                                            \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                          \
-                       && (user_op::HobDataType("dx", 0) == data_type))                         \
-      .SetInplaceProposalFn([](const user_op::InferContext&,                                    \
-                               user_op::AddInplaceArgPair AddInplaceArgPairFn) -> Maybe<void> { \
-        OF_RETURN_IF_ERROR(AddInplaceArgPairFn("dx", 0, "dy", 0, true));                        \
-        return Maybe<void>::Ok();                                                               \
-      })
-
-REGISTER_DROPOUT_GRAD_KERNEL_GPU(half, DataType::kFloat16);
-REGISTER_DROPOUT_GRAD_KERNEL_GPU(float, DataType::kFloat);
-REGISTER_DROPOUT_GRAD_KERNEL_GPU(double, DataType::kDouble);
-
-}  // namespace
-
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "hip/hip_runtime.h"
+#include "oneflow/user/kernels/op_kernel_wrapper.h"
+#include "oneflow/core/common/data_type.h"
+#include "oneflow/core/hip/elementwise.hip.h"
+#include "oneflow/core/hip/atomic.hip.h"
+#include "oneflow/user/kernels/dropout_kernel.h"
+#include "oneflow/core/kernel/cuda_graph_support.h"
+#include "oneflow/core/ep/include/device.h"
+#include "oneflow/core/ep/rocm/cuda_stream.h"
+// #include "oneflow/core/device/cuda_pseudo_bfloat16.h"
+namespace oneflow {
+
+namespace {
+
+constexpr int32_t kVecSize = 4;
+constexpr int32_t kBlockSize = 256;
+
+template<typename T>
+constexpr int32_t GetDropoutPackSize() {
+  // For float, bfloat16, half.
+  return 4;
+};
+
+template<>
+constexpr int32_t GetDropoutPackSize<half2>() {
+  return 2;
+};
+
+template<>
+constexpr int32_t GetDropoutPackSize<double>() {
+  return 2;
+};
+
+union RandPack4 {
+  float4 storage;
+  float elem[4];
+};
+
+template<typename T>
+struct GetPack2Type {
+  using T2 = typename std::aligned_storage<2 * sizeof(T), 2 * sizeof(T)>::type;
+};
+
+template<>
+struct GetPack2Type<half> {
+  using T2 = half2;
+};
+
+
+template<typename T>
+using Pack2Type = typename GetPack2Type<T>::T2;
+
+using H2PackType = typename std::aligned_storage<4 * sizeof(half), 4 * sizeof(half)>::type;
+
+template<typename T>
+union H2Pack {
+  cuda::elementwise::Pack<T, 4> pack_storage;
+  Pack2Type<T> h2[2];
+  __device__ H2Pack() {
+    // do nothing
+  }
+};
+
+template<>
+union H2Pack<half> {
+  cuda::elementwise::Pack<half, 4> pack_storage;
+  half2 h2[2];
+  __device__ H2Pack() {
+    // do nothing
+  }
+};
+
+template<typename T>
+__device__ Pack2Type<T> Make2(float v);
+
+template<>
+__device__ Pack2Type<half> Make2<half>(float v) {
+  return __float2half2_rn(v);
+}
+
+#define RETURN_VOID_IF_HALF typename std::enable_if_t<std::is_same<T, half>::value, void>
+
+#define RETURN_VOID_IF_FLOAT typename std::enable_if_t<std::is_same<T, float>::value, void>
+#define RETURN_VOID_IF_DOUBLE typename std::enable_if_t<std::is_same<T, double>::value, void>
+
+template<typename T, int pack_size, bool tail, bool has_addend>
+__global__ RETURN_VOID_IF_FLOAT FusedDropoutAddGpu(
+    uint64_t seed, one::CUDAGeneratorState* cuda_gen_state, uint64_t inc_offset,
+    const int64_t elem_cnt, float rate, float scale, int64_t n_tail, const T* x, bool* mask,
+    const T* addend, T* y, const T* tail_x, bool* tail_mask, const T* tail_addend, T* tail_y) {
+  int32_t global_thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+  hiprandStatePhilox4_32_10_t state;
+  hiprand_init(seed, global_thread_id, cuda_gen_state->dev_offset, &state);
+  using LoadType = cuda::elementwise::PackType<T, pack_size>;
+  using LoadPack = cuda::elementwise::Pack<T, pack_size>;
+  using MaskType = cuda::elementwise::PackType<bool, pack_size>;
+  using MaskPack = cuda::elementwise::Pack<bool, pack_size>;
+
+  T t_scale = static_cast<T>(scale);
+  RandPack4 rand_uniform_pack4;
+  for (int64_t linear_index = global_thread_id * pack_size; linear_index < elem_cnt;
+       linear_index += gridDim.x * blockDim.x * pack_size) {
+    rand_uniform_pack4.storage = hiprand_uniform4(&state);
+
+    const LoadType* x_load = reinterpret_cast<const LoadType*>(x + linear_index);
+    LoadPack x_vec;
+    x_vec.storage = *x_load;
+
+    LoadPack addend_vec;
+    if (has_addend) {
+      const LoadType* addend_load = reinterpret_cast<const LoadType*>(addend + linear_index);
+      addend_vec.storage = *addend_load;
+    }
+
+    MaskPack mask_vec;
+    LoadPack y_vec;
+#pragma unroll
+    for (int i = 0; i < pack_size; i++) {
+      mask_vec.elem[i] = rand_uniform_pack4.elem[i] > rate;
+      T tmp_float_mask = static_cast<float>(mask_vec.elem[i]);
+      y_vec.elem[i] = x_vec.elem[i] * tmp_float_mask * t_scale;
+      if (has_addend) { y_vec.elem[i] += addend_vec.elem[i]; }
+    }
+
+    *(reinterpret_cast<LoadType*>(y + linear_index)) = y_vec.storage;
+    *(reinterpret_cast<MaskType*>(mask + linear_index)) = mask_vec.storage;
+  }
+
+  if (tail && global_thread_id < n_tail) {
+    const float rand_uniform = hiprand_uniform(&state);
+    const bool mask_val = rand_uniform > rate;
+    tail_mask[global_thread_id] = mask_val;
+    T tmp_float_mask = static_cast<float>(mask_val);
+    T tmp_tail_out = tail_x[global_thread_id] * tmp_float_mask * t_scale;
+    if (has_addend) { tmp_tail_out += tail_addend[global_thread_id]; }
+    tail_y[global_thread_id] = tmp_tail_out;
+  }
+
+  __syncthreads();
+
+  if (threadIdx.x == 0) {
+    int32_t new_counter = cuda::atomic::Add(&cuda_gen_state->dev_counter, 1) + 1;
+    if (new_counter == gridDim.x) {
+      cuda_gen_state->dev_counter = 0;           // reset counter to zero
+      cuda_gen_state->dev_offset += inc_offset;  // maintain the state of generator's dev_offset
+    }
+  }
+}
+
+template<typename T, int pack_size, bool tail, bool has_addend>
+__global__ RETURN_VOID_IF_HALF FusedDropoutAddGpu(
+    uint64_t seed, one::CUDAGeneratorState* cuda_gen_state, uint64_t inc_offset,
+    const int64_t elem_cnt, float rate, float scale, int64_t n_tail, const T* x, bool* mask,
+    const T* addend, T* y, const T* tail_x, bool* tail_mask, const T* tail_addend, T* tail_y) {
+  int32_t global_thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+  hiprandStatePhilox4_32_10_t state;
+  hiprand_init(seed, global_thread_id, cuda_gen_state->dev_offset, &state);
+  using LoadType = cuda::elementwise::PackType<T, pack_size>;
+  using LoadPack = cuda::elementwise::Pack<T, pack_size>;
+  using StoreType = cuda::elementwise::PackType<Pack2Type<T>, pack_size / 2>;
+  using StorePack = cuda::elementwise::Pack<Pack2Type<T>, pack_size / 2>;
+  using MaskType = cuda::elementwise::PackType<bool, pack_size>;
+  using MaskPack = cuda::elementwise::Pack<bool, pack_size>;
+
+  RandPack4 rand_uniform_pack4;
+  Pack2Type<T> h2_scale = Make2<T>(scale);
+
+  for (int64_t linear_index = global_thread_id * pack_size; linear_index < elem_cnt;
+       linear_index += gridDim.x * blockDim.x * pack_size) {
+    rand_uniform_pack4.storage = hiprand_uniform4(&state);
+    const LoadType* x_load = reinterpret_cast<const LoadType*>(x + linear_index);
+    H2Pack<T> x_vec{};
+    x_vec.pack_storage.storage = *x_load;
+
+    H2Pack<T> addend_vec{};
+    if (has_addend) {
+      const LoadType* addend_load = reinterpret_cast<const LoadType*>(addend + linear_index);
+      addend_vec.pack_storage.storage = *addend_load;
+    }
+
+    MaskPack mask_vec;
+    StorePack y_vec;
+    StorePack one_or_zero_h2;
+
+    mask_vec.elem[0] = rand_uniform_pack4.elem[0] > rate;
+    float tmp_float_mask = static_cast<float>(mask_vec.elem[0]);
+    one_or_zero_h2.elem[0].x = tmp_float_mask;
+    mask_vec.elem[1] = rand_uniform_pack4.elem[1] > rate;
+    tmp_float_mask = static_cast<float>(mask_vec.elem[1]);
+    one_or_zero_h2.elem[0].y = tmp_float_mask;
+    y_vec.elem[0] = __hmul2(__hmul2(x_vec.h2[0], one_or_zero_h2.elem[0]), h2_scale);
+
+    mask_vec.elem[2] = rand_uniform_pack4.elem[2] > rate;
+    tmp_float_mask = static_cast<float>(mask_vec.elem[2]);
+    one_or_zero_h2.elem[1].x = tmp_float_mask;
+    mask_vec.elem[3] = rand_uniform_pack4.elem[3] > rate;
+    tmp_float_mask = static_cast<float>(mask_vec.elem[3]);
+    one_or_zero_h2.elem[1].y = tmp_float_mask;
+    y_vec.elem[1] = __hmul2(__hmul2(x_vec.h2[1], one_or_zero_h2.elem[1]), h2_scale);
+
+    if (has_addend) {
+      y_vec.elem[0] = __hadd2(y_vec.elem[0], addend_vec.h2[0]);
+      y_vec.elem[1] = __hadd2(y_vec.elem[1], addend_vec.h2[1]);
+    }
+
+    *(reinterpret_cast<StoreType*>(y + linear_index)) = y_vec.storage;
+    *(reinterpret_cast<MaskType*>(mask + linear_index)) = mask_vec.storage;
+  }
+
+  if (tail && global_thread_id < n_tail) {
+    const float rand_uniform = hiprand_uniform(&state);
+    const bool mask_val = rand_uniform > rate;
+    tail_mask[global_thread_id] = mask_val;
+    float tmp_half_mask = static_cast<float>(mask_val);
+    T tmp_tail_out = tail_x[global_thread_id] * static_cast<T>(tmp_half_mask) * static_cast<T>(h2_scale.data.x);
+    if (has_addend) { tmp_tail_out += tail_addend[global_thread_id]; }
+    tail_y[global_thread_id] = tmp_tail_out;
+  }
+
+  __syncthreads();
+  if (threadIdx.x == 0) {
+    int32_t new_counter = cuda::atomic::Add(&cuda_gen_state->dev_counter, 1) + 1;
+    if (new_counter == gridDim.x) {
+      cuda_gen_state->dev_counter = 0;           // reset counter to zero
+      cuda_gen_state->dev_offset += inc_offset;  // maintain the state of generator's dev_offset
+    }
+  }
+}
+
+template<typename T, int pack_size, bool tail, bool has_addend>
+__global__ RETURN_VOID_IF_DOUBLE FusedDropoutAddGpu(
+    uint64_t seed, one::CUDAGeneratorState* cuda_gen_state, uint64_t inc_offset,
+    const int64_t elem_cnt, float rate, float scale, int64_t n_tail, const T* x, bool* mask,
+    const T* addend, T* y, const T* tail_x, bool* tail_mask, const T* tail_addend, T* tail_y) {
+  int32_t global_thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+  hiprandStatePhilox4_32_10_t state;
+  hiprand_init(seed, global_thread_id, cuda_gen_state->dev_offset, &state);
+  using LoadType = cuda::elementwise::PackType<T, pack_size>;
+  using LoadPack = cuda::elementwise::Pack<T, pack_size>;
+  using MaskType = cuda::elementwise::PackType<bool, pack_size>;
+  using MaskPack = cuda::elementwise::Pack<bool, pack_size>;
+
+  RandPack4 rand_uniform_pack4;
+  bool grid_loop_rand_state = 0;
+
+  for (int64_t linear_index = global_thread_id * pack_size; linear_index < elem_cnt;
+       linear_index += gridDim.x * blockDim.x * pack_size) {
+    if (grid_loop_rand_state == 0) {
+      rand_uniform_pack4.storage = hiprand_uniform4(&state);
+      grid_loop_rand_state ^= 1;
+    } else {
+      // Use the last two random numbers we generated in previous iteration.
+      rand_uniform_pack4.elem[0] = rand_uniform_pack4.elem[2];
+      rand_uniform_pack4.elem[1] = rand_uniform_pack4.elem[3];
+      grid_loop_rand_state ^= 1;
+    }
+    const LoadType* x_load = reinterpret_cast<const LoadType*>(x + linear_index);
+    LoadPack x_vec;
+    x_vec.storage = *x_load;
+
+    LoadPack addend_vec;
+    if (has_addend) {
+      const LoadType* addend_load = reinterpret_cast<const LoadType*>(addend + linear_index);
+      addend_vec.storage = *addend_load;
+    }
+
+    MaskPack mask_vec;
+    LoadPack y_vec;
+#pragma unroll
+    for (int i = 0; i < pack_size; i++) {
+      mask_vec.elem[i] = rand_uniform_pack4.elem[i] > rate;
+      y_vec.elem[i] = x_vec.elem[i] * mask_vec.elem[i] * scale;
+      if (has_addend) { y_vec.elem[i] += addend_vec.elem[i]; }
+    }
+    *(reinterpret_cast<LoadType*>(y + linear_index)) = y_vec.storage;
+    *(reinterpret_cast<MaskType*>(mask + linear_index)) = mask_vec.storage;
+  }
+
+  if (tail && global_thread_id < n_tail) {
+    const float rand_uniform = hiprand_uniform(&state);
+    const bool mask_val = rand_uniform > rate;
+    tail_mask[global_thread_id] = mask_val;
+    double tmp_tail_out = tail_x[global_thread_id] * mask_val * scale;
+    if (has_addend) { tmp_tail_out += tail_addend[global_thread_id]; }
+    tail_y[global_thread_id] = tmp_tail_out;
+  }
+
+  __syncthreads();
+  if (threadIdx.x == 0) {
+    int32_t new_counter = cuda::atomic::Add(&cuda_gen_state->dev_counter, 1) + 1;
+    if (new_counter == gridDim.x) {
+      cuda_gen_state->dev_counter = 0;           // reset counter to zero
+      cuda_gen_state->dev_offset += inc_offset;  // maintain the state of generator's dev_offset
+    }
+  }
+}
+
+unsigned int ComputeGridSize(ep::Stream* stream, const int32_t block_size, const int64_t elem_cnt) {
+  auto* cuda_stream = stream->As<ep::CudaStream>();
+  const int32_t max_threads_multi_process =
+      cuda_stream->device_properties().maxThreadsPerMultiProcessor;
+  const int32_t multi_processor_count = cuda_stream->device_properties().multiProcessorCount;
+  unsigned int blocks_per_sm = max_threads_multi_process / block_size;
+  unsigned int grid_size = std::max((int64_t)1, ((elem_cnt + block_size - 1) / block_size));
+  grid_size = std::min((unsigned int)multi_processor_count * blocks_per_sm, grid_size);
+  return grid_size;
+}
+
+template<typename T, bool has_addend>
+void DispatchTail(ep::Stream* stream, uint64_t seed, one::CUDAGeneratorState* cuda_gen_state,
+                  const int64_t elem_cnt, float rate, float scale, const T* x, bool* mask,
+                  const T* addend, T* y) {
+  constexpr int pack_size = GetDropoutPackSize<T>();
+  const int64_t pack_num = elem_cnt / pack_size;
+  unsigned int grid_size = ComputeGridSize(stream, kBlockSize, pack_num);
+  const int64_t tail_offset = pack_num * pack_size;
+  const int64_t n_tail = elem_cnt - tail_offset;
+  const bool tail = n_tail > 0 ? true : false;
+  uint64_t inc_offset = 0;
+
+  if (tail) {
+    // If tail, we need generate randnum one more time, so here we add another `1`.
+    inc_offset = ((elem_cnt - 1) / (kBlockSize * grid_size * kVecSize) + 1) * kVecSize + 1;
+    FusedDropoutAddGpu<T, pack_size, true, has_addend>
+        <<<grid_size, kBlockSize, 0, stream->As<ep::CudaStream>()->cuda_stream()>>>(
+            seed, cuda_gen_state, inc_offset, elem_cnt, rate, scale, n_tail, x, mask, addend, y,
+            (x + tail_offset), (mask + tail_offset), (addend + tail_offset), (y + tail_offset));
+  } else {
+    inc_offset = ((elem_cnt - 1) / (kBlockSize * grid_size * kVecSize) + 1) * kVecSize;
+    FusedDropoutAddGpu<T, pack_size, false, has_addend>
+        <<<grid_size, kBlockSize, 0, stream->As<ep::CudaStream>()->cuda_stream()>>>(
+            seed, cuda_gen_state, inc_offset, elem_cnt, rate, scale, n_tail, x, mask, addend, y,
+            nullptr, nullptr, nullptr, nullptr);
+  }
+}
+
+template<typename T>
+struct MaskAndScaleFunctor {
+  OF_DEVICE_FUNC explicit MaskAndScaleFunctor(float scale) : scale(scale) {}
+  __device__ T operator()(T x, bool mask) const {
+    return x * static_cast<T>(mask) * static_cast<T>(scale);
+  }
+  float scale;
+};
+
+
+
+template<typename T>
+class DropoutKernelGPU final : public user_op::OpKernel, public user_op::CudaGraphSupport {
+ public:
+  DropoutKernelGPU() = default;
+  ~DropoutKernelGPU() = default;
+
+  std::shared_ptr<user_op::OpKernelState> CreateOpKernelState(
+      user_op::KernelInitContext* ctx) const override {
+    const auto& generator = CHECK_JUST(one::MakeGenerator(DeviceType::kCUDA));
+    return std::make_shared<FusedDropoutKernelState>(generator);
+  }
+
+ private:
+  using user_op::OpKernel::Compute;
+  void Compute(user_op::KernelComputeContext* ctx, user_op::OpKernelState* state,
+               const user_op::OpKernelCache*) const override {
+    const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("in", 0);
+    user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0);
+    user_op::Tensor* mask = ctx->Tensor4ArgNameAndIndex("mask", 0);
+    auto* fused_dropout_kernel_state = dynamic_cast<FusedDropoutKernelState*>(state);
+    CHECK_NOTNULL(fused_dropout_kernel_state);
+    const auto& generator = fused_dropout_kernel_state->generator();
+    CHECK_NOTNULL(generator);
+    auto* stream = ctx->stream();
+    const auto device_index = stream->device()->device_index();
+    std::shared_ptr<one::CUDAGeneratorImpl> cuda_generator =
+        CHECK_JUST(generator->Get<one::CUDAGeneratorImpl>(device_index));
+    uint64_t seed = cuda_generator->current_seed();
+
+    const float rate = ctx->Attr<float>("rate");
+    float scale = 0.0;
+    if (rate < 1.0f) { scale = 1.0f / (1.0f - rate); }
+    one::CUDAGeneratorState* cuda_gen_state = cuda_generator->cuda_gen_state();
+
+    if (ctx->has_input("_add_to_output", 0)) {
+      const user_op::Tensor* addend = ctx->Tensor4ArgNameAndIndex("_add_to_output", 0);
+      DispatchTail<T, true>(
+          stream, seed, cuda_gen_state, in->shape_view().elem_cnt(), rate, scale,
+          reinterpret_cast<const T*>(in->dptr()), reinterpret_cast<bool*>(mask->mut_dptr()),
+          reinterpret_cast<const T*>(addend->dptr()), reinterpret_cast<T*>(out->mut_dptr()));
+    } else {
+      DispatchTail<T, false>(stream, seed, cuda_gen_state, in->shape_view().elem_cnt(), rate, scale,
+                             reinterpret_cast<const T*>(in->dptr()),
+                             reinterpret_cast<bool*>(mask->mut_dptr()), nullptr,
+                             reinterpret_cast<T*>(out->mut_dptr()));
+    }
+  }
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
+};
+
+#define REGISTER_DROPOUT_KERNEL_GPU(cpp_type, data_type)                                     \
+  REGISTER_USER_KERNEL("dropout").SetCreateFn<DropoutKernelGPU<cpp_type>>().SetIsMatchedHob( \
+      (user_op::HobDeviceType() == DeviceType::kCUDA)                                        \
+      && (user_op::HobDataType("out", 0) == data_type)                                       \
+      && (user_op::HobDataType("mask", 0) == GetDataType<bool>::value))
+
+REGISTER_DROPOUT_KERNEL_GPU(half, DataType::kFloat16);
+REGISTER_DROPOUT_KERNEL_GPU(float, DataType::kFloat);
+REGISTER_DROPOUT_KERNEL_GPU(double, DataType::kDouble);
+
+
+template<typename T>
+class DropoutGradKernelGPU final : public user_op::OpKernel, public user_op::CudaGraphSupport {
+ public:
+  DropoutGradKernelGPU() = default;
+  ~DropoutGradKernelGPU() = default;
+
+ private:
+  using user_op::OpKernel::Compute;
+  void Compute(user_op::KernelComputeContext* ctx) const override {
+    const user_op::Tensor* dy = ctx->Tensor4ArgNameAndIndex("dy", 0);
+    const user_op::Tensor* mask = ctx->Tensor4ArgNameAndIndex("mask", 0);
+    user_op::Tensor* dx = ctx->Tensor4ArgNameAndIndex("dx", 0);
+    const float scale = ctx->Attr<float>("scale");
+    const int64_t elem_cnt = dy->shape_view().elem_cnt();
+    OF_CUDA_CHECK((cuda::elementwise::Binary(
+        MaskAndScaleFunctor<T>(scale), elem_cnt, reinterpret_cast<T*>(dx->mut_dptr()),
+        reinterpret_cast<const T*>(dy->dptr()), reinterpret_cast<const bool*>(mask->dptr()),
+        ctx->stream()->As<ep::CudaStream>()->cuda_stream())));
+  }
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
+};
+
+#define REGISTER_DROPOUT_GRAD_KERNEL_GPU(cpp_type, data_type)                                   \
+  REGISTER_USER_KERNEL("dropout_grad")                                                          \
+      .SetCreateFn<DropoutGradKernelGPU<cpp_type>>()                                            \
+      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                          \
+                       && (user_op::HobDataType("dx", 0) == data_type))                         \
+      .SetInplaceProposalFn([](const user_op::InferContext&,                                    \
+                               user_op::AddInplaceArgPair AddInplaceArgPairFn) -> Maybe<void> { \
+        OF_RETURN_IF_ERROR(AddInplaceArgPairFn("dx", 0, "dy", 0, true));                        \
+        return Maybe<void>::Ok();                                                               \
+      })
+
+REGISTER_DROPOUT_GRAD_KERNEL_GPU(half, DataType::kFloat16);
+REGISTER_DROPOUT_GRAD_KERNEL_GPU(float, DataType::kFloat);
+REGISTER_DROPOUT_GRAD_KERNEL_GPU(double, DataType::kDouble);
+
+}  // namespace
+
 }  // namespace oneflow
\ No newline at end of file
diff --git a/oneflow/user/kernels/dynamic_loss_scale_schedule_kernel.hip.cpp b/oneflow/user/kernels/dynamic_loss_scale_schedule_kernel.hip.cpp
index d614a8e..6a43bf1 100644
--- a/oneflow/user/kernels/dynamic_loss_scale_schedule_kernel.hip.cpp
+++ b/oneflow/user/kernels/dynamic_loss_scale_schedule_kernel.hip.cpp
@@ -1,67 +1,67 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "hip/hip_runtime.h"
-#include "oneflow/core/framework/framework.h"
-#include "oneflow/core/ep/rocm/cuda_stream.h"
-
-namespace oneflow {
-
-namespace {
-
-__global__ void DynamicLossScaleScheduleGpu(const int64_t increment_period, const float multiplier,
-                                            const int64_t* count_not_finite, float* loss_scale,
-                                            int64_t* good_step_counter) {
-  if (*count_not_finite == 0) {
-    int64_t cur_good_step_counter = *good_step_counter + 1;
-    if (cur_good_step_counter >= increment_period) {
-      *loss_scale = static_cast<float>(
-          min(static_cast<double>(*loss_scale) * multiplier, static_cast<double>(FLT_MAX)));
-      cur_good_step_counter = 0;
-    }
-    *good_step_counter = cur_good_step_counter;
-  } else {
-    *good_step_counter = 0;
-    *loss_scale = static_cast<float>(max(static_cast<double>(*loss_scale) / multiplier, 1.0));
-  }
-}
-
-}  // namespace
-
-class DynamicLossScaleScheduleGpuKernel final : public user_op::OpKernel {
- public:
-  DynamicLossScaleScheduleGpuKernel() = default;
-  ~DynamicLossScaleScheduleGpuKernel() override = default;
-
- private:
-  using user_op::OpKernel::Compute;
-  void Compute(user_op::KernelComputeContext* ctx) const override {
-    const user_op::Tensor* count_not_finite = ctx->Tensor4ArgNameAndIndex("count_not_finite", 0);
-    user_op::Tensor* loss_scale = ctx->Tensor4ArgNameAndIndex("loss_scale", 0);
-    user_op::Tensor* good_step_counter = ctx->Tensor4ArgNameAndIndex("good_step_counter", 0);
-    const auto increment_period = ctx->Attr<int64_t>("increment_period");
-    const auto multiplier = ctx->Attr<float>("multiplier");
-    DynamicLossScaleScheduleGpu<<<1, 1, 0, ctx->stream()->As<ep::CudaStream>()->cuda_stream()>>>(
-        increment_period, multiplier, count_not_finite->dptr<int64_t>(),
-        loss_scale->mut_dptr<float>(), good_step_counter->mut_dptr<int64_t>());
-  }
-  bool AlwaysComputeWhenAllOutputsEmpty() const override { return true; }
-};
-
-REGISTER_USER_KERNEL("dynamic_loss_scale_schedule")
-    .SetCreateFn<DynamicLossScaleScheduleGpuKernel>()
-    .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA));
-
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "hip/hip_runtime.h"
+#include "oneflow/core/framework/framework.h"
+#include "oneflow/core/ep/rocm/cuda_stream.h"
+
+namespace oneflow {
+
+namespace {
+
+__global__ void DynamicLossScaleScheduleGpu(const int64_t increment_period, const float multiplier,
+                                            const int64_t* count_not_finite, float* loss_scale,
+                                            int64_t* good_step_counter) {
+  if (*count_not_finite == 0) {
+    int64_t cur_good_step_counter = *good_step_counter + 1;
+    if (cur_good_step_counter >= increment_period) {
+      *loss_scale = static_cast<float>(
+          min(static_cast<double>(*loss_scale) * multiplier, static_cast<double>(FLT_MAX)));
+      cur_good_step_counter = 0;
+    }
+    *good_step_counter = cur_good_step_counter;
+  } else {
+    *good_step_counter = 0;
+    *loss_scale = static_cast<float>(max(static_cast<double>(*loss_scale) / multiplier, 1.0));
+  }
+}
+
+}  // namespace
+
+class DynamicLossScaleScheduleGpuKernel final : public user_op::OpKernel {
+ public:
+  DynamicLossScaleScheduleGpuKernel() = default;
+  ~DynamicLossScaleScheduleGpuKernel() override = default;
+
+ private:
+  using user_op::OpKernel::Compute;
+  void Compute(user_op::KernelComputeContext* ctx) const override {
+    const user_op::Tensor* count_not_finite = ctx->Tensor4ArgNameAndIndex("count_not_finite", 0);
+    user_op::Tensor* loss_scale = ctx->Tensor4ArgNameAndIndex("loss_scale", 0);
+    user_op::Tensor* good_step_counter = ctx->Tensor4ArgNameAndIndex("good_step_counter", 0);
+    const auto increment_period = ctx->Attr<int64_t>("increment_period");
+    const auto multiplier = ctx->Attr<float>("multiplier");
+    DynamicLossScaleScheduleGpu<<<1, 1, 0, ctx->stream()->As<ep::CudaStream>()->cuda_stream()>>>(
+        increment_period, multiplier, count_not_finite->dptr<int64_t>(),
+        loss_scale->mut_dptr<float>(), good_step_counter->mut_dptr<int64_t>());
+  }
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return true; }
+};
+
+REGISTER_USER_KERNEL("dynamic_loss_scale_schedule")
+    .SetCreateFn<DynamicLossScaleScheduleGpuKernel>()
+    .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA));
+
 }  // namespace oneflow
\ No newline at end of file
diff --git a/oneflow/user/kernels/eager_nccl_kernels.hip.cpp b/oneflow/user/kernels/eager_nccl_kernels.hip.cpp
index 55ebfcf..77d2ad6 100644
--- a/oneflow/user/kernels/eager_nccl_kernels.hip.cpp
+++ b/oneflow/user/kernels/eager_nccl_kernels.hip.cpp
@@ -1,404 +1,404 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "oneflow/core/common/container_util.h"
-#include "oneflow/core/control/global_process_ctx.h"
-#include "oneflow/core/framework/framework.h"
-#include "oneflow/core/device/nccl_util.h"
-#include "oneflow/core/job/eager_nccl_comm_manager.h"
-#include "oneflow/core/job/parallel_desc.h"
-#include "oneflow/core/ep/include/primitive/permute.h"
-#include "oneflow/core/ep/rocm/cuda_stream.h"
-
-#if defined(WITH_ROCM) 
-
-namespace oneflow {
-
-namespace {
-
-class EagerNcclOpKernelCache final : public user_op::OpKernelCache {
- public:
-  explicit EagerNcclOpKernelCache(user_op::KernelCacheContext* ctx) { Init(ctx); }
-  ~EagerNcclOpKernelCache() override = default;
-
-  Symbol<ParallelDesc> parallel_desc() const { return parallel_desc_; }
-  ncclComm_t comm() const { return comm_; }
-
- private:
-  void Init(user_op::KernelCacheContext* ctx) {
-    const std::string& parallel_conf_txt = ctx->Attr<std::string>("parallel_conf");
-    ParallelConf parallel_conf;
-    std::set<std::pair<int64_t, int64_t>> device_set;
-    CHECK(TxtString2PbMessage(parallel_conf_txt, &parallel_conf));
-    parallel_desc_ = SymbolOf(ParallelDesc(parallel_conf));
-    FOR_RANGE(int64_t, parallel_id, 0, parallel_desc_->parallel_num()) {
-      int64_t machine_id = CHECK_JUST(parallel_desc_->MachineId4ParallelId(parallel_id));
-      int64_t device_id = CHECK_JUST(parallel_desc_->DeviceId4ParallelId(parallel_id));
-      device_set.emplace(std::make_pair(machine_id, device_id));
-    }
-    comm_ = CHECK_NOTNULL(Singleton<EagerNcclCommMgr>::Get())->GetCommForDevice(device_set);
-  }
-
-  Symbol<ParallelDesc> parallel_desc_;
-  ncclComm_t comm_{};
-};
-
-size_t InferEagerNcclS2SKernelTmpBufferSize(user_op::InferContext* ctx) {
-  const user_op::TensorDesc& in_tensor = ctx->InputTensorDesc("in", 0);
-  size_t tensor_byte_size =
-      GetCudaAlignedSize(in_tensor.shape().elem_cnt() * GetSizeOfDataType(in_tensor.data_type()));
-  // NOTE(hanbinbin): Set tmp_buffer_size to twice tensor_byte_size because the
-  // SbpParallel4ArgNameAndIndex function of LocalUserOpInferContext is unimplemented
-  return tensor_byte_size * 2;
-}
-
-void InitEagerNcclOpKernelCache(user_op::KernelCacheContext* ctx,
-                                std::shared_ptr<user_op::OpKernelCache>* cache_ptr) {
-  // NOTE(jianhao): the cache only depends on parallel_conf, and the kernel is singleton
-  // once parallel_conf is determined, so only init the cache at the first time.
-  if (*cache_ptr == nullptr) { *cache_ptr = std::make_shared<EagerNcclOpKernelCache>(ctx); }
-}
-}  // namespace
-
-class EagerNcclAllReduceKernel final : public user_op::OpKernel {
- public:
-  EagerNcclAllReduceKernel() = default;
-  ~EagerNcclAllReduceKernel() override = default;
-
-  void InitOpKernelCacheWithFlags(
-      user_op::KernelCacheContext* ctx, int8_t flag,
-      std::shared_ptr<user_op::OpKernelCache>* cache_ptr) const override {
-    InitEagerNcclOpKernelCache(ctx, cache_ptr);
-  }
-
- private:
-  void Compute(user_op::KernelComputeContext* ctx, user_op::OpKernelState*,
-               const user_op::OpKernelCache* cache) const override {
-    auto* kernel_cache = dynamic_cast<const EagerNcclOpKernelCache*>(cache);
-    CHECK(kernel_cache != nullptr);
-    const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("in", 0);
-    user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0);
-    CHECK_EQ(in->shape_view(), out->shape_view());
-    CHECK_EQ(in->data_type(), out->data_type());
-    ncclRedOp_t reduce_type = ncclSum;
-    if (in->data_type() == kBool) { reduce_type = ncclMax; }
-    OF_NCCL_CHECK(ncclAllReduce(in->dptr(), out->mut_dptr(), in->shape_view().elem_cnt(),
-                                GetNcclDataType(in->data_type()), reduce_type, kernel_cache->comm(),
-                                ctx->stream()->As<ep::CudaStream>()->cuda_stream()));
-  };
-  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
-};
-
-REGISTER_USER_KERNEL("eager_nccl_all_reduce")
-    .SetCreateFn<EagerNcclAllReduceKernel>()
-    .SetIsMatchedHob(user_op::HobDeviceType() == DeviceType::kCUDA);
-
-class EagerNcclBroadcastKernel final : public user_op::OpKernel {
- public:
-  EagerNcclBroadcastKernel() = default;
-  ~EagerNcclBroadcastKernel() override = default;
-
-  void InitOpKernelCacheWithFlags(
-      user_op::KernelCacheContext* ctx, int8_t flag,
-      std::shared_ptr<user_op::OpKernelCache>* cache_ptr) const override {
-    InitEagerNcclOpKernelCache(ctx, cache_ptr);
-  }
-
- private:
-  void Compute(user_op::KernelComputeContext* ctx, user_op::OpKernelState*,
-               const user_op::OpKernelCache* cache) const override {
-    auto* kernel_cache = dynamic_cast<const EagerNcclOpKernelCache*>(cache);
-    CHECK(kernel_cache != nullptr);
-    const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("in", 0);
-    user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0);
-    int64_t root = ctx->Attr<int64_t>("root");
-    int64_t dev_id = GlobalProcessCtx::LocalRank(root);
-    int64_t nccl_root =
-        CHECK_JUST(kernel_cache->parallel_desc()->ParallelId4MachineDeviceId(root, dev_id));
-    const void* in_ptr = nullptr;
-    if (GlobalProcessCtx::Rank() == root) {
-      CHECK_EQ(in->shape_view(), out->shape_view());
-      CHECK_EQ(in->data_type(), out->data_type());
-      in_ptr = in->dptr();
-    }
-    OF_NCCL_CHECK(ncclBroadcast(in_ptr, out->mut_dptr(), out->shape_view().elem_cnt(),
-                                GetNcclDataType(out->data_type()), nccl_root, kernel_cache->comm(),
-                                ctx->stream()->As<ep::CudaStream>()->cuda_stream()));
-  };
-  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
-};
-
-REGISTER_USER_KERNEL("eager_nccl_broadcast")
-    .SetCreateFn<EagerNcclBroadcastKernel>()
-    .SetIsMatchedHob(user_op::HobDeviceType() == DeviceType::kCUDA);
-
-class EagerNcclTouchKernel final : public user_op::OpKernel {
- public:
-  EagerNcclTouchKernel() = default;
-  ~EagerNcclTouchKernel() override = default;
-
- private:
-  void Compute(user_op::KernelComputeContext* ctx, user_op::OpKernelState*,
-               const user_op::OpKernelCache* cache) const override{
-      // Do nothing.
-  };
-  bool AlwaysComputeWhenAllOutputsEmpty() const override { return true; }
-};
-
-REGISTER_USER_KERNEL("eager_nccl_touch")
-    .SetCreateFn<EagerNcclTouchKernel>()
-    .SetIsMatchedHob(user_op::HobDeviceType() == DeviceType::kCUDA);
-
-class EagerNcclReduceKernel final : public user_op::OpKernel {
- public:
-  EagerNcclReduceKernel() = default;
-  ~EagerNcclReduceKernel() override = default;
-
-  void InitOpKernelCacheWithFlags(
-      user_op::KernelCacheContext* ctx, int8_t flag,
-      std::shared_ptr<user_op::OpKernelCache>* cache_ptr) const override {
-    InitEagerNcclOpKernelCache(ctx, cache_ptr);
-  }
-
- private:
-  void Compute(user_op::KernelComputeContext* ctx, user_op::OpKernelState*,
-               const user_op::OpKernelCache* cache) const override {
-    auto* kernel_cache = dynamic_cast<const EagerNcclOpKernelCache*>(cache);
-    CHECK(kernel_cache != nullptr);
-    const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("in", 0);
-    user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0);
-    int64_t root = ctx->Attr<int64_t>("root");
-    void* out_ptr = nullptr;
-    if (GlobalProcessCtx::Rank() == root) {
-      CHECK_EQ(in->shape_view(), out->shape_view());
-      CHECK_EQ(in->data_type(), out->data_type());
-      out_ptr = out->mut_dptr();
-    }
-    ncclRedOp_t reduce_type = ncclSum;
-    if (in->data_type() == kBool) { reduce_type = ncclMax; }
-    OF_NCCL_CHECK(ncclReduce(in->dptr(), out_ptr, in->shape_view().elem_cnt(),
-                             GetNcclDataType(in->data_type()), reduce_type, root,
-                             kernel_cache->comm(),
-                             ctx->stream()->As<ep::CudaStream>()->cuda_stream()));
-  };
-  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
-};
-
-REGISTER_USER_KERNEL("eager_nccl_reduce")
-    .SetCreateFn<EagerNcclReduceKernel>()
-    .SetIsMatchedHob(user_op::HobDeviceType() == DeviceType::kCUDA);
-
-class EagerNcclReduceScatterKernel final : public user_op::OpKernel {
- public:
-  EagerNcclReduceScatterKernel() = default;
-  ~EagerNcclReduceScatterKernel() override = default;
-
-  void InitOpKernelCacheWithFlags(
-      user_op::KernelCacheContext* ctx, int8_t flag,
-      std::shared_ptr<user_op::OpKernelCache>* cache_ptr) const override {
-    InitEagerNcclOpKernelCache(ctx, cache_ptr);
-  }
-
- private:
-  void Compute(user_op::KernelComputeContext* ctx, user_op::OpKernelState*,
-               const user_op::OpKernelCache* cache) const override {
-    auto* kernel_cache = dynamic_cast<const EagerNcclOpKernelCache*>(cache);
-    CHECK(kernel_cache != nullptr);
-    const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("in", 0);
-    user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0);
-    CHECK_EQ(in->data_type(), out->data_type());
-    ncclRedOp_t reduce_type = ncclSum;
-    if (in->data_type() == kBool) {
-      reduce_type = ncclMax;
-    } else {
-      const auto& op_type = ctx->Attr<std::string>("op_type");
-      reduce_type = CHECK_JUST(MapAt(op_type2ncclRedOp_t, op_type));
-    }
-    OF_NCCL_CHECK(ncclReduceScatter(
-        in->dptr(), out->mut_dptr(), out->shape_view().elem_cnt(), GetNcclDataType(in->data_type()),
-        reduce_type, kernel_cache->comm(), ctx->stream()->As<ep::CudaStream>()->cuda_stream()));
-  };
-  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
-
-  static HashMap<std::string, ncclRedOp_t> op_type2ncclRedOp_t;
-};
-
-HashMap<std::string, ncclRedOp_t> EagerNcclReduceScatterKernel::op_type2ncclRedOp_t = {
-    {"sum", ncclSum}, {"max", ncclMax}};
-
-REGISTER_USER_KERNEL("eager_nccl_reduce_scatter")
-    .SetCreateFn<EagerNcclReduceScatterKernel>()
-    .SetIsMatchedHob(user_op::HobDeviceType() == DeviceType::kCUDA);
-
-class EagerNcclAllGatherKernel final : public user_op::OpKernel {
- public:
-  EagerNcclAllGatherKernel() = default;
-  ~EagerNcclAllGatherKernel() override = default;
-
-  void InitOpKernelCacheWithFlags(
-      user_op::KernelCacheContext* ctx, int8_t flag,
-      std::shared_ptr<user_op::OpKernelCache>* cache_ptr) const override {
-    InitEagerNcclOpKernelCache(ctx, cache_ptr);
-  }
-
- private:
-  void Compute(user_op::KernelComputeContext* ctx, user_op::OpKernelState*,
-               const user_op::OpKernelCache* cache) const override {
-    auto* kernel_cache = dynamic_cast<const EagerNcclOpKernelCache*>(cache);
-    CHECK(kernel_cache != nullptr);
-    const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("in", 0);
-    user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0);
-    CHECK_EQ(in->data_type(), out->data_type());
-    OF_NCCL_CHECK(ncclAllGather(in->dptr(), out->mut_dptr(), in->shape_view().elem_cnt(),
-                                GetNcclDataType(in->data_type()), kernel_cache->comm(),
-                                ctx->stream()->As<ep::CudaStream>()->cuda_stream()));
-  };
-  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
-};
-
-REGISTER_USER_KERNEL("eager_nccl_all_gather")
-    .SetCreateFn<EagerNcclAllGatherKernel>()
-    .SetIsMatchedHob(user_op::HobDeviceType() == DeviceType::kCUDA);
-
-template<typename T>
-class EagerNcclS2SKernel final : public user_op::OpKernel {
- public:
-  EagerNcclS2SKernel() = default;
-  ~EagerNcclS2SKernel() override = default;
-
-  void InitOpKernelCacheWithFlags(
-      user_op::KernelCacheContext* ctx, int8_t flag,
-      std::shared_ptr<user_op::OpKernelCache>* cache_ptr) const override {
-    InitEagerNcclOpKernelCache(ctx, cache_ptr);
-  }
-
- private:
-  void Compute(user_op::KernelComputeContext* ctx, user_op::OpKernelState*,
-               const user_op::OpKernelCache* cache) const override {
-    auto* kernel_cache = dynamic_cast<const EagerNcclOpKernelCache*>(cache);
-    CHECK(kernel_cache != nullptr);
-    // NOTE(hanbinbin): Compute logic copy from _nccl_logical_s2s
-    const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("in", 0);
-    user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0);
-    user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0);
-    int64_t tmp_size = 0;
-    const int64_t dtype_size = GetSizeOfDataType(in->data_type());
-    int64_t data_size = GetCudaAlignedSize(in->shape_view().elem_cnt() * dtype_size);
-    // NOTE(chengcheng): in (transpose)-> pack_to_ptr (all2all)-> unpack_from_ptr (transpose)-> out
-    const char* pack_to_ptr = in->dptr<char>();
-    char* unpack_from_ptr = out->mut_dptr<char>();
-    if (tmp_buffer) { tmp_size = tmp_buffer->shape_view().elem_cnt(); }
-    CHECK(tmp_size == 0 || tmp_size == data_size || tmp_size == data_size * 2);
-
-    CHECK_EQ(in->data_type(), out->data_type());
-    const int64_t num_ranks = kernel_cache->parallel_desc()->parallel_num();
-    CHECK_EQ(in->shape_view().elem_cnt(), out->shape_view().elem_cnt())
-        << in->shape_view().ToString() << " vs " << out->shape_view().ToString();
-    const int64_t elem_cnt = in->shape_view().elem_cnt();
-    const int64_t in_split_axis = ctx->Attr<int64_t>("in_split_axis");
-    const int64_t out_split_axis = ctx->Attr<int64_t>("out_split_axis");
-
-    DimVector logical_shape_dim_vec;
-    in->shape_view().ToDimVector(&logical_shape_dim_vec);
-    logical_shape_dim_vec[in_split_axis] = logical_shape_dim_vec.at(in_split_axis) * num_ranks;
-
-    if (out_split_axis != 0) {
-      // NOTE(chengcheng): Do pack. Need transpose in -> pack_to
-      // pack use temp buffer offset: [0, data_size]
-      pack_to_ptr = tmp_buffer->dptr<char>();
-      DimVector transpose_in_dim_vec = logical_shape_dim_vec;
-      CHECK_EQ(transpose_in_dim_vec.at(in_split_axis) % num_ranks, 0);
-      transpose_in_dim_vec[in_split_axis] = transpose_in_dim_vec.at(in_split_axis) / num_ranks;
-      CHECK_EQ(transpose_in_dim_vec.at(out_split_axis) % num_ranks, 0);
-      transpose_in_dim_vec[out_split_axis] = transpose_in_dim_vec.at(out_split_axis) / num_ranks;
-      transpose_in_dim_vec.insert(transpose_in_dim_vec.begin() + out_split_axis, num_ranks);
-      std::vector<int32_t> perm;
-      perm.emplace_back(out_split_axis);
-      FOR_RANGE(int64_t, i, 0, transpose_in_dim_vec.size()) {
-        if (i != out_split_axis) { perm.emplace_back(i); }
-      }
-      auto transpose = ep::primitive::NewPrimitive<ep::primitive::PermuteFactory>(
-          ctx->stream()->device_type(), transpose_in_dim_vec.size());
-      CHECK(transpose);
-      transpose->Launch(ctx->stream(), in->data_type(), transpose_in_dim_vec.size(),
-                        transpose_in_dim_vec.data(), in->dptr(), perm.data(),
-                        tmp_buffer->mut_dptr());
-    }
-
-    if (in_split_axis != 0) {
-      // NOTE(chengcheng): Do unpack. Need transpose unpack_from -> out
-      // unpack use temp buffer offset: [tmp_size - data_size, tmp_size]
-      unpack_from_ptr = tmp_buffer->mut_dptr<char>() + (tmp_size - data_size);
-    }
-
-    {
-      // NOTE: Do S2S
-      OF_NCCL_CHECK(ncclGroupStart());
-      const int64_t elem_per_chunk = elem_cnt / num_ranks;
-      const int64_t chunk_size = elem_per_chunk * dtype_size;
-      for (int64_t j = 0; j < num_ranks; ++j) {
-        OF_NCCL_CHECK(ncclSend(reinterpret_cast<const void*>(
-                                   reinterpret_cast<const char*>(pack_to_ptr) + j * chunk_size),
-                               elem_per_chunk, GetNcclDataType(in->data_type()), j,
-                               kernel_cache->comm(),
-                               ctx->stream()->As<ep::CudaStream>()->cuda_stream()));
-        OF_NCCL_CHECK(ncclRecv(
-            reinterpret_cast<void*>(reinterpret_cast<char*>(unpack_from_ptr) + j * chunk_size),
-            elem_per_chunk, GetNcclDataType(in->data_type()), j, kernel_cache->comm(),
-            ctx->stream()->As<ep::CudaStream>()->cuda_stream()));
-      }
-      OF_NCCL_CHECK(ncclGroupEnd());
-    }
-
-    if (in_split_axis != 0) {
-      // Do unpack.
-      CHECK(unpack_from_ptr != out->mut_dptr<char>());
-      DimVector unpack_from_dim_vec = logical_shape_dim_vec;
-      CHECK_EQ(unpack_from_dim_vec.at(in_split_axis) % num_ranks, 0);
-      unpack_from_dim_vec[in_split_axis] = unpack_from_dim_vec.at(in_split_axis) / num_ranks;
-      CHECK_EQ(unpack_from_dim_vec.at(out_split_axis) % num_ranks, 0);
-      unpack_from_dim_vec[out_split_axis] = unpack_from_dim_vec.at(out_split_axis) / num_ranks;
-      unpack_from_dim_vec.insert(unpack_from_dim_vec.begin(), num_ranks);
-      std::vector<int32_t> perm;
-      FOR_RANGE(int64_t, i, 1, unpack_from_dim_vec.size()) { perm.emplace_back(i); }
-      perm.insert(perm.begin() + in_split_axis, 0);
-      auto transpose = ep::primitive::NewPrimitive<ep::primitive::PermuteFactory>(
-          ctx->stream()->device_type(), unpack_from_dim_vec.size());
-      CHECK(transpose);
-      transpose->Launch(ctx->stream(), in->data_type(), unpack_from_dim_vec.size(),
-                        unpack_from_dim_vec.data(), unpack_from_ptr, perm.data(), out->mut_dptr());
-    }
-  };
-  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
-};
-
-#define REGISTER_EAGER_NCCL_S2S_KERNEL(dtype)                                            \
-  REGISTER_USER_KERNEL("eager_nccl_s2s")                                                 \
-      .SetCreateFn<EagerNcclS2SKernel<dtype>>()                                          \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                   \
-                       && (user_op::HobDataType("in", 0) == GetDataType<dtype>::value)   \
-                       && (user_op::HobDataType("out", 0) == GetDataType<dtype>::value)) \
-      .SetInferTmpSizeFn(InferEagerNcclS2SKernelTmpBufferSize);
-
-REGISTER_EAGER_NCCL_S2S_KERNEL(int8_t)
-REGISTER_EAGER_NCCL_S2S_KERNEL(int32_t)
-REGISTER_EAGER_NCCL_S2S_KERNEL(int64_t)
-REGISTER_EAGER_NCCL_S2S_KERNEL(bool)
-REGISTER_EAGER_NCCL_S2S_KERNEL(float)
-REGISTER_EAGER_NCCL_S2S_KERNEL(double)
-REGISTER_EAGER_NCCL_S2S_KERNEL(float16)
-}  // namespace oneflow
-
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "oneflow/core/common/container_util.h"
+#include "oneflow/core/control/global_process_ctx.h"
+#include "oneflow/core/framework/framework.h"
+#include "oneflow/core/device/nccl_util.h"
+#include "oneflow/core/job/eager_nccl_comm_manager.h"
+#include "oneflow/core/job/parallel_desc.h"
+#include "oneflow/core/ep/include/primitive/permute.h"
+#include "oneflow/core/ep/rocm/cuda_stream.h"
+
+#if defined(WITH_ROCM) 
+
+namespace oneflow {
+
+namespace {
+
+class EagerNcclOpKernelCache final : public user_op::OpKernelCache {
+ public:
+  explicit EagerNcclOpKernelCache(user_op::KernelCacheContext* ctx) { Init(ctx); }
+  ~EagerNcclOpKernelCache() override = default;
+
+  Symbol<ParallelDesc> parallel_desc() const { return parallel_desc_; }
+  ncclComm_t comm() const { return comm_; }
+
+ private:
+  void Init(user_op::KernelCacheContext* ctx) {
+    const std::string& parallel_conf_txt = ctx->Attr<std::string>("parallel_conf");
+    ParallelConf parallel_conf;
+    std::set<std::pair<int64_t, int64_t>> device_set;
+    CHECK(TxtString2PbMessage(parallel_conf_txt, &parallel_conf));
+    parallel_desc_ = SymbolOf(ParallelDesc(parallel_conf));
+    FOR_RANGE(int64_t, parallel_id, 0, parallel_desc_->parallel_num()) {
+      int64_t machine_id = CHECK_JUST(parallel_desc_->MachineId4ParallelId(parallel_id));
+      int64_t device_id = CHECK_JUST(parallel_desc_->DeviceId4ParallelId(parallel_id));
+      device_set.emplace(std::make_pair(machine_id, device_id));
+    }
+    comm_ = CHECK_NOTNULL(Singleton<EagerNcclCommMgr>::Get())->GetCommForDevice(device_set);
+  }
+
+  Symbol<ParallelDesc> parallel_desc_;
+  ncclComm_t comm_{};
+};
+
+size_t InferEagerNcclS2SKernelTmpBufferSize(user_op::InferContext* ctx) {
+  const user_op::TensorDesc& in_tensor = ctx->InputTensorDesc("in", 0);
+  size_t tensor_byte_size =
+      GetCudaAlignedSize(in_tensor.shape().elem_cnt() * GetSizeOfDataType(in_tensor.data_type()));
+  // NOTE(hanbinbin): Set tmp_buffer_size to twice tensor_byte_size because the
+  // SbpParallel4ArgNameAndIndex function of LocalUserOpInferContext is unimplemented
+  return tensor_byte_size * 2;
+}
+
+void InitEagerNcclOpKernelCache(user_op::KernelCacheContext* ctx,
+                                std::shared_ptr<user_op::OpKernelCache>* cache_ptr) {
+  // NOTE(jianhao): the cache only depends on parallel_conf, and the kernel is singleton
+  // once parallel_conf is determined, so only init the cache at the first time.
+  if (*cache_ptr == nullptr) { *cache_ptr = std::make_shared<EagerNcclOpKernelCache>(ctx); }
+}
+}  // namespace
+
+class EagerNcclAllReduceKernel final : public user_op::OpKernel {
+ public:
+  EagerNcclAllReduceKernel() = default;
+  ~EagerNcclAllReduceKernel() override = default;
+
+  void InitOpKernelCacheWithFlags(
+      user_op::KernelCacheContext* ctx, int8_t flag,
+      std::shared_ptr<user_op::OpKernelCache>* cache_ptr) const override {
+    InitEagerNcclOpKernelCache(ctx, cache_ptr);
+  }
+
+ private:
+  void Compute(user_op::KernelComputeContext* ctx, user_op::OpKernelState*,
+               const user_op::OpKernelCache* cache) const override {
+    auto* kernel_cache = dynamic_cast<const EagerNcclOpKernelCache*>(cache);
+    CHECK(kernel_cache != nullptr);
+    const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("in", 0);
+    user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0);
+    CHECK_EQ(in->shape_view(), out->shape_view());
+    CHECK_EQ(in->data_type(), out->data_type());
+    ncclRedOp_t reduce_type = ncclSum;
+    if (in->data_type() == kBool) { reduce_type = ncclMax; }
+    OF_NCCL_CHECK(ncclAllReduce(in->dptr(), out->mut_dptr(), in->shape_view().elem_cnt(),
+                                GetNcclDataType(in->data_type()), reduce_type, kernel_cache->comm(),
+                                ctx->stream()->As<ep::CudaStream>()->cuda_stream()));
+  };
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
+};
+
+REGISTER_USER_KERNEL("eager_nccl_all_reduce")
+    .SetCreateFn<EagerNcclAllReduceKernel>()
+    .SetIsMatchedHob(user_op::HobDeviceType() == DeviceType::kCUDA);
+
+class EagerNcclBroadcastKernel final : public user_op::OpKernel {
+ public:
+  EagerNcclBroadcastKernel() = default;
+  ~EagerNcclBroadcastKernel() override = default;
+
+  void InitOpKernelCacheWithFlags(
+      user_op::KernelCacheContext* ctx, int8_t flag,
+      std::shared_ptr<user_op::OpKernelCache>* cache_ptr) const override {
+    InitEagerNcclOpKernelCache(ctx, cache_ptr);
+  }
+
+ private:
+  void Compute(user_op::KernelComputeContext* ctx, user_op::OpKernelState*,
+               const user_op::OpKernelCache* cache) const override {
+    auto* kernel_cache = dynamic_cast<const EagerNcclOpKernelCache*>(cache);
+    CHECK(kernel_cache != nullptr);
+    const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("in", 0);
+    user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0);
+    int64_t root = ctx->Attr<int64_t>("root");
+    int64_t dev_id = GlobalProcessCtx::LocalRank(root);
+    int64_t nccl_root =
+        CHECK_JUST(kernel_cache->parallel_desc()->ParallelId4MachineDeviceId(root, dev_id));
+    const void* in_ptr = nullptr;
+    if (GlobalProcessCtx::Rank() == root) {
+      CHECK_EQ(in->shape_view(), out->shape_view());
+      CHECK_EQ(in->data_type(), out->data_type());
+      in_ptr = in->dptr();
+    }
+    OF_NCCL_CHECK(ncclBroadcast(in_ptr, out->mut_dptr(), out->shape_view().elem_cnt(),
+                                GetNcclDataType(out->data_type()), nccl_root, kernel_cache->comm(),
+                                ctx->stream()->As<ep::CudaStream>()->cuda_stream()));
+  };
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
+};
+
+REGISTER_USER_KERNEL("eager_nccl_broadcast")
+    .SetCreateFn<EagerNcclBroadcastKernel>()
+    .SetIsMatchedHob(user_op::HobDeviceType() == DeviceType::kCUDA);
+
+class EagerNcclTouchKernel final : public user_op::OpKernel {
+ public:
+  EagerNcclTouchKernel() = default;
+  ~EagerNcclTouchKernel() override = default;
+
+ private:
+  void Compute(user_op::KernelComputeContext* ctx, user_op::OpKernelState*,
+               const user_op::OpKernelCache* cache) const override{
+      // Do nothing.
+  };
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return true; }
+};
+
+REGISTER_USER_KERNEL("eager_nccl_touch")
+    .SetCreateFn<EagerNcclTouchKernel>()
+    .SetIsMatchedHob(user_op::HobDeviceType() == DeviceType::kCUDA);
+
+class EagerNcclReduceKernel final : public user_op::OpKernel {
+ public:
+  EagerNcclReduceKernel() = default;
+  ~EagerNcclReduceKernel() override = default;
+
+  void InitOpKernelCacheWithFlags(
+      user_op::KernelCacheContext* ctx, int8_t flag,
+      std::shared_ptr<user_op::OpKernelCache>* cache_ptr) const override {
+    InitEagerNcclOpKernelCache(ctx, cache_ptr);
+  }
+
+ private:
+  void Compute(user_op::KernelComputeContext* ctx, user_op::OpKernelState*,
+               const user_op::OpKernelCache* cache) const override {
+    auto* kernel_cache = dynamic_cast<const EagerNcclOpKernelCache*>(cache);
+    CHECK(kernel_cache != nullptr);
+    const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("in", 0);
+    user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0);
+    int64_t root = ctx->Attr<int64_t>("root");
+    void* out_ptr = nullptr;
+    if (GlobalProcessCtx::Rank() == root) {
+      CHECK_EQ(in->shape_view(), out->shape_view());
+      CHECK_EQ(in->data_type(), out->data_type());
+      out_ptr = out->mut_dptr();
+    }
+    ncclRedOp_t reduce_type = ncclSum;
+    if (in->data_type() == kBool) { reduce_type = ncclMax; }
+    OF_NCCL_CHECK(ncclReduce(in->dptr(), out_ptr, in->shape_view().elem_cnt(),
+                             GetNcclDataType(in->data_type()), reduce_type, root,
+                             kernel_cache->comm(),
+                             ctx->stream()->As<ep::CudaStream>()->cuda_stream()));
+  };
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
+};
+
+REGISTER_USER_KERNEL("eager_nccl_reduce")
+    .SetCreateFn<EagerNcclReduceKernel>()
+    .SetIsMatchedHob(user_op::HobDeviceType() == DeviceType::kCUDA);
+
+class EagerNcclReduceScatterKernel final : public user_op::OpKernel {
+ public:
+  EagerNcclReduceScatterKernel() = default;
+  ~EagerNcclReduceScatterKernel() override = default;
+
+  void InitOpKernelCacheWithFlags(
+      user_op::KernelCacheContext* ctx, int8_t flag,
+      std::shared_ptr<user_op::OpKernelCache>* cache_ptr) const override {
+    InitEagerNcclOpKernelCache(ctx, cache_ptr);
+  }
+
+ private:
+  void Compute(user_op::KernelComputeContext* ctx, user_op::OpKernelState*,
+               const user_op::OpKernelCache* cache) const override {
+    auto* kernel_cache = dynamic_cast<const EagerNcclOpKernelCache*>(cache);
+    CHECK(kernel_cache != nullptr);
+    const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("in", 0);
+    user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0);
+    CHECK_EQ(in->data_type(), out->data_type());
+    ncclRedOp_t reduce_type = ncclSum;
+    if (in->data_type() == kBool) {
+      reduce_type = ncclMax;
+    } else {
+      const auto& op_type = ctx->Attr<std::string>("op_type");
+      reduce_type = CHECK_JUST(MapAt(op_type2ncclRedOp_t, op_type));
+    }
+    OF_NCCL_CHECK(ncclReduceScatter(
+        in->dptr(), out->mut_dptr(), out->shape_view().elem_cnt(), GetNcclDataType(in->data_type()),
+        reduce_type, kernel_cache->comm(), ctx->stream()->As<ep::CudaStream>()->cuda_stream()));
+  };
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
+
+  static HashMap<std::string, ncclRedOp_t> op_type2ncclRedOp_t;
+};
+
+HashMap<std::string, ncclRedOp_t> EagerNcclReduceScatterKernel::op_type2ncclRedOp_t = {
+    {"sum", ncclSum}, {"max", ncclMax}};
+
+REGISTER_USER_KERNEL("eager_nccl_reduce_scatter")
+    .SetCreateFn<EagerNcclReduceScatterKernel>()
+    .SetIsMatchedHob(user_op::HobDeviceType() == DeviceType::kCUDA);
+
+class EagerNcclAllGatherKernel final : public user_op::OpKernel {
+ public:
+  EagerNcclAllGatherKernel() = default;
+  ~EagerNcclAllGatherKernel() override = default;
+
+  void InitOpKernelCacheWithFlags(
+      user_op::KernelCacheContext* ctx, int8_t flag,
+      std::shared_ptr<user_op::OpKernelCache>* cache_ptr) const override {
+    InitEagerNcclOpKernelCache(ctx, cache_ptr);
+  }
+
+ private:
+  void Compute(user_op::KernelComputeContext* ctx, user_op::OpKernelState*,
+               const user_op::OpKernelCache* cache) const override {
+    auto* kernel_cache = dynamic_cast<const EagerNcclOpKernelCache*>(cache);
+    CHECK(kernel_cache != nullptr);
+    const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("in", 0);
+    user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0);
+    CHECK_EQ(in->data_type(), out->data_type());
+    OF_NCCL_CHECK(ncclAllGather(in->dptr(), out->mut_dptr(), in->shape_view().elem_cnt(),
+                                GetNcclDataType(in->data_type()), kernel_cache->comm(),
+                                ctx->stream()->As<ep::CudaStream>()->cuda_stream()));
+  };
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
+};
+
+REGISTER_USER_KERNEL("eager_nccl_all_gather")
+    .SetCreateFn<EagerNcclAllGatherKernel>()
+    .SetIsMatchedHob(user_op::HobDeviceType() == DeviceType::kCUDA);
+
+template<typename T>
+class EagerNcclS2SKernel final : public user_op::OpKernel {
+ public:
+  EagerNcclS2SKernel() = default;
+  ~EagerNcclS2SKernel() override = default;
+
+  void InitOpKernelCacheWithFlags(
+      user_op::KernelCacheContext* ctx, int8_t flag,
+      std::shared_ptr<user_op::OpKernelCache>* cache_ptr) const override {
+    InitEagerNcclOpKernelCache(ctx, cache_ptr);
+  }
+
+ private:
+  void Compute(user_op::KernelComputeContext* ctx, user_op::OpKernelState*,
+               const user_op::OpKernelCache* cache) const override {
+    auto* kernel_cache = dynamic_cast<const EagerNcclOpKernelCache*>(cache);
+    CHECK(kernel_cache != nullptr);
+    // NOTE(hanbinbin): Compute logic copy from _nccl_logical_s2s
+    const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("in", 0);
+    user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0);
+    user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0);
+    int64_t tmp_size = 0;
+    const int64_t dtype_size = GetSizeOfDataType(in->data_type());
+    int64_t data_size = GetCudaAlignedSize(in->shape_view().elem_cnt() * dtype_size);
+    // NOTE(chengcheng): in (transpose)-> pack_to_ptr (all2all)-> unpack_from_ptr (transpose)-> out
+    const char* pack_to_ptr = in->dptr<char>();
+    char* unpack_from_ptr = out->mut_dptr<char>();
+    if (tmp_buffer) { tmp_size = tmp_buffer->shape_view().elem_cnt(); }
+    CHECK(tmp_size == 0 || tmp_size == data_size || tmp_size == data_size * 2);
+
+    CHECK_EQ(in->data_type(), out->data_type());
+    const int64_t num_ranks = kernel_cache->parallel_desc()->parallel_num();
+    CHECK_EQ(in->shape_view().elem_cnt(), out->shape_view().elem_cnt())
+        << in->shape_view().ToString() << " vs " << out->shape_view().ToString();
+    const int64_t elem_cnt = in->shape_view().elem_cnt();
+    const int64_t in_split_axis = ctx->Attr<int64_t>("in_split_axis");
+    const int64_t out_split_axis = ctx->Attr<int64_t>("out_split_axis");
+
+    DimVector logical_shape_dim_vec;
+    in->shape_view().ToDimVector(&logical_shape_dim_vec);
+    logical_shape_dim_vec[in_split_axis] = logical_shape_dim_vec.at(in_split_axis) * num_ranks;
+
+    if (out_split_axis != 0) {
+      // NOTE(chengcheng): Do pack. Need transpose in -> pack_to
+      // pack use temp buffer offset: [0, data_size]
+      pack_to_ptr = tmp_buffer->dptr<char>();
+      DimVector transpose_in_dim_vec = logical_shape_dim_vec;
+      CHECK_EQ(transpose_in_dim_vec.at(in_split_axis) % num_ranks, 0);
+      transpose_in_dim_vec[in_split_axis] = transpose_in_dim_vec.at(in_split_axis) / num_ranks;
+      CHECK_EQ(transpose_in_dim_vec.at(out_split_axis) % num_ranks, 0);
+      transpose_in_dim_vec[out_split_axis] = transpose_in_dim_vec.at(out_split_axis) / num_ranks;
+      transpose_in_dim_vec.insert(transpose_in_dim_vec.begin() + out_split_axis, num_ranks);
+      std::vector<int32_t> perm;
+      perm.emplace_back(out_split_axis);
+      FOR_RANGE(int64_t, i, 0, transpose_in_dim_vec.size()) {
+        if (i != out_split_axis) { perm.emplace_back(i); }
+      }
+      auto transpose = ep::primitive::NewPrimitive<ep::primitive::PermuteFactory>(
+          ctx->stream()->device_type(), transpose_in_dim_vec.size());
+      CHECK(transpose);
+      transpose->Launch(ctx->stream(), in->data_type(), transpose_in_dim_vec.size(),
+                        transpose_in_dim_vec.data(), in->dptr(), perm.data(),
+                        tmp_buffer->mut_dptr());
+    }
+
+    if (in_split_axis != 0) {
+      // NOTE(chengcheng): Do unpack. Need transpose unpack_from -> out
+      // unpack use temp buffer offset: [tmp_size - data_size, tmp_size]
+      unpack_from_ptr = tmp_buffer->mut_dptr<char>() + (tmp_size - data_size);
+    }
+
+    {
+      // NOTE: Do S2S
+      OF_NCCL_CHECK(ncclGroupStart());
+      const int64_t elem_per_chunk = elem_cnt / num_ranks;
+      const int64_t chunk_size = elem_per_chunk * dtype_size;
+      for (int64_t j = 0; j < num_ranks; ++j) {
+        OF_NCCL_CHECK(ncclSend(reinterpret_cast<const void*>(
+                                   reinterpret_cast<const char*>(pack_to_ptr) + j * chunk_size),
+                               elem_per_chunk, GetNcclDataType(in->data_type()), j,
+                               kernel_cache->comm(),
+                               ctx->stream()->As<ep::CudaStream>()->cuda_stream()));
+        OF_NCCL_CHECK(ncclRecv(
+            reinterpret_cast<void*>(reinterpret_cast<char*>(unpack_from_ptr) + j * chunk_size),
+            elem_per_chunk, GetNcclDataType(in->data_type()), j, kernel_cache->comm(),
+            ctx->stream()->As<ep::CudaStream>()->cuda_stream()));
+      }
+      OF_NCCL_CHECK(ncclGroupEnd());
+    }
+
+    if (in_split_axis != 0) {
+      // Do unpack.
+      CHECK(unpack_from_ptr != out->mut_dptr<char>());
+      DimVector unpack_from_dim_vec = logical_shape_dim_vec;
+      CHECK_EQ(unpack_from_dim_vec.at(in_split_axis) % num_ranks, 0);
+      unpack_from_dim_vec[in_split_axis] = unpack_from_dim_vec.at(in_split_axis) / num_ranks;
+      CHECK_EQ(unpack_from_dim_vec.at(out_split_axis) % num_ranks, 0);
+      unpack_from_dim_vec[out_split_axis] = unpack_from_dim_vec.at(out_split_axis) / num_ranks;
+      unpack_from_dim_vec.insert(unpack_from_dim_vec.begin(), num_ranks);
+      std::vector<int32_t> perm;
+      FOR_RANGE(int64_t, i, 1, unpack_from_dim_vec.size()) { perm.emplace_back(i); }
+      perm.insert(perm.begin() + in_split_axis, 0);
+      auto transpose = ep::primitive::NewPrimitive<ep::primitive::PermuteFactory>(
+          ctx->stream()->device_type(), unpack_from_dim_vec.size());
+      CHECK(transpose);
+      transpose->Launch(ctx->stream(), in->data_type(), unpack_from_dim_vec.size(),
+                        unpack_from_dim_vec.data(), unpack_from_ptr, perm.data(), out->mut_dptr());
+    }
+  };
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
+};
+
+#define REGISTER_EAGER_NCCL_S2S_KERNEL(dtype)                                            \
+  REGISTER_USER_KERNEL("eager_nccl_s2s")                                                 \
+      .SetCreateFn<EagerNcclS2SKernel<dtype>>()                                          \
+      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                   \
+                       && (user_op::HobDataType("in", 0) == GetDataType<dtype>::value)   \
+                       && (user_op::HobDataType("out", 0) == GetDataType<dtype>::value)) \
+      .SetInferTmpSizeFn(InferEagerNcclS2SKernelTmpBufferSize);
+
+REGISTER_EAGER_NCCL_S2S_KERNEL(int8_t)
+REGISTER_EAGER_NCCL_S2S_KERNEL(int32_t)
+REGISTER_EAGER_NCCL_S2S_KERNEL(int64_t)
+REGISTER_EAGER_NCCL_S2S_KERNEL(bool)
+REGISTER_EAGER_NCCL_S2S_KERNEL(float)
+REGISTER_EAGER_NCCL_S2S_KERNEL(double)
+REGISTER_EAGER_NCCL_S2S_KERNEL(float16)
+}  // namespace oneflow
+
 #endif  // WITH_ROCM
\ No newline at end of file
diff --git a/oneflow/user/kernels/elementwise_maximum_minimum_kernel.hip.cpp b/oneflow/user/kernels/elementwise_maximum_minimum_kernel.hip.cpp
index 9e9e330..e139f4f 100644
--- a/oneflow/user/kernels/elementwise_maximum_minimum_kernel.hip.cpp
+++ b/oneflow/user/kernels/elementwise_maximum_minimum_kernel.hip.cpp
@@ -1,57 +1,57 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#ifdef WITH_ROCM
-#include "hip/hip_runtime.h"
-#include "oneflow/core/hip/elementwise.hip.h"
-#include "oneflow/user/kernels/elementwise_maximum_minimum_kernel.h"
-#include "oneflow/core/ep/rocm/cuda_stream.h"
-
-namespace oneflow {
-
-namespace {
-template<template<typename> class Opt, typename T>
-__global__ void ElementwiseXimumGradGpuKernel(int64_t elem_cnt, const T* dz, const T* x, const T* y,
-                                              T* dx, T* dy) {
-  XPU_1D_KERNEL_LOOP(idx, elem_cnt) {
-    Opt<T>()(dz[idx], x[idx], y[idx], dx ? &dx[idx] : nullptr, dy ? &dy[idx] : nullptr);
-  }
-}
-
-template<template<typename> class Opt, typename T>
-struct ElemwiseXimumGradFunctor<DeviceType::kCUDA, Opt, T> final {
-  void operator()(ep::Stream* stream, int64_t elem_cnt, const T* dz, const T* x, const T* y, T* dx,
-                  T* dy) {
-    ElementwiseXimumGradGpuKernel<Opt, T>
-        <<<BlocksNum4ThreadsNum(elem_cnt), kCudaThreadsNumPerBlock, 0,
-           stream->As<ep::CudaStream>()->cuda_stream()>>>(elem_cnt, dz, x, y, dx, dy);
-  }
-};
-
-template<template<typename> class Opt, typename T>
-struct ElemwiseXimumFunctor<DeviceType::kCUDA, Opt, T> final {
-  void operator()(ep::Stream* stream, int64_t elem_cnt, T* z, const T* x, const T* y) {
-    OF_CUDA_CHECK(cuda::elementwise::Binary(Opt<T>(), elem_cnt, z, x, y,
-                                            stream->As<ep::CudaStream>()->cuda_stream()));
-  }
-};
-}  // namespace
-
-OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_MAXIMUM_KERNELS, (DeviceType::kCUDA),
-                                 ARITHMETIC_DATA_TYPE_SEQ UNSIGNED_INT_DATA_TYPE_SEQ)
-OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_MINIMUM_KERNELS, (DeviceType::kCUDA),
-                                 ARITHMETIC_DATA_TYPE_SEQ UNSIGNED_INT_DATA_TYPE_SEQ)
-}  // namespace oneflow
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#ifdef WITH_ROCM
+#include "hip/hip_runtime.h"
+#include "oneflow/core/hip/elementwise.hip.h"
+#include "oneflow/user/kernels/elementwise_maximum_minimum_kernel.h"
+#include "oneflow/core/ep/rocm/cuda_stream.h"
+
+namespace oneflow {
+
+namespace {
+template<template<typename> class Opt, typename T>
+__global__ void ElementwiseXimumGradGpuKernel(int64_t elem_cnt, const T* dz, const T* x, const T* y,
+                                              T* dx, T* dy) {
+  XPU_1D_KERNEL_LOOP(idx, elem_cnt) {
+    Opt<T>()(dz[idx], x[idx], y[idx], dx ? &dx[idx] : nullptr, dy ? &dy[idx] : nullptr);
+  }
+}
+
+template<template<typename> class Opt, typename T>
+struct ElemwiseXimumGradFunctor<DeviceType::kCUDA, Opt, T> final {
+  void operator()(ep::Stream* stream, int64_t elem_cnt, const T* dz, const T* x, const T* y, T* dx,
+                  T* dy) {
+    ElementwiseXimumGradGpuKernel<Opt, T>
+        <<<BlocksNum4ThreadsNum(elem_cnt), kCudaThreadsNumPerBlock, 0,
+           stream->As<ep::CudaStream>()->cuda_stream()>>>(elem_cnt, dz, x, y, dx, dy);
+  }
+};
+
+template<template<typename> class Opt, typename T>
+struct ElemwiseXimumFunctor<DeviceType::kCUDA, Opt, T> final {
+  void operator()(ep::Stream* stream, int64_t elem_cnt, T* z, const T* x, const T* y) {
+    OF_CUDA_CHECK(cuda::elementwise::Binary(Opt<T>(), elem_cnt, z, x, y,
+                                            stream->As<ep::CudaStream>()->cuda_stream()));
+  }
+};
+}  // namespace
+
+OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_MAXIMUM_KERNELS, (DeviceType::kCUDA),
+                                 ARITHMETIC_DATA_TYPE_SEQ UNSIGNED_INT_DATA_TYPE_SEQ)
+OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_MINIMUM_KERNELS, (DeviceType::kCUDA),
+                                 ARITHMETIC_DATA_TYPE_SEQ UNSIGNED_INT_DATA_TYPE_SEQ)
+}  // namespace oneflow
 #endif  // WITH_ROCM
\ No newline at end of file
diff --git a/oneflow/user/kernels/embedding_kernel.hip.cpp b/oneflow/user/kernels/embedding_kernel.hip.cpp
index 5c9c6f7..2dc5e55 100644
--- a/oneflow/user/kernels/embedding_kernel.hip.cpp
+++ b/oneflow/user/kernels/embedding_kernel.hip.cpp
@@ -1,160 +1,160 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-#include "oneflow/core/framework/framework.h"
-#include "oneflow/core/ep/rocm/cuda_stream.h"
-#include "oneflow/core/ep/include/primitive/memset.h"
-#include "oneflow/user/kernels/embedding_kernel_util.h"
-
-namespace oneflow {
-
-template<typename T, typename IndexType>
-class GpuEmbeddingRenormKernel final : public user_op::OpKernel {
- public:
-  GpuEmbeddingRenormKernel() = default;
-  ~GpuEmbeddingRenormKernel() = default;
-
- private:
-  using user_op::OpKernel::Compute;
-  void Compute(user_op::KernelComputeContext* ctx) const override {
-    const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("in", 0);
-    const user_op::Tensor* indices = ctx->Tensor4ArgNameAndIndex("indices", 0);
-    user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0);
-    const double max_norm = ctx->Attr<double>("max_norm");
-    const double norm_type = ctx->Attr<double>("norm_type");
-
-    const ShapeView& in_shape = in->shape_view();
-    const int64_t emb_size = in_shape.At(0);
-    const int64_t emb_dim = in_shape.At(1);
-    const T* in_buf = in->dptr<T>();
-    const IndexType* indices_buf = indices->dptr<IndexType>();
-    T* out_buf = out->mut_dptr<T>();
-    const int64_t num_indices = indices->shape_view().elem_cnt();
-    int32_t* tmp_buf = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0)->mut_dptr<int32_t>();
-    std::unique_ptr<ep::primitive::Memset> memset_primitive =
-        ep::primitive::NewPrimitive<ep::primitive::MemsetFactory>(ctx->device_type());
-    CHECK(memset_primitive);
-    memset_primitive->Launch(ctx->stream(), tmp_buf, 0,
-                             GetCudaAlignedSize(sizeof(int32_t) * emb_size));
-    EmbeddingReNormFunctor<DeviceType::kCUDA, T, IndexType>()(
-        ctx->stream(), in_buf, indices_buf, out_buf, max_norm, norm_type, num_indices, emb_size,
-        emb_dim, tmp_buf);
-  }
-  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
-};
-
-template<typename T, typename IndexType>
-class GpuEmbeddingKernel final : public user_op::OpKernel {
- public:
-  GpuEmbeddingKernel() = default;
-  ~GpuEmbeddingKernel() = default;
-
- private:
-  using user_op::OpKernel::Compute;
-  void Compute(user_op::KernelComputeContext* ctx) const override {
-    const user_op::Tensor* weight = ctx->Tensor4ArgNameAndIndex("weight", 0);
-    const user_op::Tensor* indices = ctx->Tensor4ArgNameAndIndex("indices", 0);
-    user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0);
-    const int64_t padding_idx = ctx->Attr<int64_t>("padding_idx");
-    const bool scale_grad_by_freq = ctx->Attr<bool>("scale_grad_by_freq");
-
-    const int64_t num_indices = indices->shape_view().elem_cnt();
-    const int64_t emb_size = weight->shape_view().At(0);
-    const int64_t emb_dim = weight->shape_view().At(1);
-    const T* weight_buf = weight->dptr<T>();
-    const IndexType* indices_buf = indices->dptr<IndexType>();
-    T* out_buf = out->mut_dptr<T>();
-
-    EmbeddingFunctor<DeviceType::kCUDA, T, IndexType>()(ctx->stream(), weight_buf, indices_buf,
-                                                        out_buf, padding_idx, scale_grad_by_freq,
-                                                        num_indices, emb_size, emb_dim);
-  }
-  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
-};
-
-template<typename T, typename IndexType>
-class GpuEmbeddingGradKernel final : public user_op::OpKernel {
- public:
-  GpuEmbeddingGradKernel() = default;
-  ~GpuEmbeddingGradKernel() = default;
-
- private:
-  using user_op::OpKernel::Compute;
-  void Compute(user_op::KernelComputeContext* ctx) const override {
-    const user_op::Tensor* dy = ctx->Tensor4ArgNameAndIndex("dy", 0);
-    const user_op::Tensor* weight = ctx->Tensor4ArgNameAndIndex("weight", 0);
-    const user_op::Tensor* indices = ctx->Tensor4ArgNameAndIndex("indices", 0);
-    user_op::Tensor* dx = ctx->Tensor4ArgNameAndIndex("dx", 0);
-    const int64_t padding_idx = ctx->Attr<int64_t>("padding_idx");
-    const bool scale_grad_by_freq = ctx->Attr<bool>("scale_grad_by_freq");
-
-    const int64_t num_indices = indices->shape_view().elem_cnt();
-    const int64_t emb_size = weight->shape_view().At(0);
-    const int64_t emb_dim = weight->shape_view().At(1);
-
-    const T* dy_buf = dy->dptr<T>();
-    const IndexType* indices_buf = indices->dptr<IndexType>();
-    T* dx_buf = dx->mut_dptr<T>();
-    int32_t* tmp_buf = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0)->mut_dptr<int32_t>();
-    std::unique_ptr<ep::primitive::Memset> memset_primitive =
-        ep::primitive::NewPrimitive<ep::primitive::MemsetFactory>(ctx->device_type());
-    CHECK(memset_primitive);
-    memset_primitive->Launch(ctx->stream(), dx_buf, 0, dx->shape_view().elem_cnt() * sizeof(T));
-    memset_primitive->Launch(ctx->stream(), tmp_buf, 0,
-                             GetCudaAlignedSize(sizeof(int32_t) * emb_size));
-    EmbeddingGradFunctor<DeviceType::kCUDA, T, IndexType>()(
-        ctx->stream(), dy_buf, indices_buf, dx_buf, padding_idx, scale_grad_by_freq, num_indices,
-        emb_size, emb_dim, tmp_buf);
-  }
-  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
-};
-
-#define REGISTER_CUDA_EMBEDDING_KERNEL(in_type, indices_type)                                      \
-  REGISTER_USER_KERNEL("embedding_renorm")                                                         \
-      .SetCreateFn<                                                                                \
-          GpuEmbeddingRenormKernel<OF_PP_PAIR_FIRST(in_type), OF_PP_PAIR_FIRST(indices_type)>>()   \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                             \
-                       && (user_op::HobDataType("in", 0) == OF_PP_PAIR_SECOND(in_type))            \
-                       && (user_op::HobDataType("indices", 0) == OF_PP_PAIR_SECOND(indices_type))) \
-      .SetInferTmpSizeFn([](user_op::InferContext* ctx) -> size_t {                                \
-        const Shape& in_shape = ctx->InputShape("in", 0);                                          \
-        const int64_t emb_size = in_shape.At(0);                                                   \
-        return GetCudaAlignedSize(sizeof(int32_t) * emb_size);                                     \
-      });                                                                                          \
-  REGISTER_USER_KERNEL("embedding")                                                                \
-      .SetCreateFn<                                                                                \
-          GpuEmbeddingKernel<OF_PP_PAIR_FIRST(in_type), OF_PP_PAIR_FIRST(indices_type)>>()         \
-      .SetIsMatchedHob(                                                                            \
-          (user_op::HobDeviceType() == DeviceType::kCUDA)                                          \
-          && (user_op::HobDataType("weight", 0) == OF_PP_PAIR_SECOND(in_type))                     \
-          && (user_op::HobDataType("indices", 0) == OF_PP_PAIR_SECOND(indices_type)));             \
-  REGISTER_USER_KERNEL("embedding_grad")                                                           \
-      .SetCreateFn<                                                                                \
-          GpuEmbeddingGradKernel<OF_PP_PAIR_FIRST(in_type), OF_PP_PAIR_FIRST(indices_type)>>()     \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                             \
-                       && (user_op::HobDataType("weight", 0) == OF_PP_PAIR_SECOND(in_type))        \
-                       && (user_op::HobDataType("indices", 0) == OF_PP_PAIR_SECOND(indices_type))) \
-      .SetInferTmpSizeFn([](user_op::InferContext* ctx) -> size_t {                                \
-        const Shape& in_shape = ctx->InputShape("weight", 0);                                      \
-        const int64_t emb_size = in_shape.At(0);                                                   \
-        return GetCudaAlignedSize(sizeof(int32_t) * emb_size);                                     \
-      });
-
-OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_CUDA_EMBEDDING_KERNEL, EMBEDDING_DATA_TYPE_SEQ_CUDA,
-                                 INDEX_DATA_TYPE_SEQ)
-#undef REGISTER_CUDA_EMBEDDING_KERNEL
-
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+#include "oneflow/core/framework/framework.h"
+#include "oneflow/core/ep/rocm/cuda_stream.h"
+#include "oneflow/core/ep/include/primitive/memset.h"
+#include "oneflow/user/kernels/embedding_kernel_util.h"
+
+namespace oneflow {
+
+template<typename T, typename IndexType>
+class GpuEmbeddingRenormKernel final : public user_op::OpKernel {
+ public:
+  GpuEmbeddingRenormKernel() = default;
+  ~GpuEmbeddingRenormKernel() = default;
+
+ private:
+  using user_op::OpKernel::Compute;
+  void Compute(user_op::KernelComputeContext* ctx) const override {
+    const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("in", 0);
+    const user_op::Tensor* indices = ctx->Tensor4ArgNameAndIndex("indices", 0);
+    user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0);
+    const double max_norm = ctx->Attr<double>("max_norm");
+    const double norm_type = ctx->Attr<double>("norm_type");
+
+    const ShapeView& in_shape = in->shape_view();
+    const int64_t emb_size = in_shape.At(0);
+    const int64_t emb_dim = in_shape.At(1);
+    const T* in_buf = in->dptr<T>();
+    const IndexType* indices_buf = indices->dptr<IndexType>();
+    T* out_buf = out->mut_dptr<T>();
+    const int64_t num_indices = indices->shape_view().elem_cnt();
+    int32_t* tmp_buf = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0)->mut_dptr<int32_t>();
+    std::unique_ptr<ep::primitive::Memset> memset_primitive =
+        ep::primitive::NewPrimitive<ep::primitive::MemsetFactory>(ctx->device_type());
+    CHECK(memset_primitive);
+    memset_primitive->Launch(ctx->stream(), tmp_buf, 0,
+                             GetCudaAlignedSize(sizeof(int32_t) * emb_size));
+    EmbeddingReNormFunctor<DeviceType::kCUDA, T, IndexType>()(
+        ctx->stream(), in_buf, indices_buf, out_buf, max_norm, norm_type, num_indices, emb_size,
+        emb_dim, tmp_buf);
+  }
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
+};
+
+template<typename T, typename IndexType>
+class GpuEmbeddingKernel final : public user_op::OpKernel {
+ public:
+  GpuEmbeddingKernel() = default;
+  ~GpuEmbeddingKernel() = default;
+
+ private:
+  using user_op::OpKernel::Compute;
+  void Compute(user_op::KernelComputeContext* ctx) const override {
+    const user_op::Tensor* weight = ctx->Tensor4ArgNameAndIndex("weight", 0);
+    const user_op::Tensor* indices = ctx->Tensor4ArgNameAndIndex("indices", 0);
+    user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0);
+    const int64_t padding_idx = ctx->Attr<int64_t>("padding_idx");
+    const bool scale_grad_by_freq = ctx->Attr<bool>("scale_grad_by_freq");
+
+    const int64_t num_indices = indices->shape_view().elem_cnt();
+    const int64_t emb_size = weight->shape_view().At(0);
+    const int64_t emb_dim = weight->shape_view().At(1);
+    const T* weight_buf = weight->dptr<T>();
+    const IndexType* indices_buf = indices->dptr<IndexType>();
+    T* out_buf = out->mut_dptr<T>();
+
+    EmbeddingFunctor<DeviceType::kCUDA, T, IndexType>()(ctx->stream(), weight_buf, indices_buf,
+                                                        out_buf, padding_idx, scale_grad_by_freq,
+                                                        num_indices, emb_size, emb_dim);
+  }
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
+};
+
+template<typename T, typename IndexType>
+class GpuEmbeddingGradKernel final : public user_op::OpKernel {
+ public:
+  GpuEmbeddingGradKernel() = default;
+  ~GpuEmbeddingGradKernel() = default;
+
+ private:
+  using user_op::OpKernel::Compute;
+  void Compute(user_op::KernelComputeContext* ctx) const override {
+    const user_op::Tensor* dy = ctx->Tensor4ArgNameAndIndex("dy", 0);
+    const user_op::Tensor* weight = ctx->Tensor4ArgNameAndIndex("weight", 0);
+    const user_op::Tensor* indices = ctx->Tensor4ArgNameAndIndex("indices", 0);
+    user_op::Tensor* dx = ctx->Tensor4ArgNameAndIndex("dx", 0);
+    const int64_t padding_idx = ctx->Attr<int64_t>("padding_idx");
+    const bool scale_grad_by_freq = ctx->Attr<bool>("scale_grad_by_freq");
+
+    const int64_t num_indices = indices->shape_view().elem_cnt();
+    const int64_t emb_size = weight->shape_view().At(0);
+    const int64_t emb_dim = weight->shape_view().At(1);
+
+    const T* dy_buf = dy->dptr<T>();
+    const IndexType* indices_buf = indices->dptr<IndexType>();
+    T* dx_buf = dx->mut_dptr<T>();
+    int32_t* tmp_buf = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0)->mut_dptr<int32_t>();
+    std::unique_ptr<ep::primitive::Memset> memset_primitive =
+        ep::primitive::NewPrimitive<ep::primitive::MemsetFactory>(ctx->device_type());
+    CHECK(memset_primitive);
+    memset_primitive->Launch(ctx->stream(), dx_buf, 0, dx->shape_view().elem_cnt() * sizeof(T));
+    memset_primitive->Launch(ctx->stream(), tmp_buf, 0,
+                             GetCudaAlignedSize(sizeof(int32_t) * emb_size));
+    EmbeddingGradFunctor<DeviceType::kCUDA, T, IndexType>()(
+        ctx->stream(), dy_buf, indices_buf, dx_buf, padding_idx, scale_grad_by_freq, num_indices,
+        emb_size, emb_dim, tmp_buf);
+  }
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
+};
+
+#define REGISTER_CUDA_EMBEDDING_KERNEL(in_type, indices_type)                                      \
+  REGISTER_USER_KERNEL("embedding_renorm")                                                         \
+      .SetCreateFn<                                                                                \
+          GpuEmbeddingRenormKernel<OF_PP_PAIR_FIRST(in_type), OF_PP_PAIR_FIRST(indices_type)>>()   \
+      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                             \
+                       && (user_op::HobDataType("in", 0) == OF_PP_PAIR_SECOND(in_type))            \
+                       && (user_op::HobDataType("indices", 0) == OF_PP_PAIR_SECOND(indices_type))) \
+      .SetInferTmpSizeFn([](user_op::InferContext* ctx) -> size_t {                                \
+        const Shape& in_shape = ctx->InputShape("in", 0);                                          \
+        const int64_t emb_size = in_shape.At(0);                                                   \
+        return GetCudaAlignedSize(sizeof(int32_t) * emb_size);                                     \
+      });                                                                                          \
+  REGISTER_USER_KERNEL("embedding")                                                                \
+      .SetCreateFn<                                                                                \
+          GpuEmbeddingKernel<OF_PP_PAIR_FIRST(in_type), OF_PP_PAIR_FIRST(indices_type)>>()         \
+      .SetIsMatchedHob(                                                                            \
+          (user_op::HobDeviceType() == DeviceType::kCUDA)                                          \
+          && (user_op::HobDataType("weight", 0) == OF_PP_PAIR_SECOND(in_type))                     \
+          && (user_op::HobDataType("indices", 0) == OF_PP_PAIR_SECOND(indices_type)));             \
+  REGISTER_USER_KERNEL("embedding_grad")                                                           \
+      .SetCreateFn<                                                                                \
+          GpuEmbeddingGradKernel<OF_PP_PAIR_FIRST(in_type), OF_PP_PAIR_FIRST(indices_type)>>()     \
+      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                             \
+                       && (user_op::HobDataType("weight", 0) == OF_PP_PAIR_SECOND(in_type))        \
+                       && (user_op::HobDataType("indices", 0) == OF_PP_PAIR_SECOND(indices_type))) \
+      .SetInferTmpSizeFn([](user_op::InferContext* ctx) -> size_t {                                \
+        const Shape& in_shape = ctx->InputShape("weight", 0);                                      \
+        const int64_t emb_size = in_shape.At(0);                                                   \
+        return GetCudaAlignedSize(sizeof(int32_t) * emb_size);                                     \
+      });
+
+OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_CUDA_EMBEDDING_KERNEL, EMBEDDING_DATA_TYPE_SEQ_CUDA,
+                                 INDEX_DATA_TYPE_SEQ)
+#undef REGISTER_CUDA_EMBEDDING_KERNEL
+
 }  // namespace oneflow
\ No newline at end of file
diff --git a/oneflow/user/kernels/embedding_kernel_util.hip.cpp b/oneflow/user/kernels/embedding_kernel_util.hip.cpp
index 7c0df30..a2f5e7d 100644
--- a/oneflow/user/kernels/embedding_kernel_util.hip.cpp
+++ b/oneflow/user/kernels/embedding_kernel_util.hip.cpp
@@ -1,182 +1,182 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "hip/hip_runtime.h"
-#include <hipcub/hipcub.hpp>
-#include "oneflow/core/hip/atomic.hip.h"
-#include "oneflow/user/kernels/embedding_kernel_util.h"
-#include "oneflow/core/ep/rocm/cuda_stream.h"
-
-namespace oneflow {
-
-namespace {
-
-template<typename T>
-struct AccumulateType {
-  using type = T;
-};
-
-template<>
-struct AccumulateType<half> {
-  using type = float;
-};
-
-template<typename T, typename IndexType>
-__global__ void embedding_kernel(const T* weight_buf, const IndexType* indices_buf, T* out_buf,
-                                 const int64_t num_indices, const int64_t emb_size,
-                                 const int64_t emb_dim) {
-  CUDA_1D_KERNEL_LOOP_T(int64_t, i, num_indices * emb_dim) {
-    IndexType indices_index = i / emb_dim;
-    IndexType emb_dim_index = i - indices_index * emb_dim;
-    IndexType emb_size_index = indices_buf[indices_index];
-    assert(emb_size_index >= 0 && emb_size_index < emb_size);
-    IndexType from_index = emb_size_index * emb_dim + emb_dim_index;
-    out_buf[i] = weight_buf[from_index];
-  }
-}
-
-template<typename T, typename IndexType>
-__global__ void embedding_grad_kernel(const T* dy_buf, const IndexType* indices_buf, T* dx_buf,
-                                      const int64_t padding_idx, const int64_t num_indices,
-                                      const int64_t emb_dim) {
-  CUDA_1D_KERNEL_LOOP_T(int64_t, i, num_indices * emb_dim) {
-    IndexType indices_index = i / emb_dim;
-    IndexType emb_dim_index = i - indices_index * emb_dim;
-    IndexType emb_size_index = indices_buf[indices_index];
-    if (emb_size_index != padding_idx) {
-      IndexType from_index = emb_size_index * emb_dim + emb_dim_index;
-      cuda::atomic::Add(dx_buf + from_index, dy_buf[i]);
-    }
-  }
-}
-
-template<typename IndexType>
-__global__ void indices_freq_kernel(const IndexType* indices_buf, const int64_t num_indices,
-                                    int32_t* indices_freq, const int64_t emb_size) {
-  CUDA_1D_KERNEL_LOOP_T(IndexType, i, num_indices) {
-    IndexType index = indices_buf[i];
-    assert(index >= 0 && index < emb_size);
-    cuda::atomic::Add(indices_freq + index, 1);
-  }
-}
-
-template<typename T, typename IndexType>
-__global__ void emb_scale_kernel(T* dx_buf, const int64_t emb_size, const int64_t emb_dim,
-                                 int32_t* indices_freq) {
-  CUDA_1D_KERNEL_LOOP_T(int64_t, i, emb_size * emb_dim) {
-    IndexType emb_size_index = i / emb_dim;
-    if (indices_freq[emb_size_index] > 1) {
-      dx_buf[i] /= static_cast<T>(indices_freq[emb_size_index]);
-    }
-  }
-}
-
-template<typename T, typename IndexType, typename AccumType>
-__global__ void embedding_renorm_kernel(const T* in_buf, T* out_buf, int32_t* indices_freq,
-                                        const AccumType max_norm, const AccumType norm_type,
-                                        const int64_t emb_size, const int64_t emb_dim) {
-  int64_t tid = threadIdx.x;
-  for (int64_t emb_idx = blockIdx.x; emb_idx < emb_size; emb_idx += gridDim.x) {
-    if (indices_freq[emb_idx] == 0) { continue; }
-    int64_t base_index = emb_idx * emb_dim;
-
-    AccumType v = 0;
-    for (int64_t i = tid; i < emb_dim; i += blockDim.x) {
-      v += pow(abs(static_cast<AccumType>(in_buf[base_index + i])), norm_type);
-    }
-
-    using BlockReduce = hipcub::BlockReduce<AccumType, kCudaThreadsNumPerBlock>;
-    __shared__ typename BlockReduce::TempStorage temp_storage;
-    __shared__ AccumType norm;
-    v = BlockReduce(temp_storage).Sum(v);
-
-    if (tid == 0) { norm = pow(v, static_cast<AccumType>(1.0 / norm_type)); }
-    __syncthreads();
-
-    if (norm > max_norm) {
-      auto scale = static_cast<T>(max_norm / (norm + 1e-7));
-      for (int64_t i = tid; i < emb_dim; i += blockDim.x) {
-        out_buf[base_index + i] = in_buf[base_index + i] * scale;
-      }
-    }
-  }
-}
-
-}  // namespace
-
-template<typename T, typename IndexType>
-struct EmbeddingReNormFunctor<DeviceType::kCUDA, T, IndexType> final {
-  void operator()(ep::Stream* stream, const T* in_buf, const IndexType* indices_buf, T* out_buf,
-                  const double max_norm, const double norm_type, const int64_t num_indices,
-                  const int64_t emb_size, const int64_t emb_dim, int32_t* tmp_buf) {
-    indices_freq_kernel<IndexType><<<BlocksNum4ThreadsNum(num_indices), kCudaThreadsNumPerBlock, 0,
-                                     stream->As<ep::CudaStream>()->cuda_stream()>>>(
-        indices_buf, num_indices, tmp_buf, emb_size);
-
-    using AccumType = typename AccumulateType<T>::type;
-    embedding_renorm_kernel<T, IndexType, AccumType>
-        <<<BlocksNum4ThreadsNum(emb_size), kCudaThreadsNumPerBlock, 0,
-           stream->As<ep::CudaStream>()->cuda_stream()>>>(
-            in_buf, out_buf, tmp_buf, static_cast<AccumType>(max_norm),
-            static_cast<AccumType>(norm_type), emb_size, emb_dim);
-  }
-};
-
-template<typename T, typename IndexType>
-struct EmbeddingFunctor<DeviceType::kCUDA, T, IndexType> final {
-  void operator()(ep::Stream* stream, const T* weight_buf, const IndexType* indices_buf, T* out_buf,
-                  const int64_t padding_idx, const bool scale_grad_by_freq,
-                  const int64_t num_indices, const int64_t emb_size, const int64_t emb_dim) {
-    embedding_kernel<T, IndexType>
-        <<<BlocksNum4ThreadsNum(num_indices * emb_dim), kCudaThreadsNumPerBlock, 0,
-           stream->As<ep::CudaStream>()->cuda_stream()>>>(weight_buf, indices_buf, out_buf,
-                                                          num_indices, emb_size, emb_dim);
-  }
-};
-
-template<typename T, typename IndexType>
-struct EmbeddingGradFunctor<DeviceType::kCUDA, T, IndexType> final {
-  void operator()(ep::Stream* stream, const T* dy_buf, const IndexType* indices_buf, T* dx_buf,
-                  const int64_t padding_idx, const bool scale_grad_by_freq,
-                  const int64_t num_indices, const int64_t emb_size, const int64_t emb_dim,
-                  int32_t* tmp_buf) {
-    embedding_grad_kernel<T, IndexType>
-        <<<BlocksNum4ThreadsNum(num_indices * emb_dim), kCudaThreadsNumPerBlock, 0,
-           stream->As<ep::CudaStream>()->cuda_stream()>>>(dy_buf, indices_buf, dx_buf, padding_idx,
-                                                          num_indices, emb_dim);
-    if (scale_grad_by_freq) {
-      indices_freq_kernel<IndexType><<<BlocksNum4ThreadsNum(num_indices), kCudaThreadsNumPerBlock,
-                                       0, stream->As<ep::CudaStream>()->cuda_stream()>>>(
-          indices_buf, num_indices, tmp_buf, emb_size);
-      emb_scale_kernel<T, IndexType>
-          <<<BlocksNum4ThreadsNum(emb_size * emb_dim), kCudaThreadsNumPerBlock, 0,
-             stream->As<ep::CudaStream>()->cuda_stream()>>>(dx_buf, emb_size, emb_dim, tmp_buf);
-    }
-  }
-};
-
-#define INITIATE_EMBEDDING_KERNEL_UTIL_CUDA_IMPL(in_type_pair, index_type_pair)             \
-  template struct EmbeddingReNormFunctor<DeviceType::kCUDA, OF_PP_PAIR_FIRST(in_type_pair), \
-                                         OF_PP_PAIR_FIRST(index_type_pair)>;                \
-  template struct EmbeddingFunctor<DeviceType::kCUDA, OF_PP_PAIR_FIRST(in_type_pair),       \
-                                   OF_PP_PAIR_FIRST(index_type_pair)>;                      \
-  template struct EmbeddingGradFunctor<DeviceType::kCUDA, OF_PP_PAIR_FIRST(in_type_pair),   \
-                                       OF_PP_PAIR_FIRST(index_type_pair)>;
-OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INITIATE_EMBEDDING_KERNEL_UTIL_CUDA_IMPL,
-                                 EMBEDDING_DATA_TYPE_SEQ_CUDA, INDEX_DATA_TYPE_SEQ);
-
-#undef INITIATE_EMBEDDING_KERNEL_UTIL_CUDA_IMPL
-
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "hip/hip_runtime.h"
+#include <hipcub/hipcub.hpp>
+#include "oneflow/core/hip/atomic.hip.h"
+#include "oneflow/user/kernels/embedding_kernel_util.h"
+#include "oneflow/core/ep/rocm/cuda_stream.h"
+
+namespace oneflow {
+
+namespace {
+
+template<typename T>
+struct AccumulateType {
+  using type = T;
+};
+
+template<>
+struct AccumulateType<half> {
+  using type = float;
+};
+
+template<typename T, typename IndexType>
+__global__ void embedding_kernel(const T* weight_buf, const IndexType* indices_buf, T* out_buf,
+                                 const int64_t num_indices, const int64_t emb_size,
+                                 const int64_t emb_dim) {
+  CUDA_1D_KERNEL_LOOP_T(int64_t, i, num_indices * emb_dim) {
+    IndexType indices_index = i / emb_dim;
+    IndexType emb_dim_index = i - indices_index * emb_dim;
+    IndexType emb_size_index = indices_buf[indices_index];
+    assert(emb_size_index >= 0 && emb_size_index < emb_size);
+    IndexType from_index = emb_size_index * emb_dim + emb_dim_index;
+    out_buf[i] = weight_buf[from_index];
+  }
+}
+
+template<typename T, typename IndexType>
+__global__ void embedding_grad_kernel(const T* dy_buf, const IndexType* indices_buf, T* dx_buf,
+                                      const int64_t padding_idx, const int64_t num_indices,
+                                      const int64_t emb_dim) {
+  CUDA_1D_KERNEL_LOOP_T(int64_t, i, num_indices * emb_dim) {
+    IndexType indices_index = i / emb_dim;
+    IndexType emb_dim_index = i - indices_index * emb_dim;
+    IndexType emb_size_index = indices_buf[indices_index];
+    if (emb_size_index != padding_idx) {
+      IndexType from_index = emb_size_index * emb_dim + emb_dim_index;
+      cuda::atomic::Add(dx_buf + from_index, dy_buf[i]);
+    }
+  }
+}
+
+template<typename IndexType>
+__global__ void indices_freq_kernel(const IndexType* indices_buf, const int64_t num_indices,
+                                    int32_t* indices_freq, const int64_t emb_size) {
+  CUDA_1D_KERNEL_LOOP_T(IndexType, i, num_indices) {
+    IndexType index = indices_buf[i];
+    assert(index >= 0 && index < emb_size);
+    cuda::atomic::Add(indices_freq + index, 1);
+  }
+}
+
+template<typename T, typename IndexType>
+__global__ void emb_scale_kernel(T* dx_buf, const int64_t emb_size, const int64_t emb_dim,
+                                 int32_t* indices_freq) {
+  CUDA_1D_KERNEL_LOOP_T(int64_t, i, emb_size * emb_dim) {
+    IndexType emb_size_index = i / emb_dim;
+    if (indices_freq[emb_size_index] > 1) {
+      dx_buf[i] /= static_cast<T>(indices_freq[emb_size_index]);
+    }
+  }
+}
+
+template<typename T, typename IndexType, typename AccumType>
+__global__ void embedding_renorm_kernel(const T* in_buf, T* out_buf, int32_t* indices_freq,
+                                        const AccumType max_norm, const AccumType norm_type,
+                                        const int64_t emb_size, const int64_t emb_dim) {
+  int64_t tid = threadIdx.x;
+  for (int64_t emb_idx = blockIdx.x; emb_idx < emb_size; emb_idx += gridDim.x) {
+    if (indices_freq[emb_idx] == 0) { continue; }
+    int64_t base_index = emb_idx * emb_dim;
+
+    AccumType v = 0;
+    for (int64_t i = tid; i < emb_dim; i += blockDim.x) {
+      v += pow(abs(static_cast<AccumType>(in_buf[base_index + i])), norm_type);
+    }
+
+    using BlockReduce = hipcub::BlockReduce<AccumType, kCudaThreadsNumPerBlock>;
+    __shared__ typename BlockReduce::TempStorage temp_storage;
+    __shared__ AccumType norm;
+    v = BlockReduce(temp_storage).Sum(v);
+
+    if (tid == 0) { norm = pow(v, static_cast<AccumType>(1.0 / norm_type)); }
+    __syncthreads();
+
+    if (norm > max_norm) {
+      auto scale = static_cast<T>(max_norm / (norm + 1e-7));
+      for (int64_t i = tid; i < emb_dim; i += blockDim.x) {
+        out_buf[base_index + i] = in_buf[base_index + i] * scale;
+      }
+    }
+  }
+}
+
+}  // namespace
+
+template<typename T, typename IndexType>
+struct EmbeddingReNormFunctor<DeviceType::kCUDA, T, IndexType> final {
+  void operator()(ep::Stream* stream, const T* in_buf, const IndexType* indices_buf, T* out_buf,
+                  const double max_norm, const double norm_type, const int64_t num_indices,
+                  const int64_t emb_size, const int64_t emb_dim, int32_t* tmp_buf) {
+    indices_freq_kernel<IndexType><<<BlocksNum4ThreadsNum(num_indices), kCudaThreadsNumPerBlock, 0,
+                                     stream->As<ep::CudaStream>()->cuda_stream()>>>(
+        indices_buf, num_indices, tmp_buf, emb_size);
+
+    using AccumType = typename AccumulateType<T>::type;
+    embedding_renorm_kernel<T, IndexType, AccumType>
+        <<<BlocksNum4ThreadsNum(emb_size), kCudaThreadsNumPerBlock, 0,
+           stream->As<ep::CudaStream>()->cuda_stream()>>>(
+            in_buf, out_buf, tmp_buf, static_cast<AccumType>(max_norm),
+            static_cast<AccumType>(norm_type), emb_size, emb_dim);
+  }
+};
+
+template<typename T, typename IndexType>
+struct EmbeddingFunctor<DeviceType::kCUDA, T, IndexType> final {
+  void operator()(ep::Stream* stream, const T* weight_buf, const IndexType* indices_buf, T* out_buf,
+                  const int64_t padding_idx, const bool scale_grad_by_freq,
+                  const int64_t num_indices, const int64_t emb_size, const int64_t emb_dim) {
+    embedding_kernel<T, IndexType>
+        <<<BlocksNum4ThreadsNum(num_indices * emb_dim), kCudaThreadsNumPerBlock, 0,
+           stream->As<ep::CudaStream>()->cuda_stream()>>>(weight_buf, indices_buf, out_buf,
+                                                          num_indices, emb_size, emb_dim);
+  }
+};
+
+template<typename T, typename IndexType>
+struct EmbeddingGradFunctor<DeviceType::kCUDA, T, IndexType> final {
+  void operator()(ep::Stream* stream, const T* dy_buf, const IndexType* indices_buf, T* dx_buf,
+                  const int64_t padding_idx, const bool scale_grad_by_freq,
+                  const int64_t num_indices, const int64_t emb_size, const int64_t emb_dim,
+                  int32_t* tmp_buf) {
+    embedding_grad_kernel<T, IndexType>
+        <<<BlocksNum4ThreadsNum(num_indices * emb_dim), kCudaThreadsNumPerBlock, 0,
+           stream->As<ep::CudaStream>()->cuda_stream()>>>(dy_buf, indices_buf, dx_buf, padding_idx,
+                                                          num_indices, emb_dim);
+    if (scale_grad_by_freq) {
+      indices_freq_kernel<IndexType><<<BlocksNum4ThreadsNum(num_indices), kCudaThreadsNumPerBlock,
+                                       0, stream->As<ep::CudaStream>()->cuda_stream()>>>(
+          indices_buf, num_indices, tmp_buf, emb_size);
+      emb_scale_kernel<T, IndexType>
+          <<<BlocksNum4ThreadsNum(emb_size * emb_dim), kCudaThreadsNumPerBlock, 0,
+             stream->As<ep::CudaStream>()->cuda_stream()>>>(dx_buf, emb_size, emb_dim, tmp_buf);
+    }
+  }
+};
+
+#define INITIATE_EMBEDDING_KERNEL_UTIL_CUDA_IMPL(in_type_pair, index_type_pair)             \
+  template struct EmbeddingReNormFunctor<DeviceType::kCUDA, OF_PP_PAIR_FIRST(in_type_pair), \
+                                         OF_PP_PAIR_FIRST(index_type_pair)>;                \
+  template struct EmbeddingFunctor<DeviceType::kCUDA, OF_PP_PAIR_FIRST(in_type_pair),       \
+                                   OF_PP_PAIR_FIRST(index_type_pair)>;                      \
+  template struct EmbeddingGradFunctor<DeviceType::kCUDA, OF_PP_PAIR_FIRST(in_type_pair),   \
+                                       OF_PP_PAIR_FIRST(index_type_pair)>;
+OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INITIATE_EMBEDDING_KERNEL_UTIL_CUDA_IMPL,
+                                 EMBEDDING_DATA_TYPE_SEQ_CUDA, INDEX_DATA_TYPE_SEQ);
+
+#undef INITIATE_EMBEDDING_KERNEL_UTIL_CUDA_IMPL
+
 }  // namespace oneflow
\ No newline at end of file
diff --git a/oneflow/user/kernels/erfinv_kernel.hip.cpp b/oneflow/user/kernels/erfinv_kernel.hip.cpp
index 7b057e6..977a8ed 100644
--- a/oneflow/user/kernels/erfinv_kernel.hip.cpp
+++ b/oneflow/user/kernels/erfinv_kernel.hip.cpp
@@ -1,61 +1,61 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "oneflow/core/ep/rocm/cuda_stream.h"
-#include "oneflow/core/framework/framework.h"
-#include "oneflow/core/hip/elementwise.hip.h"
-
-namespace oneflow {
-
-template<typename T>
-struct ErfInvFunctor {
-  OF_DEVICE_FUNC ErfInvFunctor() {}
-  OF_DEVICE_FUNC T operator()(T x) const { return erfinv(x); }
-};
-
-template<typename T>
-class GpuErfinvKernel final : public user_op::OpKernel {
- public:
-  GpuErfinvKernel() = default;
-  ~GpuErfinvKernel() = default;
-
- private:
-  using user_op::OpKernel::Compute;
-  void Compute(user_op::KernelComputeContext* ctx) const override {
-    const user_op::Tensor* x = ctx->Tensor4ArgNameAndIndex("x", 0);
-    user_op::Tensor* y = ctx->Tensor4ArgNameAndIndex("y", 0);
-    const int32_t elem_cnt = x->shape_view().elem_cnt();
-    OF_CUDA_CHECK(cuda::elementwise::Unary(ErfInvFunctor<T>(), elem_cnt, y->mut_dptr<T>(),
-                                           x->dptr<T>(),
-                                           ctx->stream()->As<ep::CudaStream>()->cuda_stream()));
-  }
-  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
-};
-
-#define REGISTER_CUDA_ERFINV_KERNEL(dtype)                                                      \
-  REGISTER_USER_KERNEL("erfinv")                                                                \
-      .SetCreateFn<GpuErfinvKernel<dtype>>()                                                    \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                          \
-                       && (user_op::HobDataType("y", 0) == GetDataType<dtype>::value))          \
-      .SetInplaceProposalFn([](const user_op::InferContext&,                                    \
-                               user_op::AddInplaceArgPair AddInplaceArgPairFn) -> Maybe<void> { \
-        OF_RETURN_IF_ERROR(AddInplaceArgPairFn("y", 0, "x", 0, true));                          \
-        return Maybe<void>::Ok();                                                               \
-      });
-
-REGISTER_CUDA_ERFINV_KERNEL(float)
-REGISTER_CUDA_ERFINV_KERNEL(double)
-
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "oneflow/core/ep/rocm/cuda_stream.h"
+#include "oneflow/core/framework/framework.h"
+#include "oneflow/core/hip/elementwise.hip.h"
+
+namespace oneflow {
+
+template<typename T>
+struct ErfInvFunctor {
+  OF_DEVICE_FUNC ErfInvFunctor() {}
+  OF_DEVICE_FUNC T operator()(T x) const { return erfinv(x); }
+};
+
+template<typename T>
+class GpuErfinvKernel final : public user_op::OpKernel {
+ public:
+  GpuErfinvKernel() = default;
+  ~GpuErfinvKernel() = default;
+
+ private:
+  using user_op::OpKernel::Compute;
+  void Compute(user_op::KernelComputeContext* ctx) const override {
+    const user_op::Tensor* x = ctx->Tensor4ArgNameAndIndex("x", 0);
+    user_op::Tensor* y = ctx->Tensor4ArgNameAndIndex("y", 0);
+    const int32_t elem_cnt = x->shape_view().elem_cnt();
+    OF_CUDA_CHECK(cuda::elementwise::Unary(ErfInvFunctor<T>(), elem_cnt, y->mut_dptr<T>(),
+                                           x->dptr<T>(),
+                                           ctx->stream()->As<ep::CudaStream>()->cuda_stream()));
+  }
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
+};
+
+#define REGISTER_CUDA_ERFINV_KERNEL(dtype)                                                      \
+  REGISTER_USER_KERNEL("erfinv")                                                                \
+      .SetCreateFn<GpuErfinvKernel<dtype>>()                                                    \
+      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                          \
+                       && (user_op::HobDataType("y", 0) == GetDataType<dtype>::value))          \
+      .SetInplaceProposalFn([](const user_op::InferContext&,                                    \
+                               user_op::AddInplaceArgPair AddInplaceArgPairFn) -> Maybe<void> { \
+        OF_RETURN_IF_ERROR(AddInplaceArgPairFn("y", 0, "x", 0, true));                          \
+        return Maybe<void>::Ok();                                                               \
+      });
+
+REGISTER_CUDA_ERFINV_KERNEL(float)
+REGISTER_CUDA_ERFINV_KERNEL(double)
+
 }  // namespace oneflow
\ No newline at end of file
diff --git a/oneflow/user/kernels/expand_kernel.hip.cpp b/oneflow/user/kernels/expand_kernel.hip.cpp
index 07699da..5f417e3 100644
--- a/oneflow/user/kernels/expand_kernel.hip.cpp
+++ b/oneflow/user/kernels/expand_kernel.hip.cpp
@@ -1,220 +1,220 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "hip/hip_runtime.h"
-#include "oneflow/core/device/cuda_util.h"
-#include "oneflow/core/hip/atomic.hip.h"
-#include "oneflow/user/kernels/expand_kernel_utils.h"
-
-namespace oneflow {
-
-namespace {
-
-const int32_t NDIMS = 16;
-struct STRIDES {
-  int32_t val[NDIMS];
-};
-
-template<typename T>
-__global__ void ExpandCudaKernel(const T* in_ptr, const STRIDES in_stride,
-                                 const STRIDES expand_stride, const int32_t dims,
-                                 const int32_t elements, T* out_ptr) {
-  int32_t gid = (blockDim.x * blockIdx.x) + threadIdx.x;
-  int32_t step = gridDim.x * blockDim.x;
-  while (gid < elements) {
-    int32_t offset = OffsetToNdIndexToOffset(gid, in_stride.val, expand_stride.val, dims);
-    out_ptr[gid] = in_ptr[offset];
-    gid += step;
-  }
-}
-
-template<typename T>
-__global__ void ExpandGradCudaKernel(const T* out_diff_ptr, const STRIDES out_stride,
-                                     const STRIDES expand_stride, const int32_t dims,
-                                     const int32_t elements, T* in_diff_ptr) {
-  int32_t gid = (blockDim.x * blockIdx.x) + threadIdx.x;
-  int32_t step = gridDim.x * blockDim.x;
-  while (gid < elements) {
-    int32_t offset = OffsetToNdIndexToOffset(gid, out_stride.val, expand_stride.val, dims);
-    cuda::atomic::Add(&in_diff_ptr[offset], out_diff_ptr[gid]);
-    gid += step;
-  }
-}
-
-template<typename T>
-__global__ void InitPtr(const int32_t elements, T* ptr) {
-  int32_t gid = (blockDim.x * blockIdx.x) + threadIdx.x;
-  int32_t step = gridDim.x * blockDim.x;
-  while (gid < elements) {
-    ptr[gid] = static_cast<T>(0);
-    gid += step;
-  }
-}
-
-template<typename T>
-struct GpuExpandFunctor final {
-  void operator()(ep::Stream* stream, const T* in_ptr, const STRIDES in_stride,
-                  const STRIDES expand_stride, const int32_t dims, const int32_t elements,
-                  T* out_ptr) {
-    RUN_CUDA_KERNEL((ExpandCudaKernel<T>), stream, elements, in_ptr, in_stride, expand_stride, dims,
-                    elements, out_ptr);
-  }
-};
-
-template<>
-void GpuExpandFunctor<float16>::operator()(ep::Stream* stream, const float16* in_ptr,
-                                           const STRIDES in_stride, const STRIDES expand_stride,
-                                           const int32_t dims, const int32_t elements,
-                                           float16* out_ptr) {
-  RUN_CUDA_KERNEL((ExpandCudaKernel<half>), stream, elements, reinterpret_cast<const half*>(in_ptr),
-                  in_stride, expand_stride, dims, elements, reinterpret_cast<half*>(out_ptr));
-}
-
-template<typename T>
-struct GpuExpandGradFunctor final {
-  void operator()(ep::Stream* stream, const T* in_ptr, const STRIDES in_stride,
-                  const STRIDES expand_stride, const int32_t dims, const int32_t elements,
-                  const int32_t out_elements, T* out_ptr) {
-    RUN_CUDA_KERNEL((InitPtr<T>), stream, out_elements, out_elements, out_ptr);
-    RUN_CUDA_KERNEL((ExpandGradCudaKernel<T>), stream, elements, in_ptr, in_stride, expand_stride,
-                    dims, elements, out_ptr);
-  }
-};
-
-template<>
-void GpuExpandGradFunctor<float16>::operator()(ep::Stream* stream, const float16* in_ptr,
-                                               const STRIDES in_stride, const STRIDES expand_stride,
-                                               const int32_t dims, const int32_t elements,
-                                               const int32_t out_elements, float16* out_ptr) {
-  RUN_CUDA_KERNEL((InitPtr<half>), stream, out_elements, out_elements,
-                  reinterpret_cast<half*>(out_ptr));
-  RUN_CUDA_KERNEL((ExpandGradCudaKernel<half>), stream, elements,
-                  reinterpret_cast<const half*>(in_ptr), in_stride, expand_stride, dims, elements,
-                  reinterpret_cast<half*>(out_ptr));
-}
-
-}  // namespace
-
-template<typename T>
-class GpuExpandKernel final : public user_op::OpKernel {
- public:
-  GpuExpandKernel() = default;
-  ~GpuExpandKernel() = default;
-
- private:
-  using user_op::OpKernel::Compute;
-  void Compute(user_op::KernelComputeContext* ctx) const override {
-    const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("in", 0);
-    user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0);
-    const std::vector<int32_t>& logical_expand_shape =
-        ctx->Attr<std::vector<int32_t>>("logical_expand_shape");
-    if (std::any_of(logical_expand_shape.begin(), logical_expand_shape.end(),
-                    [](int32_t dim_size) { return dim_size == 0; })) {
-      return;
-    }
-    std::vector<int32_t> in_shape;
-    in_shape.resize(in->shape_view().NumAxes());
-    for (int i = 0; i < in->shape_view().NumAxes(); ++i) { in_shape[i] = in->shape_view().At(i); }
-
-    std::vector<int32_t> out_shape;
-    std::vector<int32_t> stride;
-    CHECK_JUST(getOutShapeAndStrideForFp(in_shape, logical_expand_shape, out_shape, stride));
-
-    const T* in_ptr = in->dptr<T>();
-    T* out_ptr = out->mut_dptr<T>();
-    const int32_t out_dims = out->shape_view().NumAxes();
-    const int32_t out_size = out->shape_view().elem_cnt();
-
-    STRIDES expand_stride;
-    for (int i = 0; i < out_dims; ++i) { expand_stride.val[i] = stride[i]; }
-    STRIDES out_stride;
-    InitStride(out_stride.val, out_shape.data(), out_dims);
-    GpuExpandFunctor<T>()(ctx->stream(), in_ptr, out_stride, expand_stride, out_dims, out_size,
-                          out_ptr);
-  }
-
-  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
-};
-
-#define REGISTER_EXPAND_KERNEL(dtype)                                                   \
-  REGISTER_USER_KERNEL("expand").SetCreateFn<GpuExpandKernel<dtype>>().SetIsMatchedHob( \
-      (user_op::HobDeviceType() == DeviceType::kCUDA)                                   \
-      && (user_op::HobDataType("in", 0) == GetDataType<dtype>::value))
-
-REGISTER_EXPAND_KERNEL(float);
-REGISTER_EXPAND_KERNEL(double);
-REGISTER_EXPAND_KERNEL(float16);
-REGISTER_EXPAND_KERNEL(bool);
-REGISTER_EXPAND_KERNEL(uint8_t);
-REGISTER_EXPAND_KERNEL(int8_t);
-REGISTER_EXPAND_KERNEL(int32_t);
-REGISTER_EXPAND_KERNEL(int64_t);
-
-template<typename T>
-class GpuExpandGradKernel final : public user_op::OpKernel {
- public:
-  GpuExpandGradKernel() = default;
-  ~GpuExpandGradKernel() = default;
-
- private:
-  using user_op::OpKernel::Compute;
-  void Compute(user_op::KernelComputeContext* ctx) const override {
-    const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("in", 0);
-    user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0);
-    const std::vector<int32_t>& logical_out_shape =
-        ctx->Attr<std::vector<int32_t>>("logical_out_shape");
-    const std::vector<int32_t>& logical_expand_shape =
-        ctx->Attr<std::vector<int32_t>>("logical_expand_shape");
-
-    std::vector<int32_t> in_shape;
-    in_shape.resize(in->shape_view().NumAxes());
-    for (int i = 0; i < in->shape_view().NumAxes(); ++i) { in_shape[i] = in->shape_view().At(i); }
-    std::vector<int32_t> out_shape;
-    std::vector<int32_t> stride;
-    CHECK_JUST(getOutShapeAndStrideForBp(logical_out_shape, logical_expand_shape, in_shape,
-                                         out_shape, stride));
-
-    const T* in_ptr = in->dptr<T>();
-    T* out_ptr = out->mut_dptr<T>();
-
-    const int32_t in_dims = in->shape_view().NumAxes();
-    const int32_t in_size = in->shape_view().elem_cnt();
-    const int32_t out_size = out->shape_view().elem_cnt();
-
-    STRIDES expand_stride;
-    for (int i = 0; i < in_dims; ++i) { expand_stride.val[i] = stride[i]; }
-    STRIDES in_stride;
-    InitStride(in_stride.val, in_shape.data(), in_dims);
-
-    GpuExpandGradFunctor<T>()(ctx->stream(), in_ptr, in_stride, expand_stride, in_dims, in_size,
-                              out_size, out_ptr);
-  }
-
-  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
-};
-
-#define REGISTER_EXPAND_GRAD_KERNEL(dtype)                             \
-  REGISTER_USER_KERNEL("expand_grad")                                  \
-      .SetCreateFn<GpuExpandGradKernel<dtype>>()                       \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \
-                       && (user_op::HobDataType("in", 0) == GetDataType<dtype>::value))
-
-REGISTER_EXPAND_GRAD_KERNEL(float);
-REGISTER_EXPAND_GRAD_KERNEL(double);
-REGISTER_EXPAND_GRAD_KERNEL(float16);
-REGISTER_EXPAND_GRAD_KERNEL(int32_t);
-REGISTER_EXPAND_GRAD_KERNEL(int64_t);
-
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "hip/hip_runtime.h"
+#include "oneflow/core/device/cuda_util.h"
+#include "oneflow/core/hip/atomic.hip.h"
+#include "oneflow/user/kernels/expand_kernel_utils.h"
+
+namespace oneflow {
+
+namespace {
+
+const int32_t NDIMS = 16;
+struct STRIDES {
+  int32_t val[NDIMS];
+};
+
+template<typename T>
+__global__ void ExpandCudaKernel(const T* in_ptr, const STRIDES in_stride,
+                                 const STRIDES expand_stride, const int32_t dims,
+                                 const int32_t elements, T* out_ptr) {
+  int32_t gid = (blockDim.x * blockIdx.x) + threadIdx.x;
+  int32_t step = gridDim.x * blockDim.x;
+  while (gid < elements) {
+    int32_t offset = OffsetToNdIndexToOffset(gid, in_stride.val, expand_stride.val, dims);
+    out_ptr[gid] = in_ptr[offset];
+    gid += step;
+  }
+}
+
+template<typename T>
+__global__ void ExpandGradCudaKernel(const T* out_diff_ptr, const STRIDES out_stride,
+                                     const STRIDES expand_stride, const int32_t dims,
+                                     const int32_t elements, T* in_diff_ptr) {
+  int32_t gid = (blockDim.x * blockIdx.x) + threadIdx.x;
+  int32_t step = gridDim.x * blockDim.x;
+  while (gid < elements) {
+    int32_t offset = OffsetToNdIndexToOffset(gid, out_stride.val, expand_stride.val, dims);
+    cuda::atomic::Add(&in_diff_ptr[offset], out_diff_ptr[gid]);
+    gid += step;
+  }
+}
+
+template<typename T>
+__global__ void InitPtr(const int32_t elements, T* ptr) {
+  int32_t gid = (blockDim.x * blockIdx.x) + threadIdx.x;
+  int32_t step = gridDim.x * blockDim.x;
+  while (gid < elements) {
+    ptr[gid] = static_cast<T>(0);
+    gid += step;
+  }
+}
+
+template<typename T>
+struct GpuExpandFunctor final {
+  void operator()(ep::Stream* stream, const T* in_ptr, const STRIDES in_stride,
+                  const STRIDES expand_stride, const int32_t dims, const int32_t elements,
+                  T* out_ptr) {
+    RUN_CUDA_KERNEL((ExpandCudaKernel<T>), stream, elements, in_ptr, in_stride, expand_stride, dims,
+                    elements, out_ptr);
+  }
+};
+
+template<>
+void GpuExpandFunctor<float16>::operator()(ep::Stream* stream, const float16* in_ptr,
+                                           const STRIDES in_stride, const STRIDES expand_stride,
+                                           const int32_t dims, const int32_t elements,
+                                           float16* out_ptr) {
+  RUN_CUDA_KERNEL((ExpandCudaKernel<half>), stream, elements, reinterpret_cast<const half*>(in_ptr),
+                  in_stride, expand_stride, dims, elements, reinterpret_cast<half*>(out_ptr));
+}
+
+template<typename T>
+struct GpuExpandGradFunctor final {
+  void operator()(ep::Stream* stream, const T* in_ptr, const STRIDES in_stride,
+                  const STRIDES expand_stride, const int32_t dims, const int32_t elements,
+                  const int32_t out_elements, T* out_ptr) {
+    RUN_CUDA_KERNEL((InitPtr<T>), stream, out_elements, out_elements, out_ptr);
+    RUN_CUDA_KERNEL((ExpandGradCudaKernel<T>), stream, elements, in_ptr, in_stride, expand_stride,
+                    dims, elements, out_ptr);
+  }
+};
+
+template<>
+void GpuExpandGradFunctor<float16>::operator()(ep::Stream* stream, const float16* in_ptr,
+                                               const STRIDES in_stride, const STRIDES expand_stride,
+                                               const int32_t dims, const int32_t elements,
+                                               const int32_t out_elements, float16* out_ptr) {
+  RUN_CUDA_KERNEL((InitPtr<half>), stream, out_elements, out_elements,
+                  reinterpret_cast<half*>(out_ptr));
+  RUN_CUDA_KERNEL((ExpandGradCudaKernel<half>), stream, elements,
+                  reinterpret_cast<const half*>(in_ptr), in_stride, expand_stride, dims, elements,
+                  reinterpret_cast<half*>(out_ptr));
+}
+
+}  // namespace
+
+template<typename T>
+class GpuExpandKernel final : public user_op::OpKernel {
+ public:
+  GpuExpandKernel() = default;
+  ~GpuExpandKernel() = default;
+
+ private:
+  using user_op::OpKernel::Compute;
+  void Compute(user_op::KernelComputeContext* ctx) const override {
+    const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("in", 0);
+    user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0);
+    const std::vector<int32_t>& logical_expand_shape =
+        ctx->Attr<std::vector<int32_t>>("logical_expand_shape");
+    if (std::any_of(logical_expand_shape.begin(), logical_expand_shape.end(),
+                    [](int32_t dim_size) { return dim_size == 0; })) {
+      return;
+    }
+    std::vector<int32_t> in_shape;
+    in_shape.resize(in->shape_view().NumAxes());
+    for (int i = 0; i < in->shape_view().NumAxes(); ++i) { in_shape[i] = in->shape_view().At(i); }
+
+    std::vector<int32_t> out_shape;
+    std::vector<int32_t> stride;
+    CHECK_JUST(getOutShapeAndStrideForFp(in_shape, logical_expand_shape, out_shape, stride));
+
+    const T* in_ptr = in->dptr<T>();
+    T* out_ptr = out->mut_dptr<T>();
+    const int32_t out_dims = out->shape_view().NumAxes();
+    const int32_t out_size = out->shape_view().elem_cnt();
+
+    STRIDES expand_stride;
+    for (int i = 0; i < out_dims; ++i) { expand_stride.val[i] = stride[i]; }
+    STRIDES out_stride;
+    InitStride(out_stride.val, out_shape.data(), out_dims);
+    GpuExpandFunctor<T>()(ctx->stream(), in_ptr, out_stride, expand_stride, out_dims, out_size,
+                          out_ptr);
+  }
+
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
+};
+
+#define REGISTER_EXPAND_KERNEL(dtype)                                                   \
+  REGISTER_USER_KERNEL("expand").SetCreateFn<GpuExpandKernel<dtype>>().SetIsMatchedHob( \
+      (user_op::HobDeviceType() == DeviceType::kCUDA)                                   \
+      && (user_op::HobDataType("in", 0) == GetDataType<dtype>::value))
+
+REGISTER_EXPAND_KERNEL(float);
+REGISTER_EXPAND_KERNEL(double);
+REGISTER_EXPAND_KERNEL(float16);
+REGISTER_EXPAND_KERNEL(bool);
+REGISTER_EXPAND_KERNEL(uint8_t);
+REGISTER_EXPAND_KERNEL(int8_t);
+REGISTER_EXPAND_KERNEL(int32_t);
+REGISTER_EXPAND_KERNEL(int64_t);
+
+template<typename T>
+class GpuExpandGradKernel final : public user_op::OpKernel {
+ public:
+  GpuExpandGradKernel() = default;
+  ~GpuExpandGradKernel() = default;
+
+ private:
+  using user_op::OpKernel::Compute;
+  void Compute(user_op::KernelComputeContext* ctx) const override {
+    const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("in", 0);
+    user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0);
+    const std::vector<int32_t>& logical_out_shape =
+        ctx->Attr<std::vector<int32_t>>("logical_out_shape");
+    const std::vector<int32_t>& logical_expand_shape =
+        ctx->Attr<std::vector<int32_t>>("logical_expand_shape");
+
+    std::vector<int32_t> in_shape;
+    in_shape.resize(in->shape_view().NumAxes());
+    for (int i = 0; i < in->shape_view().NumAxes(); ++i) { in_shape[i] = in->shape_view().At(i); }
+    std::vector<int32_t> out_shape;
+    std::vector<int32_t> stride;
+    CHECK_JUST(getOutShapeAndStrideForBp(logical_out_shape, logical_expand_shape, in_shape,
+                                         out_shape, stride));
+
+    const T* in_ptr = in->dptr<T>();
+    T* out_ptr = out->mut_dptr<T>();
+
+    const int32_t in_dims = in->shape_view().NumAxes();
+    const int32_t in_size = in->shape_view().elem_cnt();
+    const int32_t out_size = out->shape_view().elem_cnt();
+
+    STRIDES expand_stride;
+    for (int i = 0; i < in_dims; ++i) { expand_stride.val[i] = stride[i]; }
+    STRIDES in_stride;
+    InitStride(in_stride.val, in_shape.data(), in_dims);
+
+    GpuExpandGradFunctor<T>()(ctx->stream(), in_ptr, in_stride, expand_stride, in_dims, in_size,
+                              out_size, out_ptr);
+  }
+
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
+};
+
+#define REGISTER_EXPAND_GRAD_KERNEL(dtype)                             \
+  REGISTER_USER_KERNEL("expand_grad")                                  \
+      .SetCreateFn<GpuExpandGradKernel<dtype>>()                       \
+      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \
+                       && (user_op::HobDataType("in", 0) == GetDataType<dtype>::value))
+
+REGISTER_EXPAND_GRAD_KERNEL(float);
+REGISTER_EXPAND_GRAD_KERNEL(double);
+REGISTER_EXPAND_GRAD_KERNEL(float16);
+REGISTER_EXPAND_GRAD_KERNEL(int32_t);
+REGISTER_EXPAND_GRAD_KERNEL(int64_t);
+
 }  // namespace oneflow
\ No newline at end of file
diff --git a/oneflow/user/kernels/eye_kernel_util.hip.cpp b/oneflow/user/kernels/eye_kernel_util.hip.cpp
index 84fd109..978c987 100644
--- a/oneflow/user/kernels/eye_kernel_util.hip.cpp
+++ b/oneflow/user/kernels/eye_kernel_util.hip.cpp
@@ -1,40 +1,40 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#ifdef WITH_ROCM
-#include "hip/hip_runtime.h"
-#include "oneflow/core/framework/framework.h"
-#include "oneflow/user/kernels/eye_kernel_util.h"
-
-namespace oneflow {
-
-namespace user_op {
-
-template<typename T>
-__global__ void EyeForwardGpuKernel(const int64_t cols, const int64_t rows, T* out) {
-  SetOneInDiag(cols, rows, out);
-}
-
-template<typename T>
-struct EyeFunctor<DeviceType::kCUDA, T> final {
-  void operator()(ep::Stream* stream, const int64_t& cols, const int64_t& rows, T* out) {
-    RUN_CUDA_KERNEL((EyeForwardGpuKernel<T>), stream, rows, cols, rows, out);
-  }
-};
-OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_EYE_FUNCTOR, (DeviceType::kCUDA), EYE_DATA_TYPE_SEQ);
-}  // namespace user_op
-}  // namespace oneflow
-
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#ifdef WITH_ROCM
+#include "hip/hip_runtime.h"
+#include "oneflow/core/framework/framework.h"
+#include "oneflow/user/kernels/eye_kernel_util.h"
+
+namespace oneflow {
+
+namespace user_op {
+
+template<typename T>
+__global__ void EyeForwardGpuKernel(const int64_t cols, const int64_t rows, T* out) {
+  SetOneInDiag(cols, rows, out);
+}
+
+template<typename T>
+struct EyeFunctor<DeviceType::kCUDA, T> final {
+  void operator()(ep::Stream* stream, const int64_t& cols, const int64_t& rows, T* out) {
+    RUN_CUDA_KERNEL((EyeForwardGpuKernel<T>), stream, rows, cols, rows, out);
+  }
+};
+OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_EYE_FUNCTOR, (DeviceType::kCUDA), EYE_DATA_TYPE_SEQ);
+}  // namespace user_op
+}  // namespace oneflow
+
 #endif  // End WITH_ROCM
\ No newline at end of file
diff --git a/oneflow/user/kernels/fake_quantization_kernel.hip.cpp b/oneflow/user/kernels/fake_quantization_kernel.hip.cpp
index 126595e..a992cc2 100644
--- a/oneflow/user/kernels/fake_quantization_kernel.hip.cpp
+++ b/oneflow/user/kernels/fake_quantization_kernel.hip.cpp
@@ -1,160 +1,160 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "hip/hip_runtime.h"
-#include "oneflow/core/device/cuda_util.h"
-#include "oneflow/core/framework/framework.h"
-#include "oneflow/core/kernel/kernel_util.hip.h"
-
-namespace oneflow {
-
-namespace {
-
-template<typename T>
-__global__ void FakeQuantizationSymmetric(const T* in_ptr, const T* scale_ptr,
-                                          const int64_t scale_size, const int64_t elements,
-                                          const int64_t panel_size, const double quantization_bit,
-                                          T* out_ptr) {
-  int64_t gid = (blockDim.x * blockIdx.x) + threadIdx.x;
-  int64_t step = gridDim.x * blockDim.x;
-
-  T upper_bound = static_cast<T>(pow(2.0, quantization_bit - 1)) - 1;
-  T lower_bound = -upper_bound - 1;
-
-  while (gid < elements) {
-    int64_t channel_index = gid / panel_size;
-    int64_t scale_idx = min(scale_size - 1, channel_index);
-
-    T scale = scale_ptr[scale_idx];
-
-    T out = nearbyint(in_ptr[gid] / scale);
-    out = out > upper_bound ? upper_bound : out;
-    out = out < lower_bound ? lower_bound : out;
-    out_ptr[gid] = out * scale;
-
-    gid += step;
-  }
-}
-
-template<typename T>
-__global__ void FakeQuantizationAffine(const T* in_ptr, const T* scale_ptr, const T* zero_point_ptr,
-                                       const int64_t scale_size, const int64_t elements,
-                                       const int64_t panel_size, const double quantization_bit,
-                                       T* out_ptr) {
-  int64_t gid = (blockDim.x * blockIdx.x) + threadIdx.x;
-  int64_t step = gridDim.x * blockDim.x;
-
-  T upper_bound = static_cast<T>(pow(2.0, quantization_bit)) - 1;
-  T lower_bound = 0;
-
-  while (gid < elements) {
-    int64_t channel_index = gid / panel_size;
-    int64_t scale_idx = min(scale_size - 1, channel_index);
-
-    T scale = scale_ptr[scale_idx];
-    T zero_point = zero_point_ptr[scale_idx];
-
-    T out = nearbyint(in_ptr[gid] / scale + zero_point);
-    out = out > upper_bound ? upper_bound : out;
-    out = out < lower_bound ? lower_bound : out;
-    out_ptr[gid] = (out - zero_point) * scale;
-
-    gid += step;
-  }
-}
-
-template<typename T>
-__global__ void FakeQuantizationCambricon(const T* in_ptr, const T* shift, const int64_t scale_size,
-                                          const int64_t elements, const int64_t panel_size,
-                                          const double quantization_bit, T* out_ptr) {
-  int64_t gid = (blockDim.x * blockIdx.x) + threadIdx.x;
-  int64_t step = gridDim.x * blockDim.x;
-
-  T upper_bound = static_cast<T>(pow(2.0, quantization_bit - 1)) - 1;
-  T lower_bound = -upper_bound - 1;
-
-  T scale = static_cast<T>(pow(2.0, static_cast<int32_t>(shift[0])));
-
-  while (gid < elements) {
-    T out = nearbyint(in_ptr[gid] / scale);
-    out = out > upper_bound ? upper_bound : out;
-    out = out < lower_bound ? lower_bound : out;
-    out_ptr[gid] = out * scale;
-    gid += step;
-  }
-}
-
-}  // namespace
-
-template<typename T>
-class GpuFakeQuantizationKernel final : public user_op::OpKernel {
- public:
-  GpuFakeQuantizationKernel() = default;
-  ~GpuFakeQuantizationKernel() = default;
-
- private:
-  using user_op::OpKernel::Compute;
-  void Compute(user_op::KernelComputeContext* ctx) const override {
-    const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("in", 0);
-    const user_op::Tensor* scale = ctx->Tensor4ArgNameAndIndex("scale", 0);
-    const user_op::Tensor* zero_point = ctx->Tensor4ArgNameAndIndex("zero_point", 0);
-    user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0);
-
-    const std::string quantization_scheme = ctx->Attr<std::string>("quantization_scheme");
-    const int32_t quantization_bit = ctx->Attr<int32_t>("quantization_bit");
-    const std::string quantization_formula = ctx->Attr<std::string>("quantization_formula");
-
-    const int64_t elements = in->shape_view().elem_cnt();
-    const int64_t panel_size = in->shape_view().Count(1);
-    const int64_t scale_size = scale->shape_view().elem_cnt();
-
-    // round to even
-    auto origin_round_mode = std::fegetround();
-    std::fesetround(FE_TONEAREST);
-
-    if (quantization_formula == "google") {
-      if (quantization_scheme == "symmetric") {
-        RUN_CUDA_KERNEL((FakeQuantizationSymmetric<T>), ctx->stream(), elements, in->dptr<T>(),
-                        scale->dptr<T>(), scale_size, elements, panel_size, quantization_bit,
-                        out->mut_dptr<T>());
-      } else {  // quantization_scheme == "affine"
-        RUN_CUDA_KERNEL((FakeQuantizationAffine<T>), ctx->stream(), elements, in->dptr<T>(),
-                        scale->dptr<T>(), zero_point->dptr<T>(), scale_size, elements, panel_size,
-                        quantization_bit, out->mut_dptr<T>());
-      }
-    } else if (quantization_formula == "cambricon") {
-      RUN_CUDA_KERNEL((FakeQuantizationCambricon<T>), ctx->stream(), elements, in->dptr<T>(),
-                      scale->dptr<T>(), scale_size, elements, panel_size, quantization_bit,
-                      out->mut_dptr<T>());
-    } else {
-      UNIMPLEMENTED();
-    }
-
-    std::fesetround(origin_round_mode);
-  }
-
-  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
-};
-
-#define REGISTER_FAKE_QUANTIZATION_KERNEL(dtype)                       \
-  REGISTER_USER_KERNEL("fake_quantization")                            \
-      .SetCreateFn<GpuFakeQuantizationKernel<dtype>>()                 \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \
-                       && (user_op::HobDataType("in", 0) == GetDataType<dtype>::value))
-
-REGISTER_FAKE_QUANTIZATION_KERNEL(float);
-REGISTER_FAKE_QUANTIZATION_KERNEL(double);
-
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "hip/hip_runtime.h"
+#include "oneflow/core/device/cuda_util.h"
+#include "oneflow/core/framework/framework.h"
+#include "oneflow/core/kernel/kernel_util.hip.h"
+
+namespace oneflow {
+
+namespace {
+
+template<typename T>
+__global__ void FakeQuantizationSymmetric(const T* in_ptr, const T* scale_ptr,
+                                          const int64_t scale_size, const int64_t elements,
+                                          const int64_t panel_size, const double quantization_bit,
+                                          T* out_ptr) {
+  int64_t gid = (blockDim.x * blockIdx.x) + threadIdx.x;
+  int64_t step = gridDim.x * blockDim.x;
+
+  T upper_bound = static_cast<T>(pow(2.0, quantization_bit - 1)) - 1;
+  T lower_bound = -upper_bound - 1;
+
+  while (gid < elements) {
+    int64_t channel_index = gid / panel_size;
+    int64_t scale_idx = min(scale_size - 1, channel_index);
+
+    T scale = scale_ptr[scale_idx];
+
+    T out = nearbyint(in_ptr[gid] / scale);
+    out = out > upper_bound ? upper_bound : out;
+    out = out < lower_bound ? lower_bound : out;
+    out_ptr[gid] = out * scale;
+
+    gid += step;
+  }
+}
+
+template<typename T>
+__global__ void FakeQuantizationAffine(const T* in_ptr, const T* scale_ptr, const T* zero_point_ptr,
+                                       const int64_t scale_size, const int64_t elements,
+                                       const int64_t panel_size, const double quantization_bit,
+                                       T* out_ptr) {
+  int64_t gid = (blockDim.x * blockIdx.x) + threadIdx.x;
+  int64_t step = gridDim.x * blockDim.x;
+
+  T upper_bound = static_cast<T>(pow(2.0, quantization_bit)) - 1;
+  T lower_bound = 0;
+
+  while (gid < elements) {
+    int64_t channel_index = gid / panel_size;
+    int64_t scale_idx = min(scale_size - 1, channel_index);
+
+    T scale = scale_ptr[scale_idx];
+    T zero_point = zero_point_ptr[scale_idx];
+
+    T out = nearbyint(in_ptr[gid] / scale + zero_point);
+    out = out > upper_bound ? upper_bound : out;
+    out = out < lower_bound ? lower_bound : out;
+    out_ptr[gid] = (out - zero_point) * scale;
+
+    gid += step;
+  }
+}
+
+template<typename T>
+__global__ void FakeQuantizationCambricon(const T* in_ptr, const T* shift, const int64_t scale_size,
+                                          const int64_t elements, const int64_t panel_size,
+                                          const double quantization_bit, T* out_ptr) {
+  int64_t gid = (blockDim.x * blockIdx.x) + threadIdx.x;
+  int64_t step = gridDim.x * blockDim.x;
+
+  T upper_bound = static_cast<T>(pow(2.0, quantization_bit - 1)) - 1;
+  T lower_bound = -upper_bound - 1;
+
+  T scale = static_cast<T>(pow(2.0, static_cast<int32_t>(shift[0])));
+
+  while (gid < elements) {
+    T out = nearbyint(in_ptr[gid] / scale);
+    out = out > upper_bound ? upper_bound : out;
+    out = out < lower_bound ? lower_bound : out;
+    out_ptr[gid] = out * scale;
+    gid += step;
+  }
+}
+
+}  // namespace
+
+template<typename T>
+class GpuFakeQuantizationKernel final : public user_op::OpKernel {
+ public:
+  GpuFakeQuantizationKernel() = default;
+  ~GpuFakeQuantizationKernel() = default;
+
+ private:
+  using user_op::OpKernel::Compute;
+  void Compute(user_op::KernelComputeContext* ctx) const override {
+    const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("in", 0);
+    const user_op::Tensor* scale = ctx->Tensor4ArgNameAndIndex("scale", 0);
+    const user_op::Tensor* zero_point = ctx->Tensor4ArgNameAndIndex("zero_point", 0);
+    user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0);
+
+    const std::string quantization_scheme = ctx->Attr<std::string>("quantization_scheme");
+    const int32_t quantization_bit = ctx->Attr<int32_t>("quantization_bit");
+    const std::string quantization_formula = ctx->Attr<std::string>("quantization_formula");
+
+    const int64_t elements = in->shape_view().elem_cnt();
+    const int64_t panel_size = in->shape_view().Count(1);
+    const int64_t scale_size = scale->shape_view().elem_cnt();
+
+    // round to even
+    auto origin_round_mode = std::fegetround();
+    std::fesetround(FE_TONEAREST);
+
+    if (quantization_formula == "google") {
+      if (quantization_scheme == "symmetric") {
+        RUN_CUDA_KERNEL((FakeQuantizationSymmetric<T>), ctx->stream(), elements, in->dptr<T>(),
+                        scale->dptr<T>(), scale_size, elements, panel_size, quantization_bit,
+                        out->mut_dptr<T>());
+      } else {  // quantization_scheme == "affine"
+        RUN_CUDA_KERNEL((FakeQuantizationAffine<T>), ctx->stream(), elements, in->dptr<T>(),
+                        scale->dptr<T>(), zero_point->dptr<T>(), scale_size, elements, panel_size,
+                        quantization_bit, out->mut_dptr<T>());
+      }
+    } else if (quantization_formula == "cambricon") {
+      RUN_CUDA_KERNEL((FakeQuantizationCambricon<T>), ctx->stream(), elements, in->dptr<T>(),
+                      scale->dptr<T>(), scale_size, elements, panel_size, quantization_bit,
+                      out->mut_dptr<T>());
+    } else {
+      UNIMPLEMENTED();
+    }
+
+    std::fesetround(origin_round_mode);
+  }
+
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
+};
+
+#define REGISTER_FAKE_QUANTIZATION_KERNEL(dtype)                       \
+  REGISTER_USER_KERNEL("fake_quantization")                            \
+      .SetCreateFn<GpuFakeQuantizationKernel<dtype>>()                 \
+      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \
+                       && (user_op::HobDataType("in", 0) == GetDataType<dtype>::value))
+
+REGISTER_FAKE_QUANTIZATION_KERNEL(float);
+REGISTER_FAKE_QUANTIZATION_KERNEL(double);
+
 }  // namespace oneflow
\ No newline at end of file
diff --git a/oneflow/user/kernels/fill_kernel.hip.cpp b/oneflow/user/kernels/fill_kernel.hip.cpp
index b123325..290575f 100644
--- a/oneflow/user/kernels/fill_kernel.hip.cpp
+++ b/oneflow/user/kernels/fill_kernel.hip.cpp
@@ -1,61 +1,61 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "hip/hip_runtime.h"
-#include "oneflow/core/framework/framework.h"
-#include "oneflow/core/common/nd_index_offset_helper.h"
-#include "oneflow/core/kernel/new_kernel_util.h"
-
-namespace oneflow {
-
-namespace {
-template<typename T>
-__global__ void FillTensorGpuForward(const int n, const T* value, T* y) {
-  CUDA_1D_KERNEL_LOOP(i, n) { y[i] = value[0]; }
-}
-};  // namespace
-
-template<typename T>
-class FillTensorGpuKernel final : public user_op::OpKernel {
- public:
-  FillTensorGpuKernel() = default;
-  ~FillTensorGpuKernel() = default;
-
- private:
-  using user_op::OpKernel::Compute;
-  void Compute(user_op::KernelComputeContext* ctx) const override {
-    const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("in", 0);
-    user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0);
-    const user_op::Tensor* value = ctx->Tensor4ArgNameAndIndex("value", 0);
-    const int32_t elem_cnt = in->shape_view().elem_cnt();
-    RUN_CUDA_KERNEL((FillTensorGpuForward<T>), ctx->stream(), elem_cnt, elem_cnt, value->dptr<T>(),
-                    out->mut_dptr<T>());
-  }
-  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
-};
-
-#define REGISTER_FILL_CUDA_KERNEL(dtype)                               \
-  REGISTER_USER_KERNEL("fill_tensor_")                                 \
-      .SetCreateFn<FillTensorGpuKernel<dtype>>()                       \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \
-                       && (user_op::HobDataType("out", 0) == GetDataType<dtype>::value));
-
-REGISTER_FILL_CUDA_KERNEL(float)
-REGISTER_FILL_CUDA_KERNEL(double)
-REGISTER_FILL_CUDA_KERNEL(int8_t)
-REGISTER_FILL_CUDA_KERNEL(int32_t)
-REGISTER_FILL_CUDA_KERNEL(int64_t)
-
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "hip/hip_runtime.h"
+#include "oneflow/core/framework/framework.h"
+#include "oneflow/core/common/nd_index_offset_helper.h"
+#include "oneflow/core/kernel/new_kernel_util.h"
+
+namespace oneflow {
+
+namespace {
+template<typename T>
+__global__ void FillTensorGpuForward(const int n, const T* value, T* y) {
+  CUDA_1D_KERNEL_LOOP(i, n) { y[i] = value[0]; }
+}
+};  // namespace
+
+template<typename T>
+class FillTensorGpuKernel final : public user_op::OpKernel {
+ public:
+  FillTensorGpuKernel() = default;
+  ~FillTensorGpuKernel() = default;
+
+ private:
+  using user_op::OpKernel::Compute;
+  void Compute(user_op::KernelComputeContext* ctx) const override {
+    const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("in", 0);
+    user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0);
+    const user_op::Tensor* value = ctx->Tensor4ArgNameAndIndex("value", 0);
+    const int32_t elem_cnt = in->shape_view().elem_cnt();
+    RUN_CUDA_KERNEL((FillTensorGpuForward<T>), ctx->stream(), elem_cnt, elem_cnt, value->dptr<T>(),
+                    out->mut_dptr<T>());
+  }
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
+};
+
+#define REGISTER_FILL_CUDA_KERNEL(dtype)                               \
+  REGISTER_USER_KERNEL("fill_tensor_")                                 \
+      .SetCreateFn<FillTensorGpuKernel<dtype>>()                       \
+      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \
+                       && (user_op::HobDataType("out", 0) == GetDataType<dtype>::value));
+
+REGISTER_FILL_CUDA_KERNEL(float)
+REGISTER_FILL_CUDA_KERNEL(double)
+REGISTER_FILL_CUDA_KERNEL(int8_t)
+REGISTER_FILL_CUDA_KERNEL(int32_t)
+REGISTER_FILL_CUDA_KERNEL(int64_t)
+
 }  // namespace oneflow
\ No newline at end of file
diff --git a/oneflow/user/kernels/flip_kernel.hip.cpp b/oneflow/user/kernels/flip_kernel.hip.cpp
index 2c191a1..35b9a9b 100644
--- a/oneflow/user/kernels/flip_kernel.hip.cpp
+++ b/oneflow/user/kernels/flip_kernel.hip.cpp
@@ -1,104 +1,104 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "hip/hip_runtime.h"
-#include "oneflow/core/framework/framework.h"
-#include "oneflow/core/device/cuda_util.h"
-#include "oneflow/core/common/nd_index_offset_helper.h"
-#include "oneflow/core/ep/include/stream.h"
-
-namespace oneflow {
-
-namespace {
-
-const int32_t NDIMS = 16;
-struct SIZE_V {
-  int32_t val[NDIMS];
-};
-
-struct VIS {
-  bool val[NDIMS] = {false};
-};
-
-template<typename T>
-__global__ void FlipGpuForward(const int32_t element, const int64_t total_dims,
-                               const SIZE_V sizes_v, const VIS vis, SIZE_V strides_v,
-                               const T* in_dptr, T* out_dptr) {
-  CUDA_1D_KERNEL_LOOP(i, element) {
-    int32_t cur_indices = i;
-    int32_t rem = 0;
-    int32_t dst_offset = 0;
-    for (int32_t d = 0; d < total_dims; d++) {
-      int32_t temp = cur_indices;
-      cur_indices = cur_indices / strides_v.val[d];
-      rem = temp - cur_indices * strides_v.val[d];
-      dst_offset += vis.val[d] ? (sizes_v.val[d] - 1 - cur_indices) * strides_v.val[d]
-                               : cur_indices * strides_v.val[d];
-      cur_indices = rem;
-    }
-    out_dptr[i] = in_dptr[dst_offset];
-  }
-}
-
-}  // namespace
-
-template<typename T>
-class FlipGpuKernel final : public user_op::OpKernel {
- public:
-  FlipGpuKernel() = default;
-  ~FlipGpuKernel() = default;
-
- private:
-  using user_op::OpKernel::Compute;
-  void Compute(user_op::KernelComputeContext* ctx) const override {
-    const user_op::Tensor* x_tensor = ctx->Tensor4ArgNameAndIndex("x", 0);
-    user_op::Tensor* y_tensor = ctx->Tensor4ArgNameAndIndex("y", 0);
-    const int32_t elem_cnt = y_tensor->shape_view().elem_cnt();
-    if (elem_cnt == 0) { return; }
-    const int32_t total_dims = y_tensor->shape_view().NumAxes();
-
-    std::vector<int32_t> dims = ctx->Attr<std::vector<int32_t>>("dims");
-    VIS vis;
-    for (auto x : dims) { vis.val[x] = true; }
-
-    SIZE_V sizes_v;
-    for (int32_t i = 0; i < total_dims; i++) { sizes_v.val[i] = y_tensor->shape_view().At(i); }
-
-    // TODO(bbuf) delete strides caluculate, after tensor strides supported
-    SIZE_V strides_v;
-    strides_v.val[total_dims - 1] = 1;
-    for (int32_t i = total_dims - 2; i >= 0; i--) {
-      strides_v.val[i] = strides_v.val[i + 1] * y_tensor->shape_view().At(i + 1);
-    }
-    RUN_CUDA_KERNEL((FlipGpuForward<T>), ctx->stream(), elem_cnt, elem_cnt, total_dims, sizes_v,
-                    vis, strides_v, x_tensor->dptr<T>(), y_tensor->mut_dptr<T>());
-  }
-  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
-};
-
-#define REGISTER_FLIP_CUDA_KERNEL(dtype)                                            \
-  REGISTER_USER_KERNEL("flip").SetCreateFn<FlipGpuKernel<dtype>>().SetIsMatchedHob( \
-      (user_op::HobDeviceType() == DeviceType::kCUDA)                               \
-      && (user_op::HobDataType("y", 0) == GetDataType<dtype>::value));
-
-REGISTER_FLIP_CUDA_KERNEL(bool)
-REGISTER_FLIP_CUDA_KERNEL(float)
-REGISTER_FLIP_CUDA_KERNEL(double)
-REGISTER_FLIP_CUDA_KERNEL(uint8_t)
-REGISTER_FLIP_CUDA_KERNEL(int8_t)
-REGISTER_FLIP_CUDA_KERNEL(int32_t)
-REGISTER_FLIP_CUDA_KERNEL(int64_t)
-
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "hip/hip_runtime.h"
+#include "oneflow/core/framework/framework.h"
+#include "oneflow/core/device/cuda_util.h"
+#include "oneflow/core/common/nd_index_offset_helper.h"
+#include "oneflow/core/ep/include/stream.h"
+
+namespace oneflow {
+
+namespace {
+
+const int32_t NDIMS = 16;
+struct SIZE_V {
+  int32_t val[NDIMS];
+};
+
+struct VIS {
+  bool val[NDIMS] = {false};
+};
+
+template<typename T>
+__global__ void FlipGpuForward(const int32_t element, const int64_t total_dims,
+                               const SIZE_V sizes_v, const VIS vis, SIZE_V strides_v,
+                               const T* in_dptr, T* out_dptr) {
+  CUDA_1D_KERNEL_LOOP(i, element) {
+    int32_t cur_indices = i;
+    int32_t rem = 0;
+    int32_t dst_offset = 0;
+    for (int32_t d = 0; d < total_dims; d++) {
+      int32_t temp = cur_indices;
+      cur_indices = cur_indices / strides_v.val[d];
+      rem = temp - cur_indices * strides_v.val[d];
+      dst_offset += vis.val[d] ? (sizes_v.val[d] - 1 - cur_indices) * strides_v.val[d]
+                               : cur_indices * strides_v.val[d];
+      cur_indices = rem;
+    }
+    out_dptr[i] = in_dptr[dst_offset];
+  }
+}
+
+}  // namespace
+
+template<typename T>
+class FlipGpuKernel final : public user_op::OpKernel {
+ public:
+  FlipGpuKernel() = default;
+  ~FlipGpuKernel() = default;
+
+ private:
+  using user_op::OpKernel::Compute;
+  void Compute(user_op::KernelComputeContext* ctx) const override {
+    const user_op::Tensor* x_tensor = ctx->Tensor4ArgNameAndIndex("x", 0);
+    user_op::Tensor* y_tensor = ctx->Tensor4ArgNameAndIndex("y", 0);
+    const int32_t elem_cnt = y_tensor->shape_view().elem_cnt();
+    if (elem_cnt == 0) { return; }
+    const int32_t total_dims = y_tensor->shape_view().NumAxes();
+
+    std::vector<int32_t> dims = ctx->Attr<std::vector<int32_t>>("dims");
+    VIS vis;
+    for (auto x : dims) { vis.val[x] = true; }
+
+    SIZE_V sizes_v;
+    for (int32_t i = 0; i < total_dims; i++) { sizes_v.val[i] = y_tensor->shape_view().At(i); }
+
+    // TODO(bbuf) delete strides caluculate, after tensor strides supported
+    SIZE_V strides_v;
+    strides_v.val[total_dims - 1] = 1;
+    for (int32_t i = total_dims - 2; i >= 0; i--) {
+      strides_v.val[i] = strides_v.val[i + 1] * y_tensor->shape_view().At(i + 1);
+    }
+    RUN_CUDA_KERNEL((FlipGpuForward<T>), ctx->stream(), elem_cnt, elem_cnt, total_dims, sizes_v,
+                    vis, strides_v, x_tensor->dptr<T>(), y_tensor->mut_dptr<T>());
+  }
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
+};
+
+#define REGISTER_FLIP_CUDA_KERNEL(dtype)                                            \
+  REGISTER_USER_KERNEL("flip").SetCreateFn<FlipGpuKernel<dtype>>().SetIsMatchedHob( \
+      (user_op::HobDeviceType() == DeviceType::kCUDA)                               \
+      && (user_op::HobDataType("y", 0) == GetDataType<dtype>::value));
+
+REGISTER_FLIP_CUDA_KERNEL(bool)
+REGISTER_FLIP_CUDA_KERNEL(float)
+REGISTER_FLIP_CUDA_KERNEL(double)
+REGISTER_FLIP_CUDA_KERNEL(uint8_t)
+REGISTER_FLIP_CUDA_KERNEL(int8_t)
+REGISTER_FLIP_CUDA_KERNEL(int32_t)
+REGISTER_FLIP_CUDA_KERNEL(int64_t)
+
 }  // namespace oneflow
\ No newline at end of file
diff --git a/oneflow/user/kernels/fold_kernel_util.hip.cpp b/oneflow/user/kernels/fold_kernel_util.hip.cpp
index 7085abc..ea82e73 100644
--- a/oneflow/user/kernels/fold_kernel_util.hip.cpp
+++ b/oneflow/user/kernels/fold_kernel_util.hip.cpp
@@ -1,75 +1,75 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#ifdef WITH_ROCM
-
-#include "hip/hip_runtime.h"
-#include "oneflow/core/framework/framework.h"
-#include "oneflow/core/hip/elementwise.hip.h"
-#include "oneflow/user/kernels/fold_kernel_util.h"
-#include "oneflow/core/ep/rocm/cuda_stream.h"
-
-namespace oneflow {
-
-namespace user_op {
-
-namespace {
-
-constexpr int kBlockSize = cuda::elementwise::kBlockSize;
-
-int GetNumBlocks(int64_t elem_cnt) {
-  int num_blocks = 0;
-  OF_CUDA_CHECK(cuda::elementwise::GetNumBlocks(elem_cnt, &num_blocks));
-  return num_blocks;
-}
-
-// NDIM range: (1, 2, 3)
-// SDIM range: (1, 2), 1 indicates channels_last, 2 indicates channels_first
-template<typename T, typename INDEX_T, int NDIM, int SDIM>
-__global__ void CudaFoldForward(FoldParams<INDEX_T, NDIM, SDIM> params, const T* input_ptr,
-                                T* output_ptr) {
-  CUDA_1D_KERNEL_LOOP_T(INDEX_T, in_offset, params.in_elem_cnt) {
-    using ParamType = FoldParams<INDEX_T, NDIM, SDIM>;
-    INDEX_T in_index[ParamType::kInputNDim] = {0};
-    INDEX_T out_index[ParamType::kOutputNDim] = {0};
-    params.in_index_helper.OffsetToNdIndex(in_offset, in_index);
-    if (!FoldIndexTransform<INDEX_T, NDIM, SDIM>(params, in_index, out_index)) {
-      INDEX_T out_offset = params.out_index_helper.NdIndexToOffset(out_index);
-      XPUAdd<T>::Invoke(&input_ptr[in_offset], &output_ptr[out_offset]);
-    } else {
-      continue;
-    }
-  }
-}
-
-}  // namespace
-
-template<typename T, typename INDEX_T, int NDIM, int SDIM>
-struct FoldKernelUtil<DeviceType::kCUDA, T, INDEX_T, NDIM, SDIM> {
-  using ParamType = FoldParams<INDEX_T, NDIM, SDIM>;
-  static void Forward(ep::Stream* stream, const void* raw_params, const T* input_ptr,
-                      T* output_ptr) {
-    const auto* fold_params = static_cast<const ParamType*>(raw_params);
-    CudaFoldForward<T, INDEX_T, NDIM, SDIM>
-        <<<GetNumBlocks(fold_params->in_elem_cnt), kBlockSize, 0,
-           stream->As<ep::CudaStream>()->cuda_stream()>>>(*fold_params, input_ptr, output_ptr);
-  }
-};
-
-INSTANTIATE_FOLD_KERNEL_UTIL_FOR_DEVICE(DeviceType::kCUDA)
-
-}  // namespace user_op
-}  // namespace oneflow
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#ifdef WITH_ROCM
+
+#include "hip/hip_runtime.h"
+#include "oneflow/core/framework/framework.h"
+#include "oneflow/core/hip/elementwise.hip.h"
+#include "oneflow/user/kernels/fold_kernel_util.h"
+#include "oneflow/core/ep/rocm/cuda_stream.h"
+
+namespace oneflow {
+
+namespace user_op {
+
+namespace {
+
+constexpr int kBlockSize = cuda::elementwise::kBlockSize;
+
+int GetNumBlocks(int64_t elem_cnt) {
+  int num_blocks = 0;
+  OF_CUDA_CHECK(cuda::elementwise::GetNumBlocks(elem_cnt, &num_blocks));
+  return num_blocks;
+}
+
+// NDIM range: (1, 2, 3)
+// SDIM range: (1, 2), 1 indicates channels_last, 2 indicates channels_first
+template<typename T, typename INDEX_T, int NDIM, int SDIM>
+__global__ void CudaFoldForward(FoldParams<INDEX_T, NDIM, SDIM> params, const T* input_ptr,
+                                T* output_ptr) {
+  CUDA_1D_KERNEL_LOOP_T(INDEX_T, in_offset, params.in_elem_cnt) {
+    using ParamType = FoldParams<INDEX_T, NDIM, SDIM>;
+    INDEX_T in_index[ParamType::kInputNDim] = {0};
+    INDEX_T out_index[ParamType::kOutputNDim] = {0};
+    params.in_index_helper.OffsetToNdIndex(in_offset, in_index);
+    if (!FoldIndexTransform<INDEX_T, NDIM, SDIM>(params, in_index, out_index)) {
+      INDEX_T out_offset = params.out_index_helper.NdIndexToOffset(out_index);
+      XPUAdd<T>::Invoke(&input_ptr[in_offset], &output_ptr[out_offset]);
+    } else {
+      continue;
+    }
+  }
+}
+
+}  // namespace
+
+template<typename T, typename INDEX_T, int NDIM, int SDIM>
+struct FoldKernelUtil<DeviceType::kCUDA, T, INDEX_T, NDIM, SDIM> {
+  using ParamType = FoldParams<INDEX_T, NDIM, SDIM>;
+  static void Forward(ep::Stream* stream, const void* raw_params, const T* input_ptr,
+                      T* output_ptr) {
+    const auto* fold_params = static_cast<const ParamType*>(raw_params);
+    CudaFoldForward<T, INDEX_T, NDIM, SDIM>
+        <<<GetNumBlocks(fold_params->in_elem_cnt), kBlockSize, 0,
+           stream->As<ep::CudaStream>()->cuda_stream()>>>(*fold_params, input_ptr, output_ptr);
+  }
+};
+
+INSTANTIATE_FOLD_KERNEL_UTIL_FOR_DEVICE(DeviceType::kCUDA)
+
+}  // namespace user_op
+}  // namespace oneflow
 #endif
\ No newline at end of file
diff --git a/oneflow/user/kernels/fused_bias_add_kernel.hip.cpp b/oneflow/user/kernels/fused_bias_add_kernel.hip.cpp
index 079f394..ceabaa0 100644
--- a/oneflow/user/kernels/fused_bias_add_kernel.hip.cpp
+++ b/oneflow/user/kernels/fused_bias_add_kernel.hip.cpp
@@ -1,456 +1,456 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "hip/hip_runtime.h"
-#include "oneflow/core/device/cuda_util.h"
-#include "oneflow/core/framework/framework.h"
-#include "oneflow/core/ep/rocm/cuda_stream.h"
-
-namespace oneflow {
-
-namespace {
-
-template<typename T>
-struct GeluFunctor {
-  __device__ T Compute(T x, int64_t i) const {
-    return static_cast<T>(0.5) * x * (static_cast<T>(1.0) + erf(static_cast<T>(M_SQRT1_2) * x));
-  }
-};
-
-template<>
-struct GeluFunctor<half> {
-  GeluFunctor<float> float_functor;
-  __device__ half Compute(half x, int64_t i) const {
-    return __float2half(float_functor.Compute(__half2float(x), i));
-  }
-  __device__ half2 ComputeHalf2(half2 x, int64_t i) const {
-    half2 y;
-    y.data.x = __float2half(float_functor.Compute(__half2float(x.data.x), 2 * i));
-    y.data.y = __float2half(float_functor.Compute(__half2float(x.data.y), 2 * i + 1));
-    return y;
-  }
-};
-
-template<typename T>
-struct MaskAndScaleFunctor {
-  MaskAndScaleFunctor(const bool* mask, float scale) : mask(mask), scale(scale) {}
-  __device__ T Compute(T x, int64_t i) const { return x * static_cast<T>(mask[i]) * scale; }
-  const bool* mask;
-  float scale;
-};
-
-template<>
-struct MaskAndScaleFunctor<half> {
-  MaskAndScaleFunctor(const bool* mask, float scale) : mask(mask), scale(scale) {}
-  __device__ half Compute(half x, int64_t i) const {
-    return x * static_cast<half>(mask[i] * scale);
-  }
-  __device__ half2 ComputeHalf2(half2 x, int64_t i) const {
-    const char2* mask_c2 = reinterpret_cast<const char2*>(mask);
-    char2 mask_val = mask_c2[i];
-    half2 one_or_zero_h2;
-    half2 h2_scale = __float2half2_rn(scale);
-    one_or_zero_h2.data.x = mask_val.x;
-    one_or_zero_h2.data.y = mask_val.y;
-    return __hmul2(__hmul2(x, one_or_zero_h2), h2_scale);
-  }
-  const bool* mask;
-  float scale;
-};
-
-template<typename T>
-struct MaskAndScaleAddFunctor {
-  MaskAndScaleAddFunctor(const bool* mask, const T* addend, float scale)
-      : mask(mask), addend(addend), scale(scale) {}
-  __device__ T Compute(T x, int64_t i) const {
-    return x * static_cast<T>(mask[i]) * scale + addend[i];
-  }
-  const bool* mask;
-  const T* addend;
-  float scale;
-};
-
-template<>
-struct MaskAndScaleAddFunctor<half> {
-  MaskAndScaleAddFunctor(const bool* mask, const half* addend, float scale)
-      : mask(mask), addend(addend), scale(scale) {}
-  __device__ half Compute(half x, int64_t i) const {
-    return x * static_cast<half>(mask[i] * scale) + addend[i];
-  }
-  __device__ half2 ComputeHalf2(half2 x, int64_t i) const {
-    const char2* mask_c2 = reinterpret_cast<const char2*>(mask);
-    const half2* addend_h2 = reinterpret_cast<const half2*>(addend);
-    char2 mask_val = mask_c2[i];
-    half2 one_or_zero_h2;
-    half2 h2_scale = __float2half2_rn(scale);
-    one_or_zero_h2.data.x = mask_val.x;
-    one_or_zero_h2.data.y = mask_val.y;
-    return __hadd2(__hmul2(__hmul2(x, one_or_zero_h2), h2_scale), addend_h2[i]);
-  }
-  const bool* mask;
-  const half* addend;
-  float scale;
-};
-
-template<typename T>
-struct GeluGradFunctor {
-  const T coef = std::sqrt(static_cast<T>(2.0) / std::acos(static_cast<T>(-1.0)));
-  __device__ T Compute(T x, T dy, int64_t i) const {
-    return static_cast<T>(0.5)
-           * (static_cast<T>(1.0) + erf(static_cast<T>(M_SQRT1_2) * x)
-              + x * coef * exp(static_cast<T>(-0.5) * x * x))
-           * dy;
-  }
-};
-
-template<>
-struct GeluGradFunctor<half> {
-  GeluGradFunctor<float> float_functor;
-  __device__ half Compute(half x, half dy, int64_t i) const {
-    return __float2half(float_functor.Compute(__half2float(x), __half2float(dy), i));
-  }
-};
-
-template<typename FUNCTOR, typename T, typename Index>
-__global__ void FusedBiasAddGpu(FUNCTOR functor, const Index elem_cnt, const Index bias_size,
-                                const Index inner_size, const T* x, const T* bias, T* y) {
-  const Index block_size = bias_size * inner_size;
-  CUDA_1D_KERNEL_LOOP_T(Index, i, elem_cnt) {
-    T x_i = x[i] + bias[(i % block_size) / inner_size];
-    y[i] = functor.Compute(x_i, i);
-  }
-}
-
-template<typename FUNCTOR, typename T, typename Index>
-__global__ void FusedBiasAddGradGpu(FUNCTOR grad_functor, const Index elem_cnt,
-                                    const Index bias_size, const Index inner_size, const T* x,
-                                    const T* bias, const T* dy, T* dx) {
-  const Index block_size = bias_size * inner_size;
-  CUDA_1D_KERNEL_LOOP_T(Index, i, elem_cnt) {
-    T x_i = x[i] + bias[(i % block_size) / inner_size];
-    dx[i] = grad_functor.Compute(x_i, dy[i], i);
-  }
-}
-
-template<typename FUNCTOR, typename T, typename Index>
-__global__ void FusedBiasAddRowGpu(FUNCTOR functor, const Index elem_cnt, const Index bias_size,
-                                   const T* x, const T* bias, T* y) {
-  CUDA_1D_KERNEL_LOOP_T(Index, i, elem_cnt) {
-    T x_i = x[i] + bias[i % bias_size];
-    y[i] = functor.Compute(x_i, i);
-  }
-}
-
-template<typename FUNCTOR, typename T, typename Index>
-__global__ void FusedBiasAddGradRowGpu(FUNCTOR grad_functor, const Index elem_cnt,
-                                       const Index bias_size, const T* x, const T* bias,
-                                       const T* dy, T* dx) {
-  CUDA_1D_KERNEL_LOOP_T(Index, i, elem_cnt) {
-    T x_i = x[i] + bias[i % bias_size];
-    dx[i] = grad_functor.Compute(x_i, dy[i], i);
-  }
-}
-
-template<typename FUNCTOR, typename Index>
-__global__ void FusedBiasAddRowGpuHalf2(FUNCTOR functor, const Index elem_cnt,
-                                        const Index bias_size, const half* x, const half* bias,
-                                        half* y) {
-  const Index h2_elem_cnt = elem_cnt / 2;
-  const Index h2_bias_size = bias_size / 2;
-  const auto* x_h2 = reinterpret_cast<const half2*>(x);
-  const auto* bias_h2 = reinterpret_cast<const half2*>(bias);
-  auto* y_h2 = reinterpret_cast<half2*>(y);
-  CUDA_1D_KERNEL_LOOP_T(Index, i, h2_elem_cnt) {
-    half2 x_i = __hadd2(x_h2[i], bias_h2[i % h2_bias_size]);
-    y_h2[i] = functor.ComputeHalf2(x_i, i);
-  }
-}
-
-template<typename FUNCTOR, typename Index>
-__global__ void FusedBiasAddGradRowGpuHalf2(FUNCTOR grad_functor, const Index elem_cnt,
-                                            const Index bias_size, const half* x, const half* bias,
-                                            const half* dy, half* dx) {
-  const Index h2_elem_cnt = elem_cnt / 2;
-  const Index h2_bias_size = bias_size / 2;
-  const auto* x_h2 = reinterpret_cast<const half2*>(x);
-  const auto* bias_h2 = reinterpret_cast<const half2*>(bias);
-  const auto* dy_h2 = reinterpret_cast<const half2*>(dy);
-  auto* dx_h2 = reinterpret_cast<half2*>(dx);
-  CUDA_1D_KERNEL_LOOP_T(Index, i, h2_elem_cnt) {
-    half2 x_i = __hadd2(x_h2[i], bias_h2[i % h2_bias_size]);
-    half2 dy_i = dy_h2[i];
-    half2 dx_i;
-    dx_i.data.x = grad_functor.Compute(x_i.data.x, dy_i.data.x, 2 * i);
-    dx_i.data.y = grad_functor.Compute(x_i.data.y, dy_i.data.y, 2 * i + 1);
-    dx_h2[i] = dx_i;
-  }
-}
-
-template<typename FUNCTOR, typename T, typename Index>
-__global__ void FusedBiasAddColGpu(FUNCTOR functor, const Index elem_cnt, const Index inner_size,
-                                   const T* x, const T* bias, T* y) {
-  CUDA_1D_KERNEL_LOOP_T(Index, i, elem_cnt) {
-    T x_i = x[i] + bias[i / inner_size];
-    y[i] = functor.Compute(x_i, i);
-  }
-}
-
-template<typename FUNCTOR, typename T, typename Index>
-__global__ void FusedBiasAddGradColGpu(FUNCTOR grad_functor, const Index elem_cnt,
-                                       const Index inner_size, const T* x, const T* bias,
-                                       const T* dy, T* dx) {
-  CUDA_1D_KERNEL_LOOP_T(Index, i, elem_cnt) {
-    T x_i = x[i] + bias[i / inner_size];
-    dx[i] = grad_functor.Compute(x_i, dy[i], i);
-  }
-}
-
-template<typename FUNCTOR, typename T, typename Index>
-struct FusedBiasAddRow {
-  static void Invoke(ep::Stream* stream, FUNCTOR functor, Index elem_cnt, Index bias_size,
-                     const T* x, const T* bias, T* y) {
-    FusedBiasAddRowGpu<FUNCTOR, T, Index>
-        <<<BlocksNum4ThreadsNum(elem_cnt), kCudaThreadsNumPerBlock, 0,
-           stream->As<ep::CudaStream>()->cuda_stream()>>>(functor, elem_cnt, bias_size, x, bias, y);
-  }
-};
-
-template<typename FUNCTOR, typename Index>
-struct FusedBiasAddRow<FUNCTOR, half, Index> {
-  static void Invoke(ep::Stream* stream, FUNCTOR functor, Index elem_cnt, Index bias_size,
-                     const half* x, const half* bias, half* y) {
-    if (bias_size % 2 == 0) {
-      FusedBiasAddRowGpuHalf2<FUNCTOR, Index>
-          <<<BlocksNum4ThreadsNum(elem_cnt / 2), kCudaThreadsNumPerBlock, 0,
-             stream->As<ep::CudaStream>()->cuda_stream()>>>(functor, elem_cnt, bias_size, x, bias,
-                                                            y);
-    } else {
-      FusedBiasAddRowGpu<FUNCTOR, half, Index>
-          <<<BlocksNum4ThreadsNum(elem_cnt), kCudaThreadsNumPerBlock, 0,
-             stream->As<ep::CudaStream>()->cuda_stream()>>>(functor, elem_cnt, bias_size, x, bias,
-                                                            y);
-    }
-  }
-};
-
-template<typename FUNCTOR, typename T, typename Index>
-void FusedBiasAddForwardImpl(ep::Stream* stream, FUNCTOR functor, Index outer_size, Index bias_size,
-                             Index inner_size, const T* x, const T* bias, T* y) {
-  const Index elem_cnt = outer_size * bias_size * inner_size;
-  if (inner_size == 1) {
-    FusedBiasAddRow<FUNCTOR, T, Index>::Invoke(stream, functor, elem_cnt, bias_size, x, bias, y);
-  } else if (outer_size == 1) {
-    FusedBiasAddColGpu<FUNCTOR, T, Index><<<BlocksNum4ThreadsNum(elem_cnt), kCudaThreadsNumPerBlock,
-                                            0, stream->As<ep::CudaStream>()->cuda_stream()>>>(
-        functor, elem_cnt, inner_size, x, bias, y);
-  } else {
-    FusedBiasAddGpu<FUNCTOR, T, Index><<<BlocksNum4ThreadsNum(elem_cnt), kCudaThreadsNumPerBlock, 0,
-                                         stream->As<ep::CudaStream>()->cuda_stream()>>>(
-        functor, elem_cnt, bias_size, inner_size, x, bias, y);
-  }
-}
-
-template<typename FUNCTOR, typename T, typename Index>
-struct FusedBiasAddGradRow {
-  static void Invoke(ep::Stream* stream, FUNCTOR grad_functor, Index elem_cnt, Index bias_size,
-                     const T* x, const T* bias, const T* dy, T* dx) {
-    FusedBiasAddGradRowGpu<FUNCTOR, T, Index>
-        <<<BlocksNum4ThreadsNum(elem_cnt), kCudaThreadsNumPerBlock, 0,
-           stream->As<ep::CudaStream>()->cuda_stream()>>>(grad_functor, elem_cnt, bias_size, x,
-                                                          bias, dy, dx);
-  }
-};
-
-template<typename FUNCTOR, typename Index>
-struct FusedBiasAddGradRow<FUNCTOR, half, Index> {
-  static void Invoke(ep::Stream* stream, FUNCTOR grad_functor, Index elem_cnt, Index bias_size,
-                     const half* x, const half* bias, const half* dy, half* dx) {
-    if (bias_size % 2 == 0) {
-      FusedBiasAddGradRowGpuHalf2<FUNCTOR, Index>
-          <<<BlocksNum4ThreadsNum(elem_cnt / 2), kCudaThreadsNumPerBlock, 0,
-             stream->As<ep::CudaStream>()->cuda_stream()>>>(grad_functor, elem_cnt, bias_size, x,
-                                                            bias, dy, dx);
-    } else {
-      FusedBiasAddGradRowGpu<FUNCTOR, half, Index>
-          <<<BlocksNum4ThreadsNum(elem_cnt), kCudaThreadsNumPerBlock, 0,
-             stream->As<ep::CudaStream>()->cuda_stream()>>>(grad_functor, elem_cnt, bias_size, x,
-                                                            bias, dy, dx);
-    }
-  }
-};
-
-template<typename FUNCTOR, typename T, typename Index>
-void FusedBiasAddGradImpl(ep::Stream* stream, FUNCTOR grad_functor, Index outer_size,
-                          Index bias_size, Index inner_size, const T* x, const T* bias, const T* dy,
-                          T* dx) {
-  const Index elem_cnt = outer_size * bias_size * inner_size;
-  if (inner_size == 1) {
-    FusedBiasAddGradRow<FUNCTOR, T, Index>::Invoke(stream, grad_functor, elem_cnt, bias_size, x,
-                                                   bias, dy, dx);
-  } else if (outer_size == 1) {
-    FusedBiasAddGradColGpu<FUNCTOR, T, Index>
-        <<<BlocksNum4ThreadsNum(elem_cnt), kCudaThreadsNumPerBlock, 0,
-           stream->As<ep::CudaStream>()->cuda_stream()>>>(grad_functor, elem_cnt, inner_size, x,
-                                                          bias, dy, dx);
-  } else {
-    FusedBiasAddGradGpu<FUNCTOR, T, Index>
-        <<<BlocksNum4ThreadsNum(elem_cnt), kCudaThreadsNumPerBlock, 0,
-           stream->As<ep::CudaStream>()->cuda_stream()>>>(grad_functor, elem_cnt, bias_size,
-                                                          inner_size, x, bias, dy, dx);
-  }
-}
-
-template<typename FUNCTOR, typename T>
-void DispatchFusedBiasAddForwardImpl(ep::Stream* stream, FUNCTOR functor, int64_t n,
-                                     int64_t outer_size, int64_t bias_size, int64_t inner_size,
-                                     const T* x, const T* bias, T* y) {
-  if (IsKernelSafeInt32(n)) {
-    FusedBiasAddForwardImpl<FUNCTOR, T, int32_t>(stream, functor, outer_size, bias_size, inner_size,
-                                                 x, bias, y);
-  } else {
-    FusedBiasAddForwardImpl<FUNCTOR, T, int64_t>(stream, functor, outer_size, bias_size, inner_size,
-                                                 x, bias, y);
-  }
-}
-
-}  // namespace
-
-template<typename T>
-class FusedFusedBiasAddKernel final : public user_op::OpKernel {
- public:
-  FusedFusedBiasAddKernel() = default;
-  ~FusedFusedBiasAddKernel() override = default;
-
- private:
-  using user_op::OpKernel::Compute;
-  void Compute(user_op::KernelComputeContext* ctx) const override {
-    const auto* a_tensor = ctx->Tensor4ArgNameAndIndex("a", 0);
-    const auto* b_tensor = ctx->Tensor4ArgNameAndIndex("b", 0);
-    auto* out_tensor = ctx->Tensor4ArgNameAndIndex("out", 0);
-    const int32_t bias_add_axis = ctx->Attr<int32_t>("axis");
-    const int64_t outer_size = a_tensor->shape_view().Count(0, bias_add_axis);
-    const int64_t bias_size = a_tensor->shape_view().At(bias_add_axis);
-    const int64_t inner_size = a_tensor->shape_view().Count(bias_add_axis + 1);
-    const auto n = a_tensor->shape_view().elem_cnt();
-    GeluFunctor<T> gelu_functor{};
-    DispatchFusedBiasAddForwardImpl<decltype(gelu_functor), T>(
-        ctx->stream(), gelu_functor, n, outer_size, bias_size, inner_size, a_tensor->dptr<T>(),
-        b_tensor->dptr<T>(), out_tensor->mut_dptr<T>());
-  };
-
-  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
-};
-
-#define REGISTER_FUSED_BIAS_ADD_GELU_KERNEL(dtype)                     \
-  REGISTER_USER_KERNEL("fused_bias_add_gelu")                          \
-      .SetCreateFn<FusedFusedBiasAddKernel<dtype>>()                   \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \
-                       && (user_op::HobDataType("out", 0) == GetDataType<dtype>::value));
-
-REGISTER_FUSED_BIAS_ADD_GELU_KERNEL(float)
-REGISTER_FUSED_BIAS_ADD_GELU_KERNEL(double)
-REGISTER_FUSED_BIAS_ADD_GELU_KERNEL(half)
-
-template<typename T>
-class FusedBiasAddMaskScaleKernel final : public user_op::OpKernel {
- public:
-  FusedBiasAddMaskScaleKernel() = default;
-  ~FusedBiasAddMaskScaleKernel() override = default;
-
- private:
-  using user_op::OpKernel::Compute;
-  void Compute(user_op::KernelComputeContext* ctx) const override {
-    const auto* a_tensor = ctx->Tensor4ArgNameAndIndex("a", 0);
-    const auto* b_tensor = ctx->Tensor4ArgNameAndIndex("b", 0);
-    const auto* mask_tensor = ctx->Tensor4ArgNameAndIndex("mask", 0);
-    auto* out_tensor = ctx->Tensor4ArgNameAndIndex("out", 0);
-    const int32_t bias_add_axis = ctx->Attr<int32_t>("axis");
-    const float scale = ctx->Attr<float>("scale");
-    const int64_t outer_size = a_tensor->shape_view().Count(0, bias_add_axis);
-    const int64_t bias_size = a_tensor->shape_view().At(bias_add_axis);
-    const int64_t inner_size = a_tensor->shape_view().Count(bias_add_axis + 1);
-    const auto n = a_tensor->shape_view().elem_cnt();
-    if (ctx->has_input("_add_to_output", 0)) {
-      const user_op::Tensor* addend = ctx->Tensor4ArgNameAndIndex("_add_to_output", 0);
-      MaskAndScaleAddFunctor<T> mask_and_scale_add_functor(mask_tensor->dptr<bool>(),
-                                                           addend->dptr<T>(), scale);
-      DispatchFusedBiasAddForwardImpl<decltype(mask_and_scale_add_functor), T>(
-          ctx->stream(), mask_and_scale_add_functor, n, outer_size, bias_size, inner_size,
-          a_tensor->dptr<T>(), b_tensor->dptr<T>(), out_tensor->mut_dptr<T>());
-    } else {
-      MaskAndScaleFunctor<T> mask_and_scale_functor(mask_tensor->dptr<bool>(), scale);
-      DispatchFusedBiasAddForwardImpl<decltype(mask_and_scale_functor), T>(
-          ctx->stream(), mask_and_scale_functor, n, outer_size, bias_size, inner_size,
-          a_tensor->dptr<T>(), b_tensor->dptr<T>(), out_tensor->mut_dptr<T>());
-    }
-  };
-
-  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
-};
-
-#define REGISTER_FUSED_BIAS_ADD_MASK_SCALE_KERNEL(dtype)               \
-  REGISTER_USER_KERNEL("fused_bias_add_mask_scale")                    \
-      .SetCreateFn<FusedBiasAddMaskScaleKernel<dtype>>()               \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \
-                       && (user_op::HobDataType("out", 0) == GetDataType<dtype>::value));
-
-REGISTER_FUSED_BIAS_ADD_MASK_SCALE_KERNEL(float)
-REGISTER_FUSED_BIAS_ADD_MASK_SCALE_KERNEL(double)
-REGISTER_FUSED_BIAS_ADD_MASK_SCALE_KERNEL(half)
-
-template<typename T>
-class FusedFusedBiasAddGradKernel final : public user_op::OpKernel {
- public:
-  FusedFusedBiasAddGradKernel() = default;
-  ~FusedFusedBiasAddGradKernel() override = default;
-
- private:
-  using user_op::OpKernel::Compute;
-  void Compute(user_op::KernelComputeContext* ctx) const override {
-    const auto* a_tensor = ctx->Tensor4ArgNameAndIndex("a", 0);
-    const auto* b_tensor = ctx->Tensor4ArgNameAndIndex("b", 0);
-    const auto* dy_tensor = ctx->Tensor4ArgNameAndIndex("dy", 0);
-    auto* dx_tensor = ctx->Tensor4ArgNameAndIndex("dx", 0);
-    const int32_t bias_add_axis = ctx->Attr<int32_t>("axis");
-    const int64_t outer_size = a_tensor->shape_view().Count(0, bias_add_axis);
-    const int64_t bias_size = a_tensor->shape_view().At(bias_add_axis);
-    const int64_t inner_size = a_tensor->shape_view().Count(bias_add_axis + 1);
-    const auto n = a_tensor->shape_view().elem_cnt();
-    GeluGradFunctor<T> gelu_grad_functor;
-    if (IsKernelSafeInt32(n)) {
-      FusedBiasAddGradImpl<decltype(gelu_grad_functor), T, int32_t>(
-          ctx->stream(), gelu_grad_functor, outer_size, bias_size, inner_size, a_tensor->dptr<T>(),
-          b_tensor->dptr<T>(), dy_tensor->dptr<T>(), dx_tensor->mut_dptr<T>());
-    } else {
-      FusedBiasAddGradImpl<decltype(gelu_grad_functor), T, int64_t>(
-          ctx->stream(), gelu_grad_functor, outer_size, bias_size, inner_size, a_tensor->dptr<T>(),
-          b_tensor->dptr<T>(), dy_tensor->dptr<T>(), dx_tensor->mut_dptr<T>());
-    }
-  };
-
-  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
-};
-
-#define REGISTER_FUSED_BIAS_ADD_GELU_GRAD_KERNEL(dtype)                \
-  REGISTER_USER_KERNEL("fused_bias_add_gelu_grad")                     \
-      .SetCreateFn<FusedFusedBiasAddGradKernel<dtype>>()               \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \
-                       && (user_op::HobDataType("dx", 0) == GetDataType<dtype>::value));
-
-REGISTER_FUSED_BIAS_ADD_GELU_GRAD_KERNEL(float)
-REGISTER_FUSED_BIAS_ADD_GELU_GRAD_KERNEL(double)
-REGISTER_FUSED_BIAS_ADD_GELU_GRAD_KERNEL(half)
-
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "hip/hip_runtime.h"
+#include "oneflow/core/device/cuda_util.h"
+#include "oneflow/core/framework/framework.h"
+#include "oneflow/core/ep/rocm/cuda_stream.h"
+
+namespace oneflow {
+
+namespace {
+
+template<typename T>
+struct GeluFunctor {
+  __device__ T Compute(T x, int64_t i) const {
+    return static_cast<T>(0.5) * x * (static_cast<T>(1.0) + erf(static_cast<T>(M_SQRT1_2) * x));
+  }
+};
+
+template<>
+struct GeluFunctor<half> {
+  GeluFunctor<float> float_functor;
+  __device__ half Compute(half x, int64_t i) const {
+    return __float2half(float_functor.Compute(__half2float(x), i));
+  }
+  __device__ half2 ComputeHalf2(half2 x, int64_t i) const {
+    half2 y;
+    y.data.x = __float2half(float_functor.Compute(__half2float(x.data.x), 2 * i));
+    y.data.y = __float2half(float_functor.Compute(__half2float(x.data.y), 2 * i + 1));
+    return y;
+  }
+};
+
+template<typename T>
+struct MaskAndScaleFunctor {
+  MaskAndScaleFunctor(const bool* mask, float scale) : mask(mask), scale(scale) {}
+  __device__ T Compute(T x, int64_t i) const { return x * static_cast<T>(mask[i]) * scale; }
+  const bool* mask;
+  float scale;
+};
+
+template<>
+struct MaskAndScaleFunctor<half> {
+  MaskAndScaleFunctor(const bool* mask, float scale) : mask(mask), scale(scale) {}
+  __device__ half Compute(half x, int64_t i) const {
+    return x * static_cast<half>(mask[i] * scale);
+  }
+  __device__ half2 ComputeHalf2(half2 x, int64_t i) const {
+    const char2* mask_c2 = reinterpret_cast<const char2*>(mask);
+    char2 mask_val = mask_c2[i];
+    half2 one_or_zero_h2;
+    half2 h2_scale = __float2half2_rn(scale);
+    one_or_zero_h2.data.x = mask_val.x;
+    one_or_zero_h2.data.y = mask_val.y;
+    return __hmul2(__hmul2(x, one_or_zero_h2), h2_scale);
+  }
+  const bool* mask;
+  float scale;
+};
+
+template<typename T>
+struct MaskAndScaleAddFunctor {
+  MaskAndScaleAddFunctor(const bool* mask, const T* addend, float scale)
+      : mask(mask), addend(addend), scale(scale) {}
+  __device__ T Compute(T x, int64_t i) const {
+    return x * static_cast<T>(mask[i]) * scale + addend[i];
+  }
+  const bool* mask;
+  const T* addend;
+  float scale;
+};
+
+template<>
+struct MaskAndScaleAddFunctor<half> {
+  MaskAndScaleAddFunctor(const bool* mask, const half* addend, float scale)
+      : mask(mask), addend(addend), scale(scale) {}
+  __device__ half Compute(half x, int64_t i) const {
+    return x * static_cast<half>(mask[i] * scale) + addend[i];
+  }
+  __device__ half2 ComputeHalf2(half2 x, int64_t i) const {
+    const char2* mask_c2 = reinterpret_cast<const char2*>(mask);
+    const half2* addend_h2 = reinterpret_cast<const half2*>(addend);
+    char2 mask_val = mask_c2[i];
+    half2 one_or_zero_h2;
+    half2 h2_scale = __float2half2_rn(scale);
+    one_or_zero_h2.data.x = mask_val.x;
+    one_or_zero_h2.data.y = mask_val.y;
+    return __hadd2(__hmul2(__hmul2(x, one_or_zero_h2), h2_scale), addend_h2[i]);
+  }
+  const bool* mask;
+  const half* addend;
+  float scale;
+};
+
+template<typename T>
+struct GeluGradFunctor {
+  const T coef = std::sqrt(static_cast<T>(2.0) / std::acos(static_cast<T>(-1.0)));
+  __device__ T Compute(T x, T dy, int64_t i) const {
+    return static_cast<T>(0.5)
+           * (static_cast<T>(1.0) + erf(static_cast<T>(M_SQRT1_2) * x)
+              + x * coef * exp(static_cast<T>(-0.5) * x * x))
+           * dy;
+  }
+};
+
+template<>
+struct GeluGradFunctor<half> {
+  GeluGradFunctor<float> float_functor;
+  __device__ half Compute(half x, half dy, int64_t i) const {
+    return __float2half(float_functor.Compute(__half2float(x), __half2float(dy), i));
+  }
+};
+
+template<typename FUNCTOR, typename T, typename Index>
+__global__ void FusedBiasAddGpu(FUNCTOR functor, const Index elem_cnt, const Index bias_size,
+                                const Index inner_size, const T* x, const T* bias, T* y) {
+  const Index block_size = bias_size * inner_size;
+  CUDA_1D_KERNEL_LOOP_T(Index, i, elem_cnt) {
+    T x_i = x[i] + bias[(i % block_size) / inner_size];
+    y[i] = functor.Compute(x_i, i);
+  }
+}
+
+template<typename FUNCTOR, typename T, typename Index>
+__global__ void FusedBiasAddGradGpu(FUNCTOR grad_functor, const Index elem_cnt,
+                                    const Index bias_size, const Index inner_size, const T* x,
+                                    const T* bias, const T* dy, T* dx) {
+  const Index block_size = bias_size * inner_size;
+  CUDA_1D_KERNEL_LOOP_T(Index, i, elem_cnt) {
+    T x_i = x[i] + bias[(i % block_size) / inner_size];
+    dx[i] = grad_functor.Compute(x_i, dy[i], i);
+  }
+}
+
+template<typename FUNCTOR, typename T, typename Index>
+__global__ void FusedBiasAddRowGpu(FUNCTOR functor, const Index elem_cnt, const Index bias_size,
+                                   const T* x, const T* bias, T* y) {
+  CUDA_1D_KERNEL_LOOP_T(Index, i, elem_cnt) {
+    T x_i = x[i] + bias[i % bias_size];
+    y[i] = functor.Compute(x_i, i);
+  }
+}
+
+template<typename FUNCTOR, typename T, typename Index>
+__global__ void FusedBiasAddGradRowGpu(FUNCTOR grad_functor, const Index elem_cnt,
+                                       const Index bias_size, const T* x, const T* bias,
+                                       const T* dy, T* dx) {
+  CUDA_1D_KERNEL_LOOP_T(Index, i, elem_cnt) {
+    T x_i = x[i] + bias[i % bias_size];
+    dx[i] = grad_functor.Compute(x_i, dy[i], i);
+  }
+}
+
+template<typename FUNCTOR, typename Index>
+__global__ void FusedBiasAddRowGpuHalf2(FUNCTOR functor, const Index elem_cnt,
+                                        const Index bias_size, const half* x, const half* bias,
+                                        half* y) {
+  const Index h2_elem_cnt = elem_cnt / 2;
+  const Index h2_bias_size = bias_size / 2;
+  const auto* x_h2 = reinterpret_cast<const half2*>(x);
+  const auto* bias_h2 = reinterpret_cast<const half2*>(bias);
+  auto* y_h2 = reinterpret_cast<half2*>(y);
+  CUDA_1D_KERNEL_LOOP_T(Index, i, h2_elem_cnt) {
+    half2 x_i = __hadd2(x_h2[i], bias_h2[i % h2_bias_size]);
+    y_h2[i] = functor.ComputeHalf2(x_i, i);
+  }
+}
+
+template<typename FUNCTOR, typename Index>
+__global__ void FusedBiasAddGradRowGpuHalf2(FUNCTOR grad_functor, const Index elem_cnt,
+                                            const Index bias_size, const half* x, const half* bias,
+                                            const half* dy, half* dx) {
+  const Index h2_elem_cnt = elem_cnt / 2;
+  const Index h2_bias_size = bias_size / 2;
+  const auto* x_h2 = reinterpret_cast<const half2*>(x);
+  const auto* bias_h2 = reinterpret_cast<const half2*>(bias);
+  const auto* dy_h2 = reinterpret_cast<const half2*>(dy);
+  auto* dx_h2 = reinterpret_cast<half2*>(dx);
+  CUDA_1D_KERNEL_LOOP_T(Index, i, h2_elem_cnt) {
+    half2 x_i = __hadd2(x_h2[i], bias_h2[i % h2_bias_size]);
+    half2 dy_i = dy_h2[i];
+    half2 dx_i;
+    dx_i.data.x = grad_functor.Compute(x_i.data.x, dy_i.data.x, 2 * i);
+    dx_i.data.y = grad_functor.Compute(x_i.data.y, dy_i.data.y, 2 * i + 1);
+    dx_h2[i] = dx_i;
+  }
+}
+
+template<typename FUNCTOR, typename T, typename Index>
+__global__ void FusedBiasAddColGpu(FUNCTOR functor, const Index elem_cnt, const Index inner_size,
+                                   const T* x, const T* bias, T* y) {
+  CUDA_1D_KERNEL_LOOP_T(Index, i, elem_cnt) {
+    T x_i = x[i] + bias[i / inner_size];
+    y[i] = functor.Compute(x_i, i);
+  }
+}
+
+template<typename FUNCTOR, typename T, typename Index>
+__global__ void FusedBiasAddGradColGpu(FUNCTOR grad_functor, const Index elem_cnt,
+                                       const Index inner_size, const T* x, const T* bias,
+                                       const T* dy, T* dx) {
+  CUDA_1D_KERNEL_LOOP_T(Index, i, elem_cnt) {
+    T x_i = x[i] + bias[i / inner_size];
+    dx[i] = grad_functor.Compute(x_i, dy[i], i);
+  }
+}
+
+template<typename FUNCTOR, typename T, typename Index>
+struct FusedBiasAddRow {
+  static void Invoke(ep::Stream* stream, FUNCTOR functor, Index elem_cnt, Index bias_size,
+                     const T* x, const T* bias, T* y) {
+    FusedBiasAddRowGpu<FUNCTOR, T, Index>
+        <<<BlocksNum4ThreadsNum(elem_cnt), kCudaThreadsNumPerBlock, 0,
+           stream->As<ep::CudaStream>()->cuda_stream()>>>(functor, elem_cnt, bias_size, x, bias, y);
+  }
+};
+
+template<typename FUNCTOR, typename Index>
+struct FusedBiasAddRow<FUNCTOR, half, Index> {
+  static void Invoke(ep::Stream* stream, FUNCTOR functor, Index elem_cnt, Index bias_size,
+                     const half* x, const half* bias, half* y) {
+    if (bias_size % 2 == 0) {
+      FusedBiasAddRowGpuHalf2<FUNCTOR, Index>
+          <<<BlocksNum4ThreadsNum(elem_cnt / 2), kCudaThreadsNumPerBlock, 0,
+             stream->As<ep::CudaStream>()->cuda_stream()>>>(functor, elem_cnt, bias_size, x, bias,
+                                                            y);
+    } else {
+      FusedBiasAddRowGpu<FUNCTOR, half, Index>
+          <<<BlocksNum4ThreadsNum(elem_cnt), kCudaThreadsNumPerBlock, 0,
+             stream->As<ep::CudaStream>()->cuda_stream()>>>(functor, elem_cnt, bias_size, x, bias,
+                                                            y);
+    }
+  }
+};
+
+template<typename FUNCTOR, typename T, typename Index>
+void FusedBiasAddForwardImpl(ep::Stream* stream, FUNCTOR functor, Index outer_size, Index bias_size,
+                             Index inner_size, const T* x, const T* bias, T* y) {
+  const Index elem_cnt = outer_size * bias_size * inner_size;
+  if (inner_size == 1) {
+    FusedBiasAddRow<FUNCTOR, T, Index>::Invoke(stream, functor, elem_cnt, bias_size, x, bias, y);
+  } else if (outer_size == 1) {
+    FusedBiasAddColGpu<FUNCTOR, T, Index><<<BlocksNum4ThreadsNum(elem_cnt), kCudaThreadsNumPerBlock,
+                                            0, stream->As<ep::CudaStream>()->cuda_stream()>>>(
+        functor, elem_cnt, inner_size, x, bias, y);
+  } else {
+    FusedBiasAddGpu<FUNCTOR, T, Index><<<BlocksNum4ThreadsNum(elem_cnt), kCudaThreadsNumPerBlock, 0,
+                                         stream->As<ep::CudaStream>()->cuda_stream()>>>(
+        functor, elem_cnt, bias_size, inner_size, x, bias, y);
+  }
+}
+
+template<typename FUNCTOR, typename T, typename Index>
+struct FusedBiasAddGradRow {
+  static void Invoke(ep::Stream* stream, FUNCTOR grad_functor, Index elem_cnt, Index bias_size,
+                     const T* x, const T* bias, const T* dy, T* dx) {
+    FusedBiasAddGradRowGpu<FUNCTOR, T, Index>
+        <<<BlocksNum4ThreadsNum(elem_cnt), kCudaThreadsNumPerBlock, 0,
+           stream->As<ep::CudaStream>()->cuda_stream()>>>(grad_functor, elem_cnt, bias_size, x,
+                                                          bias, dy, dx);
+  }
+};
+
+template<typename FUNCTOR, typename Index>
+struct FusedBiasAddGradRow<FUNCTOR, half, Index> {
+  static void Invoke(ep::Stream* stream, FUNCTOR grad_functor, Index elem_cnt, Index bias_size,
+                     const half* x, const half* bias, const half* dy, half* dx) {
+    if (bias_size % 2 == 0) {
+      FusedBiasAddGradRowGpuHalf2<FUNCTOR, Index>
+          <<<BlocksNum4ThreadsNum(elem_cnt / 2), kCudaThreadsNumPerBlock, 0,
+             stream->As<ep::CudaStream>()->cuda_stream()>>>(grad_functor, elem_cnt, bias_size, x,
+                                                            bias, dy, dx);
+    } else {
+      FusedBiasAddGradRowGpu<FUNCTOR, half, Index>
+          <<<BlocksNum4ThreadsNum(elem_cnt), kCudaThreadsNumPerBlock, 0,
+             stream->As<ep::CudaStream>()->cuda_stream()>>>(grad_functor, elem_cnt, bias_size, x,
+                                                            bias, dy, dx);
+    }
+  }
+};
+
+template<typename FUNCTOR, typename T, typename Index>
+void FusedBiasAddGradImpl(ep::Stream* stream, FUNCTOR grad_functor, Index outer_size,
+                          Index bias_size, Index inner_size, const T* x, const T* bias, const T* dy,
+                          T* dx) {
+  const Index elem_cnt = outer_size * bias_size * inner_size;
+  if (inner_size == 1) {
+    FusedBiasAddGradRow<FUNCTOR, T, Index>::Invoke(stream, grad_functor, elem_cnt, bias_size, x,
+                                                   bias, dy, dx);
+  } else if (outer_size == 1) {
+    FusedBiasAddGradColGpu<FUNCTOR, T, Index>
+        <<<BlocksNum4ThreadsNum(elem_cnt), kCudaThreadsNumPerBlock, 0,
+           stream->As<ep::CudaStream>()->cuda_stream()>>>(grad_functor, elem_cnt, inner_size, x,
+                                                          bias, dy, dx);
+  } else {
+    FusedBiasAddGradGpu<FUNCTOR, T, Index>
+        <<<BlocksNum4ThreadsNum(elem_cnt), kCudaThreadsNumPerBlock, 0,
+           stream->As<ep::CudaStream>()->cuda_stream()>>>(grad_functor, elem_cnt, bias_size,
+                                                          inner_size, x, bias, dy, dx);
+  }
+}
+
+template<typename FUNCTOR, typename T>
+void DispatchFusedBiasAddForwardImpl(ep::Stream* stream, FUNCTOR functor, int64_t n,
+                                     int64_t outer_size, int64_t bias_size, int64_t inner_size,
+                                     const T* x, const T* bias, T* y) {
+  if (IsKernelSafeInt32(n)) {
+    FusedBiasAddForwardImpl<FUNCTOR, T, int32_t>(stream, functor, outer_size, bias_size, inner_size,
+                                                 x, bias, y);
+  } else {
+    FusedBiasAddForwardImpl<FUNCTOR, T, int64_t>(stream, functor, outer_size, bias_size, inner_size,
+                                                 x, bias, y);
+  }
+}
+
+}  // namespace
+
+template<typename T>
+class FusedFusedBiasAddKernel final : public user_op::OpKernel {
+ public:
+  FusedFusedBiasAddKernel() = default;
+  ~FusedFusedBiasAddKernel() override = default;
+
+ private:
+  using user_op::OpKernel::Compute;
+  void Compute(user_op::KernelComputeContext* ctx) const override {
+    const auto* a_tensor = ctx->Tensor4ArgNameAndIndex("a", 0);
+    const auto* b_tensor = ctx->Tensor4ArgNameAndIndex("b", 0);
+    auto* out_tensor = ctx->Tensor4ArgNameAndIndex("out", 0);
+    const int32_t bias_add_axis = ctx->Attr<int32_t>("axis");
+    const int64_t outer_size = a_tensor->shape_view().Count(0, bias_add_axis);
+    const int64_t bias_size = a_tensor->shape_view().At(bias_add_axis);
+    const int64_t inner_size = a_tensor->shape_view().Count(bias_add_axis + 1);
+    const auto n = a_tensor->shape_view().elem_cnt();
+    GeluFunctor<T> gelu_functor{};
+    DispatchFusedBiasAddForwardImpl<decltype(gelu_functor), T>(
+        ctx->stream(), gelu_functor, n, outer_size, bias_size, inner_size, a_tensor->dptr<T>(),
+        b_tensor->dptr<T>(), out_tensor->mut_dptr<T>());
+  };
+
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
+};
+
+#define REGISTER_FUSED_BIAS_ADD_GELU_KERNEL(dtype)                     \
+  REGISTER_USER_KERNEL("fused_bias_add_gelu")                          \
+      .SetCreateFn<FusedFusedBiasAddKernel<dtype>>()                   \
+      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \
+                       && (user_op::HobDataType("out", 0) == GetDataType<dtype>::value));
+
+REGISTER_FUSED_BIAS_ADD_GELU_KERNEL(float)
+REGISTER_FUSED_BIAS_ADD_GELU_KERNEL(double)
+REGISTER_FUSED_BIAS_ADD_GELU_KERNEL(half)
+
+template<typename T>
+class FusedBiasAddMaskScaleKernel final : public user_op::OpKernel {
+ public:
+  FusedBiasAddMaskScaleKernel() = default;
+  ~FusedBiasAddMaskScaleKernel() override = default;
+
+ private:
+  using user_op::OpKernel::Compute;
+  void Compute(user_op::KernelComputeContext* ctx) const override {
+    const auto* a_tensor = ctx->Tensor4ArgNameAndIndex("a", 0);
+    const auto* b_tensor = ctx->Tensor4ArgNameAndIndex("b", 0);
+    const auto* mask_tensor = ctx->Tensor4ArgNameAndIndex("mask", 0);
+    auto* out_tensor = ctx->Tensor4ArgNameAndIndex("out", 0);
+    const int32_t bias_add_axis = ctx->Attr<int32_t>("axis");
+    const float scale = ctx->Attr<float>("scale");
+    const int64_t outer_size = a_tensor->shape_view().Count(0, bias_add_axis);
+    const int64_t bias_size = a_tensor->shape_view().At(bias_add_axis);
+    const int64_t inner_size = a_tensor->shape_view().Count(bias_add_axis + 1);
+    const auto n = a_tensor->shape_view().elem_cnt();
+    if (ctx->has_input("_add_to_output", 0)) {
+      const user_op::Tensor* addend = ctx->Tensor4ArgNameAndIndex("_add_to_output", 0);
+      MaskAndScaleAddFunctor<T> mask_and_scale_add_functor(mask_tensor->dptr<bool>(),
+                                                           addend->dptr<T>(), scale);
+      DispatchFusedBiasAddForwardImpl<decltype(mask_and_scale_add_functor), T>(
+          ctx->stream(), mask_and_scale_add_functor, n, outer_size, bias_size, inner_size,
+          a_tensor->dptr<T>(), b_tensor->dptr<T>(), out_tensor->mut_dptr<T>());
+    } else {
+      MaskAndScaleFunctor<T> mask_and_scale_functor(mask_tensor->dptr<bool>(), scale);
+      DispatchFusedBiasAddForwardImpl<decltype(mask_and_scale_functor), T>(
+          ctx->stream(), mask_and_scale_functor, n, outer_size, bias_size, inner_size,
+          a_tensor->dptr<T>(), b_tensor->dptr<T>(), out_tensor->mut_dptr<T>());
+    }
+  };
+
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
+};
+
+#define REGISTER_FUSED_BIAS_ADD_MASK_SCALE_KERNEL(dtype)               \
+  REGISTER_USER_KERNEL("fused_bias_add_mask_scale")                    \
+      .SetCreateFn<FusedBiasAddMaskScaleKernel<dtype>>()               \
+      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \
+                       && (user_op::HobDataType("out", 0) == GetDataType<dtype>::value));
+
+REGISTER_FUSED_BIAS_ADD_MASK_SCALE_KERNEL(float)
+REGISTER_FUSED_BIAS_ADD_MASK_SCALE_KERNEL(double)
+REGISTER_FUSED_BIAS_ADD_MASK_SCALE_KERNEL(half)
+
+template<typename T>
+class FusedFusedBiasAddGradKernel final : public user_op::OpKernel {
+ public:
+  FusedFusedBiasAddGradKernel() = default;
+  ~FusedFusedBiasAddGradKernel() override = default;
+
+ private:
+  using user_op::OpKernel::Compute;
+  void Compute(user_op::KernelComputeContext* ctx) const override {
+    const auto* a_tensor = ctx->Tensor4ArgNameAndIndex("a", 0);
+    const auto* b_tensor = ctx->Tensor4ArgNameAndIndex("b", 0);
+    const auto* dy_tensor = ctx->Tensor4ArgNameAndIndex("dy", 0);
+    auto* dx_tensor = ctx->Tensor4ArgNameAndIndex("dx", 0);
+    const int32_t bias_add_axis = ctx->Attr<int32_t>("axis");
+    const int64_t outer_size = a_tensor->shape_view().Count(0, bias_add_axis);
+    const int64_t bias_size = a_tensor->shape_view().At(bias_add_axis);
+    const int64_t inner_size = a_tensor->shape_view().Count(bias_add_axis + 1);
+    const auto n = a_tensor->shape_view().elem_cnt();
+    GeluGradFunctor<T> gelu_grad_functor;
+    if (IsKernelSafeInt32(n)) {
+      FusedBiasAddGradImpl<decltype(gelu_grad_functor), T, int32_t>(
+          ctx->stream(), gelu_grad_functor, outer_size, bias_size, inner_size, a_tensor->dptr<T>(),
+          b_tensor->dptr<T>(), dy_tensor->dptr<T>(), dx_tensor->mut_dptr<T>());
+    } else {
+      FusedBiasAddGradImpl<decltype(gelu_grad_functor), T, int64_t>(
+          ctx->stream(), gelu_grad_functor, outer_size, bias_size, inner_size, a_tensor->dptr<T>(),
+          b_tensor->dptr<T>(), dy_tensor->dptr<T>(), dx_tensor->mut_dptr<T>());
+    }
+  };
+
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
+};
+
+#define REGISTER_FUSED_BIAS_ADD_GELU_GRAD_KERNEL(dtype)                \
+  REGISTER_USER_KERNEL("fused_bias_add_gelu_grad")                     \
+      .SetCreateFn<FusedFusedBiasAddGradKernel<dtype>>()               \
+      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \
+                       && (user_op::HobDataType("dx", 0) == GetDataType<dtype>::value));
+
+REGISTER_FUSED_BIAS_ADD_GELU_GRAD_KERNEL(float)
+REGISTER_FUSED_BIAS_ADD_GELU_GRAD_KERNEL(double)
+REGISTER_FUSED_BIAS_ADD_GELU_GRAD_KERNEL(half)
+
 }  // namespace oneflow
\ No newline at end of file
diff --git a/oneflow/user/kernels/fused_cast_scale_kernel.hip.cpp b/oneflow/user/kernels/fused_cast_scale_kernel.hip.cpp
index 5e898fd..b61f559 100644
--- a/oneflow/user/kernels/fused_cast_scale_kernel.hip.cpp
+++ b/oneflow/user/kernels/fused_cast_scale_kernel.hip.cpp
@@ -1,112 +1,112 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "hip/hip_runtime.h"
-#include "oneflow/core/framework/framework.h"
-#include "oneflow/core/kernel/new_kernel_util.h"
-#include "oneflow/core/kernel/cuda_graph_support.h"
-#include "oneflow/core/ep/rocm/cuda_stream.h"
-
-namespace oneflow {
-
-namespace {
-
-template<typename T, typename U>
-__global__ void FusedCastScaleGpu(const int64_t n, const T scale_val, const U* in,
-                                  const T* scale_by_ptr, T* out) {
-  const T scale = *scale_by_ptr * scale_val;
-  CUDA_1D_KERNEL_LOOP(i, n) { out[i] = static_cast<T>(in[i]) * scale; }
-}
-
-template<>
-__global__ void FusedCastScaleGpu<float, half>(const int64_t n, const float scale_val,
-                                               const half* in, const float* scale_by_ptr,
-                                               float* out) {
-  const float scale = *scale_by_ptr * scale_val;
-  const int64_t n_2 = n / 2;
-  const auto* in_2 = reinterpret_cast<const half2*>(in);
-  auto* out_2 = reinterpret_cast<float2*>(out);
-  CUDA_1D_KERNEL_LOOP(i, n_2) {
-    float2 f2 = __half22float2(in_2[i]);
-    f2.x *= scale;
-    f2.y *= scale;
-    out_2[i] = f2;
-  }
-  if (n % 2 == 1 && blockIdx.x == 0 && threadIdx.x == 0) {
-    out[n - 1] = __half2float(in[n - 1]) * scale;
-  }
-}
-
-template<>
-__global__ void FusedCastScaleGpu<half, float>(const int64_t n, const half scale_val,
-                                               const float* in, const half* scale_by_ptr,
-                                               half* out) {
-  const half scale = *scale_by_ptr * scale_val;
-  const half2 scale_h2 = __half2half2(scale);
-  const int64_t n_2 = n / 2;
-  const auto* in_2 = reinterpret_cast<const float2*>(in);
-  auto* out_h2 = reinterpret_cast<half2*>(out);
-  CUDA_1D_KERNEL_LOOP(i, n_2) {
-    half2 in_h2 = __float22half2_rn(in_2[i]);
-    out_h2[i] = __hmul2(in_h2, scale_h2);
-  }
-  if (n % 2 == 1 && blockIdx.x == 0 && threadIdx.x == 0) {
-    out[n - 1] = __float2half(in[n - 1]) * scale;
-  }
-}
-
-template<typename T, typename U>
-class FusedCastScaleGpuKernel final : public user_op::OpKernel, public user_op::CudaGraphSupport {
- public:
-  FusedCastScaleGpuKernel() = default;
-  ~FusedCastScaleGpuKernel() override = default;
-
- private:
-  using user_op::OpKernel::Compute;
-  void Compute(user_op::KernelComputeContext* ctx) const override {
-    const user_op::Tensor* x = ctx->Tensor4ArgNameAndIndex("x", 0);
-    const user_op::Tensor* scale_by_tensor = ctx->Tensor4ArgNameAndIndex("scale_by_tensor", 0);
-    user_op::Tensor* y = ctx->Tensor4ArgNameAndIndex("y", 0);
-    const int64_t n = x->shape_view().elem_cnt();
-    const double scale = ctx->Attr<double>("scale");
-    const int64_t launch_n = ((std::is_same<T, half>::value && std::is_same<U, float>::value)
-                              || (std::is_same<T, float>::value && std::is_same<U, half>::value))
-                                 ? RoundUp(n, 2) / 2
-                                 : n;
-    FusedCastScaleGpu<T, U><<<BlocksNum4ThreadsNum(launch_n), kCudaThreadsNumPerBlock, 0,
-                              ctx->stream()->As<ep::CudaStream>()->cuda_stream()>>>(
-        n, static_cast<T>(scale), x->dptr<U>(), scale_by_tensor->dptr<T>(), y->mut_dptr<T>());
-  };
-  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
-};
-
-}  // namespace
-
-#define REGISTER_FUSED_CAST_SCALE_CUDA_KERNEL(x_type, y_type)                          \
-  REGISTER_USER_KERNEL("fused_cast_scale")                                             \
-      .SetCreateFn<FusedCastScaleGpuKernel<y_type, x_type>>()                          \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                 \
-                       && (user_op::HobDataType("y", 0) == GetDataType<y_type>::value) \
-                       && (user_op::HobDataType("x", 0) == GetDataType<x_type>::value));
-
-REGISTER_FUSED_CAST_SCALE_CUDA_KERNEL(half, float);
-// REGISTER_FUSED_CAST_SCALE_CUDA_KERNEL(half, double);
-REGISTER_FUSED_CAST_SCALE_CUDA_KERNEL(float, half);
-REGISTER_FUSED_CAST_SCALE_CUDA_KERNEL(float, double);
-// REGISTER_FUSED_CAST_SCALE_CUDA_KERNEL(double, half);
-REGISTER_FUSED_CAST_SCALE_CUDA_KERNEL(double, float);
-#undef REGISTER_FUSED_CAST_SCALE_CUDA_KERNEL
-
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "hip/hip_runtime.h"
+#include "oneflow/core/framework/framework.h"
+#include "oneflow/core/kernel/new_kernel_util.h"
+#include "oneflow/core/kernel/cuda_graph_support.h"
+#include "oneflow/core/ep/rocm/cuda_stream.h"
+
+namespace oneflow {
+
+namespace {
+
+template<typename T, typename U>
+__global__ void FusedCastScaleGpu(const int64_t n, const T scale_val, const U* in,
+                                  const T* scale_by_ptr, T* out) {
+  const T scale = *scale_by_ptr * scale_val;
+  CUDA_1D_KERNEL_LOOP(i, n) { out[i] = static_cast<T>(in[i]) * scale; }
+}
+
+template<>
+__global__ void FusedCastScaleGpu<float, half>(const int64_t n, const float scale_val,
+                                               const half* in, const float* scale_by_ptr,
+                                               float* out) {
+  const float scale = *scale_by_ptr * scale_val;
+  const int64_t n_2 = n / 2;
+  const auto* in_2 = reinterpret_cast<const half2*>(in);
+  auto* out_2 = reinterpret_cast<float2*>(out);
+  CUDA_1D_KERNEL_LOOP(i, n_2) {
+    float2 f2 = __half22float2(in_2[i]);
+    f2.x *= scale;
+    f2.y *= scale;
+    out_2[i] = f2;
+  }
+  if (n % 2 == 1 && blockIdx.x == 0 && threadIdx.x == 0) {
+    out[n - 1] = __half2float(in[n - 1]) * scale;
+  }
+}
+
+template<>
+__global__ void FusedCastScaleGpu<half, float>(const int64_t n, const half scale_val,
+                                               const float* in, const half* scale_by_ptr,
+                                               half* out) {
+  const half scale = *scale_by_ptr * scale_val;
+  const half2 scale_h2 = __half2half2(scale);
+  const int64_t n_2 = n / 2;
+  const auto* in_2 = reinterpret_cast<const float2*>(in);
+  auto* out_h2 = reinterpret_cast<half2*>(out);
+  CUDA_1D_KERNEL_LOOP(i, n_2) {
+    half2 in_h2 = __float22half2_rn(in_2[i]);
+    out_h2[i] = __hmul2(in_h2, scale_h2);
+  }
+  if (n % 2 == 1 && blockIdx.x == 0 && threadIdx.x == 0) {
+    out[n - 1] = __float2half(in[n - 1]) * scale;
+  }
+}
+
+template<typename T, typename U>
+class FusedCastScaleGpuKernel final : public user_op::OpKernel, public user_op::CudaGraphSupport {
+ public:
+  FusedCastScaleGpuKernel() = default;
+  ~FusedCastScaleGpuKernel() override = default;
+
+ private:
+  using user_op::OpKernel::Compute;
+  void Compute(user_op::KernelComputeContext* ctx) const override {
+    const user_op::Tensor* x = ctx->Tensor4ArgNameAndIndex("x", 0);
+    const user_op::Tensor* scale_by_tensor = ctx->Tensor4ArgNameAndIndex("scale_by_tensor", 0);
+    user_op::Tensor* y = ctx->Tensor4ArgNameAndIndex("y", 0);
+    const int64_t n = x->shape_view().elem_cnt();
+    const double scale = ctx->Attr<double>("scale");
+    const int64_t launch_n = ((std::is_same<T, half>::value && std::is_same<U, float>::value)
+                              || (std::is_same<T, float>::value && std::is_same<U, half>::value))
+                                 ? RoundUp(n, 2) / 2
+                                 : n;
+    FusedCastScaleGpu<T, U><<<BlocksNum4ThreadsNum(launch_n), kCudaThreadsNumPerBlock, 0,
+                              ctx->stream()->As<ep::CudaStream>()->cuda_stream()>>>(
+        n, static_cast<T>(scale), x->dptr<U>(), scale_by_tensor->dptr<T>(), y->mut_dptr<T>());
+  };
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
+};
+
+}  // namespace
+
+#define REGISTER_FUSED_CAST_SCALE_CUDA_KERNEL(x_type, y_type)                          \
+  REGISTER_USER_KERNEL("fused_cast_scale")                                             \
+      .SetCreateFn<FusedCastScaleGpuKernel<y_type, x_type>>()                          \
+      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                 \
+                       && (user_op::HobDataType("y", 0) == GetDataType<y_type>::value) \
+                       && (user_op::HobDataType("x", 0) == GetDataType<x_type>::value));
+
+REGISTER_FUSED_CAST_SCALE_CUDA_KERNEL(half, float);
+// REGISTER_FUSED_CAST_SCALE_CUDA_KERNEL(half, double);
+REGISTER_FUSED_CAST_SCALE_CUDA_KERNEL(float, half);
+REGISTER_FUSED_CAST_SCALE_CUDA_KERNEL(float, double);
+// REGISTER_FUSED_CAST_SCALE_CUDA_KERNEL(double, half);
+REGISTER_FUSED_CAST_SCALE_CUDA_KERNEL(double, float);
+#undef REGISTER_FUSED_CAST_SCALE_CUDA_KERNEL
+
 }  // namespace oneflow
\ No newline at end of file
diff --git a/oneflow/user/kernels/fused_cross_feature_interaction.hip.cpp b/oneflow/user/kernels/fused_cross_feature_interaction.hip.cpp
index a2dd5bb..d83dad4 100644
--- a/oneflow/user/kernels/fused_cross_feature_interaction.hip.cpp
+++ b/oneflow/user/kernels/fused_cross_feature_interaction.hip.cpp
@@ -1,259 +1,259 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "hip/hip_runtime.h"
-#include "oneflow/core/framework/framework.h"
-#include "oneflow/core/kernel/cuda_graph_support.h"
-#include "oneflow/core/ep/include/primitive/matmul.h"
-#include "oneflow/core/hip/elementwise.hip.h"
-#include "oneflow/core/ep/rocm/cuda_stream.h"
-
-namespace oneflow {
-
-namespace {
-
-enum InteractionMode { kVector = 0, kMatrix };
-
-constexpr int kBlockSize = 256;
-
-void InferMatmulMNK(const ShapeView& a_shape, const ShapeView& b_shape, bool transpose_a,
-                    bool transpose_b, size_t* m, size_t* n, size_t* k) {
-  const int64_t num_a_axes = a_shape.NumAxes();
-  CHECK_GE(num_a_axes, 2);
-  const int64_t num_b_axes = b_shape.NumAxes();
-  CHECK_GE(num_b_axes, 2);
-  if (!transpose_a) {
-    *m = a_shape.At(num_a_axes - 2);
-    *k = a_shape.At(num_a_axes - 1);
-  } else {
-    *m = a_shape.At(num_a_axes - 1);
-    *k = a_shape.At(num_a_axes - 2);
-  }
-  if (!transpose_b) {
-    CHECK_EQ(b_shape.At(num_b_axes - 2), *k);
-    *n = b_shape.At(num_b_axes - 1);
-  } else {
-    CHECK_EQ(b_shape.At(num_b_axes - 1), *k);
-    *n = b_shape.At(num_b_axes - 2);
-  }
-}
-
-ep::primitive::BlasTransposeType GetBlasTransposeType(bool transpose) {
-  return transpose ? ep::primitive::BlasTransposeType::T : ep::primitive::BlasTransposeType::N;
-}
-
-std::unique_ptr<ep::primitive::Matmul> NewMatmulPrimitive(DeviceType device_type,
-                                                          DataType data_type, bool transpose_a,
-                                                          bool transpose_b) {
-  const auto trans_a = GetBlasTransposeType(transpose_a);
-  const auto trans_b = GetBlasTransposeType(transpose_b);
-  return ep::primitive::NewPrimitive<ep::primitive::MatmulFactory>(device_type, data_type, trans_a,
-                                                                   trans_b);
-}
-
-template<typename Context>
-std::unique_ptr<ep::primitive::Matmul> NewMatmulPrimitive(Context* ctx) {
-  const DataType data_type = ctx->TensorDesc4ArgNameAndIndex("x", 0)->data_type();
-  return NewMatmulPrimitive(ctx->device_type(), data_type, /*transpose_a=*/false,
-                            /*transpose_b=*/true);
-}
-
-auto MatmulPrimitiveExists() {
-  return hob::make_custom("MatmulPrimitiveExists", [](const user_op::KernelRegContext& ctx) {
-    return NewMatmulPrimitive(&ctx).operator bool();
-  });
-}
-
-template<typename T, typename IndexType, int pack_size, InteractionMode mode>
-__global__ void FusedBiasAddMulAddResidualKernel(const T* in, const T* x, const T* x0,
-                                                 const T* bias, T* out, const IndexType cols,
-                                                 const IndexType elem_cnt) {
-  const IndexType global_thread_id = blockDim.x * blockIdx.x + threadIdx.x;
-  using LoadPack = cuda::elementwise::Packed<T, pack_size>;
-  for (IndexType linear_index = global_thread_id * pack_size,
-                 step = gridDim.x * blockDim.x * pack_size;
-       linear_index < elem_cnt; linear_index += step) {
-    const IndexType row_idx = linear_index / cols;
-    const IndexType col_idx = linear_index - row_idx * cols;
-
-    const LoadPack* x0_load = reinterpret_cast<const LoadPack*>(x0 + linear_index);
-    const LoadPack* x_load = reinterpret_cast<const LoadPack*>(x + linear_index);
-    const LoadPack* bias_load = reinterpret_cast<const LoadPack*>(bias + col_idx);
-
-    LoadPack x0_vec = *x0_load;
-    LoadPack x_vec = *x_load;
-    LoadPack bias_vec = *bias_load;
-
-    LoadPack out_store;
-    if (mode == InteractionMode::kVector) {
-      T in_val = in[row_idx];
-#pragma unroll
-      for (int i = 0; i < pack_size; i++) {
-        out_store.elem[i] = x0_vec.elem[i] * in_val + bias_vec.elem[i] + x_vec.elem[i];
-      }
-    } else if (mode == InteractionMode::kMatrix) {
-      const LoadPack* in_load = reinterpret_cast<const LoadPack*>(in + linear_index);
-      LoadPack in_vec = *in_load;
-#pragma unroll
-      for (int i = 0; i < pack_size; i++) {
-        out_store.elem[i] = (in_vec.elem[i] + bias_vec.elem[i]) * x0_vec.elem[i] + x_vec.elem[i];
-      }
-    } else {
-      asm volatile("s_trap 0;");
-    }
-    *(reinterpret_cast<LoadPack*>(out + linear_index)) = out_store;
-  }
-}
-
-template<typename T>
-int GetLaunchPackSize(const int64_t cols) {
-  constexpr int type_pack_size = cuda::elementwise::PackSize<T>();
-  for (int launch_pack_size = 8; launch_pack_size > 0; launch_pack_size /= 2) {
-    if (type_pack_size >= launch_pack_size && cols % launch_pack_size == 0) {
-      return launch_pack_size;
-    }
-  }
-  return 1;
-}
-
-template<typename T, typename IndexType, InteractionMode mode>
-void DispatchFusedBiasAddMulAddResidualPackSize(ep::Stream* stream, const T* in, const T* x,
-                                                const T* x0, const T* bias, T* out,
-                                                const IndexType cols, const IndexType elem_cnt) {
-  int grid_size;
-  const int pack_size = GetLaunchPackSize<T>(cols);
-  const int64_t pack_num = elem_cnt / pack_size;
-  hipError_t err = cuda::elementwise::GetNumBlocks(pack_num, &grid_size);
-  if (pack_size == 8) {
-    FusedBiasAddMulAddResidualKernel<T, IndexType, 8, mode>
-        <<<grid_size, kBlockSize, 0, stream->As<ep::CudaStream>()->cuda_stream()>>>(
-            in, x, x0, bias, out, cols, elem_cnt);
-  } else if (pack_size == 4) {
-    FusedBiasAddMulAddResidualKernel<T, IndexType, 4, mode>
-        <<<grid_size, kBlockSize, 0, stream->As<ep::CudaStream>()->cuda_stream()>>>(
-            in, x, x0, bias, out, cols, elem_cnt);
-  } else if (pack_size == 2) {
-    FusedBiasAddMulAddResidualKernel<T, IndexType, 2, mode>
-        <<<grid_size, kBlockSize, 0, stream->As<ep::CudaStream>()->cuda_stream()>>>(
-            in, x, x0, bias, out, cols, elem_cnt);
-  } else {
-    FusedBiasAddMulAddResidualKernel<T, IndexType, 1, mode>
-        <<<grid_size, kBlockSize, 0, stream->As<ep::CudaStream>()->cuda_stream()>>>(
-            in, x, x0, bias, out, cols, elem_cnt);
-  }
-}
-
-template<typename T, InteractionMode mode>
-void DispatchFusedBiasAddMulAddResidualIndexType(ep::Stream* stream, const T* in, const T* x,
-                                                 const T* x0, const T* bias, T* out,
-                                                 const int64_t cols, const int64_t elem_cnt) {
-  if (elem_cnt < GetMaxVal<int32_t>()) {
-    DispatchFusedBiasAddMulAddResidualPackSize<T, int32_t, mode>(stream, in, x, x0, bias, out, cols,
-                                                                 elem_cnt);
-  } else {
-    DispatchFusedBiasAddMulAddResidualPackSize<T, int64_t, mode>(stream, in, x, x0, bias, out, cols,
-                                                                 elem_cnt);
-  }
-}
-
-template<typename T>
-class FusedCrossFeatureInteractionKernel final : public user_op::OpKernel,
-                                                 public user_op::CudaGraphSupport {
- public:
-  FusedCrossFeatureInteractionKernel() = default;
-  ~FusedCrossFeatureInteractionKernel() override = default;
-  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
-
- private:
-  using user_op::OpKernel::Compute;
-  void Compute(user_op::KernelComputeContext* ctx) const override {
-    /*
-    Cross Interaction v1:
-    1. x matmul weight. matmul_result0 -> (B, E) matmul (1, E) -> (B, 1)
-       dx = dmatmul_result0 matmul weight
-       dw = x matmul dmatmul_result0
-
-    2. matmul_result0 broadcast_mul x0. matmul_result1 -> (B, 1) broadcast_mul (B, E) -> (B, E)
-       dmatmul_result0 = reduce_sum(dmatmul_result1 * x0, axis=1)
-       dx0 = dmatmul_result1 broadcast_mul matmul_result0
-
-    3. matmul_result1 broadcast_add bias. matmul_result2 -> (B, E) broadcast_add (1, E) -> (B, E)
-       dmatmul_result1 = dout
-       dbias = reduce_sum(dmatmul_result2, axis=0)
-
-    4. matmul_result2 add x. out -> (B, E) elementwise_add (B, E) -> (B, E)
-       dmatmul_result2 = dout, dx = dout.
-
-    Cross Interaction Grad:
-    dw = x matmul dmatmul_result0
-    dx0 = dmatmul_result1 broadcast_mul matmul_result0
-    dbias = reduce_sum(dmatmul_result2, axis=0)
-    dx = (dmatmul_result0 matmul weight) + dout.
-
-    Cross Interaction v2:
-    1. x matmul weight. matmul_result0 -> (B, E) matmul (E, E) -> (B, E)
-
-    2. matmul_result0 add bias. matmul_result1 -> (B, E) bias_add (1, E) -> (B, E)
-
-    3. matmul_result1 multiply x0. matmul_result2 -> (B, E) elementwise_mul (B, E) -> (B, E)
-
-    4. matmul_result2 add x. out -> (B, E) elementwise_add (B, E) -> (B, E)
-
-    */
-    const user_op::Tensor* x = ctx->Tensor4ArgNameAndIndex("x", 0);
-    const user_op::Tensor* weight = ctx->Tensor4ArgNameAndIndex("weight", 0);
-    const user_op::Tensor* x0 = ctx->Tensor4ArgNameAndIndex("x0", 0);
-    const user_op::Tensor* bias = ctx->Tensor4ArgNameAndIndex("bias", 0);
-    user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0);
-    user_op::Tensor* matmul_result = ctx->Tensor4ArgNameAndIndex("matmul_result", 0);
-    const std::string interaction_mode = ctx->Attr<std::string>("interaction_mode");
-
-    CHECK_EQ(out->shape_view().NumAxes(), 2);
-    size_t m = 0, n = 0, k = 0;
-    InferMatmulMNK(x->shape_view(), weight->shape_view(), /*trans_a=*/false, /*trans_b=*/true, &m,
-                   &n, &k);
-    const double alpha = 1.0;
-    double beta = 0.0;
-    auto matmul = NewMatmulPrimitive(ctx);
-    CHECK(matmul);
-    matmul->Launch(ctx->stream(), m, n, k, alpha, x->dptr(), weight->dptr(), beta,
-                   matmul_result->mut_dptr());
-    const int64_t elem_cnt = out->shape_view().elem_cnt();
-    const int64_t cols = out->shape_view().At(1);
-    if (interaction_mode == "vector") {
-      DispatchFusedBiasAddMulAddResidualIndexType<T, InteractionMode::kVector>(
-          ctx->stream(), matmul_result->mut_dptr<T>(), x->dptr<T>(), x0->dptr<T>(), bias->dptr<T>(),
-          out->mut_dptr<T>(), cols, elem_cnt);
-    } else {
-      DispatchFusedBiasAddMulAddResidualIndexType<T, InteractionMode::kMatrix>(
-          ctx->stream(), matmul_result->mut_dptr<T>(), x->dptr<T>(), x0->dptr<T>(), bias->dptr<T>(),
-          out->mut_dptr<T>(), cols, elem_cnt);
-    }
-  }
-};
-
-#define REGISTER_FUSED_CROSS_FEATURE_INTERACTION_KERNEL(dtype)                        \
-  REGISTER_USER_KERNEL("fused_cross_feature_interaction")                             \
-      .SetCreateFn<FusedCrossFeatureInteractionKernel<dtype>>()                       \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                \
-                       && (user_op::HobDataType("x", 0) == GetDataType<dtype>::value) \
-                       && MatmulPrimitiveExists());
-
-REGISTER_FUSED_CROSS_FEATURE_INTERACTION_KERNEL(float)
-REGISTER_FUSED_CROSS_FEATURE_INTERACTION_KERNEL(half)
-
-}  // namespace
-
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "hip/hip_runtime.h"
+#include "oneflow/core/framework/framework.h"
+#include "oneflow/core/kernel/cuda_graph_support.h"
+#include "oneflow/core/ep/include/primitive/matmul.h"
+#include "oneflow/core/hip/elementwise.hip.h"
+#include "oneflow/core/ep/rocm/cuda_stream.h"
+
+namespace oneflow {
+
+namespace {
+
+enum InteractionMode { kVector = 0, kMatrix };
+
+constexpr int kBlockSize = 256;
+
+void InferMatmulMNK(const ShapeView& a_shape, const ShapeView& b_shape, bool transpose_a,
+                    bool transpose_b, size_t* m, size_t* n, size_t* k) {
+  const int64_t num_a_axes = a_shape.NumAxes();
+  CHECK_GE(num_a_axes, 2);
+  const int64_t num_b_axes = b_shape.NumAxes();
+  CHECK_GE(num_b_axes, 2);
+  if (!transpose_a) {
+    *m = a_shape.At(num_a_axes - 2);
+    *k = a_shape.At(num_a_axes - 1);
+  } else {
+    *m = a_shape.At(num_a_axes - 1);
+    *k = a_shape.At(num_a_axes - 2);
+  }
+  if (!transpose_b) {
+    CHECK_EQ(b_shape.At(num_b_axes - 2), *k);
+    *n = b_shape.At(num_b_axes - 1);
+  } else {
+    CHECK_EQ(b_shape.At(num_b_axes - 1), *k);
+    *n = b_shape.At(num_b_axes - 2);
+  }
+}
+
+ep::primitive::BlasTransposeType GetBlasTransposeType(bool transpose) {
+  return transpose ? ep::primitive::BlasTransposeType::T : ep::primitive::BlasTransposeType::N;
+}
+
+std::unique_ptr<ep::primitive::Matmul> NewMatmulPrimitive(DeviceType device_type,
+                                                          DataType data_type, bool transpose_a,
+                                                          bool transpose_b) {
+  const auto trans_a = GetBlasTransposeType(transpose_a);
+  const auto trans_b = GetBlasTransposeType(transpose_b);
+  return ep::primitive::NewPrimitive<ep::primitive::MatmulFactory>(device_type, data_type, trans_a,
+                                                                   trans_b);
+}
+
+template<typename Context>
+std::unique_ptr<ep::primitive::Matmul> NewMatmulPrimitive(Context* ctx) {
+  const DataType data_type = ctx->TensorDesc4ArgNameAndIndex("x", 0)->data_type();
+  return NewMatmulPrimitive(ctx->device_type(), data_type, /*transpose_a=*/false,
+                            /*transpose_b=*/true);
+}
+
+auto MatmulPrimitiveExists() {
+  return hob::make_custom("MatmulPrimitiveExists", [](const user_op::KernelRegContext& ctx) {
+    return NewMatmulPrimitive(&ctx).operator bool();
+  });
+}
+
+template<typename T, typename IndexType, int pack_size, InteractionMode mode>
+__global__ void FusedBiasAddMulAddResidualKernel(const T* in, const T* x, const T* x0,
+                                                 const T* bias, T* out, const IndexType cols,
+                                                 const IndexType elem_cnt) {
+  const IndexType global_thread_id = blockDim.x * blockIdx.x + threadIdx.x;
+  using LoadPack = cuda::elementwise::Packed<T, pack_size>;
+  for (IndexType linear_index = global_thread_id * pack_size,
+                 step = gridDim.x * blockDim.x * pack_size;
+       linear_index < elem_cnt; linear_index += step) {
+    const IndexType row_idx = linear_index / cols;
+    const IndexType col_idx = linear_index - row_idx * cols;
+
+    const LoadPack* x0_load = reinterpret_cast<const LoadPack*>(x0 + linear_index);
+    const LoadPack* x_load = reinterpret_cast<const LoadPack*>(x + linear_index);
+    const LoadPack* bias_load = reinterpret_cast<const LoadPack*>(bias + col_idx);
+
+    LoadPack x0_vec = *x0_load;
+    LoadPack x_vec = *x_load;
+    LoadPack bias_vec = *bias_load;
+
+    LoadPack out_store;
+    if (mode == InteractionMode::kVector) {
+      T in_val = in[row_idx];
+#pragma unroll
+      for (int i = 0; i < pack_size; i++) {
+        out_store.elem[i] = x0_vec.elem[i] * in_val + bias_vec.elem[i] + x_vec.elem[i];
+      }
+    } else if (mode == InteractionMode::kMatrix) {
+      const LoadPack* in_load = reinterpret_cast<const LoadPack*>(in + linear_index);
+      LoadPack in_vec = *in_load;
+#pragma unroll
+      for (int i = 0; i < pack_size; i++) {
+        out_store.elem[i] = (in_vec.elem[i] + bias_vec.elem[i]) * x0_vec.elem[i] + x_vec.elem[i];
+      }
+    } else {
+      asm volatile("s_trap 0;");
+    }
+    *(reinterpret_cast<LoadPack*>(out + linear_index)) = out_store;
+  }
+}
+
+template<typename T>
+int GetLaunchPackSize(const int64_t cols) {
+  constexpr int type_pack_size = cuda::elementwise::PackSize<T>();
+  for (int launch_pack_size = 8; launch_pack_size > 0; launch_pack_size /= 2) {
+    if (type_pack_size >= launch_pack_size && cols % launch_pack_size == 0) {
+      return launch_pack_size;
+    }
+  }
+  return 1;
+}
+
+template<typename T, typename IndexType, InteractionMode mode>
+void DispatchFusedBiasAddMulAddResidualPackSize(ep::Stream* stream, const T* in, const T* x,
+                                                const T* x0, const T* bias, T* out,
+                                                const IndexType cols, const IndexType elem_cnt) {
+  int grid_size;
+  const int pack_size = GetLaunchPackSize<T>(cols);
+  const int64_t pack_num = elem_cnt / pack_size;
+  hipError_t err = cuda::elementwise::GetNumBlocks(pack_num, &grid_size);
+  if (pack_size == 8) {
+    FusedBiasAddMulAddResidualKernel<T, IndexType, 8, mode>
+        <<<grid_size, kBlockSize, 0, stream->As<ep::CudaStream>()->cuda_stream()>>>(
+            in, x, x0, bias, out, cols, elem_cnt);
+  } else if (pack_size == 4) {
+    FusedBiasAddMulAddResidualKernel<T, IndexType, 4, mode>
+        <<<grid_size, kBlockSize, 0, stream->As<ep::CudaStream>()->cuda_stream()>>>(
+            in, x, x0, bias, out, cols, elem_cnt);
+  } else if (pack_size == 2) {
+    FusedBiasAddMulAddResidualKernel<T, IndexType, 2, mode>
+        <<<grid_size, kBlockSize, 0, stream->As<ep::CudaStream>()->cuda_stream()>>>(
+            in, x, x0, bias, out, cols, elem_cnt);
+  } else {
+    FusedBiasAddMulAddResidualKernel<T, IndexType, 1, mode>
+        <<<grid_size, kBlockSize, 0, stream->As<ep::CudaStream>()->cuda_stream()>>>(
+            in, x, x0, bias, out, cols, elem_cnt);
+  }
+}
+
+template<typename T, InteractionMode mode>
+void DispatchFusedBiasAddMulAddResidualIndexType(ep::Stream* stream, const T* in, const T* x,
+                                                 const T* x0, const T* bias, T* out,
+                                                 const int64_t cols, const int64_t elem_cnt) {
+  if (elem_cnt < GetMaxVal<int32_t>()) {
+    DispatchFusedBiasAddMulAddResidualPackSize<T, int32_t, mode>(stream, in, x, x0, bias, out, cols,
+                                                                 elem_cnt);
+  } else {
+    DispatchFusedBiasAddMulAddResidualPackSize<T, int64_t, mode>(stream, in, x, x0, bias, out, cols,
+                                                                 elem_cnt);
+  }
+}
+
+template<typename T>
+class FusedCrossFeatureInteractionKernel final : public user_op::OpKernel,
+                                                 public user_op::CudaGraphSupport {
+ public:
+  FusedCrossFeatureInteractionKernel() = default;
+  ~FusedCrossFeatureInteractionKernel() override = default;
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
+
+ private:
+  using user_op::OpKernel::Compute;
+  void Compute(user_op::KernelComputeContext* ctx) const override {
+    /*
+    Cross Interaction v1:
+    1. x matmul weight. matmul_result0 -> (B, E) matmul (1, E) -> (B, 1)
+       dx = dmatmul_result0 matmul weight
+       dw = x matmul dmatmul_result0
+
+    2. matmul_result0 broadcast_mul x0. matmul_result1 -> (B, 1) broadcast_mul (B, E) -> (B, E)
+       dmatmul_result0 = reduce_sum(dmatmul_result1 * x0, axis=1)
+       dx0 = dmatmul_result1 broadcast_mul matmul_result0
+
+    3. matmul_result1 broadcast_add bias. matmul_result2 -> (B, E) broadcast_add (1, E) -> (B, E)
+       dmatmul_result1 = dout
+       dbias = reduce_sum(dmatmul_result2, axis=0)
+
+    4. matmul_result2 add x. out -> (B, E) elementwise_add (B, E) -> (B, E)
+       dmatmul_result2 = dout, dx = dout.
+
+    Cross Interaction Grad:
+    dw = x matmul dmatmul_result0
+    dx0 = dmatmul_result1 broadcast_mul matmul_result0
+    dbias = reduce_sum(dmatmul_result2, axis=0)
+    dx = (dmatmul_result0 matmul weight) + dout.
+
+    Cross Interaction v2:
+    1. x matmul weight. matmul_result0 -> (B, E) matmul (E, E) -> (B, E)
+
+    2. matmul_result0 add bias. matmul_result1 -> (B, E) bias_add (1, E) -> (B, E)
+
+    3. matmul_result1 multiply x0. matmul_result2 -> (B, E) elementwise_mul (B, E) -> (B, E)
+
+    4. matmul_result2 add x. out -> (B, E) elementwise_add (B, E) -> (B, E)
+
+    */
+    const user_op::Tensor* x = ctx->Tensor4ArgNameAndIndex("x", 0);
+    const user_op::Tensor* weight = ctx->Tensor4ArgNameAndIndex("weight", 0);
+    const user_op::Tensor* x0 = ctx->Tensor4ArgNameAndIndex("x0", 0);
+    const user_op::Tensor* bias = ctx->Tensor4ArgNameAndIndex("bias", 0);
+    user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0);
+    user_op::Tensor* matmul_result = ctx->Tensor4ArgNameAndIndex("matmul_result", 0);
+    const std::string interaction_mode = ctx->Attr<std::string>("interaction_mode");
+
+    CHECK_EQ(out->shape_view().NumAxes(), 2);
+    size_t m = 0, n = 0, k = 0;
+    InferMatmulMNK(x->shape_view(), weight->shape_view(), /*trans_a=*/false, /*trans_b=*/true, &m,
+                   &n, &k);
+    const double alpha = 1.0;
+    double beta = 0.0;
+    auto matmul = NewMatmulPrimitive(ctx);
+    CHECK(matmul);
+    matmul->Launch(ctx->stream(), m, n, k, alpha, x->dptr(), weight->dptr(), beta,
+                   matmul_result->mut_dptr());
+    const int64_t elem_cnt = out->shape_view().elem_cnt();
+    const int64_t cols = out->shape_view().At(1);
+    if (interaction_mode == "vector") {
+      DispatchFusedBiasAddMulAddResidualIndexType<T, InteractionMode::kVector>(
+          ctx->stream(), matmul_result->mut_dptr<T>(), x->dptr<T>(), x0->dptr<T>(), bias->dptr<T>(),
+          out->mut_dptr<T>(), cols, elem_cnt);
+    } else {
+      DispatchFusedBiasAddMulAddResidualIndexType<T, InteractionMode::kMatrix>(
+          ctx->stream(), matmul_result->mut_dptr<T>(), x->dptr<T>(), x0->dptr<T>(), bias->dptr<T>(),
+          out->mut_dptr<T>(), cols, elem_cnt);
+    }
+  }
+};
+
+#define REGISTER_FUSED_CROSS_FEATURE_INTERACTION_KERNEL(dtype)                        \
+  REGISTER_USER_KERNEL("fused_cross_feature_interaction")                             \
+      .SetCreateFn<FusedCrossFeatureInteractionKernel<dtype>>()                       \
+      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                \
+                       && (user_op::HobDataType("x", 0) == GetDataType<dtype>::value) \
+                       && MatmulPrimitiveExists());
+
+REGISTER_FUSED_CROSS_FEATURE_INTERACTION_KERNEL(float)
+REGISTER_FUSED_CROSS_FEATURE_INTERACTION_KERNEL(half)
+
+}  // namespace
+
 }  // namespace oneflow
\ No newline at end of file
diff --git a/oneflow/user/kernels/fused_cross_feature_interaction_grad.hip.cpp b/oneflow/user/kernels/fused_cross_feature_interaction_grad.hip.cpp
index 6e5483a..38e37b0 100644
--- a/oneflow/user/kernels/fused_cross_feature_interaction_grad.hip.cpp
+++ b/oneflow/user/kernels/fused_cross_feature_interaction_grad.hip.cpp
@@ -1,455 +1,455 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "hip/hip_runtime.h"
-#include "oneflow/core/framework/framework.h"
-#include "oneflow/core/kernel/cuda_graph_support.h"
-#include "oneflow/core/ep/include/primitive/matmul.h"
-#include "oneflow/core/hip/elementwise.hip.h"
-#include "oneflow/core/ep/rocm/cuda_stream.h"
-
-namespace oneflow {
-
-namespace {
-
-constexpr int kBlockSize = 256;
-
-void InferMatmulMNK(const DimVector& a_shape, const DimVector& b_shape, bool transpose_a,
-                    bool transpose_b, size_t* m, size_t* n, size_t* k) {
-  const int64_t num_a_axes = a_shape.size();
-  CHECK_GE(num_a_axes, 2);
-  const int64_t num_b_axes = b_shape.size();
-  CHECK_GE(num_b_axes, 2);
-  if (!transpose_a) {
-    *m = a_shape.at(num_a_axes - 2);
-    *k = a_shape.at(num_a_axes - 1);
-  } else {
-    *m = a_shape.at(num_a_axes - 1);
-    *k = a_shape.at(num_a_axes - 2);
-  }
-  if (!transpose_b) {
-    CHECK_EQ(b_shape.at(num_b_axes - 2), *k);
-    *n = b_shape.at(num_b_axes - 1);
-  } else {
-    CHECK_EQ(b_shape.at(num_b_axes - 1), *k);
-    *n = b_shape.at(num_b_axes - 2);
-  }
-}
-
-ep::primitive::BlasTransposeType GetBlasTransposeType(bool transpose) {
-  return transpose ? ep::primitive::BlasTransposeType::T : ep::primitive::BlasTransposeType::N;
-}
-
-template<typename T>
-struct MulOp {
-  __device__ __forceinline__ T operator()(const T& a, const T& b) const { return a * b; }
-};
-
-template<typename T>
-struct AddOp {
-  __device__ __forceinline__ T operator()(const T& a, const T& b) const { return a + b; }
-};
-
-template<typename T>
-int GetLaunchPackSize(const int64_t cols) {
-  constexpr int type_pack_size = cuda::elementwise::PackSize<T>();
-  for (int launch_pack_size = 8; launch_pack_size > 0; launch_pack_size /= 2) {
-    if (type_pack_size >= launch_pack_size && cols % launch_pack_size == 0) {
-      return launch_pack_size;
-    }
-  }
-  return 1;
-}
-
-template<typename T, typename IndexType, int pack_size>
-__global__ void BroadcastMulKernel(const T* x, const T* y, T* out, const IndexType cols,
-                                   const IndexType elem_cnt) {
-  const IndexType global_thread_id = blockDim.x * blockIdx.x + threadIdx.x;
-  using LoadPack = cuda::elementwise::Packed<T, pack_size>;
-  for (IndexType linear_index = global_thread_id * pack_size,
-                 step = gridDim.x * blockDim.x * pack_size;
-       linear_index < elem_cnt; linear_index += step) {
-    const IndexType row_idx = linear_index / cols;
-    const LoadPack* x_load = reinterpret_cast<const LoadPack*>(x + linear_index);
-    LoadPack x_vec = *x_load;
-    LoadPack out_store;
-    const T y_val = y[row_idx];
-#pragma unroll
-    for (int i = 0; i < pack_size; i++) { out_store.elem[i] = x_vec.elem[i] * y_val; }
-    *(reinterpret_cast<LoadPack*>(out + linear_index)) = out_store;
-  }
-}
-
-template<typename T, typename IndexType>
-void DispatchBroadcastMulPackSize(ep::Stream* stream, const T* x, const T* y, T* out,
-                                  const IndexType cols, const IndexType elem_cnt) {
-  int grid_size;
-  const int pack_size = GetLaunchPackSize<T>(cols);
-  const int64_t pack_num = elem_cnt / pack_size;
-  hipError_t err = cuda::elementwise::GetNumBlocks(pack_num, &grid_size);
-  if (pack_size == 8) {
-    BroadcastMulKernel<T, IndexType, 8>
-        <<<grid_size, kBlockSize, 0, stream->As<ep::CudaStream>()->cuda_stream()>>>(x, y, out, cols,
-                                                                                    elem_cnt);
-  } else if (pack_size == 4) {
-    BroadcastMulKernel<T, IndexType, 4>
-        <<<grid_size, kBlockSize, 0, stream->As<ep::CudaStream>()->cuda_stream()>>>(x, y, out, cols,
-                                                                                    elem_cnt);
-  } else if (pack_size == 2) {
-    BroadcastMulKernel<T, IndexType, 2>
-        <<<grid_size, kBlockSize, 0, stream->As<ep::CudaStream>()->cuda_stream()>>>(x, y, out, cols,
-                                                                                    elem_cnt);
-  } else {
-    BroadcastMulKernel<T, IndexType, 1>
-        <<<grid_size, kBlockSize, 0, stream->As<ep::CudaStream>()->cuda_stream()>>>(x, y, out, cols,
-                                                                                    elem_cnt);
-  }
-}
-
-template<typename T>
-void DispatchBroadcastMulIndexType(ep::Stream* stream, const T* x, const T* y, T* out,
-                                   const int64_t cols, const int64_t elem_cnt) {
-  if (elem_cnt < GetMaxVal<int32_t>()) {
-    DispatchBroadcastMulPackSize<T, int32_t>(stream, x, y, out, cols, elem_cnt);
-  } else {
-    DispatchBroadcastMulPackSize<T, int64_t>(stream, x, y, out, cols, elem_cnt);
-  }
-}
-
-template<typename T, typename IndexType, int pack_size>
-__global__ void BroadcastAddElementwiseMulKernel(const T* x, const T* y, const T* z, T* out,
-                                                 const IndexType cols, const IndexType elem_cnt) {
-  const IndexType global_thread_id = blockDim.x * blockIdx.x + threadIdx.x;
-  using LoadPack = cuda::elementwise::Packed<T, pack_size>;
-  for (IndexType linear_index = global_thread_id * pack_size,
-                 step = gridDim.x * blockDim.x * pack_size;
-       linear_index < elem_cnt; linear_index += step) {
-    const IndexType row_idx = linear_index / cols;
-    const IndexType col_idx = linear_index - row_idx * cols;
-    const LoadPack* x_load = reinterpret_cast<const LoadPack*>(x + linear_index);
-    const LoadPack* y_load = reinterpret_cast<const LoadPack*>(y + col_idx);
-    const LoadPack* z_load = reinterpret_cast<const LoadPack*>(z + linear_index);
-
-    LoadPack x_vec = *x_load;
-    LoadPack y_vec = *y_load;
-    LoadPack z_vec = *z_load;
-    LoadPack out_store;
-
-#pragma unroll
-    for (int i = 0; i < pack_size; i++) {
-      out_store.elem[i] = (x_vec.elem[i] + y_vec.elem[i]) * z_vec.elem[i];
-    }
-    *(reinterpret_cast<LoadPack*>(out + linear_index)) = out_store;
-  }
-}
-
-template<typename T, typename IndexType>
-void DispatchBroadcastAddElementwiseMulPackSize(ep::Stream* stream, const T* x, const T* y,
-                                                const T* z, T* out, const IndexType cols,
-                                                const IndexType elem_cnt) {
-  int grid_size;
-  const int pack_size = GetLaunchPackSize<T>(cols);
-  const int64_t pack_num = elem_cnt / pack_size;
-  hipError_t err = cuda::elementwise::GetNumBlocks(pack_num, &grid_size);
-  if (pack_size == 8) {
-    BroadcastAddElementwiseMulKernel<T, IndexType, 8>
-        <<<grid_size, kBlockSize, 0, stream->As<ep::CudaStream>()->cuda_stream()>>>(x, y, z, out,
-                                                                                    cols, elem_cnt);
-  } else if (pack_size == 4) {
-    BroadcastAddElementwiseMulKernel<T, IndexType, 4>
-        <<<grid_size, kBlockSize, 0, stream->As<ep::CudaStream>()->cuda_stream()>>>(x, y, z, out,
-                                                                                    cols, elem_cnt);
-  } else if (pack_size == 2) {
-    BroadcastAddElementwiseMulKernel<T, IndexType, 2>
-        <<<grid_size, kBlockSize, 0, stream->As<ep::CudaStream>()->cuda_stream()>>>(x, y, z, out,
-                                                                                    cols, elem_cnt);
-  } else {
-    BroadcastAddElementwiseMulKernel<T, IndexType, 1>
-        <<<grid_size, kBlockSize, 0, stream->As<ep::CudaStream>()->cuda_stream()>>>(x, y, z, out,
-                                                                                    cols, elem_cnt);
-  }
-}
-
-template<typename T>
-void DispatchBroadcastAddElementwiseMulIndexType(ep::Stream* stream, const T* x, const T* y,
-                                                 const T* z, T* out, const int64_t cols,
-                                                 const int64_t elem_cnt) {
-  if (elem_cnt < GetMaxVal<int32_t>()) {
-    DispatchBroadcastAddElementwiseMulPackSize<T, int32_t>(stream, x, y, z, out, cols, elem_cnt);
-  } else {
-    DispatchBroadcastAddElementwiseMulPackSize<T, int64_t>(stream, x, y, z, out, cols, elem_cnt);
-  }
-}
-
-}  // namespace
-
-namespace user_op {
-
-std::unique_ptr<ep::primitive::Matmul> NewMatmulPrimitive(DeviceType device_type,
-                                                          DataType data_type, bool transpose_a,
-                                                          bool transpose_b) {
-  const auto trans_a = GetBlasTransposeType(transpose_a);
-  const auto trans_b = GetBlasTransposeType(transpose_b);
-  return ep::primitive::NewPrimitive<ep::primitive::MatmulFactory>(device_type, data_type, trans_a,
-                                                                   trans_b);
-}
-
-template<typename Context>
-std::unique_ptr<ep::primitive::Matmul> NewReduceMatmulPrimitive(Context* ctx) {
-  const DataType data_type = ctx->TensorDesc4ArgNameAndIndex("dy", 0)->data_type();
-  return NewMatmulPrimitive(ctx->device_type(), data_type, /*transpose_a=*/false,
-                            /*transpose_b=*/false);
-}
-
-auto ReduceMatmulPrimitiveExists() {
-  return hob::make_custom("MatmulPrimitiveExists", [](const KernelRegContext& ctx) {
-    return NewReduceMatmulPrimitive(&ctx).operator bool();
-  });
-}
-
-template<typename Context>
-std::unique_ptr<ep::primitive::Matmul> NewWeightGradMatmulPrimitive(Context* ctx) {
-  const DataType data_type = ctx->TensorDesc4ArgNameAndIndex("x", 0)->data_type();
-  return NewMatmulPrimitive(ctx->device_type(), data_type, /*transpose_a=*/true,
-                            /*transpose_b=*/false);
-}
-
-auto WeightGradMatmulPrimitiveExists() {
-  return hob::make_custom("MatmulPrimitiveExists", [](const KernelRegContext& ctx) {
-    return NewWeightGradMatmulPrimitive(&ctx).operator bool();
-  });
-}
-
-template<typename T>
-class FusedCrossFeatureInteractionGradKernel final : public OpKernel, public CudaGraphSupport {
- public:
-  FusedCrossFeatureInteractionGradKernel() = default;
-  ~FusedCrossFeatureInteractionGradKernel() override = default;
-  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
-
- private:
-  using user_op::OpKernel::Compute;
-  void Compute(KernelComputeContext* ctx) const override {
-    const Tensor* dy = ctx->Tensor4ArgNameAndIndex("dy", 0);
-    const Tensor* weight = ctx->Tensor4ArgNameAndIndex("weight", 0);
-    const Tensor* x0 = ctx->Tensor4ArgNameAndIndex("x0", 0);
-    const Tensor* x = ctx->Tensor4ArgNameAndIndex("x", 0);
-    const Tensor* matmul_result = ctx->Tensor4ArgNameAndIndex("matmul_result", 0);
-
-    const int64_t batch_size = dy->shape_view().At(0);
-    const int64_t hidden_size = dy->shape_view().At(1);
-    const int64_t out_size = weight->shape_view().At(0);
-    const int64_t dy_elem_cnt = dy->shape_view().elem_cnt();
-
-    Tensor* dx0 = ctx->Tensor4ArgNameAndIndex("dx0", 0);
-    Tensor* dw = ctx->Tensor4ArgNameAndIndex("dw", 0);
-    Tensor* dx = ctx->Tensor4ArgNameAndIndex("dx", 0);
-    Tensor* dbias = ctx->Tensor4ArgNameAndIndex("dbias", 0);
-    Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0);
-
-    // step1: Get dbias.
-    const T* ones = nullptr;
-    auto* cuda_device = dynamic_cast<ep::CudaDevice*>(ctx->stream()->device());
-    if (cuda_device != nullptr) {
-      ones = static_cast<const T*>(cuda_device->GetConstOnes(dy->data_type(), batch_size));
-    }
-    size_t m = 0, n = 0, k = 0;
-    DimVector dy_shape(2);
-    dy->shape_view().ToDimVector(&dy_shape);
-    DimVector ones_buf_shape(2);
-    ones_buf_shape.at(0) = 1;
-    ones_buf_shape.at(1) = batch_size;
-    InferMatmulMNK(ones_buf_shape, dy_shape, /*trans_a=*/false, /*trans_b=*/false, &m, &n, &k);
-    auto reduce_matmul = NewReduceMatmulPrimitive(ctx);
-    CHECK(reduce_matmul);
-    reduce_matmul->Launch(ctx->stream(), m, n, k, 1.0, ones, dy->dptr(), 0.0, dbias->mut_dptr());
-
-    // step2: Get dmatmul_result0.
-    T* dy_mul_x0 = reinterpret_cast<T*>(tmp_buffer->mut_dptr());
-    T* dmatmul_result0 = reinterpret_cast<T*>(tmp_buffer->mut_dptr<char>()
-                                              + GetCudaAlignedSize(dy_elem_cnt * sizeof(T)));
-    OF_CUDA_CHECK(cuda::elementwise::Binary(MulOp<T>(), dy_elem_cnt, dy_mul_x0, dy->dptr<T>(),
-                                            x0->dptr<T>(),
-                                            ctx->stream()->As<ep::CudaStream>()->cuda_stream()));
-
-    ones = static_cast<const T*>(cuda_device->GetConstOnes(dy->data_type(), hidden_size));
-    DimVector dy_mul_x0_shape(2);
-    dy->shape_view().ToDimVector(&dy_mul_x0_shape);
-    ones_buf_shape.at(0) = hidden_size;
-    ones_buf_shape.at(1) = 1;
-    InferMatmulMNK(dy_mul_x0_shape, ones_buf_shape, /*trans_a=*/false, /*trans_b=*/false, &m, &n,
-                   &k);
-    reduce_matmul->Launch(ctx->stream(), m, n, k, 1.0, dy_mul_x0, ones, 0.0, dmatmul_result0);
-
-    // step3: Get dx
-    T* dx_buf = reinterpret_cast<T*>(tmp_buffer->mut_dptr<char>()
-                                     + GetCudaAlignedSize(dy_elem_cnt * sizeof(T))
-                                     + GetCudaAlignedSize(batch_size * sizeof(T)));
-    DimVector dmatmul_result_shape(2);
-    dmatmul_result_shape.at(0) = batch_size;
-    dmatmul_result_shape.at(1) = 1;  // todo change to hidden size
-    DimVector weight_shape(2);
-    weight->shape_view().ToDimVector(&weight_shape);
-    InferMatmulMNK(dmatmul_result_shape, weight_shape, /*trans_a=*/false, /*trans_b=*/false, &m, &n,
-                   &k);
-    reduce_matmul->Launch(ctx->stream(), m, n, k, 1.0, dmatmul_result0, weight->dptr(), 0.0,
-                          reinterpret_cast<void*>(dx_buf));
-    OF_CUDA_CHECK(cuda::elementwise::Binary(AddOp<T>(), dy_elem_cnt, dx->mut_dptr<T>(), dx_buf,
-                                            dy->dptr<T>(),
-                                            ctx->stream()->As<ep::CudaStream>()->cuda_stream()));
-
-    // step4: Get dw.
-    DimVector x_shape(2);
-    x->shape_view().ToDimVector(&x_shape);
-
-    InferMatmulMNK(dmatmul_result_shape, x_shape, /*trans_a=*/true, /*trans_b=*/false, &m, &n, &k);
-    auto weight_grad_matmul = NewWeightGradMatmulPrimitive(ctx);
-    CHECK(weight_grad_matmul);
-    weight_grad_matmul->Launch(ctx->stream(), m, n, k, 1.0, dmatmul_result0, x->dptr(), 0.0,
-                               dw->mut_dptr());
-
-    // step5: Get dx0.
-    DispatchBroadcastMulIndexType<T>(ctx->stream(), dy->dptr<T>(), matmul_result->dptr<T>(),
-                                     dx0->mut_dptr<T>(), hidden_size, dy_elem_cnt);
-  }
-};
-
-#define REGISTER_FUSED_CROSS_FEATURE_INTERACTION_V1_GRAD_KERNEL(dtype)                        \
-  REGISTER_USER_KERNEL("fused_cross_feature_interaction_v1_grad")                             \
-      .SetCreateFn<FusedCrossFeatureInteractionGradKernel<dtype>>()                           \
-      .SetIsMatchedHob((HobDeviceType() == DeviceType::kCUDA)                                 \
-                       && (HobDataType("dy", 0) == GetDataType<dtype>::value)                 \
-                       && ReduceMatmulPrimitiveExists() && WeightGradMatmulPrimitiveExists()) \
-      .SetInferTmpSizeFn([](InferContext* ctx) {                                              \
-        size_t tmp_size = 0;                                                                  \
-        const TensorDesc& dy = ctx->InputTensorDesc("dy", 0);                                 \
-        const int64_t dy_elem_cnt = dy.shape().elem_cnt();                                    \
-        const int64_t batch_size = dy.shape().At(0);                                          \
-        size_t dy_mul_x0_size = GetCudaAlignedSize(dy_elem_cnt * sizeof(dtype));              \
-        size_t dmatmul_result_size = GetCudaAlignedSize(batch_size * sizeof(dtype));          \
-        size_t dx_buf_size = dy_mul_x0_size;                                                  \
-        tmp_size = dy_mul_x0_size + dmatmul_result_size + dx_buf_size;                        \
-        return tmp_size;                                                                      \
-      });
-
-REGISTER_FUSED_CROSS_FEATURE_INTERACTION_V1_GRAD_KERNEL(float)
-REGISTER_FUSED_CROSS_FEATURE_INTERACTION_V1_GRAD_KERNEL(half)
-
-template<typename T>
-class FusedCrossFeatureInteractionV2GradKernel final : public OpKernel, public CudaGraphSupport {
- public:
-  FusedCrossFeatureInteractionV2GradKernel() = default;
-  ~FusedCrossFeatureInteractionV2GradKernel() = default;
-  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
-
- private:
-  using user_op::OpKernel::Compute;
-  void Compute(KernelComputeContext* ctx) const override {
-    const Tensor* dy = ctx->Tensor4ArgNameAndIndex("dy", 0);
-    const Tensor* weight = ctx->Tensor4ArgNameAndIndex("weight", 0);
-    const Tensor* bias = ctx->Tensor4ArgNameAndIndex("bias", 0);
-    const Tensor* x0 = ctx->Tensor4ArgNameAndIndex("x0", 0);
-    const Tensor* x = ctx->Tensor4ArgNameAndIndex("x", 0);
-    const Tensor* matmul_result = ctx->Tensor4ArgNameAndIndex("matmul_result", 0);
-
-    const int64_t batch_size = dy->shape_view().At(0);
-    const int64_t in_size = weight->shape_view().At(1);
-    const int64_t hidden_size = weight->shape_view().At(0);
-    const int64_t dy_elem_cnt = dy->shape_view().elem_cnt();
-
-    Tensor* dx0 = ctx->Tensor4ArgNameAndIndex("dx0", 0);
-    Tensor* dw = ctx->Tensor4ArgNameAndIndex("dw", 0);
-    Tensor* dx = ctx->Tensor4ArgNameAndIndex("dx", 0);
-    Tensor* dbias = ctx->Tensor4ArgNameAndIndex("dbias", 0);
-    Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0);
-
-    // step1: Get dx0.
-    DispatchBroadcastAddElementwiseMulIndexType<T>(ctx->stream(), matmul_result->dptr<T>(),
-                                                   bias->dptr<T>(), dy->dptr<T>(),
-                                                   dx0->mut_dptr<T>(), hidden_size, dy_elem_cnt);
-
-    // step2: Get dmatmul_result0.
-    T* dmatmul_result0 = reinterpret_cast<T*>(tmp_buffer->mut_dptr());
-    OF_CUDA_CHECK(cuda::elementwise::Binary(MulOp<T>(), dy_elem_cnt, dmatmul_result0, dy->dptr<T>(),
-                                            x0->dptr<T>(),
-                                            ctx->stream()->As<ep::CudaStream>()->cuda_stream()));
-    // step3: Get dx
-    T* dx_buf = reinterpret_cast<T*>(tmp_buffer->mut_dptr<char>()
-                                     + GetCudaAlignedSize(dy_elem_cnt * sizeof(T)));
-    DimVector dmatmul_result_shape(2);
-    dmatmul_result_shape.at(0) = batch_size;
-    dmatmul_result_shape.at(1) = hidden_size;
-    DimVector weight_shape(2);
-    weight->shape_view().ToDimVector(&weight_shape);
-    size_t m = 0, n = 0, k = 0;
-    InferMatmulMNK(dmatmul_result_shape, weight_shape, /*trans_a=*/false, /*trans_b=*/false, &m, &n,
-                   &k);
-    auto reduce_matmul = NewReduceMatmulPrimitive(ctx);
-    CHECK(reduce_matmul);
-    reduce_matmul->Launch(ctx->stream(), m, n, k, 1.0, dmatmul_result0, weight->dptr(), 0.0,
-                          reinterpret_cast<void*>(dx_buf));
-    OF_CUDA_CHECK(cuda::elementwise::Binary(AddOp<T>(), dy_elem_cnt, dx->mut_dptr<T>(), dx_buf,
-                                            dy->dptr<T>(),
-                                            ctx->stream()->As<ep::CudaStream>()->cuda_stream()));
-
-    // step4: Get dw.
-    DimVector x_shape(2);
-    x->shape_view().ToDimVector(&x_shape);
-
-    InferMatmulMNK(dmatmul_result_shape, x_shape, /*trans_a=*/true, /*trans_b=*/false, &m, &n, &k);
-    auto weight_grad_matmul = NewWeightGradMatmulPrimitive(ctx);
-    CHECK(weight_grad_matmul);
-    weight_grad_matmul->Launch(ctx->stream(), m, n, k, 1.0, dmatmul_result0, x->dptr(), 0.0,
-                               dw->mut_dptr());
-
-    // step5: Get dbias.
-    const T* ones = nullptr;
-    auto* cuda_device = dynamic_cast<ep::CudaDevice*>(ctx->stream()->device());
-    if (cuda_device != nullptr) {
-      ones = static_cast<const T*>(cuda_device->GetConstOnes(dy->data_type(), batch_size));
-    }
-    DimVector dy_shape(2);
-    dy->shape_view().ToDimVector(&dy_shape);
-    DimVector ones_buf_shape(2);
-    ones_buf_shape.at(0) = 1;
-    ones_buf_shape.at(1) = batch_size;
-    InferMatmulMNK(ones_buf_shape, dy_shape, /*trans_a=*/false, /*trans_b=*/false, &m, &n, &k);
-    reduce_matmul->Launch(ctx->stream(), m, n, k, 1.0, ones,
-                          reinterpret_cast<void*>(dmatmul_result0), 0.0, dbias->mut_dptr());
-  }
-};
-
-#define REGISTER_FUSED_CROSS_FEATURE_INTERACTION_V2_GRAD_KERNEL(dtype)                        \
-  REGISTER_USER_KERNEL("fused_cross_feature_interaction_v2_grad")                             \
-      .SetCreateFn<FusedCrossFeatureInteractionV2GradKernel<dtype>>()                         \
-      .SetIsMatchedHob((HobDeviceType() == DeviceType::kCUDA)                                 \
-                       && (HobDataType("dy", 0) == GetDataType<dtype>::value)                 \
-                       && ReduceMatmulPrimitiveExists() && WeightGradMatmulPrimitiveExists()) \
-      .SetInferTmpSizeFn([](InferContext* ctx) {                                              \
-        size_t tmp_size = 0;                                                                  \
-        const TensorDesc& dy = ctx->InputTensorDesc("dy", 0);                                 \
-        const int64_t dy_elem_cnt = dy.shape().elem_cnt();                                    \
-        size_t dmatmul_result_size = GetCudaAlignedSize(dy_elem_cnt * sizeof(dtype));         \
-        size_t dx_buf_size = dmatmul_result_size;                                             \
-        tmp_size = dmatmul_result_size + dx_buf_size;                                         \
-        return tmp_size;                                                                      \
-      });
-
-REGISTER_FUSED_CROSS_FEATURE_INTERACTION_V2_GRAD_KERNEL(float)
-REGISTER_FUSED_CROSS_FEATURE_INTERACTION_V2_GRAD_KERNEL(half)
-
-}  // namespace user_op
-
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "hip/hip_runtime.h"
+#include "oneflow/core/framework/framework.h"
+#include "oneflow/core/kernel/cuda_graph_support.h"
+#include "oneflow/core/ep/include/primitive/matmul.h"
+#include "oneflow/core/hip/elementwise.hip.h"
+#include "oneflow/core/ep/rocm/cuda_stream.h"
+
+namespace oneflow {
+
+namespace {
+
+constexpr int kBlockSize = 256;
+
+void InferMatmulMNK(const DimVector& a_shape, const DimVector& b_shape, bool transpose_a,
+                    bool transpose_b, size_t* m, size_t* n, size_t* k) {
+  const int64_t num_a_axes = a_shape.size();
+  CHECK_GE(num_a_axes, 2);
+  const int64_t num_b_axes = b_shape.size();
+  CHECK_GE(num_b_axes, 2);
+  if (!transpose_a) {
+    *m = a_shape.at(num_a_axes - 2);
+    *k = a_shape.at(num_a_axes - 1);
+  } else {
+    *m = a_shape.at(num_a_axes - 1);
+    *k = a_shape.at(num_a_axes - 2);
+  }
+  if (!transpose_b) {
+    CHECK_EQ(b_shape.at(num_b_axes - 2), *k);
+    *n = b_shape.at(num_b_axes - 1);
+  } else {
+    CHECK_EQ(b_shape.at(num_b_axes - 1), *k);
+    *n = b_shape.at(num_b_axes - 2);
+  }
+}
+
+ep::primitive::BlasTransposeType GetBlasTransposeType(bool transpose) {
+  return transpose ? ep::primitive::BlasTransposeType::T : ep::primitive::BlasTransposeType::N;
+}
+
+template<typename T>
+struct MulOp {
+  __device__ __forceinline__ T operator()(const T& a, const T& b) const { return a * b; }
+};
+
+template<typename T>
+struct AddOp {
+  __device__ __forceinline__ T operator()(const T& a, const T& b) const { return a + b; }
+};
+
+template<typename T>
+int GetLaunchPackSize(const int64_t cols) {
+  constexpr int type_pack_size = cuda::elementwise::PackSize<T>();
+  for (int launch_pack_size = 8; launch_pack_size > 0; launch_pack_size /= 2) {
+    if (type_pack_size >= launch_pack_size && cols % launch_pack_size == 0) {
+      return launch_pack_size;
+    }
+  }
+  return 1;
+}
+
+template<typename T, typename IndexType, int pack_size>
+__global__ void BroadcastMulKernel(const T* x, const T* y, T* out, const IndexType cols,
+                                   const IndexType elem_cnt) {
+  const IndexType global_thread_id = blockDim.x * blockIdx.x + threadIdx.x;
+  using LoadPack = cuda::elementwise::Packed<T, pack_size>;
+  for (IndexType linear_index = global_thread_id * pack_size,
+                 step = gridDim.x * blockDim.x * pack_size;
+       linear_index < elem_cnt; linear_index += step) {
+    const IndexType row_idx = linear_index / cols;
+    const LoadPack* x_load = reinterpret_cast<const LoadPack*>(x + linear_index);
+    LoadPack x_vec = *x_load;
+    LoadPack out_store;
+    const T y_val = y[row_idx];
+#pragma unroll
+    for (int i = 0; i < pack_size; i++) { out_store.elem[i] = x_vec.elem[i] * y_val; }
+    *(reinterpret_cast<LoadPack*>(out + linear_index)) = out_store;
+  }
+}
+
+template<typename T, typename IndexType>
+void DispatchBroadcastMulPackSize(ep::Stream* stream, const T* x, const T* y, T* out,
+                                  const IndexType cols, const IndexType elem_cnt) {
+  int grid_size;
+  const int pack_size = GetLaunchPackSize<T>(cols);
+  const int64_t pack_num = elem_cnt / pack_size;
+  hipError_t err = cuda::elementwise::GetNumBlocks(pack_num, &grid_size);
+  if (pack_size == 8) {
+    BroadcastMulKernel<T, IndexType, 8>
+        <<<grid_size, kBlockSize, 0, stream->As<ep::CudaStream>()->cuda_stream()>>>(x, y, out, cols,
+                                                                                    elem_cnt);
+  } else if (pack_size == 4) {
+    BroadcastMulKernel<T, IndexType, 4>
+        <<<grid_size, kBlockSize, 0, stream->As<ep::CudaStream>()->cuda_stream()>>>(x, y, out, cols,
+                                                                                    elem_cnt);
+  } else if (pack_size == 2) {
+    BroadcastMulKernel<T, IndexType, 2>
+        <<<grid_size, kBlockSize, 0, stream->As<ep::CudaStream>()->cuda_stream()>>>(x, y, out, cols,
+                                                                                    elem_cnt);
+  } else {
+    BroadcastMulKernel<T, IndexType, 1>
+        <<<grid_size, kBlockSize, 0, stream->As<ep::CudaStream>()->cuda_stream()>>>(x, y, out, cols,
+                                                                                    elem_cnt);
+  }
+}
+
+template<typename T>
+void DispatchBroadcastMulIndexType(ep::Stream* stream, const T* x, const T* y, T* out,
+                                   const int64_t cols, const int64_t elem_cnt) {
+  if (elem_cnt < GetMaxVal<int32_t>()) {
+    DispatchBroadcastMulPackSize<T, int32_t>(stream, x, y, out, cols, elem_cnt);
+  } else {
+    DispatchBroadcastMulPackSize<T, int64_t>(stream, x, y, out, cols, elem_cnt);
+  }
+}
+
+template<typename T, typename IndexType, int pack_size>
+__global__ void BroadcastAddElementwiseMulKernel(const T* x, const T* y, const T* z, T* out,
+                                                 const IndexType cols, const IndexType elem_cnt) {
+  const IndexType global_thread_id = blockDim.x * blockIdx.x + threadIdx.x;
+  using LoadPack = cuda::elementwise::Packed<T, pack_size>;
+  for (IndexType linear_index = global_thread_id * pack_size,
+                 step = gridDim.x * blockDim.x * pack_size;
+       linear_index < elem_cnt; linear_index += step) {
+    const IndexType row_idx = linear_index / cols;
+    const IndexType col_idx = linear_index - row_idx * cols;
+    const LoadPack* x_load = reinterpret_cast<const LoadPack*>(x + linear_index);
+    const LoadPack* y_load = reinterpret_cast<const LoadPack*>(y + col_idx);
+    const LoadPack* z_load = reinterpret_cast<const LoadPack*>(z + linear_index);
+
+    LoadPack x_vec = *x_load;
+    LoadPack y_vec = *y_load;
+    LoadPack z_vec = *z_load;
+    LoadPack out_store;
+
+#pragma unroll
+    for (int i = 0; i < pack_size; i++) {
+      out_store.elem[i] = (x_vec.elem[i] + y_vec.elem[i]) * z_vec.elem[i];
+    }
+    *(reinterpret_cast<LoadPack*>(out + linear_index)) = out_store;
+  }
+}
+
+template<typename T, typename IndexType>
+void DispatchBroadcastAddElementwiseMulPackSize(ep::Stream* stream, const T* x, const T* y,
+                                                const T* z, T* out, const IndexType cols,
+                                                const IndexType elem_cnt) {
+  int grid_size;
+  const int pack_size = GetLaunchPackSize<T>(cols);
+  const int64_t pack_num = elem_cnt / pack_size;
+  hipError_t err = cuda::elementwise::GetNumBlocks(pack_num, &grid_size);
+  if (pack_size == 8) {
+    BroadcastAddElementwiseMulKernel<T, IndexType, 8>
+        <<<grid_size, kBlockSize, 0, stream->As<ep::CudaStream>()->cuda_stream()>>>(x, y, z, out,
+                                                                                    cols, elem_cnt);
+  } else if (pack_size == 4) {
+    BroadcastAddElementwiseMulKernel<T, IndexType, 4>
+        <<<grid_size, kBlockSize, 0, stream->As<ep::CudaStream>()->cuda_stream()>>>(x, y, z, out,
+                                                                                    cols, elem_cnt);
+  } else if (pack_size == 2) {
+    BroadcastAddElementwiseMulKernel<T, IndexType, 2>
+        <<<grid_size, kBlockSize, 0, stream->As<ep::CudaStream>()->cuda_stream()>>>(x, y, z, out,
+                                                                                    cols, elem_cnt);
+  } else {
+    BroadcastAddElementwiseMulKernel<T, IndexType, 1>
+        <<<grid_size, kBlockSize, 0, stream->As<ep::CudaStream>()->cuda_stream()>>>(x, y, z, out,
+                                                                                    cols, elem_cnt);
+  }
+}
+
+template<typename T>
+void DispatchBroadcastAddElementwiseMulIndexType(ep::Stream* stream, const T* x, const T* y,
+                                                 const T* z, T* out, const int64_t cols,
+                                                 const int64_t elem_cnt) {
+  if (elem_cnt < GetMaxVal<int32_t>()) {
+    DispatchBroadcastAddElementwiseMulPackSize<T, int32_t>(stream, x, y, z, out, cols, elem_cnt);
+  } else {
+    DispatchBroadcastAddElementwiseMulPackSize<T, int64_t>(stream, x, y, z, out, cols, elem_cnt);
+  }
+}
+
+}  // namespace
+
+namespace user_op {
+
+std::unique_ptr<ep::primitive::Matmul> NewMatmulPrimitive(DeviceType device_type,
+                                                          DataType data_type, bool transpose_a,
+                                                          bool transpose_b) {
+  const auto trans_a = GetBlasTransposeType(transpose_a);
+  const auto trans_b = GetBlasTransposeType(transpose_b);
+  return ep::primitive::NewPrimitive<ep::primitive::MatmulFactory>(device_type, data_type, trans_a,
+                                                                   trans_b);
+}
+
+template<typename Context>
+std::unique_ptr<ep::primitive::Matmul> NewReduceMatmulPrimitive(Context* ctx) {
+  const DataType data_type = ctx->TensorDesc4ArgNameAndIndex("dy", 0)->data_type();
+  return NewMatmulPrimitive(ctx->device_type(), data_type, /*transpose_a=*/false,
+                            /*transpose_b=*/false);
+}
+
+auto ReduceMatmulPrimitiveExists() {
+  return hob::make_custom("MatmulPrimitiveExists", [](const KernelRegContext& ctx) {
+    return NewReduceMatmulPrimitive(&ctx).operator bool();
+  });
+}
+
+template<typename Context>
+std::unique_ptr<ep::primitive::Matmul> NewWeightGradMatmulPrimitive(Context* ctx) {
+  const DataType data_type = ctx->TensorDesc4ArgNameAndIndex("x", 0)->data_type();
+  return NewMatmulPrimitive(ctx->device_type(), data_type, /*transpose_a=*/true,
+                            /*transpose_b=*/false);
+}
+
+auto WeightGradMatmulPrimitiveExists() {
+  return hob::make_custom("MatmulPrimitiveExists", [](const KernelRegContext& ctx) {
+    return NewWeightGradMatmulPrimitive(&ctx).operator bool();
+  });
+}
+
+template<typename T>
+class FusedCrossFeatureInteractionGradKernel final : public OpKernel, public CudaGraphSupport {
+ public:
+  FusedCrossFeatureInteractionGradKernel() = default;
+  ~FusedCrossFeatureInteractionGradKernel() override = default;
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
+
+ private:
+  using user_op::OpKernel::Compute;
+  void Compute(KernelComputeContext* ctx) const override {
+    const Tensor* dy = ctx->Tensor4ArgNameAndIndex("dy", 0);
+    const Tensor* weight = ctx->Tensor4ArgNameAndIndex("weight", 0);
+    const Tensor* x0 = ctx->Tensor4ArgNameAndIndex("x0", 0);
+    const Tensor* x = ctx->Tensor4ArgNameAndIndex("x", 0);
+    const Tensor* matmul_result = ctx->Tensor4ArgNameAndIndex("matmul_result", 0);
+
+    const int64_t batch_size = dy->shape_view().At(0);
+    const int64_t hidden_size = dy->shape_view().At(1);
+    const int64_t out_size = weight->shape_view().At(0);
+    const int64_t dy_elem_cnt = dy->shape_view().elem_cnt();
+
+    Tensor* dx0 = ctx->Tensor4ArgNameAndIndex("dx0", 0);
+    Tensor* dw = ctx->Tensor4ArgNameAndIndex("dw", 0);
+    Tensor* dx = ctx->Tensor4ArgNameAndIndex("dx", 0);
+    Tensor* dbias = ctx->Tensor4ArgNameAndIndex("dbias", 0);
+    Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0);
+
+    // step1: Get dbias.
+    const T* ones = nullptr;
+    auto* cuda_device = dynamic_cast<ep::CudaDevice*>(ctx->stream()->device());
+    if (cuda_device != nullptr) {
+      ones = static_cast<const T*>(cuda_device->GetConstOnes(dy->data_type(), batch_size));
+    }
+    size_t m = 0, n = 0, k = 0;
+    DimVector dy_shape(2);
+    dy->shape_view().ToDimVector(&dy_shape);
+    DimVector ones_buf_shape(2);
+    ones_buf_shape.at(0) = 1;
+    ones_buf_shape.at(1) = batch_size;
+    InferMatmulMNK(ones_buf_shape, dy_shape, /*trans_a=*/false, /*trans_b=*/false, &m, &n, &k);
+    auto reduce_matmul = NewReduceMatmulPrimitive(ctx);
+    CHECK(reduce_matmul);
+    reduce_matmul->Launch(ctx->stream(), m, n, k, 1.0, ones, dy->dptr(), 0.0, dbias->mut_dptr());
+
+    // step2: Get dmatmul_result0.
+    T* dy_mul_x0 = reinterpret_cast<T*>(tmp_buffer->mut_dptr());
+    T* dmatmul_result0 = reinterpret_cast<T*>(tmp_buffer->mut_dptr<char>()
+                                              + GetCudaAlignedSize(dy_elem_cnt * sizeof(T)));
+    OF_CUDA_CHECK(cuda::elementwise::Binary(MulOp<T>(), dy_elem_cnt, dy_mul_x0, dy->dptr<T>(),
+                                            x0->dptr<T>(),
+                                            ctx->stream()->As<ep::CudaStream>()->cuda_stream()));
+
+    ones = static_cast<const T*>(cuda_device->GetConstOnes(dy->data_type(), hidden_size));
+    DimVector dy_mul_x0_shape(2);
+    dy->shape_view().ToDimVector(&dy_mul_x0_shape);
+    ones_buf_shape.at(0) = hidden_size;
+    ones_buf_shape.at(1) = 1;
+    InferMatmulMNK(dy_mul_x0_shape, ones_buf_shape, /*trans_a=*/false, /*trans_b=*/false, &m, &n,
+                   &k);
+    reduce_matmul->Launch(ctx->stream(), m, n, k, 1.0, dy_mul_x0, ones, 0.0, dmatmul_result0);
+
+    // step3: Get dx
+    T* dx_buf = reinterpret_cast<T*>(tmp_buffer->mut_dptr<char>()
+                                     + GetCudaAlignedSize(dy_elem_cnt * sizeof(T))
+                                     + GetCudaAlignedSize(batch_size * sizeof(T)));
+    DimVector dmatmul_result_shape(2);
+    dmatmul_result_shape.at(0) = batch_size;
+    dmatmul_result_shape.at(1) = 1;  // todo change to hidden size
+    DimVector weight_shape(2);
+    weight->shape_view().ToDimVector(&weight_shape);
+    InferMatmulMNK(dmatmul_result_shape, weight_shape, /*trans_a=*/false, /*trans_b=*/false, &m, &n,
+                   &k);
+    reduce_matmul->Launch(ctx->stream(), m, n, k, 1.0, dmatmul_result0, weight->dptr(), 0.0,
+                          reinterpret_cast<void*>(dx_buf));
+    OF_CUDA_CHECK(cuda::elementwise::Binary(AddOp<T>(), dy_elem_cnt, dx->mut_dptr<T>(), dx_buf,
+                                            dy->dptr<T>(),
+                                            ctx->stream()->As<ep::CudaStream>()->cuda_stream()));
+
+    // step4: Get dw.
+    DimVector x_shape(2);
+    x->shape_view().ToDimVector(&x_shape);
+
+    InferMatmulMNK(dmatmul_result_shape, x_shape, /*trans_a=*/true, /*trans_b=*/false, &m, &n, &k);
+    auto weight_grad_matmul = NewWeightGradMatmulPrimitive(ctx);
+    CHECK(weight_grad_matmul);
+    weight_grad_matmul->Launch(ctx->stream(), m, n, k, 1.0, dmatmul_result0, x->dptr(), 0.0,
+                               dw->mut_dptr());
+
+    // step5: Get dx0.
+    DispatchBroadcastMulIndexType<T>(ctx->stream(), dy->dptr<T>(), matmul_result->dptr<T>(),
+                                     dx0->mut_dptr<T>(), hidden_size, dy_elem_cnt);
+  }
+};
+
+#define REGISTER_FUSED_CROSS_FEATURE_INTERACTION_V1_GRAD_KERNEL(dtype)                        \
+  REGISTER_USER_KERNEL("fused_cross_feature_interaction_v1_grad")                             \
+      .SetCreateFn<FusedCrossFeatureInteractionGradKernel<dtype>>()                           \
+      .SetIsMatchedHob((HobDeviceType() == DeviceType::kCUDA)                                 \
+                       && (HobDataType("dy", 0) == GetDataType<dtype>::value)                 \
+                       && ReduceMatmulPrimitiveExists() && WeightGradMatmulPrimitiveExists()) \
+      .SetInferTmpSizeFn([](InferContext* ctx) {                                              \
+        size_t tmp_size = 0;                                                                  \
+        const TensorDesc& dy = ctx->InputTensorDesc("dy", 0);                                 \
+        const int64_t dy_elem_cnt = dy.shape().elem_cnt();                                    \
+        const int64_t batch_size = dy.shape().At(0);                                          \
+        size_t dy_mul_x0_size = GetCudaAlignedSize(dy_elem_cnt * sizeof(dtype));              \
+        size_t dmatmul_result_size = GetCudaAlignedSize(batch_size * sizeof(dtype));          \
+        size_t dx_buf_size = dy_mul_x0_size;                                                  \
+        tmp_size = dy_mul_x0_size + dmatmul_result_size + dx_buf_size;                        \
+        return tmp_size;                                                                      \
+      });
+
+REGISTER_FUSED_CROSS_FEATURE_INTERACTION_V1_GRAD_KERNEL(float)
+REGISTER_FUSED_CROSS_FEATURE_INTERACTION_V1_GRAD_KERNEL(half)
+
+template<typename T>
+class FusedCrossFeatureInteractionV2GradKernel final : public OpKernel, public CudaGraphSupport {
+ public:
+  FusedCrossFeatureInteractionV2GradKernel() = default;
+  ~FusedCrossFeatureInteractionV2GradKernel() = default;
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
+
+ private:
+  using user_op::OpKernel::Compute;
+  void Compute(KernelComputeContext* ctx) const override {
+    const Tensor* dy = ctx->Tensor4ArgNameAndIndex("dy", 0);
+    const Tensor* weight = ctx->Tensor4ArgNameAndIndex("weight", 0);
+    const Tensor* bias = ctx->Tensor4ArgNameAndIndex("bias", 0);
+    const Tensor* x0 = ctx->Tensor4ArgNameAndIndex("x0", 0);
+    const Tensor* x = ctx->Tensor4ArgNameAndIndex("x", 0);
+    const Tensor* matmul_result = ctx->Tensor4ArgNameAndIndex("matmul_result", 0);
+
+    const int64_t batch_size = dy->shape_view().At(0);
+    const int64_t in_size = weight->shape_view().At(1);
+    const int64_t hidden_size = weight->shape_view().At(0);
+    const int64_t dy_elem_cnt = dy->shape_view().elem_cnt();
+
+    Tensor* dx0 = ctx->Tensor4ArgNameAndIndex("dx0", 0);
+    Tensor* dw = ctx->Tensor4ArgNameAndIndex("dw", 0);
+    Tensor* dx = ctx->Tensor4ArgNameAndIndex("dx", 0);
+    Tensor* dbias = ctx->Tensor4ArgNameAndIndex("dbias", 0);
+    Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0);
+
+    // step1: Get dx0.
+    DispatchBroadcastAddElementwiseMulIndexType<T>(ctx->stream(), matmul_result->dptr<T>(),
+                                                   bias->dptr<T>(), dy->dptr<T>(),
+                                                   dx0->mut_dptr<T>(), hidden_size, dy_elem_cnt);
+
+    // step2: Get dmatmul_result0.
+    T* dmatmul_result0 = reinterpret_cast<T*>(tmp_buffer->mut_dptr());
+    OF_CUDA_CHECK(cuda::elementwise::Binary(MulOp<T>(), dy_elem_cnt, dmatmul_result0, dy->dptr<T>(),
+                                            x0->dptr<T>(),
+                                            ctx->stream()->As<ep::CudaStream>()->cuda_stream()));
+    // step3: Get dx
+    T* dx_buf = reinterpret_cast<T*>(tmp_buffer->mut_dptr<char>()
+                                     + GetCudaAlignedSize(dy_elem_cnt * sizeof(T)));
+    DimVector dmatmul_result_shape(2);
+    dmatmul_result_shape.at(0) = batch_size;
+    dmatmul_result_shape.at(1) = hidden_size;
+    DimVector weight_shape(2);
+    weight->shape_view().ToDimVector(&weight_shape);
+    size_t m = 0, n = 0, k = 0;
+    InferMatmulMNK(dmatmul_result_shape, weight_shape, /*trans_a=*/false, /*trans_b=*/false, &m, &n,
+                   &k);
+    auto reduce_matmul = NewReduceMatmulPrimitive(ctx);
+    CHECK(reduce_matmul);
+    reduce_matmul->Launch(ctx->stream(), m, n, k, 1.0, dmatmul_result0, weight->dptr(), 0.0,
+                          reinterpret_cast<void*>(dx_buf));
+    OF_CUDA_CHECK(cuda::elementwise::Binary(AddOp<T>(), dy_elem_cnt, dx->mut_dptr<T>(), dx_buf,
+                                            dy->dptr<T>(),
+                                            ctx->stream()->As<ep::CudaStream>()->cuda_stream()));
+
+    // step4: Get dw.
+    DimVector x_shape(2);
+    x->shape_view().ToDimVector(&x_shape);
+
+    InferMatmulMNK(dmatmul_result_shape, x_shape, /*trans_a=*/true, /*trans_b=*/false, &m, &n, &k);
+    auto weight_grad_matmul = NewWeightGradMatmulPrimitive(ctx);
+    CHECK(weight_grad_matmul);
+    weight_grad_matmul->Launch(ctx->stream(), m, n, k, 1.0, dmatmul_result0, x->dptr(), 0.0,
+                               dw->mut_dptr());
+
+    // step5: Get dbias.
+    const T* ones = nullptr;
+    auto* cuda_device = dynamic_cast<ep::CudaDevice*>(ctx->stream()->device());
+    if (cuda_device != nullptr) {
+      ones = static_cast<const T*>(cuda_device->GetConstOnes(dy->data_type(), batch_size));
+    }
+    DimVector dy_shape(2);
+    dy->shape_view().ToDimVector(&dy_shape);
+    DimVector ones_buf_shape(2);
+    ones_buf_shape.at(0) = 1;
+    ones_buf_shape.at(1) = batch_size;
+    InferMatmulMNK(ones_buf_shape, dy_shape, /*trans_a=*/false, /*trans_b=*/false, &m, &n, &k);
+    reduce_matmul->Launch(ctx->stream(), m, n, k, 1.0, ones,
+                          reinterpret_cast<void*>(dmatmul_result0), 0.0, dbias->mut_dptr());
+  }
+};
+
+#define REGISTER_FUSED_CROSS_FEATURE_INTERACTION_V2_GRAD_KERNEL(dtype)                        \
+  REGISTER_USER_KERNEL("fused_cross_feature_interaction_v2_grad")                             \
+      .SetCreateFn<FusedCrossFeatureInteractionV2GradKernel<dtype>>()                         \
+      .SetIsMatchedHob((HobDeviceType() == DeviceType::kCUDA)                                 \
+                       && (HobDataType("dy", 0) == GetDataType<dtype>::value)                 \
+                       && ReduceMatmulPrimitiveExists() && WeightGradMatmulPrimitiveExists()) \
+      .SetInferTmpSizeFn([](InferContext* ctx) {                                              \
+        size_t tmp_size = 0;                                                                  \
+        const TensorDesc& dy = ctx->InputTensorDesc("dy", 0);                                 \
+        const int64_t dy_elem_cnt = dy.shape().elem_cnt();                                    \
+        size_t dmatmul_result_size = GetCudaAlignedSize(dy_elem_cnt * sizeof(dtype));         \
+        size_t dx_buf_size = dmatmul_result_size;                                             \
+        tmp_size = dmatmul_result_size + dx_buf_size;                                         \
+        return tmp_size;                                                                      \
+      });
+
+REGISTER_FUSED_CROSS_FEATURE_INTERACTION_V2_GRAD_KERNEL(float)
+REGISTER_FUSED_CROSS_FEATURE_INTERACTION_V2_GRAD_KERNEL(half)
+
+}  // namespace user_op
+
 }  // namespace oneflow
\ No newline at end of file
diff --git a/oneflow/user/kernels/fused_dot_feature_interaction_kernel.hip.cpp b/oneflow/user/kernels/fused_dot_feature_interaction_kernel.hip.cpp
index 07993ac..bf662d2 100644
--- a/oneflow/user/kernels/fused_dot_feature_interaction_kernel.hip.cpp
+++ b/oneflow/user/kernels/fused_dot_feature_interaction_kernel.hip.cpp
@@ -1,923 +1,923 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "hip/hip_runtime.h"
-#include "oneflow/core/framework/framework.h"
-#include "oneflow/core/ep/rocm/cuda_stream.h"
-#include "oneflow/core/ep/include/primitive/copy_nd.h"
-#include "oneflow/core/ep/include/primitive/batch_matmul.h"
-#include "oneflow/core/kernel/cuda_graph_support.h"
-
-namespace oneflow {
-
-namespace {
-
-__global__ void GenerateGatherIndicesGpu(const int32_t elem_cnt, const int32_t stride,
-                                         const int32_t in_cols, const int32_t offset,
-                                         int32_t* gather_indices) {
-  CUDA_1D_KERNEL_LOOP(i, elem_cnt) {
-    const int32_t row = i / stride;
-    const int32_t col = i - row * stride;
-    if (col < row + offset) {
-      int32_t in_index = row * in_cols + col;
-      int32_t idx = row * (offset + row - 1 + offset) / 2 + col;
-      gather_indices[idx] = in_index;
-    }
-  }
-}
-
-template<typename T>
-__global__ void GatherConcatGpu(int32_t elem_cnt, int32_t out_cols, int32_t valid_out_cols,
-                                int32_t in_cols, int32_t output_concat_end_dim,
-                                const int32_t* gather_indices, const T* in,
-                                const T* output_concat_ptr, T* out_ptr) {
-  CUDA_1D_KERNEL_LOOP(i, elem_cnt) {
-    const int32_t row = i / out_cols;
-    const int32_t col = i - row * out_cols;
-    T out_val;
-    if (col < output_concat_end_dim) {
-      const int32_t output_concat_idx = row * output_concat_end_dim + col;
-      out_val = output_concat_ptr[output_concat_idx];
-    } else if (col < valid_out_cols) {
-      const int32_t gather_col_idx = gather_indices[col - output_concat_end_dim];
-      const int32_t in_offset = row * in_cols + gather_col_idx;
-      out_val = in[in_offset];
-    } else {
-      out_val = 0;
-    }
-    out_ptr[i] = out_val;
-  }
-}
-
-template<typename T>
-__global__ void ScatterSplitAddTransposeGpu(int32_t elem_cnt, int32_t stride_dim, int32_t out_dim,
-                                            int32_t in_grad_stride, int32_t in_grad_matrix_dim,
-                                            int32_t in_grad_matrix_valid_dim,
-                                            int32_t output_concat_end_dim, const int32_t offset,
-                                            const T* dy, T* output_concat_grad, T* in_grad) {
-  CUDA_1D_KERNEL_LOOP(i, elem_cnt) {
-    const int32_t row = i / stride_dim;
-    const int32_t col = i - row * stride_dim;
-    if (col < output_concat_end_dim) {
-      output_concat_grad[row * output_concat_end_dim + col] = dy[row * out_dim + col];
-    } else {
-      int32_t in_col_id = col - output_concat_end_dim;
-      const int32_t matrix_row = in_col_id / in_grad_matrix_dim;
-      const int32_t matrix_col = in_col_id - matrix_row * in_grad_matrix_dim;
-      T grad_val = 0;
-      const T* row_dy = dy + row * out_dim + output_concat_end_dim;
-      if (matrix_row < in_grad_matrix_valid_dim && matrix_col < in_grad_matrix_valid_dim) {
-        if (matrix_col < matrix_row) {
-          int32_t dy_col_idx = matrix_row * (offset + matrix_row - 1 + offset) / 2 + matrix_col;
-          grad_val = row_dy[dy_col_idx];
-        } else if (matrix_row < matrix_col) {
-          // transpose add
-          int32_t trans_row_id = matrix_col;
-          int32_t trans_col_id = matrix_row;
-          int32_t dy_col_idx =
-              trans_row_id * (offset + trans_row_id - 1 + offset) / 2 + trans_col_id;
-          grad_val = row_dy[dy_col_idx];
-        } else if ((matrix_row == matrix_col) && (offset == 1)) {
-          int32_t dy_col_idx = matrix_row * (offset + matrix_row - 1 + offset) / 2 + matrix_col;
-          grad_val = row_dy[dy_col_idx] * static_cast<T>(2);
-        }
-      }
-      int32_t in_grad_offset = row * in_grad_stride + in_col_id;
-      in_grad[in_grad_offset] = grad_val;
-    }
-  }
-}
-
-template<typename T>
-void ConcatFeatures(user_op::KernelComputeContext* ctx, int64_t dst_rows, int64_t dst_cols,
-                    void* dst_ptr) {
-  const int64_t feature_input_size = ctx->input_size("features");
-  auto primitive = ep::primitive::NewPrimitive<ep::primitive::CopyNdFactory>(DeviceType::kCUDA, 2);
-  DimVector dst_shape = {dst_rows, dst_cols};
-  int64_t out_col_offset = 0;
-  for (int64_t i = 0; i < feature_input_size; ++i) {
-    const user_op::Tensor* feature = ctx->Tensor4ArgNameAndIndex("features", i);
-    const int64_t feature_rows = feature->shape_view().At(0);
-    const int64_t feature_cols = feature->shape_view().Count(1);
-    DimVector dst_pos_vec = {0, out_col_offset};
-    DimVector src_shape = {feature_rows, feature_cols};
-    DimVector src_pos_vec = {0, 0};
-    DimVector extent_vec = {feature_rows, feature_cols};
-    primitive->Launch(ctx->stream(), feature->data_type(), 2, dst_ptr, dst_shape.data(),
-                      dst_pos_vec.data(), feature->dptr<T>(), src_shape.data(), src_pos_vec.data(),
-                      extent_vec.data());
-    out_col_offset += feature_cols;
-  }
-  int64_t pad_dim = dst_cols - out_col_offset;
-  if (pad_dim > 0) {
-    char* out_ptr = reinterpret_cast<char*>(dst_ptr) + out_col_offset * sizeof(T);
-    OF_CUDA_CHECK(hipMemset2DAsync(out_ptr, dst_cols * sizeof(T), 0, pad_dim * sizeof(T), dst_rows,
-                                    ctx->stream()->As<ep::CudaStream>()->cuda_stream()));
-  }
-}
-
-template<typename T>
-void GatherConcatKernel(ep::Stream* stream, int32_t elem_cnt, int32_t out_dim,
-                        int32_t valid_out_dim, int32_t features_concated_dim,
-                        int32_t concated_padded_dim, int32_t output_concat_end_dim,
-                        bool self_interaction, const T* matmul_out, const T* output_concat_ptr,
-                        int32_t* gather_indices_ptr, T* out_ptr) {
-  hipStream_t cuda_stream = stream->As<ep::CudaStream>()->cuda_stream();
-  const int32_t gen_indices_elem_cnt = features_concated_dim * features_concated_dim;
-  int32_t offset = self_interaction ? 1 : 0;
-  hipLaunchKernelGGL(GenerateGatherIndicesGpu, BlocksNum4ThreadsNum(gen_indices_elem_cnt), kCudaThreadsNumPerBlock, 0, cuda_stream, gen_indices_elem_cnt, features_concated_dim,
-                                            concated_padded_dim, offset, gather_indices_ptr);
-
-  int32_t matmul_stride = concated_padded_dim * concated_padded_dim;
-  hipLaunchKernelGGL(GatherConcatGpu, BlocksNum4ThreadsNum(elem_cnt), kCudaThreadsNumPerBlock, 0, cuda_stream, 
-      elem_cnt, out_dim, valid_out_dim, matmul_stride, output_concat_end_dim, gather_indices_ptr,
-      matmul_out, output_concat_ptr, out_ptr);
-}
-
-template<typename T>
-void ScatterSplitAddTranspose(ep::Stream* stream, int32_t batch_size, int32_t out_dim,
-                              int32_t concated_padded_dim, int32_t features_concated_dim,
-                              int32_t output_concat_end_dim, const bool self_interaction,
-                              const T* dy, T* output_concat_grad, T* matmul_out_grad_ptr) {
-  int32_t stride_dim = output_concat_end_dim + concated_padded_dim * concated_padded_dim;
-  int32_t matmul_stride = concated_padded_dim * concated_padded_dim;
-  const int32_t elem_cnt = batch_size * stride_dim;
-  int32_t offset = self_interaction ? 1 : 0;
-  ScatterSplitAddTransposeGpu<<<BlocksNum4ThreadsNum(elem_cnt), kCudaThreadsNumPerBlock, 0,
-                                stream->As<ep::CudaStream>()->cuda_stream()>>>(
-      elem_cnt, stride_dim, out_dim, matmul_stride, concated_padded_dim, features_concated_dim,
-      output_concat_end_dim, offset, dy, output_concat_grad, matmul_out_grad_ptr);
-}
-
-template<typename T>
-void ConcatFeaturesGrad(user_op::KernelComputeContext* ctx, const int64_t batch_size,
-                        const int64_t concated_padded_dim, const int64_t vector_size,
-                        const T* concated_features_grad) {
-  auto primitive = ep::primitive::NewPrimitive<ep::primitive::CopyNdFactory>(DeviceType::kCUDA, 2);
-  DimVector src_shape = {batch_size, concated_padded_dim * vector_size};
-  int64_t in_col_offset = 0;
-  for (int64_t i = 0; i < ctx->output_size("features_grad"); ++i) {
-    user_op::Tensor* feature_grad = ctx->Tensor4ArgNameAndIndex("features_grad", i);
-    const int64_t feature_grad_rows = feature_grad->shape_view().At(0);
-    const int64_t feature_grad_cols = feature_grad->shape_view().Count(1);
-    DimVector dst_shape = {feature_grad_rows, feature_grad_cols};
-    DimVector dst_pos_vec = {0, 0};
-    DimVector src_pos_vec = {0, in_col_offset};
-    DimVector extent_vec = {feature_grad_rows, feature_grad_cols};
-    in_col_offset += feature_grad_cols;
-    primitive->Launch(ctx->stream(), feature_grad->data_type(), 2, feature_grad->mut_dptr(),
-                      dst_shape.data(), dst_pos_vec.data(), concated_features_grad,
-                      src_shape.data(), src_pos_vec.data(), extent_vec.data());
-  }
-}
-
-template<typename T>
-struct DefaultComputeType {
-  using type = T;
-};
-
-template<>
-struct DefaultComputeType<half> {
-  using type = float;
-};
-
-template<typename T, size_t pack_size>
-struct alignas(sizeof(T) * pack_size) Pack {
-  T elem[pack_size];
-};
-
-int64_t GetPaddedDim(int64_t dim) {
-  const int64_t align_dim = 16;
-  const int64_t padded_dim = (dim + align_dim - 1) / align_dim * align_dim;
-  return padded_dim;
-}
-
-template<typename T, int32_t max_in>
-struct DotFwdParam {
-  const T* in[max_in];
-  int32_t in_feature_dim[max_in];
-  int32_t dim_start_offset[max_in];
-  int32_t features_dim;
-  const T* output_concat;
-  int32_t output_concat_size;
-  T* out;
-  int32_t num_in;
-};
-
-constexpr int kUnrollDim = 2;
-template<typename T, typename ComputeType, int32_t max_in, int32_t pack_size, int mn_tile_dim,
-         int k_tile_dim>
-__global__ void DotFeatureInteractionWmmaImpl(
-    int m_num_tiles, int k_num_tiles, int64_t batch_size, int padded_num_rows, int vector_num_pack,
-    int padded_vector_num_pack, int out_num_cols, int out_num_cols_num_pack, int in_shared_mem_cols,
-    int in_shared_mem_cols_num_pack, int acc_shared_mem_cols, int acc_shared_mem_cols_num_pack,
-    int offset, int output_padding, DotFwdParam<T, max_in> param) {
-  asm volatile("s_trap 0;");
-}
-
-template<typename T>
-struct KTileDim {
-  static const int val = 16;
-};
-
-template<>
-struct KTileDim<float> {
-  static const int val = 8;
-};
-
-template<typename T, int max_in, int32_t pack_size>
-struct DotFeatureInteractionKernel {
-  static bool Launch(ep::Stream* stream, int64_t batch_size, int concated_padded_dim,
-                     int vector_size, int out_num_cols, bool self_interaction, int output_padding,
-                     const DotFwdParam<T, max_in>& param) {
-    const int block_size = 128;
-    const int block_dim_x = 32;
-    const int block_dim_y = block_size / block_dim_x;
-    const int num_blocks = batch_size;
-    const int mn_tile_dim = 16;
-    const int k_tile_dim = KTileDim<T>::val;
-    const int64_t padded_vector_size = GetPaddedDim(vector_size);
-    const int m_num_tiles = concated_padded_dim / mn_tile_dim;
-    const int k_num_tiles = padded_vector_size / k_tile_dim;
-    const int skew_in = 8;
-    const int skew_acc = 8;
-    const int in_shared_mem_num_cols = padded_vector_size + skew_in;
-    const int acc_shared_mem_num_cols = concated_padded_dim + skew_acc;
-    const size_t in_shared_mem_bytes = concated_padded_dim * in_shared_mem_num_cols * sizeof(T);
-    using ComputeType = typename DefaultComputeType<T>::type;
-    const size_t acc_shared_mem_bytes =
-        concated_padded_dim * acc_shared_mem_num_cols * sizeof(ComputeType);
-    const size_t total_shared_mem_bytes = in_shared_mem_bytes + acc_shared_mem_bytes;
-    const int32_t offset = self_interaction ? 1 : 0;
-    const int out_num_cols_num_pack = out_num_cols / pack_size;
-    const int vector_num_pack = vector_size / pack_size;
-    const int padded_vector_num_pack = padded_vector_size / pack_size;
-    const int in_shared_mem_cols_num_pack = in_shared_mem_num_cols / pack_size;
-    const int acc_shared_mem_cols_num_pack = acc_shared_mem_num_cols / pack_size;
-    int max_active_blocks;
-    OF_CUDA_CHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor(
-        &max_active_blocks,
-        DotFeatureInteractionWmmaImpl<T, ComputeType, max_in, pack_size, mn_tile_dim, k_tile_dim>,
-        block_size, total_shared_mem_bytes));
-    if (max_active_blocks <= 0) { return false; }
-    hipStream_t cuda_stream = stream->As<ep::CudaStream>()->cuda_stream();
-    hipLaunchKernelGGL(HIP_KERNEL_NAME(DotFeatureInteractionWmmaImpl<T, ComputeType, max_in, pack_size, mn_tile_dim, k_tile_dim>), num_blocks, dim3(block_dim_x, block_dim_y), total_shared_mem_bytes, cuda_stream, 
-            m_num_tiles, k_num_tiles, batch_size, concated_padded_dim, vector_num_pack,
-            padded_vector_num_pack, out_num_cols, out_num_cols_num_pack, in_shared_mem_num_cols,
-            in_shared_mem_cols_num_pack, acc_shared_mem_num_cols, acc_shared_mem_cols_num_pack,
-            offset, output_padding, param);
-    return true;
-  }
-};
-
-template<typename T, int32_t max_in>
-struct DotBwdParam {
-  const T* out_grad;
-  const T* in[max_in];
-  T* in_grad[max_in];
-  T* output_concat_grad;
-  int32_t output_concat_size;
-  int32_t in_feature_dim[max_in];
-  int32_t dim_start_offset[max_in];
-  int32_t features_dim;
-  int32_t num_in;
-};
-
-template<typename T, typename ComputeType, int32_t max_in, int32_t pack_size, int mn_tile_dim,
-         int k_tile_dim>
-__global__ void DotFeatureInteractionBackwardWmmaImpl(
-    int m_num_tiles, int n_num_tiles, int k_num_tiles, int64_t batch_size, int padded_num_rows,
-    int vector_num_pack, int padded_vector_num_pack, int out_num_cols, int in_shared_mem_cols,
-    int in_shared_mem_cols_num_pack, int matrix_out_grad_shared_mem_cols, int offset,
-    DotBwdParam<T, max_in> param) {
-  asm volatile("s_trap 0;");
-}
-
-template<typename T, int max_in, int32_t pack_size>
-struct DotFeatureInteractionBackwardKernel {
-  static bool Launch(ep::Stream* stream, int64_t batch_size, int concated_padded_dim,
-                     int vector_size, int out_num_cols, bool self_interaction,
-                     const DotBwdParam<T, max_in>& param) {
-    const int block_size = 256;
-    const int block_dim_x = 32;
-    const int block_dim_y = block_size / block_dim_x;
-    const int num_blocks = batch_size;
-    const int mn_tile_dim = 16;
-    const int k_tile_dim = KTileDim<T>::val;
-    const int64_t padded_vector_size = GetPaddedDim(vector_size);
-    const int m_num_tiles = concated_padded_dim / mn_tile_dim;
-    const int k_num_tiles = concated_padded_dim / k_tile_dim;
-    const int n_num_tiles = padded_vector_size / mn_tile_dim;
-    const int skew_in = 8;
-    const int in_shared_mem_num_cols = padded_vector_size + skew_in;
-    const int matrix_out_grad_shared_mem_cols = concated_padded_dim + skew_in;
-    const size_t in_shared_mem_bytes = concated_padded_dim * in_shared_mem_num_cols * sizeof(T);
-    const size_t matrix_out_grad_shared_mem_bytes =
-        concated_padded_dim * matrix_out_grad_shared_mem_cols * sizeof(T);
-    using ComputeType = typename DefaultComputeType<T>::type;
-    const size_t in_grad_shared_mem_bytes =
-        concated_padded_dim * in_shared_mem_num_cols * sizeof(ComputeType);
-    const size_t total_shared_mem_bytes =
-        in_shared_mem_bytes + matrix_out_grad_shared_mem_bytes + in_grad_shared_mem_bytes;
-    const int32_t offset = self_interaction ? 1 : 0;
-    const int vector_num_pack = vector_size / pack_size;
-    const int padded_vector_num_pack = padded_vector_size / pack_size;
-    const int in_shared_mem_cols_num_pack = in_shared_mem_num_cols / pack_size;
-    int max_active_blocks;
-    OF_CUDA_CHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor(
-        &max_active_blocks,
-        DotFeatureInteractionBackwardWmmaImpl<T, ComputeType, max_in, pack_size, mn_tile_dim,
-                                              k_tile_dim>,
-        block_size, total_shared_mem_bytes));
-    if (max_active_blocks <= 0) { return false; }
-    hipStream_t cuda_stream = stream->As<ep::CudaStream>()->cuda_stream();
-    DotFeatureInteractionBackwardWmmaImpl<T, ComputeType, max_in, pack_size, mn_tile_dim,
-                                          k_tile_dim>
-        <<<num_blocks, dim3(block_dim_x, block_dim_y), total_shared_mem_bytes, cuda_stream>>>(
-            m_num_tiles, n_num_tiles, k_num_tiles, batch_size, concated_padded_dim, vector_num_pack,
-            padded_vector_num_pack, out_num_cols, in_shared_mem_num_cols,
-            in_shared_mem_cols_num_pack, matrix_out_grad_shared_mem_cols, offset, param);
-
-    return true;
-  }
-};
-
-template<typename T, int max_in>
-bool DispatchFeatureInteractionDotPackSize(user_op::KernelComputeContext* ctx,
-                                           const int32_t input_size) {
-  CHECK_LE(input_size, max_in) << input_size;
-  user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0);
-  const int64_t batch_size = out->shape_view().At(0);
-  const int64_t out_num_cols = out->shape_view().At(1);
-  const int64_t vector_size = ctx->TensorDesc4ArgNameAndIndex("features", 0)->shape().At(2);
-  DotFwdParam<T, max_in> param;
-  param.num_in = input_size;
-  param.out = out->mut_dptr<T>();
-  int64_t features_concated_dim = 0;
-  for (int i = 0; i < input_size; ++i) {
-    param.in[i] = ctx->Tensor4ArgNameAndIndex("features", i)->dptr<T>();
-    param.in_feature_dim[i] = ctx->TensorDesc4ArgNameAndIndex("features", i)->shape().At(1);
-    param.dim_start_offset[i] = features_concated_dim;
-    features_concated_dim += param.in_feature_dim[i];
-  }
-  const int64_t concated_padded_dim = GetPaddedDim(features_concated_dim);
-  param.features_dim = features_concated_dim;
-  if (ctx->has_input("output_concat", 0)) {
-    const user_op::Tensor* output_concat = ctx->Tensor4ArgNameAndIndex("output_concat", 0);
-    param.output_concat = output_concat->dptr<T>();
-    param.output_concat_size = output_concat->shape_view().At(1);
-  } else {
-    param.output_concat = nullptr;
-    param.output_concat_size = 0;
-  }
-  const bool self_interaction = ctx->Attr<bool>("self_interaction");
-  const int32_t output_padding = ctx->Attr<int32_t>("output_padding");
-  if (vector_size % 4 == 0 && out_num_cols % 4 == 0) {
-    return DotFeatureInteractionKernel<T, max_in, 4>::Launch(
-        ctx->stream(), batch_size, concated_padded_dim, vector_size, out_num_cols, self_interaction,
-        output_padding, param);
-  } else if (vector_size % 2 == 0 && out_num_cols % 2 == 0) {
-    return DotFeatureInteractionKernel<T, max_in, 2>::Launch(
-        ctx->stream(), batch_size, concated_padded_dim, vector_size, out_num_cols, self_interaction,
-        output_padding, param);
-  } else {
-    return DotFeatureInteractionKernel<T, max_in, 1>::Launch(
-        ctx->stream(), batch_size, concated_padded_dim, vector_size, out_num_cols, self_interaction,
-        output_padding, param);
-  }
-}
-
-template<typename T, int max_in>
-bool DispatchFeatureInteractionDotBackwardPackSize(user_op::KernelComputeContext* ctx,
-                                                   const int32_t input_size) {
-  CHECK_LE(input_size, max_in) << input_size;
-  user_op::Tensor* dy = ctx->Tensor4ArgNameAndIndex("dy", 0);
-  const int64_t batch_size = dy->shape_view().At(0);
-  const int64_t out_num_cols = dy->shape_view().At(1);
-  const int64_t vector_size = ctx->TensorDesc4ArgNameAndIndex("features", 0)->shape().At(2);
-  DotBwdParam<T, max_in> param;
-  param.num_in = input_size;
-  param.out_grad = dy->dptr<T>();
-  int64_t features_concated_dim = 0;
-  for (int i = 0; i < input_size; ++i) {
-    param.in[i] = ctx->Tensor4ArgNameAndIndex("features", i)->dptr<T>();
-    param.in_grad[i] = ctx->Tensor4ArgNameAndIndex("features_grad", i)->mut_dptr<T>();
-    param.in_feature_dim[i] = ctx->TensorDesc4ArgNameAndIndex("features", i)->shape().At(1);
-    param.dim_start_offset[i] = features_concated_dim;
-    features_concated_dim += param.in_feature_dim[i];
-  }
-  const int64_t concated_padded_dim = GetPaddedDim(features_concated_dim);
-  param.features_dim = features_concated_dim;
-  if (ctx->has_output("output_concat_grad", 0)) {
-    user_op::Tensor* output_concat_grad = ctx->Tensor4ArgNameAndIndex("output_concat_grad", 0);
-    param.output_concat_grad = output_concat_grad->mut_dptr<T>();
-    param.output_concat_size = output_concat_grad->shape_view().At(1);
-  } else {
-    param.output_concat_grad = nullptr;
-    param.output_concat_size = 0;
-  }
-  const bool self_interaction = ctx->Attr<bool>("self_interaction");
-  if (vector_size % 4 == 0) {
-    return DotFeatureInteractionBackwardKernel<T, max_in, 4>::Launch(
-        ctx->stream(), batch_size, concated_padded_dim, vector_size, out_num_cols, self_interaction,
-        param);
-  } else if (vector_size % 2 == 0) {
-    return DotFeatureInteractionBackwardKernel<T, max_in, 2>::Launch(
-        ctx->stream(), batch_size, concated_padded_dim, vector_size, out_num_cols, self_interaction,
-        param);
-  } else {
-    return DotFeatureInteractionBackwardKernel<T, max_in, 1>::Launch(
-        ctx->stream(), batch_size, concated_padded_dim, vector_size, out_num_cols, self_interaction,
-        param);
-  }
-}
-
-template<typename T, int32_t max_in>
-struct Param {
-  const T* in[max_in];
-  int32_t in_feature_dim[max_in];
-  T* out;
-  int32_t num_in;
-};
-
-template<typename T, int32_t max_in, int32_t pack_size>
-__global__ void FeatureInteractionSum(int64_t batch_size, int64_t vector_num_pack,
-                                      Param<T, max_in> param) {
-  using ComputeType = typename DefaultComputeType<T>::type;
-  Pack<T, pack_size>* dst_pack = reinterpret_cast<Pack<T, pack_size>*>(param.out);
-  for (int batch_idx = blockIdx.x * blockDim.y + threadIdx.y; batch_idx < batch_size;
-       batch_idx += gridDim.x * blockDim.y) {
-    Pack<T, pack_size>* batch_out = dst_pack + batch_idx * vector_num_pack;
-    for (int col_id = threadIdx.x; col_id < vector_num_pack; col_id += blockDim.x) {
-      Pack<ComputeType, pack_size> sum;
-      Pack<ComputeType, pack_size> square_sum;
-#pragma unroll
-      for (int k = 0; k < pack_size; ++k) {
-        sum.elem[k] = static_cast<ComputeType>(0);
-        square_sum.elem[k] = static_cast<ComputeType>(0);
-      }
-      for (int i = 0; i < max_in; ++i) {
-        if (i >= param.num_in) { break; }
-        const Pack<T, pack_size>* batch_in =
-            reinterpret_cast<const Pack<T, pack_size>*>(param.in[i])
-            + batch_idx * param.in_feature_dim[i] * vector_num_pack;
-#pragma unroll
-        for (int j = 0; j < param.in_feature_dim[i]; ++j) {
-          Pack<T, pack_size> val = batch_in[j * vector_num_pack + col_id];
-#pragma unroll
-          for (int k = 0; k < pack_size; ++k) {
-            const ComputeType compute_val = static_cast<ComputeType>(val.elem[k]);
-            sum.elem[k] += compute_val;
-            square_sum.elem[k] += compute_val * compute_val;
-          }
-        }
-      }
-      Pack<T, pack_size> out;
-#pragma unroll
-      for (int k = 0; k < pack_size; ++k) {
-        out.elem[k] = static_cast<T>((sum.elem[k] * sum.elem[k] - square_sum.elem[k])
-                                     * static_cast<ComputeType>(0.5));
-      }
-      batch_out[col_id] = out;
-    }
-  }
-}
-
-template<typename T, int32_t max_in>
-struct GradParam {
-  const T* out_grad;
-  const T* in[max_in];
-  int32_t in_feature_dim[max_in];
-  T* in_grad[max_in];
-  int32_t num_in;
-};
-
-template<typename T, int32_t max_in>
-__global__ void FeatureInteractionSumGrad(int64_t batch_size, int64_t vector_size,
-                                          GradParam<T, max_in> param) {
-  using ComputeType = typename DefaultComputeType<T>::type;
-  for (int batch_idx = blockIdx.x * blockDim.y + threadIdx.y; batch_idx < batch_size;
-       batch_idx += gridDim.x * blockDim.y) {
-    const T* batch_out_grad = param.out_grad + batch_idx * vector_size;
-    for (int col_id = threadIdx.x; col_id < vector_size; col_id += blockDim.x) {
-      ComputeType sum = 0;
-      for (int i = 0; i < max_in; ++i) {
-        if (i >= param.num_in) { break; }
-        const T* batch_in = param.in[i] + batch_idx * param.in_feature_dim[i] * vector_size;
-        for (int j = 0; j < param.in_feature_dim[i]; ++j) {
-          sum += static_cast<ComputeType>(batch_in[j * vector_size + col_id]);
-        }
-      }
-      for (int i = 0; i < max_in; ++i) {
-        if (i >= param.num_in) { break; }
-        const int64_t in_batch_offset = batch_idx * param.in_feature_dim[i] * vector_size;
-        const T* batch_in = param.in[i] + in_batch_offset;
-        T* batch_in_grad = param.in_grad[i] + in_batch_offset;
-        for (int j = 0; j < param.in_feature_dim[i]; ++j) {
-          const int64_t offset = j * vector_size + col_id;
-          batch_in_grad[offset] =
-              static_cast<T>(static_cast<ComputeType>(batch_out_grad[col_id])
-                             * (sum - static_cast<ComputeType>(batch_in[offset])));
-        }
-      }
-    }
-  }
-}
-
-void GetBlockDims(const int64_t vector_size, int* block_dim_x, int* block_dim_y) {
-  const int block_size = 256;
-  if (vector_size < block_size) {
-    *block_dim_x = std::ceil(static_cast<float>(vector_size) / 8) * 8;
-    *block_dim_y = (block_size + *block_dim_x - 1) / *block_dim_x;
-  } else {
-    *block_dim_x = block_size;
-    *block_dim_y = 1;
-  }
-}
-
-int GetNumBlocks(const int64_t num_instances, const int64_t instance_per_block) {
-  int max_blocks = (num_instances + instance_per_block - 1) / instance_per_block;
-  return std::min(max_blocks, kCudaMaxBlocksNum);
-}
-
-template<typename T, int32_t max_in>
-void DispatchFeatureInteractionSumPackSize(ep::Stream* stream, const int64_t batch_size,
-                                           const int64_t vector_size,
-                                           const Param<T, max_in>& param) {
-  int block_dim_x;
-  int block_dim_y;
-  const int pack_size = (vector_size % 2 == 0) ? 2 : 1;
-  const int64_t vector_num_pack = vector_size / pack_size;
-  GetBlockDims(vector_num_pack, &block_dim_x, &block_dim_y);
-  const int num_blocks = GetNumBlocks(batch_size, block_dim_y);
-  dim3 block_dims = dim3(block_dim_x, block_dim_y);
-  hipStream_t cuda_stream = stream->As<ep::CudaStream>()->cuda_stream();
-  if (pack_size == 2) {
-    hipLaunchKernelGGL(HIP_KERNEL_NAME(FeatureInteractionSum<T, max_in, 2>), num_blocks, block_dims, 0, cuda_stream, batch_size, vector_num_pack, param);
-  } else {
-    hipLaunchKernelGGL(HIP_KERNEL_NAME(FeatureInteractionSum<T, max_in, 1>), num_blocks, block_dims, 0, cuda_stream, batch_size, vector_num_pack, param);
-  }
-}
-
-template<typename T, int max_in>
-void DispatchFeatureInteractionSumInputSize(user_op::KernelComputeContext* ctx,
-                                            const int32_t input_size) {
-  CHECK_LE(input_size, max_in) << input_size;
-  user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0);
-  const int64_t batch_size = out->shape_view().At(0);
-  const int64_t vector_size = out->shape_view().At(1);
-  Param<T, max_in> param;
-  param.num_in = input_size;
-  param.out = out->mut_dptr<T>();
-  for (int i = 0; i < input_size; ++i) {
-    param.in[i] = ctx->Tensor4ArgNameAndIndex("features", i)->dptr<T>();
-    param.in_feature_dim[i] = ctx->TensorDesc4ArgNameAndIndex("features", i)->shape().At(1);
-  }
-  DispatchFeatureInteractionSumPackSize<T, max_in>(ctx->stream(), batch_size, vector_size, param);
-}
-
-template<typename T, int max_in>
-void DispatchFeatureInteractionSumGradInputSize(user_op::KernelComputeContext* ctx,
-                                                const int32_t input_size) {
-  CHECK_LE(input_size, max_in) << input_size;
-  const user_op::Tensor* dy = ctx->Tensor4ArgNameAndIndex("dy", 0);
-  const int64_t batch_size = dy->shape_view().At(0);
-  const int64_t vector_size = dy->shape_view().At(1);
-  int block_dim_x;
-  int block_dim_y;
-  GetBlockDims(vector_size, &block_dim_x, &block_dim_y);
-  const int num_blocks = GetNumBlocks(batch_size, block_dim_y);
-  dim3 block_dims = dim3(block_dim_x, block_dim_y);
-  GradParam<T, max_in> param;
-  param.num_in = input_size;
-  param.out_grad = dy->dptr<T>();
-  for (int i = 0; i < input_size; ++i) {
-    param.in[i] = ctx->Tensor4ArgNameAndIndex("features", i)->dptr<T>();
-    param.in_grad[i] = ctx->Tensor4ArgNameAndIndex("features_grad", i)->mut_dptr<T>();
-    param.in_feature_dim[i] = ctx->TensorDesc4ArgNameAndIndex("features_grad", i)->shape().At(1);
-  }
-  FeatureInteractionSumGrad<T, max_in>
-      <<<num_blocks, block_dims, 0, ctx->stream()->As<ep::CudaStream>()->cuda_stream()>>>(
-          batch_size, vector_size, param);
-}
-
-}  // namespace
-
-template<typename T>
-class FusedDotFeatureInteractionPoolingSumKernel final : public user_op::OpKernel,
-                                                         public user_op::CudaGraphSupport {
- public:
-  FusedDotFeatureInteractionPoolingSumKernel() = default;
-  ~FusedDotFeatureInteractionPoolingSumKernel() override = default;
-
- private:
-  using user_op::OpKernel::Compute;
-  void Compute(user_op::KernelComputeContext* ctx) const override {
-    const int input_size = ctx->input_size("features");
-    if (input_size == 1) {
-      DispatchFeatureInteractionSumInputSize<T, 1>(ctx, input_size);
-    } else if (input_size == 2) {
-      DispatchFeatureInteractionSumInputSize<T, 2>(ctx, input_size);
-    } else if (input_size <= 8) {
-      DispatchFeatureInteractionSumInputSize<T, 8>(ctx, input_size);
-    } else {
-      CHECK_LE(input_size, 128) << "input_size must not greater than 128. ";
-      DispatchFeatureInteractionSumInputSize<T, 128>(ctx, input_size);
-    }
-  }
-  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
-};
-
-#define REGISTER_FUSED_DOT_FEATURE_INTERACTION_POOLING_SUM_KERNEL(dtype)                \
-  REGISTER_USER_KERNEL("fused_dot_feature_interaction")                                 \
-      .SetCreateFn<FusedDotFeatureInteractionPoolingSumKernel<dtype>>()                 \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                  \
-                       && (user_op::HobDataType("out", 0) == GetDataType<dtype>::value) \
-                       && (user_op::HobAttr<std::string>("pooling") == "sum"));
-
-REGISTER_FUSED_DOT_FEATURE_INTERACTION_POOLING_SUM_KERNEL(float)
-REGISTER_FUSED_DOT_FEATURE_INTERACTION_POOLING_SUM_KERNEL(half)
-
-template<typename T>
-bool TryLaunchTensorCoreDotKernel(user_op::KernelComputeContext* ctx) {
-  const int input_size = ctx->input_size("features");
-  if (input_size == 1) {
-    return DispatchFeatureInteractionDotPackSize<T, 1>(ctx, input_size);
-  } else if (input_size == 2) {
-    return DispatchFeatureInteractionDotPackSize<T, 2>(ctx, input_size);
-  } else if (input_size <= 8) {
-    return DispatchFeatureInteractionDotPackSize<T, 8>(ctx, input_size);
-  } else {
-    CHECK_LE(input_size, 128) << "input_size must not greater than 128. ";
-    return DispatchFeatureInteractionDotPackSize<T, 128>(ctx, input_size);
-  }
-}
-
-template<typename T>
-bool TryLaunchTensorCoreDotBackwardKernel(user_op::KernelComputeContext* ctx) {
-  const int input_size = ctx->input_size("features");
-  if (input_size == 1) {
-    return DispatchFeatureInteractionDotBackwardPackSize<T, 1>(ctx, input_size);
-  } else if (input_size == 2) {
-    return DispatchFeatureInteractionDotBackwardPackSize<T, 2>(ctx, input_size);
-  } else if (input_size <= 8) {
-    return DispatchFeatureInteractionDotBackwardPackSize<T, 8>(ctx, input_size);
-  } else {
-    CHECK_LE(input_size, 128) << "input_size must not greater than 128. ";
-    return DispatchFeatureInteractionDotBackwardPackSize<T, 128>(ctx, input_size);
-  }
-}
-template<typename T>
-class FusedDotFeatureInteractionKernel final : public user_op::OpKernel,
-                                               public user_op::CudaGraphSupport {
- public:
-  FusedDotFeatureInteractionKernel() = default;
-  ~FusedDotFeatureInteractionKernel() override = default;
-
- private:
-  using user_op::OpKernel::Compute;
-  void Compute(user_op::KernelComputeContext* ctx) const override {
-    user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0);
-    const DataType data_type = out->data_type();
-    CHECK_LT(out->shape_view().elem_cnt(), GetMaxVal<int32_t>());
-    auto* cuda_stream = ctx->stream()->As<ep::CudaStream>();
-    // if ((cuda_stream->device_properties().major >= 7 && data_type == DataType::kFloat16)
-    //     || (cuda_stream->device_properties().major >= 8 && data_type == DataType::kFloat)) {
-    //   bool success = TryLaunchTensorCoreDotKernel<T>(ctx);
-    //   if (success == true) { return; }
-    // }
-    user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0);
-    const int64_t batch_size = out->shape_view().At(0);
-    int64_t features_concated_dim = 0;
-    for (int64_t i = 0; i < ctx->input_size("features"); ++i) {
-      features_concated_dim += ctx->TensorDesc4ArgNameAndIndex("features", i)->shape().At(1);
-    }
-    const int64_t concated_padded_dim = GetPaddedDim(features_concated_dim);
-    const int64_t vector_size = ctx->TensorDesc4ArgNameAndIndex("features", 0)->shape().At(2);
-    const int64_t out_dim = out->shape_view().At(1);
-    const int32_t output_padding = ctx->Attr<int32_t>("output_padding");
-    const int64_t valid_out_dim = out_dim - output_padding;
-    const bool self_interaction = ctx->Attr<bool>("self_interaction");
-
-    T* matmul_out = reinterpret_cast<T*>(tmp_buffer->mut_dptr<char>());
-    size_t matmul_out_size =
-        GetCudaAlignedSize(batch_size * concated_padded_dim * concated_padded_dim * sizeof(T));
-    const int64_t interaction_dim = self_interaction
-                                        ? features_concated_dim * (features_concated_dim + 1) / 2
-                                        : features_concated_dim * (features_concated_dim - 1) / 2;
-    int32_t* gather_indices_ptr =
-        reinterpret_cast<int32_t*>(tmp_buffer->mut_dptr<char>() + matmul_out_size);
-    size_t gather_indices_size = GetCudaAlignedSize(interaction_dim * sizeof(int32_t));
-    T* padded_concated_features_ptr =
-        reinterpret_cast<T*>(tmp_buffer->mut_dptr<char>() + matmul_out_size + gather_indices_size);
-    size_t padded_concated_features_size =
-        GetCudaAlignedSize(batch_size * concated_padded_dim * vector_size * sizeof(T));
-    CHECK_GE(tmp_buffer->shape_view().elem_cnt(),
-             matmul_out_size + gather_indices_size + padded_concated_features_size);
-    ConcatFeatures<T>(ctx, batch_size, concated_padded_dim * vector_size,
-                      padded_concated_features_ptr);
-    auto batch_matmul = ep::primitive::NewPrimitive<ep::primitive::BatchMatmulFactory>(
-        ctx->device_type(), data_type, ep::primitive::BlasTransposeType::N,
-        ep::primitive::BlasTransposeType::T);
-    batch_matmul->Launch(ctx->stream(), batch_size, concated_padded_dim, concated_padded_dim,
-                         vector_size, 1.0, padded_concated_features_ptr,
-                         padded_concated_features_ptr, 0.0, matmul_out);
-
-    int64_t output_concat_end_dim = 0;
-    const T* output_concat_ptr = nullptr;
-    if (ctx->has_input("output_concat", 0)) {
-      user_op::Tensor* output_concat = ctx->Tensor4ArgNameAndIndex("output_concat", 0);
-      output_concat_end_dim = output_concat->shape_view().At(1);
-      output_concat_ptr = output_concat->dptr<T>();
-    }
-    CHECK_EQ(valid_out_dim, output_concat_end_dim + interaction_dim);
-    GatherConcatKernel<T>(ctx->stream(), out->shape_view().elem_cnt(), out_dim, valid_out_dim,
-                          features_concated_dim, concated_padded_dim, output_concat_end_dim,
-                          self_interaction, matmul_out, output_concat_ptr, gather_indices_ptr,
-                          out->mut_dptr<T>());
-  }
-  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
-};
-
-template<typename T>
-user_op::InferTmpSizeFn GenFusedDotFeatureInteractionInferTmpSizeFn() {
-  return [](user_op::InferContext* ctx) {
-    const Shape& first_feature_shape = ctx->InputShape("features", 0);
-    const int64_t batch_size = first_feature_shape.At(0);
-    const int64_t vector_size = first_feature_shape.At(2);
-    int64_t features_concated_dim = 0;
-    for (int32_t i = 0; i < ctx->input_size("features"); ++i) {
-      features_concated_dim += ctx->InputShape("features", i).At(1);
-    }
-    const int64_t concated_padded_dim = GetPaddedDim(features_concated_dim);
-    size_t matmul_out_size =
-        GetCudaAlignedSize(batch_size * concated_padded_dim * concated_padded_dim * sizeof(T));
-    const bool self_interaction = ctx->Attr<bool>("self_interaction");
-    const int64_t interaction_dim = self_interaction
-                                        ? features_concated_dim * (features_concated_dim + 1) / 2
-                                        : features_concated_dim * (features_concated_dim - 1) / 2;
-    size_t gather_indices_size = GetCudaAlignedSize(interaction_dim * sizeof(int32_t));
-    size_t padded_concated_features_size =
-        GetCudaAlignedSize(batch_size * concated_padded_dim * vector_size * sizeof(T));
-    return matmul_out_size + gather_indices_size + padded_concated_features_size;
-  };
-}
-
-#define REGISTER_FUSED_DOT_FEATURE_INTERACTION_KERNEL(dtype)                            \
-  REGISTER_USER_KERNEL("fused_dot_feature_interaction")                                 \
-      .SetCreateFn<FusedDotFeatureInteractionKernel<dtype>>()                           \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                  \
-                       && (user_op::HobDataType("out", 0) == GetDataType<dtype>::value) \
-                       && (user_op::HobAttr<std::string>("pooling") == "none"))         \
-      .SetInferTmpSizeFn(GenFusedDotFeatureInteractionInferTmpSizeFn<dtype>());
-
-REGISTER_FUSED_DOT_FEATURE_INTERACTION_KERNEL(float)
-REGISTER_FUSED_DOT_FEATURE_INTERACTION_KERNEL(half)
-
-template<typename T>
-class FusedDotFeatureInteractionGradKernel final : public user_op::OpKernel,
-                                                   public user_op::CudaGraphSupport {
- public:
-  FusedDotFeatureInteractionGradKernel() = default;
-  ~FusedDotFeatureInteractionGradKernel() override = default;
-
- private:
-  using user_op::OpKernel::Compute;
-  void Compute(user_op::KernelComputeContext* ctx) const override {
-    const user_op::Tensor* dy = ctx->Tensor4ArgNameAndIndex("dy", 0);
-    user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0);
-    const DataType data_type = dy->data_type();
-    auto* cuda_stream = ctx->stream()->As<ep::CudaStream>();
-    // if ((cuda_stream->device_properties().major >= 7 && data_type == DataType::kFloat16)
-    //     || (cuda_stream->device_properties().major >= 8 && data_type == DataType::kFloat)) {
-    //   bool success = TryLaunchTensorCoreDotBackwardKernel<T>(ctx);
-    //   if (success == true) { return; }
-    // }
-    const int64_t batch_size = dy->shape_view().At(0);
-    int64_t features_concated_dim = 0;
-    for (int32_t i = 0; i < ctx->output_size("features_grad"); ++i) {
-      features_concated_dim += ctx->TensorDesc4ArgNameAndIndex("features_grad", i)->shape().At(1);
-    }
-    const int64_t concated_padded_dim = GetPaddedDim(features_concated_dim);
-    const int64_t vector_size = ctx->TensorDesc4ArgNameAndIndex("features_grad", 0)->shape().At(2);
-    const int64_t out_dim = dy->shape_view().At(1);
-    const bool self_interaction = ctx->Attr<bool>("self_interaction");
-    T* matmul_out_grad_ptr = reinterpret_cast<T*>(tmp_buffer->mut_dptr<char>());
-    size_t matmul_out_grad_size =
-        GetCudaAlignedSize(batch_size * concated_padded_dim * concated_padded_dim * sizeof(T));
-    T* padded_concated_features_grad_ptr =
-        reinterpret_cast<T*>(tmp_buffer->mut_dptr<char>() + matmul_out_grad_size);
-    size_t padded_concated_features_grad_size =
-        GetCudaAlignedSize(batch_size * concated_padded_dim * vector_size * sizeof(T));
-    T* padded_concated_features_ptr = reinterpret_cast<T*>(
-        tmp_buffer->mut_dptr<char>() + matmul_out_grad_size + padded_concated_features_grad_size);
-    size_t padded_concated_features_size = padded_concated_features_grad_size;
-    CHECK_LE(
-        matmul_out_grad_size + padded_concated_features_grad_size + padded_concated_features_size,
-        tmp_buffer->shape_view().elem_cnt());
-    ConcatFeatures<T>(ctx, batch_size, concated_padded_dim * vector_size,
-                      padded_concated_features_ptr);
-
-    T* output_concat_grad_ptr = nullptr;
-    int64_t output_concat_end_dim = 0;
-    if (ctx->has_output("output_concat_grad", 0)) {
-      user_op::Tensor* output_concat_grad = ctx->Tensor4ArgNameAndIndex("output_concat_grad", 0);
-      output_concat_grad_ptr = output_concat_grad->mut_dptr<T>();
-      output_concat_end_dim = output_concat_grad->shape_view().At(1);
-    }
-    ScatterSplitAddTranspose(ctx->stream(), batch_size, out_dim, concated_padded_dim,
-                             features_concated_dim, output_concat_end_dim, self_interaction,
-                             dy->dptr<T>(), output_concat_grad_ptr, matmul_out_grad_ptr);
-
-    auto batch_matmul = ep::primitive::NewPrimitive<ep::primitive::BatchMatmulFactory>(
-        ctx->device_type(), data_type, ep::primitive::BlasTransposeType::N,
-        ep::primitive::BlasTransposeType::N);
-    batch_matmul->Launch(ctx->stream(), batch_size, concated_padded_dim, vector_size,
-                         concated_padded_dim, 1.0, matmul_out_grad_ptr,
-                         padded_concated_features_ptr, 0.0, padded_concated_features_grad_ptr);
-
-    ConcatFeaturesGrad(ctx, batch_size, concated_padded_dim, vector_size,
-                       padded_concated_features_grad_ptr);
-  }
-  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
-};
-
-template<typename T>
-user_op::InferTmpSizeFn GenFusedDotFeatureInteractionGradInferTmpSizeFn() {
-  return [](user_op::InferContext* ctx) {
-    int64_t features_concated_dim = 0;
-    for (int32_t i = 0; i < ctx->output_size("features_grad"); ++i) {
-      features_concated_dim += ctx->InputShape("features_grad", i).At(1);
-    }
-    const int64_t concated_padded_dim = GetPaddedDim(features_concated_dim);
-    const int64_t batch_size = ctx->InputShape("features_grad", 0).At(0);
-    const int64_t vector_size = ctx->InputShape("features_grad", 0).At(2);
-    size_t matmul_out_grad_size =
-        GetCudaAlignedSize(batch_size * concated_padded_dim * concated_padded_dim * sizeof(T));
-    size_t padded_concated_features_grad_size =
-        GetCudaAlignedSize(batch_size * concated_padded_dim * vector_size * sizeof(T));
-    size_t padded_concated_features_size = padded_concated_features_grad_size;
-    return matmul_out_grad_size + padded_concated_features_grad_size
-           + padded_concated_features_size;
-  };
-}
-
-#define REGISTER_FUSED_DOT_FEATURE_INTERACTION_GRAD_KERNEL(dtype)                      \
-  REGISTER_USER_KERNEL("fused_dot_feature_interaction_grad")                           \
-      .SetCreateFn<FusedDotFeatureInteractionGradKernel<dtype>>()                      \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                 \
-                       && (user_op::HobDataType("dy", 0) == GetDataType<dtype>::value) \
-                       && (user_op::HobAttr<std::string>("pooling") == "none"))        \
-      .SetInferTmpSizeFn(GenFusedDotFeatureInteractionGradInferTmpSizeFn<dtype>());
-
-REGISTER_FUSED_DOT_FEATURE_INTERACTION_GRAD_KERNEL(float)
-REGISTER_FUSED_DOT_FEATURE_INTERACTION_GRAD_KERNEL(half)
-
-template<typename T>
-class FusedDotFeatureInteractionPoolingSumGradKernel final : public user_op::OpKernel,
-                                                             public user_op::CudaGraphSupport {
- public:
-  FusedDotFeatureInteractionPoolingSumGradKernel() = default;
-  ~FusedDotFeatureInteractionPoolingSumGradKernel() override = default;
-
- private:
-  using user_op::OpKernel::Compute;
-  void Compute(user_op::KernelComputeContext* ctx) const override {
-    const int input_size = ctx->input_size("features");
-    if (input_size == 1) {
-      DispatchFeatureInteractionSumGradInputSize<T, 1>(ctx, input_size);
-    } else if (input_size == 2) {
-      DispatchFeatureInteractionSumGradInputSize<T, 2>(ctx, input_size);
-    } else if (input_size <= 8) {
-      DispatchFeatureInteractionSumGradInputSize<T, 8>(ctx, input_size);
-    } else {
-      CHECK_LE(input_size, 128) << "input_size must not greater than 128. ";
-      DispatchFeatureInteractionSumGradInputSize<T, 128>(ctx, input_size);
-    }
-  }
-  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
-};
-
-#define REGISTER_FUSED_DOT_FEATURE_INTERACTION_POOLING_SUM_GRAD_KERNEL(dtype)          \
-  REGISTER_USER_KERNEL("fused_dot_feature_interaction_grad")                           \
-      .SetCreateFn<FusedDotFeatureInteractionPoolingSumGradKernel<dtype>>()            \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                 \
-                       && (user_op::HobDataType("dy", 0) == GetDataType<dtype>::value) \
-                       && (user_op::HobAttr<std::string>("pooling") == "sum"));
-
-REGISTER_FUSED_DOT_FEATURE_INTERACTION_POOLING_SUM_GRAD_KERNEL(float)
-REGISTER_FUSED_DOT_FEATURE_INTERACTION_POOLING_SUM_GRAD_KERNEL(half)
-
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "hip/hip_runtime.h"
+#include "oneflow/core/framework/framework.h"
+#include "oneflow/core/ep/rocm/cuda_stream.h"
+#include "oneflow/core/ep/include/primitive/copy_nd.h"
+#include "oneflow/core/ep/include/primitive/batch_matmul.h"
+#include "oneflow/core/kernel/cuda_graph_support.h"
+
+namespace oneflow {
+
+namespace {
+
+__global__ void GenerateGatherIndicesGpu(const int32_t elem_cnt, const int32_t stride,
+                                         const int32_t in_cols, const int32_t offset,
+                                         int32_t* gather_indices) {
+  CUDA_1D_KERNEL_LOOP(i, elem_cnt) {
+    const int32_t row = i / stride;
+    const int32_t col = i - row * stride;
+    if (col < row + offset) {
+      int32_t in_index = row * in_cols + col;
+      int32_t idx = row * (offset + row - 1 + offset) / 2 + col;
+      gather_indices[idx] = in_index;
+    }
+  }
+}
+
+template<typename T>
+__global__ void GatherConcatGpu(int32_t elem_cnt, int32_t out_cols, int32_t valid_out_cols,
+                                int32_t in_cols, int32_t output_concat_end_dim,
+                                const int32_t* gather_indices, const T* in,
+                                const T* output_concat_ptr, T* out_ptr) {
+  CUDA_1D_KERNEL_LOOP(i, elem_cnt) {
+    const int32_t row = i / out_cols;
+    const int32_t col = i - row * out_cols;
+    T out_val;
+    if (col < output_concat_end_dim) {
+      const int32_t output_concat_idx = row * output_concat_end_dim + col;
+      out_val = output_concat_ptr[output_concat_idx];
+    } else if (col < valid_out_cols) {
+      const int32_t gather_col_idx = gather_indices[col - output_concat_end_dim];
+      const int32_t in_offset = row * in_cols + gather_col_idx;
+      out_val = in[in_offset];
+    } else {
+      out_val = 0;
+    }
+    out_ptr[i] = out_val;
+  }
+}
+
+template<typename T>
+__global__ void ScatterSplitAddTransposeGpu(int32_t elem_cnt, int32_t stride_dim, int32_t out_dim,
+                                            int32_t in_grad_stride, int32_t in_grad_matrix_dim,
+                                            int32_t in_grad_matrix_valid_dim,
+                                            int32_t output_concat_end_dim, const int32_t offset,
+                                            const T* dy, T* output_concat_grad, T* in_grad) {
+  CUDA_1D_KERNEL_LOOP(i, elem_cnt) {
+    const int32_t row = i / stride_dim;
+    const int32_t col = i - row * stride_dim;
+    if (col < output_concat_end_dim) {
+      output_concat_grad[row * output_concat_end_dim + col] = dy[row * out_dim + col];
+    } else {
+      int32_t in_col_id = col - output_concat_end_dim;
+      const int32_t matrix_row = in_col_id / in_grad_matrix_dim;
+      const int32_t matrix_col = in_col_id - matrix_row * in_grad_matrix_dim;
+      T grad_val = 0;
+      const T* row_dy = dy + row * out_dim + output_concat_end_dim;
+      if (matrix_row < in_grad_matrix_valid_dim && matrix_col < in_grad_matrix_valid_dim) {
+        if (matrix_col < matrix_row) {
+          int32_t dy_col_idx = matrix_row * (offset + matrix_row - 1 + offset) / 2 + matrix_col;
+          grad_val = row_dy[dy_col_idx];
+        } else if (matrix_row < matrix_col) {
+          // transpose add
+          int32_t trans_row_id = matrix_col;
+          int32_t trans_col_id = matrix_row;
+          int32_t dy_col_idx =
+              trans_row_id * (offset + trans_row_id - 1 + offset) / 2 + trans_col_id;
+          grad_val = row_dy[dy_col_idx];
+        } else if ((matrix_row == matrix_col) && (offset == 1)) {
+          int32_t dy_col_idx = matrix_row * (offset + matrix_row - 1 + offset) / 2 + matrix_col;
+          grad_val = row_dy[dy_col_idx] * static_cast<T>(2);
+        }
+      }
+      int32_t in_grad_offset = row * in_grad_stride + in_col_id;
+      in_grad[in_grad_offset] = grad_val;
+    }
+  }
+}
+
+template<typename T>
+void ConcatFeatures(user_op::KernelComputeContext* ctx, int64_t dst_rows, int64_t dst_cols,
+                    void* dst_ptr) {
+  const int64_t feature_input_size = ctx->input_size("features");
+  auto primitive = ep::primitive::NewPrimitive<ep::primitive::CopyNdFactory>(DeviceType::kCUDA, 2);
+  DimVector dst_shape = {dst_rows, dst_cols};
+  int64_t out_col_offset = 0;
+  for (int64_t i = 0; i < feature_input_size; ++i) {
+    const user_op::Tensor* feature = ctx->Tensor4ArgNameAndIndex("features", i);
+    const int64_t feature_rows = feature->shape_view().At(0);
+    const int64_t feature_cols = feature->shape_view().Count(1);
+    DimVector dst_pos_vec = {0, out_col_offset};
+    DimVector src_shape = {feature_rows, feature_cols};
+    DimVector src_pos_vec = {0, 0};
+    DimVector extent_vec = {feature_rows, feature_cols};
+    primitive->Launch(ctx->stream(), feature->data_type(), 2, dst_ptr, dst_shape.data(),
+                      dst_pos_vec.data(), feature->dptr<T>(), src_shape.data(), src_pos_vec.data(),
+                      extent_vec.data());
+    out_col_offset += feature_cols;
+  }
+  int64_t pad_dim = dst_cols - out_col_offset;
+  if (pad_dim > 0) {
+    char* out_ptr = reinterpret_cast<char*>(dst_ptr) + out_col_offset * sizeof(T);
+    OF_CUDA_CHECK(hipMemset2DAsync(out_ptr, dst_cols * sizeof(T), 0, pad_dim * sizeof(T), dst_rows,
+                                    ctx->stream()->As<ep::CudaStream>()->cuda_stream()));
+  }
+}
+
+template<typename T>
+void GatherConcatKernel(ep::Stream* stream, int32_t elem_cnt, int32_t out_dim,
+                        int32_t valid_out_dim, int32_t features_concated_dim,
+                        int32_t concated_padded_dim, int32_t output_concat_end_dim,
+                        bool self_interaction, const T* matmul_out, const T* output_concat_ptr,
+                        int32_t* gather_indices_ptr, T* out_ptr) {
+  hipStream_t cuda_stream = stream->As<ep::CudaStream>()->cuda_stream();
+  const int32_t gen_indices_elem_cnt = features_concated_dim * features_concated_dim;
+  int32_t offset = self_interaction ? 1 : 0;
+  hipLaunchKernelGGL(GenerateGatherIndicesGpu, BlocksNum4ThreadsNum(gen_indices_elem_cnt), kCudaThreadsNumPerBlock, 0, cuda_stream, gen_indices_elem_cnt, features_concated_dim,
+                                            concated_padded_dim, offset, gather_indices_ptr);
+
+  int32_t matmul_stride = concated_padded_dim * concated_padded_dim;
+  hipLaunchKernelGGL(GatherConcatGpu, BlocksNum4ThreadsNum(elem_cnt), kCudaThreadsNumPerBlock, 0, cuda_stream, 
+      elem_cnt, out_dim, valid_out_dim, matmul_stride, output_concat_end_dim, gather_indices_ptr,
+      matmul_out, output_concat_ptr, out_ptr);
+}
+
+template<typename T>
+void ScatterSplitAddTranspose(ep::Stream* stream, int32_t batch_size, int32_t out_dim,
+                              int32_t concated_padded_dim, int32_t features_concated_dim,
+                              int32_t output_concat_end_dim, const bool self_interaction,
+                              const T* dy, T* output_concat_grad, T* matmul_out_grad_ptr) {
+  int32_t stride_dim = output_concat_end_dim + concated_padded_dim * concated_padded_dim;
+  int32_t matmul_stride = concated_padded_dim * concated_padded_dim;
+  const int32_t elem_cnt = batch_size * stride_dim;
+  int32_t offset = self_interaction ? 1 : 0;
+  ScatterSplitAddTransposeGpu<<<BlocksNum4ThreadsNum(elem_cnt), kCudaThreadsNumPerBlock, 0,
+                                stream->As<ep::CudaStream>()->cuda_stream()>>>(
+      elem_cnt, stride_dim, out_dim, matmul_stride, concated_padded_dim, features_concated_dim,
+      output_concat_end_dim, offset, dy, output_concat_grad, matmul_out_grad_ptr);
+}
+
+template<typename T>
+void ConcatFeaturesGrad(user_op::KernelComputeContext* ctx, const int64_t batch_size,
+                        const int64_t concated_padded_dim, const int64_t vector_size,
+                        const T* concated_features_grad) {
+  auto primitive = ep::primitive::NewPrimitive<ep::primitive::CopyNdFactory>(DeviceType::kCUDA, 2);
+  DimVector src_shape = {batch_size, concated_padded_dim * vector_size};
+  int64_t in_col_offset = 0;
+  for (int64_t i = 0; i < ctx->output_size("features_grad"); ++i) {
+    user_op::Tensor* feature_grad = ctx->Tensor4ArgNameAndIndex("features_grad", i);
+    const int64_t feature_grad_rows = feature_grad->shape_view().At(0);
+    const int64_t feature_grad_cols = feature_grad->shape_view().Count(1);
+    DimVector dst_shape = {feature_grad_rows, feature_grad_cols};
+    DimVector dst_pos_vec = {0, 0};
+    DimVector src_pos_vec = {0, in_col_offset};
+    DimVector extent_vec = {feature_grad_rows, feature_grad_cols};
+    in_col_offset += feature_grad_cols;
+    primitive->Launch(ctx->stream(), feature_grad->data_type(), 2, feature_grad->mut_dptr(),
+                      dst_shape.data(), dst_pos_vec.data(), concated_features_grad,
+                      src_shape.data(), src_pos_vec.data(), extent_vec.data());
+  }
+}
+
+template<typename T>
+struct DefaultComputeType {
+  using type = T;
+};
+
+template<>
+struct DefaultComputeType<half> {
+  using type = float;
+};
+
+template<typename T, size_t pack_size>
+struct alignas(sizeof(T) * pack_size) Pack {
+  T elem[pack_size];
+};
+
+int64_t GetPaddedDim(int64_t dim) {
+  const int64_t align_dim = 16;
+  const int64_t padded_dim = (dim + align_dim - 1) / align_dim * align_dim;
+  return padded_dim;
+}
+
+template<typename T, int32_t max_in>
+struct DotFwdParam {
+  const T* in[max_in];
+  int32_t in_feature_dim[max_in];
+  int32_t dim_start_offset[max_in];
+  int32_t features_dim;
+  const T* output_concat;
+  int32_t output_concat_size;
+  T* out;
+  int32_t num_in;
+};
+
+constexpr int kUnrollDim = 2;
+template<typename T, typename ComputeType, int32_t max_in, int32_t pack_size, int mn_tile_dim,
+         int k_tile_dim>
+__global__ void DotFeatureInteractionWmmaImpl(
+    int m_num_tiles, int k_num_tiles, int64_t batch_size, int padded_num_rows, int vector_num_pack,
+    int padded_vector_num_pack, int out_num_cols, int out_num_cols_num_pack, int in_shared_mem_cols,
+    int in_shared_mem_cols_num_pack, int acc_shared_mem_cols, int acc_shared_mem_cols_num_pack,
+    int offset, int output_padding, DotFwdParam<T, max_in> param) {
+  asm volatile("s_trap 0;");
+}
+
+template<typename T>
+struct KTileDim {
+  static const int val = 16;
+};
+
+template<>
+struct KTileDim<float> {
+  static const int val = 8;
+};
+
+template<typename T, int max_in, int32_t pack_size>
+struct DotFeatureInteractionKernel {
+  static bool Launch(ep::Stream* stream, int64_t batch_size, int concated_padded_dim,
+                     int vector_size, int out_num_cols, bool self_interaction, int output_padding,
+                     const DotFwdParam<T, max_in>& param) {
+    const int block_size = 128;
+    const int block_dim_x = 32;
+    const int block_dim_y = block_size / block_dim_x;
+    const int num_blocks = batch_size;
+    const int mn_tile_dim = 16;
+    const int k_tile_dim = KTileDim<T>::val;
+    const int64_t padded_vector_size = GetPaddedDim(vector_size);
+    const int m_num_tiles = concated_padded_dim / mn_tile_dim;
+    const int k_num_tiles = padded_vector_size / k_tile_dim;
+    const int skew_in = 8;
+    const int skew_acc = 8;
+    const int in_shared_mem_num_cols = padded_vector_size + skew_in;
+    const int acc_shared_mem_num_cols = concated_padded_dim + skew_acc;
+    const size_t in_shared_mem_bytes = concated_padded_dim * in_shared_mem_num_cols * sizeof(T);
+    using ComputeType = typename DefaultComputeType<T>::type;
+    const size_t acc_shared_mem_bytes =
+        concated_padded_dim * acc_shared_mem_num_cols * sizeof(ComputeType);
+    const size_t total_shared_mem_bytes = in_shared_mem_bytes + acc_shared_mem_bytes;
+    const int32_t offset = self_interaction ? 1 : 0;
+    const int out_num_cols_num_pack = out_num_cols / pack_size;
+    const int vector_num_pack = vector_size / pack_size;
+    const int padded_vector_num_pack = padded_vector_size / pack_size;
+    const int in_shared_mem_cols_num_pack = in_shared_mem_num_cols / pack_size;
+    const int acc_shared_mem_cols_num_pack = acc_shared_mem_num_cols / pack_size;
+    int max_active_blocks;
+    OF_CUDA_CHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor(
+        &max_active_blocks,
+        DotFeatureInteractionWmmaImpl<T, ComputeType, max_in, pack_size, mn_tile_dim, k_tile_dim>,
+        block_size, total_shared_mem_bytes));
+    if (max_active_blocks <= 0) { return false; }
+    hipStream_t cuda_stream = stream->As<ep::CudaStream>()->cuda_stream();
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(DotFeatureInteractionWmmaImpl<T, ComputeType, max_in, pack_size, mn_tile_dim, k_tile_dim>), num_blocks, dim3(block_dim_x, block_dim_y), total_shared_mem_bytes, cuda_stream, 
+            m_num_tiles, k_num_tiles, batch_size, concated_padded_dim, vector_num_pack,
+            padded_vector_num_pack, out_num_cols, out_num_cols_num_pack, in_shared_mem_num_cols,
+            in_shared_mem_cols_num_pack, acc_shared_mem_num_cols, acc_shared_mem_cols_num_pack,
+            offset, output_padding, param);
+    return true;
+  }
+};
+
+template<typename T, int32_t max_in>
+struct DotBwdParam {
+  const T* out_grad;
+  const T* in[max_in];
+  T* in_grad[max_in];
+  T* output_concat_grad;
+  int32_t output_concat_size;
+  int32_t in_feature_dim[max_in];
+  int32_t dim_start_offset[max_in];
+  int32_t features_dim;
+  int32_t num_in;
+};
+
+template<typename T, typename ComputeType, int32_t max_in, int32_t pack_size, int mn_tile_dim,
+         int k_tile_dim>
+__global__ void DotFeatureInteractionBackwardWmmaImpl(
+    int m_num_tiles, int n_num_tiles, int k_num_tiles, int64_t batch_size, int padded_num_rows,
+    int vector_num_pack, int padded_vector_num_pack, int out_num_cols, int in_shared_mem_cols,
+    int in_shared_mem_cols_num_pack, int matrix_out_grad_shared_mem_cols, int offset,
+    DotBwdParam<T, max_in> param) {
+  asm volatile("s_trap 0;");
+}
+
+template<typename T, int max_in, int32_t pack_size>
+struct DotFeatureInteractionBackwardKernel {
+  static bool Launch(ep::Stream* stream, int64_t batch_size, int concated_padded_dim,
+                     int vector_size, int out_num_cols, bool self_interaction,
+                     const DotBwdParam<T, max_in>& param) {
+    const int block_size = 256;
+    const int block_dim_x = 32;
+    const int block_dim_y = block_size / block_dim_x;
+    const int num_blocks = batch_size;
+    const int mn_tile_dim = 16;
+    const int k_tile_dim = KTileDim<T>::val;
+    const int64_t padded_vector_size = GetPaddedDim(vector_size);
+    const int m_num_tiles = concated_padded_dim / mn_tile_dim;
+    const int k_num_tiles = concated_padded_dim / k_tile_dim;
+    const int n_num_tiles = padded_vector_size / mn_tile_dim;
+    const int skew_in = 8;
+    const int in_shared_mem_num_cols = padded_vector_size + skew_in;
+    const int matrix_out_grad_shared_mem_cols = concated_padded_dim + skew_in;
+    const size_t in_shared_mem_bytes = concated_padded_dim * in_shared_mem_num_cols * sizeof(T);
+    const size_t matrix_out_grad_shared_mem_bytes =
+        concated_padded_dim * matrix_out_grad_shared_mem_cols * sizeof(T);
+    using ComputeType = typename DefaultComputeType<T>::type;
+    const size_t in_grad_shared_mem_bytes =
+        concated_padded_dim * in_shared_mem_num_cols * sizeof(ComputeType);
+    const size_t total_shared_mem_bytes =
+        in_shared_mem_bytes + matrix_out_grad_shared_mem_bytes + in_grad_shared_mem_bytes;
+    const int32_t offset = self_interaction ? 1 : 0;
+    const int vector_num_pack = vector_size / pack_size;
+    const int padded_vector_num_pack = padded_vector_size / pack_size;
+    const int in_shared_mem_cols_num_pack = in_shared_mem_num_cols / pack_size;
+    int max_active_blocks;
+    OF_CUDA_CHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor(
+        &max_active_blocks,
+        DotFeatureInteractionBackwardWmmaImpl<T, ComputeType, max_in, pack_size, mn_tile_dim,
+                                              k_tile_dim>,
+        block_size, total_shared_mem_bytes));
+    if (max_active_blocks <= 0) { return false; }
+    hipStream_t cuda_stream = stream->As<ep::CudaStream>()->cuda_stream();
+    DotFeatureInteractionBackwardWmmaImpl<T, ComputeType, max_in, pack_size, mn_tile_dim,
+                                          k_tile_dim>
+        <<<num_blocks, dim3(block_dim_x, block_dim_y), total_shared_mem_bytes, cuda_stream>>>(
+            m_num_tiles, n_num_tiles, k_num_tiles, batch_size, concated_padded_dim, vector_num_pack,
+            padded_vector_num_pack, out_num_cols, in_shared_mem_num_cols,
+            in_shared_mem_cols_num_pack, matrix_out_grad_shared_mem_cols, offset, param);
+
+    return true;
+  }
+};
+
+template<typename T, int max_in>
+bool DispatchFeatureInteractionDotPackSize(user_op::KernelComputeContext* ctx,
+                                           const int32_t input_size) {
+  CHECK_LE(input_size, max_in) << input_size;
+  user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0);
+  const int64_t batch_size = out->shape_view().At(0);
+  const int64_t out_num_cols = out->shape_view().At(1);
+  const int64_t vector_size = ctx->TensorDesc4ArgNameAndIndex("features", 0)->shape().At(2);
+  DotFwdParam<T, max_in> param;
+  param.num_in = input_size;
+  param.out = out->mut_dptr<T>();
+  int64_t features_concated_dim = 0;
+  for (int i = 0; i < input_size; ++i) {
+    param.in[i] = ctx->Tensor4ArgNameAndIndex("features", i)->dptr<T>();
+    param.in_feature_dim[i] = ctx->TensorDesc4ArgNameAndIndex("features", i)->shape().At(1);
+    param.dim_start_offset[i] = features_concated_dim;
+    features_concated_dim += param.in_feature_dim[i];
+  }
+  const int64_t concated_padded_dim = GetPaddedDim(features_concated_dim);
+  param.features_dim = features_concated_dim;
+  if (ctx->has_input("output_concat", 0)) {
+    const user_op::Tensor* output_concat = ctx->Tensor4ArgNameAndIndex("output_concat", 0);
+    param.output_concat = output_concat->dptr<T>();
+    param.output_concat_size = output_concat->shape_view().At(1);
+  } else {
+    param.output_concat = nullptr;
+    param.output_concat_size = 0;
+  }
+  const bool self_interaction = ctx->Attr<bool>("self_interaction");
+  const int32_t output_padding = ctx->Attr<int32_t>("output_padding");
+  if (vector_size % 4 == 0 && out_num_cols % 4 == 0) {
+    return DotFeatureInteractionKernel<T, max_in, 4>::Launch(
+        ctx->stream(), batch_size, concated_padded_dim, vector_size, out_num_cols, self_interaction,
+        output_padding, param);
+  } else if (vector_size % 2 == 0 && out_num_cols % 2 == 0) {
+    return DotFeatureInteractionKernel<T, max_in, 2>::Launch(
+        ctx->stream(), batch_size, concated_padded_dim, vector_size, out_num_cols, self_interaction,
+        output_padding, param);
+  } else {
+    return DotFeatureInteractionKernel<T, max_in, 1>::Launch(
+        ctx->stream(), batch_size, concated_padded_dim, vector_size, out_num_cols, self_interaction,
+        output_padding, param);
+  }
+}
+
+template<typename T, int max_in>
+bool DispatchFeatureInteractionDotBackwardPackSize(user_op::KernelComputeContext* ctx,
+                                                   const int32_t input_size) {
+  CHECK_LE(input_size, max_in) << input_size;
+  user_op::Tensor* dy = ctx->Tensor4ArgNameAndIndex("dy", 0);
+  const int64_t batch_size = dy->shape_view().At(0);
+  const int64_t out_num_cols = dy->shape_view().At(1);
+  const int64_t vector_size = ctx->TensorDesc4ArgNameAndIndex("features", 0)->shape().At(2);
+  DotBwdParam<T, max_in> param;
+  param.num_in = input_size;
+  param.out_grad = dy->dptr<T>();
+  int64_t features_concated_dim = 0;
+  for (int i = 0; i < input_size; ++i) {
+    param.in[i] = ctx->Tensor4ArgNameAndIndex("features", i)->dptr<T>();
+    param.in_grad[i] = ctx->Tensor4ArgNameAndIndex("features_grad", i)->mut_dptr<T>();
+    param.in_feature_dim[i] = ctx->TensorDesc4ArgNameAndIndex("features", i)->shape().At(1);
+    param.dim_start_offset[i] = features_concated_dim;
+    features_concated_dim += param.in_feature_dim[i];
+  }
+  const int64_t concated_padded_dim = GetPaddedDim(features_concated_dim);
+  param.features_dim = features_concated_dim;
+  if (ctx->has_output("output_concat_grad", 0)) {
+    user_op::Tensor* output_concat_grad = ctx->Tensor4ArgNameAndIndex("output_concat_grad", 0);
+    param.output_concat_grad = output_concat_grad->mut_dptr<T>();
+    param.output_concat_size = output_concat_grad->shape_view().At(1);
+  } else {
+    param.output_concat_grad = nullptr;
+    param.output_concat_size = 0;
+  }
+  const bool self_interaction = ctx->Attr<bool>("self_interaction");
+  if (vector_size % 4 == 0) {
+    return DotFeatureInteractionBackwardKernel<T, max_in, 4>::Launch(
+        ctx->stream(), batch_size, concated_padded_dim, vector_size, out_num_cols, self_interaction,
+        param);
+  } else if (vector_size % 2 == 0) {
+    return DotFeatureInteractionBackwardKernel<T, max_in, 2>::Launch(
+        ctx->stream(), batch_size, concated_padded_dim, vector_size, out_num_cols, self_interaction,
+        param);
+  } else {
+    return DotFeatureInteractionBackwardKernel<T, max_in, 1>::Launch(
+        ctx->stream(), batch_size, concated_padded_dim, vector_size, out_num_cols, self_interaction,
+        param);
+  }
+}
+
+template<typename T, int32_t max_in>
+struct Param {
+  const T* in[max_in];
+  int32_t in_feature_dim[max_in];
+  T* out;
+  int32_t num_in;
+};
+
+template<typename T, int32_t max_in, int32_t pack_size>
+__global__ void FeatureInteractionSum(int64_t batch_size, int64_t vector_num_pack,
+                                      Param<T, max_in> param) {
+  using ComputeType = typename DefaultComputeType<T>::type;
+  Pack<T, pack_size>* dst_pack = reinterpret_cast<Pack<T, pack_size>*>(param.out);
+  for (int batch_idx = blockIdx.x * blockDim.y + threadIdx.y; batch_idx < batch_size;
+       batch_idx += gridDim.x * blockDim.y) {
+    Pack<T, pack_size>* batch_out = dst_pack + batch_idx * vector_num_pack;
+    for (int col_id = threadIdx.x; col_id < vector_num_pack; col_id += blockDim.x) {
+      Pack<ComputeType, pack_size> sum;
+      Pack<ComputeType, pack_size> square_sum;
+#pragma unroll
+      for (int k = 0; k < pack_size; ++k) {
+        sum.elem[k] = static_cast<ComputeType>(0);
+        square_sum.elem[k] = static_cast<ComputeType>(0);
+      }
+      for (int i = 0; i < max_in; ++i) {
+        if (i >= param.num_in) { break; }
+        const Pack<T, pack_size>* batch_in =
+            reinterpret_cast<const Pack<T, pack_size>*>(param.in[i])
+            + batch_idx * param.in_feature_dim[i] * vector_num_pack;
+#pragma unroll
+        for (int j = 0; j < param.in_feature_dim[i]; ++j) {
+          Pack<T, pack_size> val = batch_in[j * vector_num_pack + col_id];
+#pragma unroll
+          for (int k = 0; k < pack_size; ++k) {
+            const ComputeType compute_val = static_cast<ComputeType>(val.elem[k]);
+            sum.elem[k] += compute_val;
+            square_sum.elem[k] += compute_val * compute_val;
+          }
+        }
+      }
+      Pack<T, pack_size> out;
+#pragma unroll
+      for (int k = 0; k < pack_size; ++k) {
+        out.elem[k] = static_cast<T>((sum.elem[k] * sum.elem[k] - square_sum.elem[k])
+                                     * static_cast<ComputeType>(0.5));
+      }
+      batch_out[col_id] = out;
+    }
+  }
+}
+
+template<typename T, int32_t max_in>
+struct GradParam {
+  const T* out_grad;
+  const T* in[max_in];
+  int32_t in_feature_dim[max_in];
+  T* in_grad[max_in];
+  int32_t num_in;
+};
+
+template<typename T, int32_t max_in>
+__global__ void FeatureInteractionSumGrad(int64_t batch_size, int64_t vector_size,
+                                          GradParam<T, max_in> param) {
+  using ComputeType = typename DefaultComputeType<T>::type;
+  for (int batch_idx = blockIdx.x * blockDim.y + threadIdx.y; batch_idx < batch_size;
+       batch_idx += gridDim.x * blockDim.y) {
+    const T* batch_out_grad = param.out_grad + batch_idx * vector_size;
+    for (int col_id = threadIdx.x; col_id < vector_size; col_id += blockDim.x) {
+      ComputeType sum = 0;
+      for (int i = 0; i < max_in; ++i) {
+        if (i >= param.num_in) { break; }
+        const T* batch_in = param.in[i] + batch_idx * param.in_feature_dim[i] * vector_size;
+        for (int j = 0; j < param.in_feature_dim[i]; ++j) {
+          sum += static_cast<ComputeType>(batch_in[j * vector_size + col_id]);
+        }
+      }
+      for (int i = 0; i < max_in; ++i) {
+        if (i >= param.num_in) { break; }
+        const int64_t in_batch_offset = batch_idx * param.in_feature_dim[i] * vector_size;
+        const T* batch_in = param.in[i] + in_batch_offset;
+        T* batch_in_grad = param.in_grad[i] + in_batch_offset;
+        for (int j = 0; j < param.in_feature_dim[i]; ++j) {
+          const int64_t offset = j * vector_size + col_id;
+          batch_in_grad[offset] =
+              static_cast<T>(static_cast<ComputeType>(batch_out_grad[col_id])
+                             * (sum - static_cast<ComputeType>(batch_in[offset])));
+        }
+      }
+    }
+  }
+}
+
+void GetBlockDims(const int64_t vector_size, int* block_dim_x, int* block_dim_y) {
+  const int block_size = 256;
+  if (vector_size < block_size) {
+    *block_dim_x = std::ceil(static_cast<float>(vector_size) / 8) * 8;
+    *block_dim_y = (block_size + *block_dim_x - 1) / *block_dim_x;
+  } else {
+    *block_dim_x = block_size;
+    *block_dim_y = 1;
+  }
+}
+
+int GetNumBlocks(const int64_t num_instances, const int64_t instance_per_block) {
+  int max_blocks = (num_instances + instance_per_block - 1) / instance_per_block;
+  return std::min(max_blocks, kCudaMaxBlocksNum);
+}
+
+template<typename T, int32_t max_in>
+void DispatchFeatureInteractionSumPackSize(ep::Stream* stream, const int64_t batch_size,
+                                           const int64_t vector_size,
+                                           const Param<T, max_in>& param) {
+  int block_dim_x;
+  int block_dim_y;
+  const int pack_size = (vector_size % 2 == 0) ? 2 : 1;
+  const int64_t vector_num_pack = vector_size / pack_size;
+  GetBlockDims(vector_num_pack, &block_dim_x, &block_dim_y);
+  const int num_blocks = GetNumBlocks(batch_size, block_dim_y);
+  dim3 block_dims = dim3(block_dim_x, block_dim_y);
+  hipStream_t cuda_stream = stream->As<ep::CudaStream>()->cuda_stream();
+  if (pack_size == 2) {
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(FeatureInteractionSum<T, max_in, 2>), num_blocks, block_dims, 0, cuda_stream, batch_size, vector_num_pack, param);
+  } else {
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(FeatureInteractionSum<T, max_in, 1>), num_blocks, block_dims, 0, cuda_stream, batch_size, vector_num_pack, param);
+  }
+}
+
+template<typename T, int max_in>
+void DispatchFeatureInteractionSumInputSize(user_op::KernelComputeContext* ctx,
+                                            const int32_t input_size) {
+  CHECK_LE(input_size, max_in) << input_size;
+  user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0);
+  const int64_t batch_size = out->shape_view().At(0);
+  const int64_t vector_size = out->shape_view().At(1);
+  Param<T, max_in> param;
+  param.num_in = input_size;
+  param.out = out->mut_dptr<T>();
+  for (int i = 0; i < input_size; ++i) {
+    param.in[i] = ctx->Tensor4ArgNameAndIndex("features", i)->dptr<T>();
+    param.in_feature_dim[i] = ctx->TensorDesc4ArgNameAndIndex("features", i)->shape().At(1);
+  }
+  DispatchFeatureInteractionSumPackSize<T, max_in>(ctx->stream(), batch_size, vector_size, param);
+}
+
+template<typename T, int max_in>
+void DispatchFeatureInteractionSumGradInputSize(user_op::KernelComputeContext* ctx,
+                                                const int32_t input_size) {
+  CHECK_LE(input_size, max_in) << input_size;
+  const user_op::Tensor* dy = ctx->Tensor4ArgNameAndIndex("dy", 0);
+  const int64_t batch_size = dy->shape_view().At(0);
+  const int64_t vector_size = dy->shape_view().At(1);
+  int block_dim_x;
+  int block_dim_y;
+  GetBlockDims(vector_size, &block_dim_x, &block_dim_y);
+  const int num_blocks = GetNumBlocks(batch_size, block_dim_y);
+  dim3 block_dims = dim3(block_dim_x, block_dim_y);
+  GradParam<T, max_in> param;
+  param.num_in = input_size;
+  param.out_grad = dy->dptr<T>();
+  for (int i = 0; i < input_size; ++i) {
+    param.in[i] = ctx->Tensor4ArgNameAndIndex("features", i)->dptr<T>();
+    param.in_grad[i] = ctx->Tensor4ArgNameAndIndex("features_grad", i)->mut_dptr<T>();
+    param.in_feature_dim[i] = ctx->TensorDesc4ArgNameAndIndex("features_grad", i)->shape().At(1);
+  }
+  FeatureInteractionSumGrad<T, max_in>
+      <<<num_blocks, block_dims, 0, ctx->stream()->As<ep::CudaStream>()->cuda_stream()>>>(
+          batch_size, vector_size, param);
+}
+
+}  // namespace
+
+template<typename T>
+class FusedDotFeatureInteractionPoolingSumKernel final : public user_op::OpKernel,
+                                                         public user_op::CudaGraphSupport {
+ public:
+  FusedDotFeatureInteractionPoolingSumKernel() = default;
+  ~FusedDotFeatureInteractionPoolingSumKernel() override = default;
+
+ private:
+  using user_op::OpKernel::Compute;
+  void Compute(user_op::KernelComputeContext* ctx) const override {
+    const int input_size = ctx->input_size("features");
+    if (input_size == 1) {
+      DispatchFeatureInteractionSumInputSize<T, 1>(ctx, input_size);
+    } else if (input_size == 2) {
+      DispatchFeatureInteractionSumInputSize<T, 2>(ctx, input_size);
+    } else if (input_size <= 8) {
+      DispatchFeatureInteractionSumInputSize<T, 8>(ctx, input_size);
+    } else {
+      CHECK_LE(input_size, 128) << "input_size must not greater than 128. ";
+      DispatchFeatureInteractionSumInputSize<T, 128>(ctx, input_size);
+    }
+  }
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
+};
+
+#define REGISTER_FUSED_DOT_FEATURE_INTERACTION_POOLING_SUM_KERNEL(dtype)                \
+  REGISTER_USER_KERNEL("fused_dot_feature_interaction")                                 \
+      .SetCreateFn<FusedDotFeatureInteractionPoolingSumKernel<dtype>>()                 \
+      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                  \
+                       && (user_op::HobDataType("out", 0) == GetDataType<dtype>::value) \
+                       && (user_op::HobAttr<std::string>("pooling") == "sum"));
+
+REGISTER_FUSED_DOT_FEATURE_INTERACTION_POOLING_SUM_KERNEL(float)
+REGISTER_FUSED_DOT_FEATURE_INTERACTION_POOLING_SUM_KERNEL(half)
+
+template<typename T>
+bool TryLaunchTensorCoreDotKernel(user_op::KernelComputeContext* ctx) {
+  const int input_size = ctx->input_size("features");
+  if (input_size == 1) {
+    return DispatchFeatureInteractionDotPackSize<T, 1>(ctx, input_size);
+  } else if (input_size == 2) {
+    return DispatchFeatureInteractionDotPackSize<T, 2>(ctx, input_size);
+  } else if (input_size <= 8) {
+    return DispatchFeatureInteractionDotPackSize<T, 8>(ctx, input_size);
+  } else {
+    CHECK_LE(input_size, 128) << "input_size must not greater than 128. ";
+    return DispatchFeatureInteractionDotPackSize<T, 128>(ctx, input_size);
+  }
+}
+
+template<typename T>
+bool TryLaunchTensorCoreDotBackwardKernel(user_op::KernelComputeContext* ctx) {
+  const int input_size = ctx->input_size("features");
+  if (input_size == 1) {
+    return DispatchFeatureInteractionDotBackwardPackSize<T, 1>(ctx, input_size);
+  } else if (input_size == 2) {
+    return DispatchFeatureInteractionDotBackwardPackSize<T, 2>(ctx, input_size);
+  } else if (input_size <= 8) {
+    return DispatchFeatureInteractionDotBackwardPackSize<T, 8>(ctx, input_size);
+  } else {
+    CHECK_LE(input_size, 128) << "input_size must not greater than 128. ";
+    return DispatchFeatureInteractionDotBackwardPackSize<T, 128>(ctx, input_size);
+  }
+}
+template<typename T>
+class FusedDotFeatureInteractionKernel final : public user_op::OpKernel,
+                                               public user_op::CudaGraphSupport {
+ public:
+  FusedDotFeatureInteractionKernel() = default;
+  ~FusedDotFeatureInteractionKernel() override = default;
+
+ private:
+  using user_op::OpKernel::Compute;
+  void Compute(user_op::KernelComputeContext* ctx) const override {
+    user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0);
+    const DataType data_type = out->data_type();
+    CHECK_LT(out->shape_view().elem_cnt(), GetMaxVal<int32_t>());
+    auto* cuda_stream = ctx->stream()->As<ep::CudaStream>();
+    // if ((cuda_stream->device_properties().major >= 7 && data_type == DataType::kFloat16)
+    //     || (cuda_stream->device_properties().major >= 8 && data_type == DataType::kFloat)) {
+    //   bool success = TryLaunchTensorCoreDotKernel<T>(ctx);
+    //   if (success == true) { return; }
+    // }
+    user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0);
+    const int64_t batch_size = out->shape_view().At(0);
+    int64_t features_concated_dim = 0;
+    for (int64_t i = 0; i < ctx->input_size("features"); ++i) {
+      features_concated_dim += ctx->TensorDesc4ArgNameAndIndex("features", i)->shape().At(1);
+    }
+    const int64_t concated_padded_dim = GetPaddedDim(features_concated_dim);
+    const int64_t vector_size = ctx->TensorDesc4ArgNameAndIndex("features", 0)->shape().At(2);
+    const int64_t out_dim = out->shape_view().At(1);
+    const int32_t output_padding = ctx->Attr<int32_t>("output_padding");
+    const int64_t valid_out_dim = out_dim - output_padding;
+    const bool self_interaction = ctx->Attr<bool>("self_interaction");
+
+    T* matmul_out = reinterpret_cast<T*>(tmp_buffer->mut_dptr<char>());
+    size_t matmul_out_size =
+        GetCudaAlignedSize(batch_size * concated_padded_dim * concated_padded_dim * sizeof(T));
+    const int64_t interaction_dim = self_interaction
+                                        ? features_concated_dim * (features_concated_dim + 1) / 2
+                                        : features_concated_dim * (features_concated_dim - 1) / 2;
+    int32_t* gather_indices_ptr =
+        reinterpret_cast<int32_t*>(tmp_buffer->mut_dptr<char>() + matmul_out_size);
+    size_t gather_indices_size = GetCudaAlignedSize(interaction_dim * sizeof(int32_t));
+    T* padded_concated_features_ptr =
+        reinterpret_cast<T*>(tmp_buffer->mut_dptr<char>() + matmul_out_size + gather_indices_size);
+    size_t padded_concated_features_size =
+        GetCudaAlignedSize(batch_size * concated_padded_dim * vector_size * sizeof(T));
+    CHECK_GE(tmp_buffer->shape_view().elem_cnt(),
+             matmul_out_size + gather_indices_size + padded_concated_features_size);
+    ConcatFeatures<T>(ctx, batch_size, concated_padded_dim * vector_size,
+                      padded_concated_features_ptr);
+    auto batch_matmul = ep::primitive::NewPrimitive<ep::primitive::BatchMatmulFactory>(
+        ctx->device_type(), data_type, ep::primitive::BlasTransposeType::N,
+        ep::primitive::BlasTransposeType::T);
+    batch_matmul->Launch(ctx->stream(), batch_size, concated_padded_dim, concated_padded_dim,
+                         vector_size, 1.0, padded_concated_features_ptr,
+                         padded_concated_features_ptr, 0.0, matmul_out);
+
+    int64_t output_concat_end_dim = 0;
+    const T* output_concat_ptr = nullptr;
+    if (ctx->has_input("output_concat", 0)) {
+      user_op::Tensor* output_concat = ctx->Tensor4ArgNameAndIndex("output_concat", 0);
+      output_concat_end_dim = output_concat->shape_view().At(1);
+      output_concat_ptr = output_concat->dptr<T>();
+    }
+    CHECK_EQ(valid_out_dim, output_concat_end_dim + interaction_dim);
+    GatherConcatKernel<T>(ctx->stream(), out->shape_view().elem_cnt(), out_dim, valid_out_dim,
+                          features_concated_dim, concated_padded_dim, output_concat_end_dim,
+                          self_interaction, matmul_out, output_concat_ptr, gather_indices_ptr,
+                          out->mut_dptr<T>());
+  }
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
+};
+
+template<typename T>
+user_op::InferTmpSizeFn GenFusedDotFeatureInteractionInferTmpSizeFn() {
+  return [](user_op::InferContext* ctx) {
+    const Shape& first_feature_shape = ctx->InputShape("features", 0);
+    const int64_t batch_size = first_feature_shape.At(0);
+    const int64_t vector_size = first_feature_shape.At(2);
+    int64_t features_concated_dim = 0;
+    for (int32_t i = 0; i < ctx->input_size("features"); ++i) {
+      features_concated_dim += ctx->InputShape("features", i).At(1);
+    }
+    const int64_t concated_padded_dim = GetPaddedDim(features_concated_dim);
+    size_t matmul_out_size =
+        GetCudaAlignedSize(batch_size * concated_padded_dim * concated_padded_dim * sizeof(T));
+    const bool self_interaction = ctx->Attr<bool>("self_interaction");
+    const int64_t interaction_dim = self_interaction
+                                        ? features_concated_dim * (features_concated_dim + 1) / 2
+                                        : features_concated_dim * (features_concated_dim - 1) / 2;
+    size_t gather_indices_size = GetCudaAlignedSize(interaction_dim * sizeof(int32_t));
+    size_t padded_concated_features_size =
+        GetCudaAlignedSize(batch_size * concated_padded_dim * vector_size * sizeof(T));
+    return matmul_out_size + gather_indices_size + padded_concated_features_size;
+  };
+}
+
+#define REGISTER_FUSED_DOT_FEATURE_INTERACTION_KERNEL(dtype)                            \
+  REGISTER_USER_KERNEL("fused_dot_feature_interaction")                                 \
+      .SetCreateFn<FusedDotFeatureInteractionKernel<dtype>>()                           \
+      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                  \
+                       && (user_op::HobDataType("out", 0) == GetDataType<dtype>::value) \
+                       && (user_op::HobAttr<std::string>("pooling") == "none"))         \
+      .SetInferTmpSizeFn(GenFusedDotFeatureInteractionInferTmpSizeFn<dtype>());
+
+REGISTER_FUSED_DOT_FEATURE_INTERACTION_KERNEL(float)
+REGISTER_FUSED_DOT_FEATURE_INTERACTION_KERNEL(half)
+
+template<typename T>
+class FusedDotFeatureInteractionGradKernel final : public user_op::OpKernel,
+                                                   public user_op::CudaGraphSupport {
+ public:
+  FusedDotFeatureInteractionGradKernel() = default;
+  ~FusedDotFeatureInteractionGradKernel() override = default;
+
+ private:
+  using user_op::OpKernel::Compute;
+  void Compute(user_op::KernelComputeContext* ctx) const override {
+    const user_op::Tensor* dy = ctx->Tensor4ArgNameAndIndex("dy", 0);
+    user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0);
+    const DataType data_type = dy->data_type();
+    auto* cuda_stream = ctx->stream()->As<ep::CudaStream>();
+    // if ((cuda_stream->device_properties().major >= 7 && data_type == DataType::kFloat16)
+    //     || (cuda_stream->device_properties().major >= 8 && data_type == DataType::kFloat)) {
+    //   bool success = TryLaunchTensorCoreDotBackwardKernel<T>(ctx);
+    //   if (success == true) { return; }
+    // }
+    const int64_t batch_size = dy->shape_view().At(0);
+    int64_t features_concated_dim = 0;
+    for (int32_t i = 0; i < ctx->output_size("features_grad"); ++i) {
+      features_concated_dim += ctx->TensorDesc4ArgNameAndIndex("features_grad", i)->shape().At(1);
+    }
+    const int64_t concated_padded_dim = GetPaddedDim(features_concated_dim);
+    const int64_t vector_size = ctx->TensorDesc4ArgNameAndIndex("features_grad", 0)->shape().At(2);
+    const int64_t out_dim = dy->shape_view().At(1);
+    const bool self_interaction = ctx->Attr<bool>("self_interaction");
+    T* matmul_out_grad_ptr = reinterpret_cast<T*>(tmp_buffer->mut_dptr<char>());
+    size_t matmul_out_grad_size =
+        GetCudaAlignedSize(batch_size * concated_padded_dim * concated_padded_dim * sizeof(T));
+    T* padded_concated_features_grad_ptr =
+        reinterpret_cast<T*>(tmp_buffer->mut_dptr<char>() + matmul_out_grad_size);
+    size_t padded_concated_features_grad_size =
+        GetCudaAlignedSize(batch_size * concated_padded_dim * vector_size * sizeof(T));
+    T* padded_concated_features_ptr = reinterpret_cast<T*>(
+        tmp_buffer->mut_dptr<char>() + matmul_out_grad_size + padded_concated_features_grad_size);
+    size_t padded_concated_features_size = padded_concated_features_grad_size;
+    CHECK_LE(
+        matmul_out_grad_size + padded_concated_features_grad_size + padded_concated_features_size,
+        tmp_buffer->shape_view().elem_cnt());
+    ConcatFeatures<T>(ctx, batch_size, concated_padded_dim * vector_size,
+                      padded_concated_features_ptr);
+
+    T* output_concat_grad_ptr = nullptr;
+    int64_t output_concat_end_dim = 0;
+    if (ctx->has_output("output_concat_grad", 0)) {
+      user_op::Tensor* output_concat_grad = ctx->Tensor4ArgNameAndIndex("output_concat_grad", 0);
+      output_concat_grad_ptr = output_concat_grad->mut_dptr<T>();
+      output_concat_end_dim = output_concat_grad->shape_view().At(1);
+    }
+    ScatterSplitAddTranspose(ctx->stream(), batch_size, out_dim, concated_padded_dim,
+                             features_concated_dim, output_concat_end_dim, self_interaction,
+                             dy->dptr<T>(), output_concat_grad_ptr, matmul_out_grad_ptr);
+
+    auto batch_matmul = ep::primitive::NewPrimitive<ep::primitive::BatchMatmulFactory>(
+        ctx->device_type(), data_type, ep::primitive::BlasTransposeType::N,
+        ep::primitive::BlasTransposeType::N);
+    batch_matmul->Launch(ctx->stream(), batch_size, concated_padded_dim, vector_size,
+                         concated_padded_dim, 1.0, matmul_out_grad_ptr,
+                         padded_concated_features_ptr, 0.0, padded_concated_features_grad_ptr);
+
+    ConcatFeaturesGrad(ctx, batch_size, concated_padded_dim, vector_size,
+                       padded_concated_features_grad_ptr);
+  }
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
+};
+
+template<typename T>
+user_op::InferTmpSizeFn GenFusedDotFeatureInteractionGradInferTmpSizeFn() {
+  return [](user_op::InferContext* ctx) {
+    int64_t features_concated_dim = 0;
+    for (int32_t i = 0; i < ctx->output_size("features_grad"); ++i) {
+      features_concated_dim += ctx->InputShape("features_grad", i).At(1);
+    }
+    const int64_t concated_padded_dim = GetPaddedDim(features_concated_dim);
+    const int64_t batch_size = ctx->InputShape("features_grad", 0).At(0);
+    const int64_t vector_size = ctx->InputShape("features_grad", 0).At(2);
+    size_t matmul_out_grad_size =
+        GetCudaAlignedSize(batch_size * concated_padded_dim * concated_padded_dim * sizeof(T));
+    size_t padded_concated_features_grad_size =
+        GetCudaAlignedSize(batch_size * concated_padded_dim * vector_size * sizeof(T));
+    size_t padded_concated_features_size = padded_concated_features_grad_size;
+    return matmul_out_grad_size + padded_concated_features_grad_size
+           + padded_concated_features_size;
+  };
+}
+
+#define REGISTER_FUSED_DOT_FEATURE_INTERACTION_GRAD_KERNEL(dtype)                      \
+  REGISTER_USER_KERNEL("fused_dot_feature_interaction_grad")                           \
+      .SetCreateFn<FusedDotFeatureInteractionGradKernel<dtype>>()                      \
+      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                 \
+                       && (user_op::HobDataType("dy", 0) == GetDataType<dtype>::value) \
+                       && (user_op::HobAttr<std::string>("pooling") == "none"))        \
+      .SetInferTmpSizeFn(GenFusedDotFeatureInteractionGradInferTmpSizeFn<dtype>());
+
+REGISTER_FUSED_DOT_FEATURE_INTERACTION_GRAD_KERNEL(float)
+REGISTER_FUSED_DOT_FEATURE_INTERACTION_GRAD_KERNEL(half)
+
+template<typename T>
+class FusedDotFeatureInteractionPoolingSumGradKernel final : public user_op::OpKernel,
+                                                             public user_op::CudaGraphSupport {
+ public:
+  FusedDotFeatureInteractionPoolingSumGradKernel() = default;
+  ~FusedDotFeatureInteractionPoolingSumGradKernel() override = default;
+
+ private:
+  using user_op::OpKernel::Compute;
+  void Compute(user_op::KernelComputeContext* ctx) const override {
+    const int input_size = ctx->input_size("features");
+    if (input_size == 1) {
+      DispatchFeatureInteractionSumGradInputSize<T, 1>(ctx, input_size);
+    } else if (input_size == 2) {
+      DispatchFeatureInteractionSumGradInputSize<T, 2>(ctx, input_size);
+    } else if (input_size <= 8) {
+      DispatchFeatureInteractionSumGradInputSize<T, 8>(ctx, input_size);
+    } else {
+      CHECK_LE(input_size, 128) << "input_size must not greater than 128. ";
+      DispatchFeatureInteractionSumGradInputSize<T, 128>(ctx, input_size);
+    }
+  }
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
+};
+
+#define REGISTER_FUSED_DOT_FEATURE_INTERACTION_POOLING_SUM_GRAD_KERNEL(dtype)          \
+  REGISTER_USER_KERNEL("fused_dot_feature_interaction_grad")                           \
+      .SetCreateFn<FusedDotFeatureInteractionPoolingSumGradKernel<dtype>>()            \
+      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                 \
+                       && (user_op::HobDataType("dy", 0) == GetDataType<dtype>::value) \
+                       && (user_op::HobAttr<std::string>("pooling") == "sum"));
+
+REGISTER_FUSED_DOT_FEATURE_INTERACTION_POOLING_SUM_GRAD_KERNEL(float)
+REGISTER_FUSED_DOT_FEATURE_INTERACTION_POOLING_SUM_GRAD_KERNEL(half)
+
 }  // namespace oneflow
\ No newline at end of file
diff --git a/oneflow/user/kernels/fused_gru_cell_kernel.hip.cpp b/oneflow/user/kernels/fused_gru_cell_kernel.hip.cpp
index d910b5d..3371b04 100644
--- a/oneflow/user/kernels/fused_gru_cell_kernel.hip.cpp
+++ b/oneflow/user/kernels/fused_gru_cell_kernel.hip.cpp
@@ -1,472 +1,472 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "hip/hip_runtime.h"
-#include <limits>
-#include "oneflow/core/device/cuda_util.h"
-#include "oneflow/core/framework/framework.h"
-#include "oneflow/core/ndarray/ndarray_util.h"
-#include "oneflow/core/ndarray/xpu_var_ndarray.h"
-#include "oneflow/core/kernel/kernel_util.h"
-#include "oneflow/core/kernel/cuda_graph_support.h"
-#include "oneflow/core/ep/include/primitive/cast.h"
-#include "oneflow/core/ep/include/primitive/fill.h"
-#include "oneflow/core/ep/rocm/cuda_device.h"
-#include "oneflow/core/ep/include/primitive/matmul.h"
-#include "oneflow/user/kernels/fused_rnn_cell_kernel_util.h"
-
-// NOTE(Liang Depeng): The implementation of fused_gru_cell is modified from
-//                     https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/cuda/RNN.cu
-
-namespace oneflow {
-
-namespace {
-
-template<typename T>
-struct AccumulateType {};
-template<>
-struct AccumulateType<float> {
-  using type = float;
-};
-template<>
-struct AccumulateType<double> {
-  using type = double;
-};
-
-template<typename T>
-using acc_type = typename AccumulateType<T>::type;
-
-#define H2F(input) static_cast<ACC_T>(input)
-#define F2H(input) static_cast<T>(input)
-
-template<typename T>
-__device__ __forceinline__ T sigmoid(T in) {
-  T one = static_cast<T>(1.0);
-  return one / (one + ::exp(-in));
-}
-
-template<typename T, typename ACC_T, typename IDX_TYPE>
-#if __CUDA_ARCH__ >= 350
-OF_LAUNCH_BOUNDS_2(512, 4)
-#endif
-__global__ void gru_cell_forward(const IDX_TYPE numel, const IDX_TYPE hidden_size,
-                                 const T* input_gates_ptr, const T* hidden_gates_ptr,
-                                 const T* hx_ptr, const T* input_bias_ptr, const T* hidden_bias_ptr,
-                                 T* hy_ptr, T* workspace_ptr) {
-  bool has_bias = input_bias_ptr != nullptr;
-  for (IDX_TYPE linearIndex = blockIdx.x * blockDim.x + threadIdx.x; linearIndex < numel;
-       linearIndex += gridDim.x * blockDim.x) {
-    IDX_TYPE offset = (linearIndex / hidden_size) * 3 * hidden_size + linearIndex % hidden_size;
-
-    T ir = input_gates_ptr[offset + 0 * hidden_size];
-    T ii = input_gates_ptr[offset + 1 * hidden_size];
-    T in = input_gates_ptr[offset + 2 * hidden_size];
-    T hr = hidden_gates_ptr[offset + 0 * hidden_size];
-    T hi = hidden_gates_ptr[offset + 1 * hidden_size];
-    T hn = hidden_gates_ptr[offset + 2 * hidden_size];
-
-    T hx = hx_ptr[linearIndex];
-    T* hy = &(hy_ptr[linearIndex]);
-
-    T b1r, b1i, b1n, b2r, b2i, b2n;
-
-    if (has_bias) {
-      b1r = input_bias_ptr[linearIndex % hidden_size + 0 * hidden_size];
-      b1i = input_bias_ptr[linearIndex % hidden_size + 1 * hidden_size];
-      b1n = input_bias_ptr[linearIndex % hidden_size + 2 * hidden_size];
-
-      b2r = hidden_bias_ptr[linearIndex % hidden_size + 0 * hidden_size];
-      b2i = hidden_bias_ptr[linearIndex % hidden_size + 1 * hidden_size];
-      b2n = hidden_bias_ptr[linearIndex % hidden_size + 2 * hidden_size];
-    } else {
-      b1r = F2H(0.0);
-      b1i = F2H(0.0);
-      b1n = F2H(0.0);
-      b2r = F2H(0.0);
-      b2i = F2H(0.0);
-      b2n = F2H(0.0);
-    }
-
-    offset = (linearIndex / hidden_size) * 5 * hidden_size + linearIndex % hidden_size;
-    ACC_T rg, ig, ng;
-    rg = sigmoid(H2F(ir) + H2F(hr) + H2F(b1r) + H2F(b2r));
-    ig = sigmoid(H2F(ii) + H2F(hi) + H2F(b1i) + H2F(b2i));
-
-    ng = H2F(in) + H2F(b1n) + rg * (H2F(hn) + H2F(b2n));
-    ng = ::tanh(ng);
-    *hy = F2H(ng + ig * (H2F(hx) - ng));
-
-    // SAVE FOR BACKWARDS
-    workspace_ptr[offset + 0 * hidden_size] = F2H(rg);
-    workspace_ptr[offset + 1 * hidden_size] = F2H(ig);
-    workspace_ptr[offset + 2 * hidden_size] = F2H(ng);
-    workspace_ptr[offset + 3 * hidden_size] = hx;
-    workspace_ptr[offset + 4 * hidden_size] = F2H(H2F(hn) + H2F(b2n));
-  }
-}
-
-template<typename T, typename ACC_T, typename IDX_TYPE>
-#if __CUDA_ARCH__ >= 350
-OF_LAUNCH_BOUNDS_2(512, 4)
-#endif
-__global__
-    void gru_cell_backward(const IDX_TYPE numel, const IDX_TYPE hidden_size, const T* grad_hy_ptr,
-                           const T* workspace_ptr, T* grad_input_gates_ptr,
-                           T* grad_hidden_gates_ptr, T* grad_hx_ptr) {
-  for (IDX_TYPE linearIndex = blockIdx.x * blockDim.x + threadIdx.x; linearIndex < numel;
-       linearIndex += gridDim.x * blockDim.x) {
-    IDX_TYPE offset = (linearIndex / hidden_size) * 5 * hidden_size + linearIndex % hidden_size;
-
-    T rg = workspace_ptr[offset + 0 * hidden_size];
-    T ig = workspace_ptr[offset + 1 * hidden_size];
-    T ng = workspace_ptr[offset + 2 * hidden_size];
-    T hx = workspace_ptr[offset + 3 * hidden_size];
-    T hn = workspace_ptr[offset + 4 * hidden_size];
-
-    T go = grad_hy_ptr[linearIndex];
-
-    offset = (linearIndex / hidden_size) * 3 * hidden_size + linearIndex % hidden_size;
-
-    ACC_T gig = H2F(go) * (H2F(hx) - H2F(ng)) * (1 - H2F(ig)) * H2F(ig);
-    ACC_T ghx = H2F(go) * H2F(ig);
-    ACC_T gin = H2F(go) * (1 - H2F(ig)) * (1 - H2F(ng) * H2F(ng));
-    ACC_T ghn = gin * H2F(rg);
-    ACC_T grg = gin * H2F(hn) * (1 - H2F(rg)) * H2F(rg);
-
-    grad_input_gates_ptr[offset + 0 * hidden_size] = F2H(grg);
-    grad_input_gates_ptr[offset + 1 * hidden_size] = F2H(gig);
-    grad_input_gates_ptr[offset + 2 * hidden_size] = F2H(gin);
-
-    grad_hidden_gates_ptr[offset + 0 * hidden_size] = F2H(grg);
-    grad_hidden_gates_ptr[offset + 1 * hidden_size] = F2H(gig);
-    grad_hidden_gates_ptr[offset + 2 * hidden_size] = F2H(ghn);
-    if (grad_hx_ptr != nullptr) { grad_hx_ptr[linearIndex] = F2H(ghx); }
-  }
-}
-
-template<typename T>
-struct FusedGruCellGradFunctor final {
-  void operator()(ep::Stream* stream, const int64_t hx_numel, const int64_t workspace_numel,
-                  const int64_t hidden_size, const T* grad_hy_ptr, const T* workspace_ptr,
-                  T* grad_input_gates_ptr, T* grad_hidden_gates_ptr, T* grad_hx_ptr) {
-    using ACC_T = acc_type<T>;
-    if (workspace_numel < std::numeric_limits<int32_t>::max()) {
-      RUN_CUDA_KERNEL((gru_cell_backward<T, ACC_T, int32_t>), stream, hx_numel,
-                      static_cast<int32_t>(hx_numel), static_cast<int32_t>(hidden_size),
-                      grad_hy_ptr, workspace_ptr, grad_input_gates_ptr, grad_hidden_gates_ptr,
-                      grad_hx_ptr);
-    } else {
-      RUN_CUDA_KERNEL((gru_cell_backward<T, ACC_T, int64_t>), stream, hx_numel, hx_numel,
-                      hidden_size, grad_hy_ptr, workspace_ptr, grad_input_gates_ptr,
-                      grad_hidden_gates_ptr, grad_hx_ptr);
-    }
-  }
-};
-
-template<>
-void FusedGruCellGradFunctor<float16>::operator()(
-    ep::Stream* stream, const int64_t hx_numel, const int64_t workspace_numel,
-    const int64_t hidden_size, const float16* grad_hy_ptr, const float16* workspace_ptr,
-    float16* grad_input_gates_ptr, float16* grad_hidden_gates_ptr, float16* grad_hx_ptr) {
-  if (workspace_numel < std::numeric_limits<int32_t>::max()) {
-    RUN_CUDA_KERNEL(
-        (gru_cell_backward<half, float, int32_t>), stream, hx_numel, static_cast<int32_t>(hx_numel),
-        static_cast<int32_t>(hidden_size), reinterpret_cast<const half*>(grad_hy_ptr),
-        reinterpret_cast<const half*>(workspace_ptr), reinterpret_cast<half*>(grad_input_gates_ptr),
-        reinterpret_cast<half*>(grad_hidden_gates_ptr), reinterpret_cast<half*>(grad_hx_ptr));
-  } else {
-    RUN_CUDA_KERNEL(
-        (gru_cell_backward<half, float, int64_t>), stream, hx_numel, hx_numel, hidden_size,
-        reinterpret_cast<const half*>(grad_hy_ptr), reinterpret_cast<const half*>(workspace_ptr),
-        reinterpret_cast<half*>(grad_input_gates_ptr),
-        reinterpret_cast<half*>(grad_hidden_gates_ptr), reinterpret_cast<half*>(grad_hx_ptr));
-  }
-}
-
-template<typename T>
-struct FusedGruCellFunctor final {
-  void operator()(ep::Stream* stream, const int64_t hx_numel, const int64_t workspace_numel,
-                  const int64_t hidden_size, const T* input_gates_ptr, const T* hidden_gates_ptr,
-                  const T* hx_ptr, const T* input_bias_ptr, const T* hidden_bias_ptr, T* hy_ptr,
-                  T* workspace_ptr) {
-    using ACC_T = acc_type<T>;
-    if (workspace_numel < std::numeric_limits<int32_t>::max()) {
-      RUN_CUDA_KERNEL((gru_cell_forward<T, ACC_T, int32_t>), stream, hx_numel,
-                      static_cast<int32_t>(hx_numel), static_cast<int32_t>(hidden_size),
-                      input_gates_ptr, hidden_gates_ptr, hx_ptr, input_bias_ptr, hidden_bias_ptr,
-                      hy_ptr, workspace_ptr);
-    } else {
-      RUN_CUDA_KERNEL((gru_cell_forward<T, ACC_T, int64_t>), stream, hx_numel, hx_numel,
-                      hidden_size, input_gates_ptr, hidden_gates_ptr, hx_ptr, input_bias_ptr,
-                      hidden_bias_ptr, hy_ptr, workspace_ptr);
-    }
-  }
-};
-
-template<>
-void FusedGruCellFunctor<float16>::operator()(
-    ep::Stream* stream, const int64_t hx_numel, const int64_t workspace_numel,
-    const int64_t hidden_size, const float16* input_gates_ptr, const float16* hidden_gates_ptr,
-    const float16* hx_ptr, const float16* input_bias_ptr, const float16* hidden_bias_ptr,
-    float16* hy_ptr, float16* workspace_ptr) {
-  if (workspace_numel < std::numeric_limits<int32_t>::max()) {
-    RUN_CUDA_KERNEL(
-        (gru_cell_forward<half, float, int32_t>), stream, hx_numel, static_cast<int32_t>(hx_numel),
-        static_cast<int32_t>(hidden_size), reinterpret_cast<const half*>(input_gates_ptr),
-        reinterpret_cast<const half*>(hidden_gates_ptr), reinterpret_cast<const half*>(hx_ptr),
-        reinterpret_cast<const half*>(input_bias_ptr),
-        reinterpret_cast<const half*>(hidden_bias_ptr), reinterpret_cast<half*>(hy_ptr),
-        reinterpret_cast<half*>(workspace_ptr));
-  } else {
-    RUN_CUDA_KERNEL((gru_cell_forward<half, float, int64_t>), stream, hx_numel, hx_numel,
-                    hidden_size, reinterpret_cast<const half*>(input_gates_ptr),
-                    reinterpret_cast<const half*>(hidden_gates_ptr),
-                    reinterpret_cast<const half*>(hx_ptr),
-                    reinterpret_cast<const half*>(input_bias_ptr),
-                    reinterpret_cast<const half*>(hidden_bias_ptr), reinterpret_cast<half*>(hy_ptr),
-                    reinterpret_cast<half*>(workspace_ptr));
-  }
-}
-
-}  // namespace
-
-template<typename T>
-class GpuFusedGruCellKernel final : public user_op::OpKernel {
- public:
-  GpuFusedGruCellKernel() = default;
-  ~GpuFusedGruCellKernel() = default;
-
- private:
-  using user_op::OpKernel::Compute;
-  void Compute(user_op::KernelComputeContext* ctx) const override {
-    const user_op::Tensor* input_gates = ctx->Tensor4ArgNameAndIndex("input_gates", 0);
-    const user_op::Tensor* hidden_gates = ctx->Tensor4ArgNameAndIndex("hidden_gates", 0);
-    const user_op::Tensor* hx = ctx->Tensor4ArgNameAndIndex("hx", 0);
-    user_op::Tensor* hy = ctx->Tensor4ArgNameAndIndex("hy", 0);
-    user_op::Tensor* workspace = ctx->Tensor4ArgNameAndIndex("workspace", 0);
-
-    const T* input_bias_ptr = nullptr;
-    const T* hidden_bias_ptr = nullptr;
-    if (ctx->has_input("input_bias", 0)) {
-      CHECK(ctx->has_input("hidden_bias", 0));
-      input_bias_ptr = ctx->Tensor4ArgNameAndIndex("input_bias", 0)->dptr<T>();
-      hidden_bias_ptr = ctx->Tensor4ArgNameAndIndex("hidden_bias", 0)->dptr<T>();
-    }
-    const T* input_gates_ptr = input_gates->dptr<T>();
-    const T* hidden_gates_ptr = hidden_gates->dptr<T>();
-    const T* hx_ptr = hx->dptr<T>();
-
-    T* hy_ptr = hy->mut_dptr<T>();
-    T* workspace_ptr = workspace->mut_dptr<T>();
-    const int64_t hx_numel = hx->shape_view().elem_cnt();
-    const int64_t workspace_numel = workspace->shape_view().elem_cnt();
-    const int64_t hidden_size = hx->shape_view().At(hx->shape_view().NumAxes() - 1);
-    FusedGruCellFunctor<T>()(ctx->stream(), hx_numel, workspace_numel, hidden_size, input_gates_ptr,
-                             hidden_gates_ptr, hx_ptr, input_bias_ptr, hidden_bias_ptr, hy_ptr,
-                             workspace_ptr);
-  }
-
-  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
-};
-
-#define REGISTER_FUSED_GRU_CELL_KERNEL(dtype)                                                   \
-  REGISTER_USER_KERNEL("fused_gru_cell")                                                        \
-      .SetCreateFn<GpuFusedGruCellKernel<dtype>>()                                              \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                          \
-                       && (user_op::HobDataType("hx", 0) == GetDataType<dtype>::value)          \
-                       && (user_op::HobDataType("input_gates", 0) == GetDataType<dtype>::value) \
-                       && (user_op::HobDataType("hidden_gates", 0) == GetDataType<dtype>::value))
-
-REGISTER_FUSED_GRU_CELL_KERNEL(float);
-REGISTER_FUSED_GRU_CELL_KERNEL(float16);
-
-class GpuFusedGruCellGradFloatKernel final : public user_op::OpKernel {
- public:
-  GpuFusedGruCellGradFloatKernel() = default;
-  ~GpuFusedGruCellGradFloatKernel() = default;
-
- private:
-  using user_op::OpKernel::Compute;
-  void Compute(user_op::KernelComputeContext* ctx) const override {
-    const user_op::Tensor* grad_hy = ctx->Tensor4ArgNameAndIndex("grad_hy", 0);
-    const user_op::Tensor* workspace = ctx->Tensor4ArgNameAndIndex("workspace", 0);
-    user_op::Tensor* grad_input_gates = ctx->Tensor4ArgNameAndIndex("grad_input_gates", 0);
-    user_op::Tensor* grad_hidden_gates = ctx->Tensor4ArgNameAndIndex("grad_hidden_gates", 0);
-
-    const float* grad_hy_ptr = grad_hy->dptr<float>();
-    const float* workspace_ptr = workspace->dptr<float>();
-
-    float* grad_input_gates_ptr = grad_input_gates->mut_dptr<float>();
-    float* grad_hidden_gates_ptr = grad_hidden_gates->mut_dptr<float>();
-
-    float* grad_hx_ptr = nullptr;
-    if (ctx->has_output("grad_hx", 0)) {
-      user_op::Tensor* grad_hx = ctx->Tensor4ArgNameAndIndex("grad_hx", 0);
-      grad_hx_ptr = grad_hx->mut_dptr<float>();
-    }
-
-    const int64_t hx_numel = grad_hy->shape_view().elem_cnt();
-    const int64_t workspace_numel = workspace->shape_view().elem_cnt();
-    const int64_t hidden_size = grad_hy->shape_view().At(grad_hy->shape_view().NumAxes() - 1);
-    FusedGruCellGradFunctor<float>()(ctx->stream(), hx_numel, workspace_numel, hidden_size,
-                                     grad_hy_ptr, workspace_ptr, grad_input_gates_ptr,
-                                     grad_hidden_gates_ptr, grad_hx_ptr);
-
-    if (ctx->has_output("grad_input_bias", 0) && ctx->has_output("grad_hidden_bias", 0)) {
-      float* grad_input_bias_ptr =
-          ctx->Tensor4ArgNameAndIndex("grad_input_bias", 0)->mut_dptr<float>();
-      std::vector<int32_t> axis;
-      axis.push_back(0);
-      const Shape& reduced_shape =
-          CreateReducedShape(grad_input_gates->shape_view(), {axis.begin(), axis.end()});
-      user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0);
-      NdarrayReduce<DeviceType::kCUDA, float, BinaryFuncSum>::Reduce(
-          ctx->stream(), XpuVarNdarray<float>(reduced_shape, grad_input_bias_ptr),
-          XpuVarNdarray<const float>(grad_input_gates->shape_view(),
-                                     grad_input_gates->dptr<float>()),
-          XpuVarNdarray<float>(tmp_buffer->shape_view(), tmp_buffer->mut_dptr<float>()));
-
-      float* grad_hidden_bias_ptr =
-          ctx->Tensor4ArgNameAndIndex("grad_hidden_bias", 0)->mut_dptr<float>();
-      NdarrayReduce<DeviceType::kCUDA, float, BinaryFuncSum>::Reduce(
-          ctx->stream(), XpuVarNdarray<float>(reduced_shape, grad_hidden_bias_ptr),
-          XpuVarNdarray<const float>(grad_hidden_gates->shape_view(),
-                                     grad_hidden_gates->dptr<float>()),
-          XpuVarNdarray<float>(tmp_buffer->shape_view(), tmp_buffer->mut_dptr<float>()));
-    }
-  }
-
-  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
-};
-
-REGISTER_USER_KERNEL("fused_gru_cell_grad")
-    .SetCreateFn<GpuFusedGruCellGradFloatKernel>()
-    .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)
-                     && (user_op::HobDataType("grad_hy", 0) == GetDataType<float>::value)
-                     && (user_op::HobDataType("workspace", 0) == GetDataType<float>::value))
-    .SetInferTmpSizeFn([](user_op::InferContext* ctx) {
-      size_t tmp_bytes = 0;
-      if (ctx->has_output("grad_input_bias", 0) && ctx->has_output("grad_hidden_bias", 0)) {
-        const Shape& in_shape = ctx->InputTensorDesc("grad_hy", 0).shape();
-        tmp_bytes = GetCudaAlignedSize(in_shape.elem_cnt() * 3 * sizeof(float));
-      } else {
-        tmp_bytes = 0;
-      }
-      return tmp_bytes;
-    });
-
-class GpuFusedGruCellGradHalfKernel final : public user_op::OpKernel {
- public:
-  GpuFusedGruCellGradHalfKernel() = default;
-  ~GpuFusedGruCellGradHalfKernel() = default;
-
- private:
-  using user_op::OpKernel::Compute;
-  void Compute(user_op::KernelComputeContext* ctx) const override {
-    const user_op::Tensor* grad_hy = ctx->Tensor4ArgNameAndIndex("grad_hy", 0);
-    const user_op::Tensor* workspace = ctx->Tensor4ArgNameAndIndex("workspace", 0);
-    user_op::Tensor* grad_input_gates = ctx->Tensor4ArgNameAndIndex("grad_input_gates", 0);
-    user_op::Tensor* grad_hidden_gates = ctx->Tensor4ArgNameAndIndex("grad_hidden_gates", 0);
-
-    const float16* grad_hy_ptr = grad_hy->dptr<float16>();
-    const float16* workspace_ptr = workspace->dptr<float16>();
-
-    float16* grad_input_gates_ptr = grad_input_gates->mut_dptr<float16>();
-    float16* grad_hidden_gates_ptr = grad_hidden_gates->mut_dptr<float16>();
-
-    float16* grad_hx_ptr = nullptr;
-    if (ctx->has_output("grad_hx", 0)) {
-      user_op::Tensor* grad_hx = ctx->Tensor4ArgNameAndIndex("grad_hx", 0);
-      grad_hx_ptr = grad_hx->mut_dptr<float16>();
-    }
-
-    const int64_t hx_numel = grad_hy->shape_view().elem_cnt();
-    const int64_t workspace_numel = workspace->shape_view().elem_cnt();
-    const int64_t hidden_size = grad_hy->shape_view().At(grad_hy->shape_view().NumAxes() - 1);
-    FusedGruCellGradFunctor<float16>()(ctx->stream(), hx_numel, workspace_numel, hidden_size,
-                                       grad_hy_ptr, workspace_ptr, grad_input_gates_ptr,
-                                       grad_hidden_gates_ptr, grad_hx_ptr);
-
-    if (ctx->has_output("grad_input_bias", 0) && ctx->has_output("grad_hidden_bias", 0)) {
-      std::vector<int32_t> axis;
-      axis.push_back(0);
-      user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0);
-      const ShapeView& in_shape = grad_input_gates->shape_view();
-      const Shape& reduced_shape = CreateReducedShape(in_shape, {axis.begin(), axis.end()});
-      float* in_tmp_buffer = tmp_buffer->mut_dptr<float>();
-      const size_t in_tmp_buffer_bytes = GetCudaAlignedSize(in_shape.elem_cnt() * sizeof(float));
-      float* out_tmp_buffer =
-          reinterpret_cast<float*>(tmp_buffer->mut_dptr<char>() + in_tmp_buffer_bytes);
-      const size_t out_tmp_buffer_bytes =
-          GetCudaAlignedSize(reduced_shape.elem_cnt() * sizeof(float));
-      float* reduce_tmp_buffer = reinterpret_cast<float*>(
-          tmp_buffer->mut_dptr<char>() + in_tmp_buffer_bytes + out_tmp_buffer_bytes);
-      const size_t reduce_tmp_buffer_bytes =
-          GetCudaAlignedSize(in_shape.elem_cnt() * sizeof(float));
-      CHECK_LE(in_tmp_buffer_bytes + out_tmp_buffer_bytes + reduce_tmp_buffer_bytes,
-               tmp_buffer->shape_view().elem_cnt());
-      auto h2f = ep::primitive::NewPrimitive<ep::primitive::CastFactory>(
-          ctx->device_type(), DataType::kFloat16, DataType::kFloat);
-      CHECK(h2f);
-      auto f2h = ep::primitive::NewPrimitive<ep::primitive::CastFactory>(
-          ctx->device_type(), DataType::kFloat, DataType::kFloat16);
-      CHECK(f2h);
-      h2f->Launch(ctx->stream(), grad_input_gates->dptr<float16>(), in_tmp_buffer,
-                  in_shape.elem_cnt());
-
-      NdarrayReduce<DeviceType::kCUDA, float, BinaryFuncSum>::Reduce(
-          ctx->stream(), XpuVarNdarray<float>(reduced_shape, out_tmp_buffer),
-          XpuVarNdarray<const float>(in_shape, in_tmp_buffer),
-          XpuVarNdarray<float>(in_shape, reduce_tmp_buffer));
-
-      user_op::Tensor* output_tensor = ctx->Tensor4ArgNameAndIndex("grad_input_bias", 0);
-      f2h->Launch(ctx->stream(), out_tmp_buffer, output_tensor->mut_dptr<float16>(),
-                  output_tensor->shape_view().elem_cnt());
-
-      h2f->Launch(ctx->stream(), grad_hidden_gates->dptr<float16>(), in_tmp_buffer,
-                  in_shape.elem_cnt());
-      NdarrayReduce<DeviceType::kCUDA, float, BinaryFuncSum>::Reduce(
-          ctx->stream(), XpuVarNdarray<float>(reduced_shape, out_tmp_buffer),
-          XpuVarNdarray<const float>(in_shape, in_tmp_buffer),
-          XpuVarNdarray<float>(in_shape, reduce_tmp_buffer));
-
-      output_tensor = ctx->Tensor4ArgNameAndIndex("grad_hidden_bias", 0);
-      f2h->Launch(ctx->stream(), out_tmp_buffer, output_tensor->mut_dptr<float16>(),
-                  output_tensor->shape_view().elem_cnt());
-    }
-  }
-
-  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
-};
-
-REGISTER_USER_KERNEL("fused_gru_cell_grad")
-    .SetCreateFn<GpuFusedGruCellGradHalfKernel>()
-    .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)
-                     && (user_op::HobDataType("grad_hy", 0) == GetDataType<float16>::value)
-                     && (user_op::HobDataType("workspace", 0) == GetDataType<float16>::value))
-    .SetInferTmpSizeFn([](user_op::InferContext* ctx) {
-      size_t tmp_bytes = 0;
-      if (ctx->has_output("grad_input_bias", 0) && ctx->has_output("grad_hidden_bias", 0)) {
-        const Shape& in_shape = ctx->InputTensorDesc("grad_hy", 0).shape();
-        const Shape& out_shape = ctx->OutputTensorDesc("grad_input_bias", 0)->shape();
-        tmp_bytes = (2 * GetCudaAlignedSize(in_shape.elem_cnt() * 3 * sizeof(float))
-                     + GetCudaAlignedSize(out_shape.elem_cnt() * sizeof(float)));
-      } else {
-        tmp_bytes = 0;
-      }
-      return tmp_bytes;
-    });
-
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "hip/hip_runtime.h"
+#include <limits>
+#include "oneflow/core/device/cuda_util.h"
+#include "oneflow/core/framework/framework.h"
+#include "oneflow/core/ndarray/ndarray_util.h"
+#include "oneflow/core/ndarray/xpu_var_ndarray.h"
+#include "oneflow/core/kernel/kernel_util.h"
+#include "oneflow/core/kernel/cuda_graph_support.h"
+#include "oneflow/core/ep/include/primitive/cast.h"
+#include "oneflow/core/ep/include/primitive/fill.h"
+#include "oneflow/core/ep/rocm/cuda_device.h"
+#include "oneflow/core/ep/include/primitive/matmul.h"
+#include "oneflow/user/kernels/fused_rnn_cell_kernel_util.h"
+
+// NOTE(Liang Depeng): The implementation of fused_gru_cell is modified from
+//                     https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/cuda/RNN.cu
+
+namespace oneflow {
+
+namespace {
+
+template<typename T>
+struct AccumulateType {};
+template<>
+struct AccumulateType<float> {
+  using type = float;
+};
+template<>
+struct AccumulateType<double> {
+  using type = double;
+};
+
+template<typename T>
+using acc_type = typename AccumulateType<T>::type;
+
+#define H2F(input) static_cast<ACC_T>(input)
+#define F2H(input) static_cast<T>(input)
+
+template<typename T>
+__device__ __forceinline__ T sigmoid(T in) {
+  T one = static_cast<T>(1.0);
+  return one / (one + ::exp(-in));
+}
+
+template<typename T, typename ACC_T, typename IDX_TYPE>
+#if __CUDA_ARCH__ >= 350
+OF_LAUNCH_BOUNDS_2(512, 4)
+#endif
+__global__ void gru_cell_forward(const IDX_TYPE numel, const IDX_TYPE hidden_size,
+                                 const T* input_gates_ptr, const T* hidden_gates_ptr,
+                                 const T* hx_ptr, const T* input_bias_ptr, const T* hidden_bias_ptr,
+                                 T* hy_ptr, T* workspace_ptr) {
+  bool has_bias = input_bias_ptr != nullptr;
+  for (IDX_TYPE linearIndex = blockIdx.x * blockDim.x + threadIdx.x; linearIndex < numel;
+       linearIndex += gridDim.x * blockDim.x) {
+    IDX_TYPE offset = (linearIndex / hidden_size) * 3 * hidden_size + linearIndex % hidden_size;
+
+    T ir = input_gates_ptr[offset + 0 * hidden_size];
+    T ii = input_gates_ptr[offset + 1 * hidden_size];
+    T in = input_gates_ptr[offset + 2 * hidden_size];
+    T hr = hidden_gates_ptr[offset + 0 * hidden_size];
+    T hi = hidden_gates_ptr[offset + 1 * hidden_size];
+    T hn = hidden_gates_ptr[offset + 2 * hidden_size];
+
+    T hx = hx_ptr[linearIndex];
+    T* hy = &(hy_ptr[linearIndex]);
+
+    T b1r, b1i, b1n, b2r, b2i, b2n;
+
+    if (has_bias) {
+      b1r = input_bias_ptr[linearIndex % hidden_size + 0 * hidden_size];
+      b1i = input_bias_ptr[linearIndex % hidden_size + 1 * hidden_size];
+      b1n = input_bias_ptr[linearIndex % hidden_size + 2 * hidden_size];
+
+      b2r = hidden_bias_ptr[linearIndex % hidden_size + 0 * hidden_size];
+      b2i = hidden_bias_ptr[linearIndex % hidden_size + 1 * hidden_size];
+      b2n = hidden_bias_ptr[linearIndex % hidden_size + 2 * hidden_size];
+    } else {
+      b1r = F2H(0.0);
+      b1i = F2H(0.0);
+      b1n = F2H(0.0);
+      b2r = F2H(0.0);
+      b2i = F2H(0.0);
+      b2n = F2H(0.0);
+    }
+
+    offset = (linearIndex / hidden_size) * 5 * hidden_size + linearIndex % hidden_size;
+    ACC_T rg, ig, ng;
+    rg = sigmoid(H2F(ir) + H2F(hr) + H2F(b1r) + H2F(b2r));
+    ig = sigmoid(H2F(ii) + H2F(hi) + H2F(b1i) + H2F(b2i));
+
+    ng = H2F(in) + H2F(b1n) + rg * (H2F(hn) + H2F(b2n));
+    ng = ::tanh(ng);
+    *hy = F2H(ng + ig * (H2F(hx) - ng));
+
+    // SAVE FOR BACKWARDS
+    workspace_ptr[offset + 0 * hidden_size] = F2H(rg);
+    workspace_ptr[offset + 1 * hidden_size] = F2H(ig);
+    workspace_ptr[offset + 2 * hidden_size] = F2H(ng);
+    workspace_ptr[offset + 3 * hidden_size] = hx;
+    workspace_ptr[offset + 4 * hidden_size] = F2H(H2F(hn) + H2F(b2n));
+  }
+}
+
+template<typename T, typename ACC_T, typename IDX_TYPE>
+#if __CUDA_ARCH__ >= 350
+OF_LAUNCH_BOUNDS_2(512, 4)
+#endif
+__global__
+    void gru_cell_backward(const IDX_TYPE numel, const IDX_TYPE hidden_size, const T* grad_hy_ptr,
+                           const T* workspace_ptr, T* grad_input_gates_ptr,
+                           T* grad_hidden_gates_ptr, T* grad_hx_ptr) {
+  for (IDX_TYPE linearIndex = blockIdx.x * blockDim.x + threadIdx.x; linearIndex < numel;
+       linearIndex += gridDim.x * blockDim.x) {
+    IDX_TYPE offset = (linearIndex / hidden_size) * 5 * hidden_size + linearIndex % hidden_size;
+
+    T rg = workspace_ptr[offset + 0 * hidden_size];
+    T ig = workspace_ptr[offset + 1 * hidden_size];
+    T ng = workspace_ptr[offset + 2 * hidden_size];
+    T hx = workspace_ptr[offset + 3 * hidden_size];
+    T hn = workspace_ptr[offset + 4 * hidden_size];
+
+    T go = grad_hy_ptr[linearIndex];
+
+    offset = (linearIndex / hidden_size) * 3 * hidden_size + linearIndex % hidden_size;
+
+    ACC_T gig = H2F(go) * (H2F(hx) - H2F(ng)) * (1 - H2F(ig)) * H2F(ig);
+    ACC_T ghx = H2F(go) * H2F(ig);
+    ACC_T gin = H2F(go) * (1 - H2F(ig)) * (1 - H2F(ng) * H2F(ng));
+    ACC_T ghn = gin * H2F(rg);
+    ACC_T grg = gin * H2F(hn) * (1 - H2F(rg)) * H2F(rg);
+
+    grad_input_gates_ptr[offset + 0 * hidden_size] = F2H(grg);
+    grad_input_gates_ptr[offset + 1 * hidden_size] = F2H(gig);
+    grad_input_gates_ptr[offset + 2 * hidden_size] = F2H(gin);
+
+    grad_hidden_gates_ptr[offset + 0 * hidden_size] = F2H(grg);
+    grad_hidden_gates_ptr[offset + 1 * hidden_size] = F2H(gig);
+    grad_hidden_gates_ptr[offset + 2 * hidden_size] = F2H(ghn);
+    if (grad_hx_ptr != nullptr) { grad_hx_ptr[linearIndex] = F2H(ghx); }
+  }
+}
+
+template<typename T>
+struct FusedGruCellGradFunctor final {
+  void operator()(ep::Stream* stream, const int64_t hx_numel, const int64_t workspace_numel,
+                  const int64_t hidden_size, const T* grad_hy_ptr, const T* workspace_ptr,
+                  T* grad_input_gates_ptr, T* grad_hidden_gates_ptr, T* grad_hx_ptr) {
+    using ACC_T = acc_type<T>;
+    if (workspace_numel < std::numeric_limits<int32_t>::max()) {
+      RUN_CUDA_KERNEL((gru_cell_backward<T, ACC_T, int32_t>), stream, hx_numel,
+                      static_cast<int32_t>(hx_numel), static_cast<int32_t>(hidden_size),
+                      grad_hy_ptr, workspace_ptr, grad_input_gates_ptr, grad_hidden_gates_ptr,
+                      grad_hx_ptr);
+    } else {
+      RUN_CUDA_KERNEL((gru_cell_backward<T, ACC_T, int64_t>), stream, hx_numel, hx_numel,
+                      hidden_size, grad_hy_ptr, workspace_ptr, grad_input_gates_ptr,
+                      grad_hidden_gates_ptr, grad_hx_ptr);
+    }
+  }
+};
+
+template<>
+void FusedGruCellGradFunctor<float16>::operator()(
+    ep::Stream* stream, const int64_t hx_numel, const int64_t workspace_numel,
+    const int64_t hidden_size, const float16* grad_hy_ptr, const float16* workspace_ptr,
+    float16* grad_input_gates_ptr, float16* grad_hidden_gates_ptr, float16* grad_hx_ptr) {
+  if (workspace_numel < std::numeric_limits<int32_t>::max()) {
+    RUN_CUDA_KERNEL(
+        (gru_cell_backward<half, float, int32_t>), stream, hx_numel, static_cast<int32_t>(hx_numel),
+        static_cast<int32_t>(hidden_size), reinterpret_cast<const half*>(grad_hy_ptr),
+        reinterpret_cast<const half*>(workspace_ptr), reinterpret_cast<half*>(grad_input_gates_ptr),
+        reinterpret_cast<half*>(grad_hidden_gates_ptr), reinterpret_cast<half*>(grad_hx_ptr));
+  } else {
+    RUN_CUDA_KERNEL(
+        (gru_cell_backward<half, float, int64_t>), stream, hx_numel, hx_numel, hidden_size,
+        reinterpret_cast<const half*>(grad_hy_ptr), reinterpret_cast<const half*>(workspace_ptr),
+        reinterpret_cast<half*>(grad_input_gates_ptr),
+        reinterpret_cast<half*>(grad_hidden_gates_ptr), reinterpret_cast<half*>(grad_hx_ptr));
+  }
+}
+
+template<typename T>
+struct FusedGruCellFunctor final {
+  void operator()(ep::Stream* stream, const int64_t hx_numel, const int64_t workspace_numel,
+                  const int64_t hidden_size, const T* input_gates_ptr, const T* hidden_gates_ptr,
+                  const T* hx_ptr, const T* input_bias_ptr, const T* hidden_bias_ptr, T* hy_ptr,
+                  T* workspace_ptr) {
+    using ACC_T = acc_type<T>;
+    if (workspace_numel < std::numeric_limits<int32_t>::max()) {
+      RUN_CUDA_KERNEL((gru_cell_forward<T, ACC_T, int32_t>), stream, hx_numel,
+                      static_cast<int32_t>(hx_numel), static_cast<int32_t>(hidden_size),
+                      input_gates_ptr, hidden_gates_ptr, hx_ptr, input_bias_ptr, hidden_bias_ptr,
+                      hy_ptr, workspace_ptr);
+    } else {
+      RUN_CUDA_KERNEL((gru_cell_forward<T, ACC_T, int64_t>), stream, hx_numel, hx_numel,
+                      hidden_size, input_gates_ptr, hidden_gates_ptr, hx_ptr, input_bias_ptr,
+                      hidden_bias_ptr, hy_ptr, workspace_ptr);
+    }
+  }
+};
+
+template<>
+void FusedGruCellFunctor<float16>::operator()(
+    ep::Stream* stream, const int64_t hx_numel, const int64_t workspace_numel,
+    const int64_t hidden_size, const float16* input_gates_ptr, const float16* hidden_gates_ptr,
+    const float16* hx_ptr, const float16* input_bias_ptr, const float16* hidden_bias_ptr,
+    float16* hy_ptr, float16* workspace_ptr) {
+  if (workspace_numel < std::numeric_limits<int32_t>::max()) {
+    RUN_CUDA_KERNEL(
+        (gru_cell_forward<half, float, int32_t>), stream, hx_numel, static_cast<int32_t>(hx_numel),
+        static_cast<int32_t>(hidden_size), reinterpret_cast<const half*>(input_gates_ptr),
+        reinterpret_cast<const half*>(hidden_gates_ptr), reinterpret_cast<const half*>(hx_ptr),
+        reinterpret_cast<const half*>(input_bias_ptr),
+        reinterpret_cast<const half*>(hidden_bias_ptr), reinterpret_cast<half*>(hy_ptr),
+        reinterpret_cast<half*>(workspace_ptr));
+  } else {
+    RUN_CUDA_KERNEL((gru_cell_forward<half, float, int64_t>), stream, hx_numel, hx_numel,
+                    hidden_size, reinterpret_cast<const half*>(input_gates_ptr),
+                    reinterpret_cast<const half*>(hidden_gates_ptr),
+                    reinterpret_cast<const half*>(hx_ptr),
+                    reinterpret_cast<const half*>(input_bias_ptr),
+                    reinterpret_cast<const half*>(hidden_bias_ptr), reinterpret_cast<half*>(hy_ptr),
+                    reinterpret_cast<half*>(workspace_ptr));
+  }
+}
+
+}  // namespace
+
+template<typename T>
+class GpuFusedGruCellKernel final : public user_op::OpKernel {
+ public:
+  GpuFusedGruCellKernel() = default;
+  ~GpuFusedGruCellKernel() = default;
+
+ private:
+  using user_op::OpKernel::Compute;
+  void Compute(user_op::KernelComputeContext* ctx) const override {
+    const user_op::Tensor* input_gates = ctx->Tensor4ArgNameAndIndex("input_gates", 0);
+    const user_op::Tensor* hidden_gates = ctx->Tensor4ArgNameAndIndex("hidden_gates", 0);
+    const user_op::Tensor* hx = ctx->Tensor4ArgNameAndIndex("hx", 0);
+    user_op::Tensor* hy = ctx->Tensor4ArgNameAndIndex("hy", 0);
+    user_op::Tensor* workspace = ctx->Tensor4ArgNameAndIndex("workspace", 0);
+
+    const T* input_bias_ptr = nullptr;
+    const T* hidden_bias_ptr = nullptr;
+    if (ctx->has_input("input_bias", 0)) {
+      CHECK(ctx->has_input("hidden_bias", 0));
+      input_bias_ptr = ctx->Tensor4ArgNameAndIndex("input_bias", 0)->dptr<T>();
+      hidden_bias_ptr = ctx->Tensor4ArgNameAndIndex("hidden_bias", 0)->dptr<T>();
+    }
+    const T* input_gates_ptr = input_gates->dptr<T>();
+    const T* hidden_gates_ptr = hidden_gates->dptr<T>();
+    const T* hx_ptr = hx->dptr<T>();
+
+    T* hy_ptr = hy->mut_dptr<T>();
+    T* workspace_ptr = workspace->mut_dptr<T>();
+    const int64_t hx_numel = hx->shape_view().elem_cnt();
+    const int64_t workspace_numel = workspace->shape_view().elem_cnt();
+    const int64_t hidden_size = hx->shape_view().At(hx->shape_view().NumAxes() - 1);
+    FusedGruCellFunctor<T>()(ctx->stream(), hx_numel, workspace_numel, hidden_size, input_gates_ptr,
+                             hidden_gates_ptr, hx_ptr, input_bias_ptr, hidden_bias_ptr, hy_ptr,
+                             workspace_ptr);
+  }
+
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
+};
+
+#define REGISTER_FUSED_GRU_CELL_KERNEL(dtype)                                                   \
+  REGISTER_USER_KERNEL("fused_gru_cell")                                                        \
+      .SetCreateFn<GpuFusedGruCellKernel<dtype>>()                                              \
+      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                          \
+                       && (user_op::HobDataType("hx", 0) == GetDataType<dtype>::value)          \
+                       && (user_op::HobDataType("input_gates", 0) == GetDataType<dtype>::value) \
+                       && (user_op::HobDataType("hidden_gates", 0) == GetDataType<dtype>::value))
+
+REGISTER_FUSED_GRU_CELL_KERNEL(float);
+REGISTER_FUSED_GRU_CELL_KERNEL(float16);
+
+class GpuFusedGruCellGradFloatKernel final : public user_op::OpKernel {
+ public:
+  GpuFusedGruCellGradFloatKernel() = default;
+  ~GpuFusedGruCellGradFloatKernel() = default;
+
+ private:
+  using user_op::OpKernel::Compute;
+  void Compute(user_op::KernelComputeContext* ctx) const override {
+    const user_op::Tensor* grad_hy = ctx->Tensor4ArgNameAndIndex("grad_hy", 0);
+    const user_op::Tensor* workspace = ctx->Tensor4ArgNameAndIndex("workspace", 0);
+    user_op::Tensor* grad_input_gates = ctx->Tensor4ArgNameAndIndex("grad_input_gates", 0);
+    user_op::Tensor* grad_hidden_gates = ctx->Tensor4ArgNameAndIndex("grad_hidden_gates", 0);
+
+    const float* grad_hy_ptr = grad_hy->dptr<float>();
+    const float* workspace_ptr = workspace->dptr<float>();
+
+    float* grad_input_gates_ptr = grad_input_gates->mut_dptr<float>();
+    float* grad_hidden_gates_ptr = grad_hidden_gates->mut_dptr<float>();
+
+    float* grad_hx_ptr = nullptr;
+    if (ctx->has_output("grad_hx", 0)) {
+      user_op::Tensor* grad_hx = ctx->Tensor4ArgNameAndIndex("grad_hx", 0);
+      grad_hx_ptr = grad_hx->mut_dptr<float>();
+    }
+
+    const int64_t hx_numel = grad_hy->shape_view().elem_cnt();
+    const int64_t workspace_numel = workspace->shape_view().elem_cnt();
+    const int64_t hidden_size = grad_hy->shape_view().At(grad_hy->shape_view().NumAxes() - 1);
+    FusedGruCellGradFunctor<float>()(ctx->stream(), hx_numel, workspace_numel, hidden_size,
+                                     grad_hy_ptr, workspace_ptr, grad_input_gates_ptr,
+                                     grad_hidden_gates_ptr, grad_hx_ptr);
+
+    if (ctx->has_output("grad_input_bias", 0) && ctx->has_output("grad_hidden_bias", 0)) {
+      float* grad_input_bias_ptr =
+          ctx->Tensor4ArgNameAndIndex("grad_input_bias", 0)->mut_dptr<float>();
+      std::vector<int32_t> axis;
+      axis.push_back(0);
+      const Shape& reduced_shape =
+          CreateReducedShape(grad_input_gates->shape_view(), {axis.begin(), axis.end()});
+      user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0);
+      NdarrayReduce<DeviceType::kCUDA, float, BinaryFuncSum>::Reduce(
+          ctx->stream(), XpuVarNdarray<float>(reduced_shape, grad_input_bias_ptr),
+          XpuVarNdarray<const float>(grad_input_gates->shape_view(),
+                                     grad_input_gates->dptr<float>()),
+          XpuVarNdarray<float>(tmp_buffer->shape_view(), tmp_buffer->mut_dptr<float>()));
+
+      float* grad_hidden_bias_ptr =
+          ctx->Tensor4ArgNameAndIndex("grad_hidden_bias", 0)->mut_dptr<float>();
+      NdarrayReduce<DeviceType::kCUDA, float, BinaryFuncSum>::Reduce(
+          ctx->stream(), XpuVarNdarray<float>(reduced_shape, grad_hidden_bias_ptr),
+          XpuVarNdarray<const float>(grad_hidden_gates->shape_view(),
+                                     grad_hidden_gates->dptr<float>()),
+          XpuVarNdarray<float>(tmp_buffer->shape_view(), tmp_buffer->mut_dptr<float>()));
+    }
+  }
+
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
+};
+
+REGISTER_USER_KERNEL("fused_gru_cell_grad")
+    .SetCreateFn<GpuFusedGruCellGradFloatKernel>()
+    .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)
+                     && (user_op::HobDataType("grad_hy", 0) == GetDataType<float>::value)
+                     && (user_op::HobDataType("workspace", 0) == GetDataType<float>::value))
+    .SetInferTmpSizeFn([](user_op::InferContext* ctx) {
+      size_t tmp_bytes = 0;
+      if (ctx->has_output("grad_input_bias", 0) && ctx->has_output("grad_hidden_bias", 0)) {
+        const Shape& in_shape = ctx->InputTensorDesc("grad_hy", 0).shape();
+        tmp_bytes = GetCudaAlignedSize(in_shape.elem_cnt() * 3 * sizeof(float));
+      } else {
+        tmp_bytes = 0;
+      }
+      return tmp_bytes;
+    });
+
+class GpuFusedGruCellGradHalfKernel final : public user_op::OpKernel {
+ public:
+  GpuFusedGruCellGradHalfKernel() = default;
+  ~GpuFusedGruCellGradHalfKernel() = default;
+
+ private:
+  using user_op::OpKernel::Compute;
+  void Compute(user_op::KernelComputeContext* ctx) const override {
+    const user_op::Tensor* grad_hy = ctx->Tensor4ArgNameAndIndex("grad_hy", 0);
+    const user_op::Tensor* workspace = ctx->Tensor4ArgNameAndIndex("workspace", 0);
+    user_op::Tensor* grad_input_gates = ctx->Tensor4ArgNameAndIndex("grad_input_gates", 0);
+    user_op::Tensor* grad_hidden_gates = ctx->Tensor4ArgNameAndIndex("grad_hidden_gates", 0);
+
+    const float16* grad_hy_ptr = grad_hy->dptr<float16>();
+    const float16* workspace_ptr = workspace->dptr<float16>();
+
+    float16* grad_input_gates_ptr = grad_input_gates->mut_dptr<float16>();
+    float16* grad_hidden_gates_ptr = grad_hidden_gates->mut_dptr<float16>();
+
+    float16* grad_hx_ptr = nullptr;
+    if (ctx->has_output("grad_hx", 0)) {
+      user_op::Tensor* grad_hx = ctx->Tensor4ArgNameAndIndex("grad_hx", 0);
+      grad_hx_ptr = grad_hx->mut_dptr<float16>();
+    }
+
+    const int64_t hx_numel = grad_hy->shape_view().elem_cnt();
+    const int64_t workspace_numel = workspace->shape_view().elem_cnt();
+    const int64_t hidden_size = grad_hy->shape_view().At(grad_hy->shape_view().NumAxes() - 1);
+    FusedGruCellGradFunctor<float16>()(ctx->stream(), hx_numel, workspace_numel, hidden_size,
+                                       grad_hy_ptr, workspace_ptr, grad_input_gates_ptr,
+                                       grad_hidden_gates_ptr, grad_hx_ptr);
+
+    if (ctx->has_output("grad_input_bias", 0) && ctx->has_output("grad_hidden_bias", 0)) {
+      std::vector<int32_t> axis;
+      axis.push_back(0);
+      user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0);
+      const ShapeView& in_shape = grad_input_gates->shape_view();
+      const Shape& reduced_shape = CreateReducedShape(in_shape, {axis.begin(), axis.end()});
+      float* in_tmp_buffer = tmp_buffer->mut_dptr<float>();
+      const size_t in_tmp_buffer_bytes = GetCudaAlignedSize(in_shape.elem_cnt() * sizeof(float));
+      float* out_tmp_buffer =
+          reinterpret_cast<float*>(tmp_buffer->mut_dptr<char>() + in_tmp_buffer_bytes);
+      const size_t out_tmp_buffer_bytes =
+          GetCudaAlignedSize(reduced_shape.elem_cnt() * sizeof(float));
+      float* reduce_tmp_buffer = reinterpret_cast<float*>(
+          tmp_buffer->mut_dptr<char>() + in_tmp_buffer_bytes + out_tmp_buffer_bytes);
+      const size_t reduce_tmp_buffer_bytes =
+          GetCudaAlignedSize(in_shape.elem_cnt() * sizeof(float));
+      CHECK_LE(in_tmp_buffer_bytes + out_tmp_buffer_bytes + reduce_tmp_buffer_bytes,
+               tmp_buffer->shape_view().elem_cnt());
+      auto h2f = ep::primitive::NewPrimitive<ep::primitive::CastFactory>(
+          ctx->device_type(), DataType::kFloat16, DataType::kFloat);
+      CHECK(h2f);
+      auto f2h = ep::primitive::NewPrimitive<ep::primitive::CastFactory>(
+          ctx->device_type(), DataType::kFloat, DataType::kFloat16);
+      CHECK(f2h);
+      h2f->Launch(ctx->stream(), grad_input_gates->dptr<float16>(), in_tmp_buffer,
+                  in_shape.elem_cnt());
+
+      NdarrayReduce<DeviceType::kCUDA, float, BinaryFuncSum>::Reduce(
+          ctx->stream(), XpuVarNdarray<float>(reduced_shape, out_tmp_buffer),
+          XpuVarNdarray<const float>(in_shape, in_tmp_buffer),
+          XpuVarNdarray<float>(in_shape, reduce_tmp_buffer));
+
+      user_op::Tensor* output_tensor = ctx->Tensor4ArgNameAndIndex("grad_input_bias", 0);
+      f2h->Launch(ctx->stream(), out_tmp_buffer, output_tensor->mut_dptr<float16>(),
+                  output_tensor->shape_view().elem_cnt());
+
+      h2f->Launch(ctx->stream(), grad_hidden_gates->dptr<float16>(), in_tmp_buffer,
+                  in_shape.elem_cnt());
+      NdarrayReduce<DeviceType::kCUDA, float, BinaryFuncSum>::Reduce(
+          ctx->stream(), XpuVarNdarray<float>(reduced_shape, out_tmp_buffer),
+          XpuVarNdarray<const float>(in_shape, in_tmp_buffer),
+          XpuVarNdarray<float>(in_shape, reduce_tmp_buffer));
+
+      output_tensor = ctx->Tensor4ArgNameAndIndex("grad_hidden_bias", 0);
+      f2h->Launch(ctx->stream(), out_tmp_buffer, output_tensor->mut_dptr<float16>(),
+                  output_tensor->shape_view().elem_cnt());
+    }
+  }
+
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
+};
+
+REGISTER_USER_KERNEL("fused_gru_cell_grad")
+    .SetCreateFn<GpuFusedGruCellGradHalfKernel>()
+    .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)
+                     && (user_op::HobDataType("grad_hy", 0) == GetDataType<float16>::value)
+                     && (user_op::HobDataType("workspace", 0) == GetDataType<float16>::value))
+    .SetInferTmpSizeFn([](user_op::InferContext* ctx) {
+      size_t tmp_bytes = 0;
+      if (ctx->has_output("grad_input_bias", 0) && ctx->has_output("grad_hidden_bias", 0)) {
+        const Shape& in_shape = ctx->InputTensorDesc("grad_hy", 0).shape();
+        const Shape& out_shape = ctx->OutputTensorDesc("grad_input_bias", 0)->shape();
+        tmp_bytes = (2 * GetCudaAlignedSize(in_shape.elem_cnt() * 3 * sizeof(float))
+                     + GetCudaAlignedSize(out_shape.elem_cnt() * sizeof(float)));
+      } else {
+        tmp_bytes = 0;
+      }
+      return tmp_bytes;
+    });
+
 }  // namespace oneflow
\ No newline at end of file
diff --git a/oneflow/user/kernels/fused_lstm_cell_kernel.hip.cpp b/oneflow/user/kernels/fused_lstm_cell_kernel.hip.cpp
index 60cbc6c..d06005c 100644
--- a/oneflow/user/kernels/fused_lstm_cell_kernel.hip.cpp
+++ b/oneflow/user/kernels/fused_lstm_cell_kernel.hip.cpp
@@ -1,505 +1,505 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "hip/hip_runtime.h"
-#include <limits>
-#include "oneflow/core/device/cuda_util.h"
-#include "oneflow/core/framework/framework.h"
-#include "oneflow/core/ndarray/ndarray_util.h"
-#include "oneflow/core/ndarray/xpu_var_ndarray.h"
-#include "oneflow/core/kernel/kernel_util.h"
-#include "oneflow/core/kernel/cuda_graph_support.h"
-#include "oneflow/core/ep/include/primitive/cast.h"
-#include "oneflow/core/ep/include/primitive/fill.h"
-#include "oneflow/core/ep/rocm/cuda_device.h"
-#include "oneflow/core/ep/include/primitive/matmul.h"
-#include "oneflow/user/kernels/fused_rnn_cell_kernel_util.h"
-
-// NOTE(Liang Depeng): The implementation of fused_lstm_cell is modified from
-//                     https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/cuda/RNN.cu
-
-namespace oneflow {
-
-namespace {
-
-template<typename T>
-struct AccumulateType {};
-template<>
-struct AccumulateType<float> {
-  using type = float;
-};
-template<>
-struct AccumulateType<double> {
-  using type = double;
-};
-
-template<typename T>
-using acc_type = typename AccumulateType<T>::type;
-
-#define H2F(input) static_cast<ACC_T>(input)
-#define F2H(input) static_cast<T>(input)
-
-template<typename T>
-__device__ __forceinline__ T sigmoid(T in) {
-  T one = static_cast<T>(1.0);
-  return one / (one + ::exp(-in));
-}
-
-template<typename T, typename ACC_T, typename IDX_TYPE>
-#if __CUDA_ARCH__ >= 350
-OF_LAUNCH_BOUNDS_2(512, 4)
-#endif
-__global__
-    void lstm_cell_forward(const IDX_TYPE numel, const IDX_TYPE hidden_size,
-                           const T* input_gates_ptr, const T* hidden_gates_ptr, const T* cx_ptr,
-                           const T* input_bias_ptr, const T* hidden_bias_ptr, T* hy_ptr, T* cy_ptr,
-                           T* workspace_ptr) {
-  bool has_bias = input_bias_ptr != nullptr;
-  for (IDX_TYPE linearIndex = blockIdx.x * blockDim.x + threadIdx.x; linearIndex < numel;
-       linearIndex += gridDim.x * blockDim.x) {
-    IDX_TYPE offset = (linearIndex / hidden_size) * 4 * hidden_size + linearIndex % hidden_size;
-
-    T iig = input_gates_ptr[offset + 0 * hidden_size];
-    T ifg = input_gates_ptr[offset + 1 * hidden_size];
-    T icg = input_gates_ptr[offset + 2 * hidden_size];
-    T iog = input_gates_ptr[offset + 3 * hidden_size];
-
-    T hig = hidden_gates_ptr[offset + 0 * hidden_size];
-    T hfg = hidden_gates_ptr[offset + 1 * hidden_size];
-    T hcg = hidden_gates_ptr[offset + 2 * hidden_size];
-    T hog = hidden_gates_ptr[offset + 3 * hidden_size];
-
-    T* wig = &(workspace_ptr[offset + 0 * hidden_size]);
-    T* wfg = &(workspace_ptr[offset + 1 * hidden_size]);
-    T* wcg = &(workspace_ptr[offset + 2 * hidden_size]);
-    T* wog = &(workspace_ptr[offset + 3 * hidden_size]);
-
-    T cx = cx_ptr[linearIndex];
-
-    T* hy = &(hy_ptr[linearIndex]);
-    T* cy = &(cy_ptr[linearIndex]);
-
-    T b1i, b1f, b1c, b1o;
-    T b2i, b2f, b2c, b2o;
-
-    if (has_bias) {
-      b1i = input_bias_ptr[linearIndex % hidden_size + 0 * hidden_size];
-      b1f = input_bias_ptr[linearIndex % hidden_size + 1 * hidden_size];
-      b1c = input_bias_ptr[linearIndex % hidden_size + 2 * hidden_size];
-      b1o = input_bias_ptr[linearIndex % hidden_size + 3 * hidden_size];
-
-      b2i = hidden_bias_ptr[linearIndex % hidden_size + 0 * hidden_size];
-      b2f = hidden_bias_ptr[linearIndex % hidden_size + 1 * hidden_size];
-      b2c = hidden_bias_ptr[linearIndex % hidden_size + 2 * hidden_size];
-      b2o = hidden_bias_ptr[linearIndex % hidden_size + 3 * hidden_size];
-    } else {
-      b1i = F2H(0.0);
-      b1f = F2H(0.0);
-      b1c = F2H(0.0);
-      b1o = F2H(0.0);
-      b2i = F2H(0.0);
-      b2f = F2H(0.0);
-      b2c = F2H(0.0);
-      b2o = F2H(0.0);
-    }
-
-    ACC_T ig, fg, cg, og;
-    ACC_T f_hy, f_cy;
-
-    ig = sigmoid(H2F(iig) + H2F(hig) + H2F(b1i) + H2F(b2i));
-    fg = sigmoid(H2F(ifg) + H2F(hfg) + H2F(b1f) + H2F(b2f));
-    cg = ::tanh(H2F(icg) + H2F(hcg) + H2F(b1c) + H2F(b2c));
-    og = sigmoid(H2F(iog) + H2F(hog) + H2F(b1o) + H2F(b2o));
-
-    f_cy = (fg * H2F(cx)) + (ig * cg);
-    f_hy = og * ::tanh(f_cy);
-
-    *hy = F2H(f_hy);
-    *cy = F2H(f_cy);
-
-    // SAVE FOR BACKWARDS
-    // Also need cy and cx but can be saved easily in python
-    *wig = F2H(ig);
-    *wfg = F2H(fg);
-    *wcg = F2H(cg);
-    *wog = F2H(og);
-  }
-}
-
-template<typename T, typename ACC_T, typename IDX_TYPE>
-#if __CUDA_ARCH__ >= 350
-OF_LAUNCH_BOUNDS_2(512, 4)
-#endif
-__global__
-    void lstm_cell_backward(const IDX_TYPE numel, const IDX_TYPE hidden_size, const T* grad_hy_ptr,
-                            const T* grad_cy_ptr, const T* cx_ptr, const T* cy_ptr,
-                            const T* workspace_ptr, T* grad_gates_ptr, T* grad_cx_ptr) {
-  for (IDX_TYPE linearIndex = blockIdx.x * blockDim.x + threadIdx.x; linearIndex < numel;
-       linearIndex += gridDim.x * blockDim.x) {
-    IDX_TYPE offset = (linearIndex / hidden_size) * 4 * hidden_size + linearIndex % hidden_size;
-
-    T ig = workspace_ptr[offset + 0 * hidden_size];
-    T fg = workspace_ptr[offset + 1 * hidden_size];
-    T cg = workspace_ptr[offset + 2 * hidden_size];
-    T og = workspace_ptr[offset + 3 * hidden_size];
-
-    T* ih = &(grad_gates_ptr[offset + 0 * hidden_size]);
-    T* fh = &(grad_gates_ptr[offset + 1 * hidden_size]);
-    T* ch = &(grad_gates_ptr[offset + 2 * hidden_size]);
-    T* oh = &(grad_gates_ptr[offset + 3 * hidden_size]);
-
-    // will return hidden grads here
-    T cx = cx_ptr[linearIndex];
-    T cy = cy_ptr[linearIndex];
-
-    ACC_T go = H2F(grad_hy_ptr[linearIndex]);
-    ACC_T goc = H2F(grad_cy_ptr[linearIndex]);
-
-    ACC_T gcx = ::tanh(H2F(cy));
-
-    ACC_T gog = go * gcx;
-    gcx = go * H2F(og) * (1 - gcx * gcx) + goc;
-
-    ACC_T gig = gcx * H2F(cg);
-    ACC_T gfg = gcx * H2F(cx);
-    ACC_T gcg = gcx * H2F(ig);
-
-    gig = gig * (1 - H2F(ig)) * H2F(ig);
-    gfg = gfg * (1 - H2F(fg)) * H2F(fg);
-    gcg = gcg * (1 - H2F(cg) * H2F(cg));
-    gog = gog * (1 - H2F(og)) * H2F(og);
-
-    *ih = F2H(gig);
-    *fh = F2H(gfg);
-    *ch = F2H(gcg);
-    *oh = F2H(gog);
-
-    if (grad_cx_ptr != nullptr) {
-      gcx = gcx * H2F(fg);
-      T* gi = &(grad_cx_ptr[linearIndex]);
-      *gi = F2H(gcx);
-    }
-  }
-}
-
-template<typename T>
-struct FusedLstmCellFunctor final {
-  void operator()(ep::Stream* stream, const int64_t cx_numel, const int64_t workspace_numel,
-                  const int64_t hidden_size, const T* input_gates_ptr, const T* hidden_gates_ptr,
-                  const T* cx_ptr, const T* input_bias_ptr, const T* hidden_bias_ptr, T* hy_ptr,
-                  T* cy_ptr, T* workspace_ptr) {
-    using ACC_T = acc_type<T>;
-    if (workspace_numel < std::numeric_limits<int32_t>::max()) {
-      RUN_CUDA_KERNEL((lstm_cell_forward<T, ACC_T, int32_t>), stream, cx_numel,
-                      static_cast<int32_t>(cx_numel), static_cast<int32_t>(hidden_size),
-                      input_gates_ptr, hidden_gates_ptr, cx_ptr, input_bias_ptr, hidden_bias_ptr,
-                      hy_ptr, cy_ptr, workspace_ptr);
-    } else {
-      RUN_CUDA_KERNEL((lstm_cell_forward<T, ACC_T, int64_t>), stream, cx_numel, cx_numel,
-                      hidden_size, input_gates_ptr, hidden_gates_ptr, cx_ptr, input_bias_ptr,
-                      hidden_bias_ptr, hy_ptr, cy_ptr, workspace_ptr);
-    }
-  }
-};
-
-template<>
-void FusedLstmCellFunctor<float16>::operator()(
-    ep::Stream* stream, const int64_t cx_numel, const int64_t workspace_numel,
-    const int64_t hidden_size, const float16* input_gates_ptr, const float16* hidden_gates_ptr,
-    const float16* cx_ptr, const float16* input_bias_ptr, const float16* hidden_bias_ptr,
-    float16* hy_ptr, float16* cy_ptr, float16* workspace_ptr) {
-  if (workspace_numel < std::numeric_limits<int32_t>::max()) {
-    RUN_CUDA_KERNEL(
-        (lstm_cell_forward<half, float, int32_t>), stream, cx_numel, static_cast<int32_t>(cx_numel),
-        static_cast<int32_t>(hidden_size), reinterpret_cast<const half*>(input_gates_ptr),
-        reinterpret_cast<const half*>(hidden_gates_ptr), reinterpret_cast<const half*>(cx_ptr),
-        reinterpret_cast<const half*>(input_bias_ptr),
-        reinterpret_cast<const half*>(hidden_bias_ptr), reinterpret_cast<half*>(hy_ptr),
-        reinterpret_cast<half*>(cy_ptr), reinterpret_cast<half*>(workspace_ptr));
-  } else {
-    RUN_CUDA_KERNEL((lstm_cell_forward<half, float, int64_t>), stream, cx_numel, cx_numel,
-                    hidden_size, reinterpret_cast<const half*>(input_gates_ptr),
-                    reinterpret_cast<const half*>(hidden_gates_ptr),
-                    reinterpret_cast<const half*>(cx_ptr),
-                    reinterpret_cast<const half*>(input_bias_ptr),
-                    reinterpret_cast<const half*>(hidden_bias_ptr), reinterpret_cast<half*>(hy_ptr),
-                    reinterpret_cast<half*>(cy_ptr), reinterpret_cast<half*>(workspace_ptr));
-  }
-}
-
-template<typename T>
-struct FusedLstmCellGradFunctor final {
-  void operator()(ep::Stream* stream, const int64_t cx_numel, const int64_t workspace_numel,
-                  const int64_t hidden_size, const T* grad_hy_ptr, const T* grad_cy_ptr,
-                  const T* cx_ptr, const T* cy_ptr, const T* workspace_ptr, T* grad_gates_ptr,
-                  T* grad_cx_ptr) {
-    using ACC_T = acc_type<T>;
-    if (workspace_numel < std::numeric_limits<int32_t>::max()) {
-      RUN_CUDA_KERNEL((lstm_cell_backward<T, ACC_T, int32_t>), stream, cx_numel,
-                      static_cast<int32_t>(cx_numel), static_cast<int32_t>(hidden_size),
-                      grad_hy_ptr, grad_cy_ptr, cx_ptr, cy_ptr, workspace_ptr, grad_gates_ptr,
-                      grad_cx_ptr);
-    } else {
-      RUN_CUDA_KERNEL((lstm_cell_backward<T, ACC_T, int64_t>), stream, cx_numel, cx_numel,
-                      hidden_size, grad_hy_ptr, grad_cy_ptr, cx_ptr, cy_ptr, workspace_ptr,
-                      grad_gates_ptr, grad_cx_ptr);
-    }
-  }
-};
-
-template<>
-void FusedLstmCellGradFunctor<float16>::operator()(
-    ep::Stream* stream, const int64_t cx_numel, const int64_t workspace_numel,
-    const int64_t hidden_size, const float16* grad_hy_ptr, const float16* grad_cy_ptr,
-    const float16* cx_ptr, const float16* cy_ptr, const float16* workspace_ptr,
-    float16* grad_gates_ptr, float16* grad_cx_ptr) {
-  if (workspace_numel < std::numeric_limits<int32_t>::max()) {
-    RUN_CUDA_KERNEL((lstm_cell_backward<half, float, int32_t>), stream, cx_numel,
-                    static_cast<int32_t>(cx_numel), static_cast<int32_t>(hidden_size),
-                    reinterpret_cast<const half*>(grad_hy_ptr),
-                    reinterpret_cast<const half*>(grad_cy_ptr),
-                    reinterpret_cast<const half*>(cx_ptr), reinterpret_cast<const half*>(cy_ptr),
-                    reinterpret_cast<const half*>(workspace_ptr),
-                    reinterpret_cast<half*>(grad_gates_ptr), reinterpret_cast<half*>(grad_cx_ptr));
-  } else {
-    RUN_CUDA_KERNEL((lstm_cell_backward<half, float, int64_t>), stream, cx_numel, cx_numel,
-                    hidden_size, reinterpret_cast<const half*>(grad_hy_ptr),
-                    reinterpret_cast<const half*>(grad_cy_ptr),
-                    reinterpret_cast<const half*>(cx_ptr), reinterpret_cast<const half*>(cy_ptr),
-                    reinterpret_cast<const half*>(workspace_ptr),
-                    reinterpret_cast<half*>(grad_gates_ptr), reinterpret_cast<half*>(grad_cx_ptr));
-  }
-}
-
-}  // namespace
-
-template<typename T>
-class GpuFusedLstmCellKernel final : public user_op::OpKernel {
- public:
-  GpuFusedLstmCellKernel() = default;
-  ~GpuFusedLstmCellKernel() = default;
-
- private:
-  using user_op::OpKernel::Compute;
-  void Compute(user_op::KernelComputeContext* ctx) const override {
-    const user_op::Tensor* input_gates = ctx->Tensor4ArgNameAndIndex("input_gates", 0);
-    const user_op::Tensor* hidden_gates = ctx->Tensor4ArgNameAndIndex("hidden_gates", 0);
-    const user_op::Tensor* cx = ctx->Tensor4ArgNameAndIndex("cx", 0);
-    user_op::Tensor* hy = ctx->Tensor4ArgNameAndIndex("hy", 0);
-    user_op::Tensor* cy = ctx->Tensor4ArgNameAndIndex("cy", 0);
-    user_op::Tensor* workspace = ctx->Tensor4ArgNameAndIndex("workspace", 0);
-
-    const T* input_bias_ptr = nullptr;
-    const T* hidden_bias_ptr = nullptr;
-    if (ctx->has_input("input_bias", 0)) {
-      CHECK(ctx->has_input("hidden_bias", 0));
-      input_bias_ptr = ctx->Tensor4ArgNameAndIndex("input_bias", 0)->dptr<T>();
-      hidden_bias_ptr = ctx->Tensor4ArgNameAndIndex("hidden_bias", 0)->dptr<T>();
-    }
-    const T* input_gates_ptr = input_gates->dptr<T>();
-    const T* hidden_gates_ptr = hidden_gates->dptr<T>();
-    const T* cx_ptr = cx->dptr<T>();
-
-    T* hy_ptr = hy->mut_dptr<T>();
-    T* cy_ptr = cy->mut_dptr<T>();
-    T* workspace_ptr = workspace->mut_dptr<T>();
-    const int64_t cx_numel = cx->shape_view().elem_cnt();
-    const int64_t workspace_numel = workspace->shape_view().elem_cnt();
-    const int64_t hidden_size = cx->shape_view().At(cx->shape_view().NumAxes() - 1);
-    FusedLstmCellFunctor<T>()(ctx->stream(), cx_numel, workspace_numel, hidden_size,
-                              input_gates_ptr, hidden_gates_ptr, cx_ptr, input_bias_ptr,
-                              hidden_bias_ptr, hy_ptr, cy_ptr, workspace_ptr);
-  }
-
-  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
-};
-
-#define REGISTER_FUSED_LSTM_CELL_KERNEL(dtype)                                                  \
-  REGISTER_USER_KERNEL("fused_lstm_cell")                                                       \
-      .SetCreateFn<GpuFusedLstmCellKernel<dtype>>()                                             \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                          \
-                       && (user_op::HobDataType("cx", 0) == GetDataType<dtype>::value)          \
-                       && (user_op::HobDataType("input_gates", 0) == GetDataType<dtype>::value) \
-                       && (user_op::HobDataType("hidden_gates", 0) == GetDataType<dtype>::value))
-
-REGISTER_FUSED_LSTM_CELL_KERNEL(float);
-REGISTER_FUSED_LSTM_CELL_KERNEL(float16);
-
-class GpuFusedLstmCellGradFloatKernel final : public user_op::OpKernel {
- public:
-  GpuFusedLstmCellGradFloatKernel() = default;
-  ~GpuFusedLstmCellGradFloatKernel() = default;
-
- private:
-  using user_op::OpKernel::Compute;
-  void Compute(user_op::KernelComputeContext* ctx) const override {
-    const user_op::Tensor* grad_hy = ctx->Tensor4ArgNameAndIndex("grad_hy", 0);
-    const user_op::Tensor* grad_cy = ctx->Tensor4ArgNameAndIndex("grad_cy", 0);
-    const user_op::Tensor* cx = ctx->Tensor4ArgNameAndIndex("cx", 0);
-    const user_op::Tensor* cy = ctx->Tensor4ArgNameAndIndex("cy", 0);
-    const user_op::Tensor* workspace = ctx->Tensor4ArgNameAndIndex("workspace", 0);
-    user_op::Tensor* grad_gates = ctx->Tensor4ArgNameAndIndex("grad_gates", 0);
-    user_op::Tensor* grad_cx = ctx->Tensor4ArgNameAndIndex("grad_cx", 0);
-
-    const float* grad_hy_ptr = grad_hy->dptr<float>();
-    const float* grad_cy_ptr = grad_cy->dptr<float>();
-    const float* cx_ptr = cx->dptr<float>();
-    const float* cy_ptr = cy->dptr<float>();
-    const float* workspace_ptr = workspace->dptr<float>();
-
-    float* grad_gates_ptr = grad_gates->mut_dptr<float>();
-    float* grad_cx_ptr = nullptr;
-
-    if (ctx->has_output("grad_cx", 0)) { grad_cx_ptr = grad_cx->mut_dptr<float>(); }
-
-    const int64_t cx_numel = cx->shape_view().elem_cnt();
-    const int64_t workspace_numel = workspace->shape_view().elem_cnt();
-    const int64_t hidden_size = cx->shape_view().At(cx->shape_view().NumAxes() - 1);
-    FusedLstmCellGradFunctor<float>()(ctx->stream(), cx_numel, workspace_numel, hidden_size,
-                                      grad_hy_ptr, grad_cy_ptr, cx_ptr, cy_ptr, workspace_ptr,
-                                      grad_gates_ptr, grad_cx_ptr);
-
-    if (ctx->has_output("grad_bias", 0)) {
-      float* grad_bias_ptr = ctx->Tensor4ArgNameAndIndex("grad_bias", 0)->mut_dptr<float>();
-      std::vector<int32_t> axis;
-      axis.push_back(0);
-      const Shape& reduced_shape =
-          CreateReducedShape(workspace->shape_view(), {axis.begin(), axis.end()});
-      user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0);
-      NdarrayReduce<DeviceType::kCUDA, float, BinaryFuncSum>::Reduce(
-          ctx->stream(), XpuVarNdarray<float>(reduced_shape, grad_bias_ptr),
-          XpuVarNdarray<const float>(grad_gates->shape_view(), grad_gates->dptr<float>()),
-          XpuVarNdarray<float>(tmp_buffer->shape_view(), tmp_buffer->mut_dptr<float>()));
-    }
-  }
-
-  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
-};
-
-REGISTER_USER_KERNEL("fused_lstm_cell_grad")
-    .SetCreateFn<GpuFusedLstmCellGradFloatKernel>()
-    .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)
-                     && (user_op::HobDataType("grad_hy", 0) == GetDataType<float>::value)
-                     && (user_op::HobDataType("grad_cy", 0) == GetDataType<float>::value)
-                     && (user_op::HobDataType("cx", 0) == GetDataType<float>::value)
-                     && (user_op::HobDataType("cy", 0) == GetDataType<float>::value)
-                     && (user_op::HobDataType("workspace", 0) == GetDataType<float>::value))
-    .SetInferTmpSizeFn([](user_op::InferContext* ctx) {
-      size_t tmp_bytes = 0;
-      if (ctx->has_output("grad_bias", 0)) {
-        const Shape& in_shape = ctx->InputTensorDesc("workspace", 0).shape();
-        tmp_bytes = GetCudaAlignedSize(in_shape.elem_cnt() * sizeof(float));
-      } else {
-        tmp_bytes = 0;
-      }
-      return tmp_bytes;
-    });
-
-class GpuFusedLstmCellGradHalfKernel final : public user_op::OpKernel {
- public:
-  GpuFusedLstmCellGradHalfKernel() = default;
-  ~GpuFusedLstmCellGradHalfKernel() = default;
-
- private:
-  using user_op::OpKernel::Compute;
-  void Compute(user_op::KernelComputeContext* ctx) const override {
-    const user_op::Tensor* grad_hy = ctx->Tensor4ArgNameAndIndex("grad_hy", 0);
-    const user_op::Tensor* grad_cy = ctx->Tensor4ArgNameAndIndex("grad_cy", 0);
-    const user_op::Tensor* cx = ctx->Tensor4ArgNameAndIndex("cx", 0);
-    const user_op::Tensor* cy = ctx->Tensor4ArgNameAndIndex("cy", 0);
-    const user_op::Tensor* workspace = ctx->Tensor4ArgNameAndIndex("workspace", 0);
-    user_op::Tensor* grad_gates = ctx->Tensor4ArgNameAndIndex("grad_gates", 0);
-    user_op::Tensor* grad_cx = ctx->Tensor4ArgNameAndIndex("grad_cx", 0);
-
-    const float16* grad_hy_ptr = grad_hy->dptr<float16>();
-    const float16* grad_cy_ptr = grad_cy->dptr<float16>();
-    const float16* cx_ptr = cx->dptr<float16>();
-    const float16* cy_ptr = cy->dptr<float16>();
-    const float16* workspace_ptr = workspace->dptr<float16>();
-
-    float16* grad_gates_ptr = grad_gates->mut_dptr<float16>();
-    float16* grad_cx_ptr = nullptr;
-
-    if (ctx->has_output("grad_cx", 0)) { grad_cx_ptr = grad_cx->mut_dptr<float16>(); }
-
-    const int64_t cx_numel = cx->shape_view().elem_cnt();
-    const int64_t workspace_numel = workspace->shape_view().elem_cnt();
-    const int64_t hidden_size = cx->shape_view().At(cx->shape_view().NumAxes() - 1);
-    FusedLstmCellGradFunctor<float16>()(ctx->stream(), cx_numel, workspace_numel, hidden_size,
-                                        grad_hy_ptr, grad_cy_ptr, cx_ptr, cy_ptr, workspace_ptr,
-                                        grad_gates_ptr, grad_cx_ptr);
-
-    if (ctx->has_output("grad_bias", 0)) {
-      std::vector<int32_t> axis;
-      axis.push_back(0);
-      user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0);
-      const ShapeView& in_shape = grad_gates->shape_view();
-      const Shape& reduced_shape = CreateReducedShape(in_shape, {axis.begin(), axis.end()});
-      float* in_tmp_buffer = tmp_buffer->mut_dptr<float>();
-      const size_t in_tmp_buffer_bytes = GetCudaAlignedSize(in_shape.elem_cnt() * sizeof(float));
-      float* out_tmp_buffer =
-          reinterpret_cast<float*>(tmp_buffer->mut_dptr<char>() + in_tmp_buffer_bytes);
-      const size_t out_tmp_buffer_bytes =
-          GetCudaAlignedSize(reduced_shape.elem_cnt() * sizeof(float));
-      float* reduce_tmp_buffer = reinterpret_cast<float*>(
-          tmp_buffer->mut_dptr<char>() + in_tmp_buffer_bytes + out_tmp_buffer_bytes);
-      const size_t reduce_tmp_buffer_bytes =
-          GetCudaAlignedSize(in_shape.elem_cnt() * sizeof(float));
-      CHECK_LE(in_tmp_buffer_bytes + out_tmp_buffer_bytes + reduce_tmp_buffer_bytes,
-               tmp_buffer->shape_view().elem_cnt());
-      auto h2f = ep::primitive::NewPrimitive<ep::primitive::CastFactory>(
-          ctx->device_type(), DataType::kFloat16, DataType::kFloat);
-      CHECK(h2f);
-      auto f2h = ep::primitive::NewPrimitive<ep::primitive::CastFactory>(
-          ctx->device_type(), DataType::kFloat, DataType::kFloat16);
-      CHECK(f2h);
-      h2f->Launch(ctx->stream(), grad_gates->dptr<float16>(), in_tmp_buffer, in_shape.elem_cnt());
-
-      NdarrayReduce<DeviceType::kCUDA, float, BinaryFuncSum>::Reduce(
-          ctx->stream(), XpuVarNdarray<float>(reduced_shape, out_tmp_buffer),
-          XpuVarNdarray<const float>(in_shape, in_tmp_buffer),
-          XpuVarNdarray<float>(in_shape, reduce_tmp_buffer));
-
-      user_op::Tensor* output_tensor = ctx->Tensor4ArgNameAndIndex("grad_bias", 0);
-      f2h->Launch(ctx->stream(), out_tmp_buffer, output_tensor->mut_dptr<float16>(),
-                  output_tensor->shape_view().elem_cnt());
-    }
-  }
-
-  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
-};
-
-REGISTER_USER_KERNEL("fused_lstm_cell_grad")
-    .SetCreateFn<GpuFusedLstmCellGradHalfKernel>()
-    .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)
-                     && (user_op::HobDataType("grad_hy", 0) == GetDataType<float16>::value)
-                     && (user_op::HobDataType("grad_cy", 0) == GetDataType<float16>::value)
-                     && (user_op::HobDataType("cx", 0) == GetDataType<float16>::value)
-                     && (user_op::HobDataType("cy", 0) == GetDataType<float16>::value)
-                     && (user_op::HobDataType("workspace", 0) == GetDataType<float16>::value))
-    .SetInferTmpSizeFn([](user_op::InferContext* ctx) {
-      size_t tmp_bytes = 0;
-      if (ctx->has_output("grad_bias", 0)) {
-        const Shape& in_shape = ctx->InputTensorDesc("workspace", 0).shape();
-        const Shape& out_shape = ctx->OutputTensorDesc("grad_bias", 0)->shape();
-        tmp_bytes = (2 * GetCudaAlignedSize(in_shape.elem_cnt() * sizeof(float))
-                     + GetCudaAlignedSize(out_shape.elem_cnt() * sizeof(float)));
-      } else {
-        tmp_bytes = 0;
-      }
-      return tmp_bytes;
-    });
-
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "hip/hip_runtime.h"
+#include <limits>
+#include "oneflow/core/device/cuda_util.h"
+#include "oneflow/core/framework/framework.h"
+#include "oneflow/core/ndarray/ndarray_util.h"
+#include "oneflow/core/ndarray/xpu_var_ndarray.h"
+#include "oneflow/core/kernel/kernel_util.h"
+#include "oneflow/core/kernel/cuda_graph_support.h"
+#include "oneflow/core/ep/include/primitive/cast.h"
+#include "oneflow/core/ep/include/primitive/fill.h"
+#include "oneflow/core/ep/rocm/cuda_device.h"
+#include "oneflow/core/ep/include/primitive/matmul.h"
+#include "oneflow/user/kernels/fused_rnn_cell_kernel_util.h"
+
+// NOTE(Liang Depeng): The implementation of fused_lstm_cell is modified from
+//                     https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/cuda/RNN.cu
+
+namespace oneflow {
+
+namespace {
+
+template<typename T>
+struct AccumulateType {};
+template<>
+struct AccumulateType<float> {
+  using type = float;
+};
+template<>
+struct AccumulateType<double> {
+  using type = double;
+};
+
+template<typename T>
+using acc_type = typename AccumulateType<T>::type;
+
+#define H2F(input) static_cast<ACC_T>(input)
+#define F2H(input) static_cast<T>(input)
+
+template<typename T>
+__device__ __forceinline__ T sigmoid(T in) {
+  T one = static_cast<T>(1.0);
+  return one / (one + ::exp(-in));
+}
+
+template<typename T, typename ACC_T, typename IDX_TYPE>
+#if __CUDA_ARCH__ >= 350
+OF_LAUNCH_BOUNDS_2(512, 4)
+#endif
+__global__
+    void lstm_cell_forward(const IDX_TYPE numel, const IDX_TYPE hidden_size,
+                           const T* input_gates_ptr, const T* hidden_gates_ptr, const T* cx_ptr,
+                           const T* input_bias_ptr, const T* hidden_bias_ptr, T* hy_ptr, T* cy_ptr,
+                           T* workspace_ptr) {
+  bool has_bias = input_bias_ptr != nullptr;
+  for (IDX_TYPE linearIndex = blockIdx.x * blockDim.x + threadIdx.x; linearIndex < numel;
+       linearIndex += gridDim.x * blockDim.x) {
+    IDX_TYPE offset = (linearIndex / hidden_size) * 4 * hidden_size + linearIndex % hidden_size;
+
+    T iig = input_gates_ptr[offset + 0 * hidden_size];
+    T ifg = input_gates_ptr[offset + 1 * hidden_size];
+    T icg = input_gates_ptr[offset + 2 * hidden_size];
+    T iog = input_gates_ptr[offset + 3 * hidden_size];
+
+    T hig = hidden_gates_ptr[offset + 0 * hidden_size];
+    T hfg = hidden_gates_ptr[offset + 1 * hidden_size];
+    T hcg = hidden_gates_ptr[offset + 2 * hidden_size];
+    T hog = hidden_gates_ptr[offset + 3 * hidden_size];
+
+    T* wig = &(workspace_ptr[offset + 0 * hidden_size]);
+    T* wfg = &(workspace_ptr[offset + 1 * hidden_size]);
+    T* wcg = &(workspace_ptr[offset + 2 * hidden_size]);
+    T* wog = &(workspace_ptr[offset + 3 * hidden_size]);
+
+    T cx = cx_ptr[linearIndex];
+
+    T* hy = &(hy_ptr[linearIndex]);
+    T* cy = &(cy_ptr[linearIndex]);
+
+    T b1i, b1f, b1c, b1o;
+    T b2i, b2f, b2c, b2o;
+
+    if (has_bias) {
+      b1i = input_bias_ptr[linearIndex % hidden_size + 0 * hidden_size];
+      b1f = input_bias_ptr[linearIndex % hidden_size + 1 * hidden_size];
+      b1c = input_bias_ptr[linearIndex % hidden_size + 2 * hidden_size];
+      b1o = input_bias_ptr[linearIndex % hidden_size + 3 * hidden_size];
+
+      b2i = hidden_bias_ptr[linearIndex % hidden_size + 0 * hidden_size];
+      b2f = hidden_bias_ptr[linearIndex % hidden_size + 1 * hidden_size];
+      b2c = hidden_bias_ptr[linearIndex % hidden_size + 2 * hidden_size];
+      b2o = hidden_bias_ptr[linearIndex % hidden_size + 3 * hidden_size];
+    } else {
+      b1i = F2H(0.0);
+      b1f = F2H(0.0);
+      b1c = F2H(0.0);
+      b1o = F2H(0.0);
+      b2i = F2H(0.0);
+      b2f = F2H(0.0);
+      b2c = F2H(0.0);
+      b2o = F2H(0.0);
+    }
+
+    ACC_T ig, fg, cg, og;
+    ACC_T f_hy, f_cy;
+
+    ig = sigmoid(H2F(iig) + H2F(hig) + H2F(b1i) + H2F(b2i));
+    fg = sigmoid(H2F(ifg) + H2F(hfg) + H2F(b1f) + H2F(b2f));
+    cg = ::tanh(H2F(icg) + H2F(hcg) + H2F(b1c) + H2F(b2c));
+    og = sigmoid(H2F(iog) + H2F(hog) + H2F(b1o) + H2F(b2o));
+
+    f_cy = (fg * H2F(cx)) + (ig * cg);
+    f_hy = og * ::tanh(f_cy);
+
+    *hy = F2H(f_hy);
+    *cy = F2H(f_cy);
+
+    // SAVE FOR BACKWARDS
+    // Also need cy and cx but can be saved easily in python
+    *wig = F2H(ig);
+    *wfg = F2H(fg);
+    *wcg = F2H(cg);
+    *wog = F2H(og);
+  }
+}
+
+template<typename T, typename ACC_T, typename IDX_TYPE>
+#if __CUDA_ARCH__ >= 350
+OF_LAUNCH_BOUNDS_2(512, 4)
+#endif
+__global__
+    void lstm_cell_backward(const IDX_TYPE numel, const IDX_TYPE hidden_size, const T* grad_hy_ptr,
+                            const T* grad_cy_ptr, const T* cx_ptr, const T* cy_ptr,
+                            const T* workspace_ptr, T* grad_gates_ptr, T* grad_cx_ptr) {
+  for (IDX_TYPE linearIndex = blockIdx.x * blockDim.x + threadIdx.x; linearIndex < numel;
+       linearIndex += gridDim.x * blockDim.x) {
+    IDX_TYPE offset = (linearIndex / hidden_size) * 4 * hidden_size + linearIndex % hidden_size;
+
+    T ig = workspace_ptr[offset + 0 * hidden_size];
+    T fg = workspace_ptr[offset + 1 * hidden_size];
+    T cg = workspace_ptr[offset + 2 * hidden_size];
+    T og = workspace_ptr[offset + 3 * hidden_size];
+
+    T* ih = &(grad_gates_ptr[offset + 0 * hidden_size]);
+    T* fh = &(grad_gates_ptr[offset + 1 * hidden_size]);
+    T* ch = &(grad_gates_ptr[offset + 2 * hidden_size]);
+    T* oh = &(grad_gates_ptr[offset + 3 * hidden_size]);
+
+    // will return hidden grads here
+    T cx = cx_ptr[linearIndex];
+    T cy = cy_ptr[linearIndex];
+
+    ACC_T go = H2F(grad_hy_ptr[linearIndex]);
+    ACC_T goc = H2F(grad_cy_ptr[linearIndex]);
+
+    ACC_T gcx = ::tanh(H2F(cy));
+
+    ACC_T gog = go * gcx;
+    gcx = go * H2F(og) * (1 - gcx * gcx) + goc;
+
+    ACC_T gig = gcx * H2F(cg);
+    ACC_T gfg = gcx * H2F(cx);
+    ACC_T gcg = gcx * H2F(ig);
+
+    gig = gig * (1 - H2F(ig)) * H2F(ig);
+    gfg = gfg * (1 - H2F(fg)) * H2F(fg);
+    gcg = gcg * (1 - H2F(cg) * H2F(cg));
+    gog = gog * (1 - H2F(og)) * H2F(og);
+
+    *ih = F2H(gig);
+    *fh = F2H(gfg);
+    *ch = F2H(gcg);
+    *oh = F2H(gog);
+
+    if (grad_cx_ptr != nullptr) {
+      gcx = gcx * H2F(fg);
+      T* gi = &(grad_cx_ptr[linearIndex]);
+      *gi = F2H(gcx);
+    }
+  }
+}
+
+template<typename T>
+struct FusedLstmCellFunctor final {
+  void operator()(ep::Stream* stream, const int64_t cx_numel, const int64_t workspace_numel,
+                  const int64_t hidden_size, const T* input_gates_ptr, const T* hidden_gates_ptr,
+                  const T* cx_ptr, const T* input_bias_ptr, const T* hidden_bias_ptr, T* hy_ptr,
+                  T* cy_ptr, T* workspace_ptr) {
+    using ACC_T = acc_type<T>;
+    if (workspace_numel < std::numeric_limits<int32_t>::max()) {
+      RUN_CUDA_KERNEL((lstm_cell_forward<T, ACC_T, int32_t>), stream, cx_numel,
+                      static_cast<int32_t>(cx_numel), static_cast<int32_t>(hidden_size),
+                      input_gates_ptr, hidden_gates_ptr, cx_ptr, input_bias_ptr, hidden_bias_ptr,
+                      hy_ptr, cy_ptr, workspace_ptr);
+    } else {
+      RUN_CUDA_KERNEL((lstm_cell_forward<T, ACC_T, int64_t>), stream, cx_numel, cx_numel,
+                      hidden_size, input_gates_ptr, hidden_gates_ptr, cx_ptr, input_bias_ptr,
+                      hidden_bias_ptr, hy_ptr, cy_ptr, workspace_ptr);
+    }
+  }
+};
+
+template<>
+void FusedLstmCellFunctor<float16>::operator()(
+    ep::Stream* stream, const int64_t cx_numel, const int64_t workspace_numel,
+    const int64_t hidden_size, const float16* input_gates_ptr, const float16* hidden_gates_ptr,
+    const float16* cx_ptr, const float16* input_bias_ptr, const float16* hidden_bias_ptr,
+    float16* hy_ptr, float16* cy_ptr, float16* workspace_ptr) {
+  if (workspace_numel < std::numeric_limits<int32_t>::max()) {
+    RUN_CUDA_KERNEL(
+        (lstm_cell_forward<half, float, int32_t>), stream, cx_numel, static_cast<int32_t>(cx_numel),
+        static_cast<int32_t>(hidden_size), reinterpret_cast<const half*>(input_gates_ptr),
+        reinterpret_cast<const half*>(hidden_gates_ptr), reinterpret_cast<const half*>(cx_ptr),
+        reinterpret_cast<const half*>(input_bias_ptr),
+        reinterpret_cast<const half*>(hidden_bias_ptr), reinterpret_cast<half*>(hy_ptr),
+        reinterpret_cast<half*>(cy_ptr), reinterpret_cast<half*>(workspace_ptr));
+  } else {
+    RUN_CUDA_KERNEL((lstm_cell_forward<half, float, int64_t>), stream, cx_numel, cx_numel,
+                    hidden_size, reinterpret_cast<const half*>(input_gates_ptr),
+                    reinterpret_cast<const half*>(hidden_gates_ptr),
+                    reinterpret_cast<const half*>(cx_ptr),
+                    reinterpret_cast<const half*>(input_bias_ptr),
+                    reinterpret_cast<const half*>(hidden_bias_ptr), reinterpret_cast<half*>(hy_ptr),
+                    reinterpret_cast<half*>(cy_ptr), reinterpret_cast<half*>(workspace_ptr));
+  }
+}
+
+template<typename T>
+struct FusedLstmCellGradFunctor final {
+  void operator()(ep::Stream* stream, const int64_t cx_numel, const int64_t workspace_numel,
+                  const int64_t hidden_size, const T* grad_hy_ptr, const T* grad_cy_ptr,
+                  const T* cx_ptr, const T* cy_ptr, const T* workspace_ptr, T* grad_gates_ptr,
+                  T* grad_cx_ptr) {
+    using ACC_T = acc_type<T>;
+    if (workspace_numel < std::numeric_limits<int32_t>::max()) {
+      RUN_CUDA_KERNEL((lstm_cell_backward<T, ACC_T, int32_t>), stream, cx_numel,
+                      static_cast<int32_t>(cx_numel), static_cast<int32_t>(hidden_size),
+                      grad_hy_ptr, grad_cy_ptr, cx_ptr, cy_ptr, workspace_ptr, grad_gates_ptr,
+                      grad_cx_ptr);
+    } else {
+      RUN_CUDA_KERNEL((lstm_cell_backward<T, ACC_T, int64_t>), stream, cx_numel, cx_numel,
+                      hidden_size, grad_hy_ptr, grad_cy_ptr, cx_ptr, cy_ptr, workspace_ptr,
+                      grad_gates_ptr, grad_cx_ptr);
+    }
+  }
+};
+
+template<>
+void FusedLstmCellGradFunctor<float16>::operator()(
+    ep::Stream* stream, const int64_t cx_numel, const int64_t workspace_numel,
+    const int64_t hidden_size, const float16* grad_hy_ptr, const float16* grad_cy_ptr,
+    const float16* cx_ptr, const float16* cy_ptr, const float16* workspace_ptr,
+    float16* grad_gates_ptr, float16* grad_cx_ptr) {
+  if (workspace_numel < std::numeric_limits<int32_t>::max()) {
+    RUN_CUDA_KERNEL((lstm_cell_backward<half, float, int32_t>), stream, cx_numel,
+                    static_cast<int32_t>(cx_numel), static_cast<int32_t>(hidden_size),
+                    reinterpret_cast<const half*>(grad_hy_ptr),
+                    reinterpret_cast<const half*>(grad_cy_ptr),
+                    reinterpret_cast<const half*>(cx_ptr), reinterpret_cast<const half*>(cy_ptr),
+                    reinterpret_cast<const half*>(workspace_ptr),
+                    reinterpret_cast<half*>(grad_gates_ptr), reinterpret_cast<half*>(grad_cx_ptr));
+  } else {
+    RUN_CUDA_KERNEL((lstm_cell_backward<half, float, int64_t>), stream, cx_numel, cx_numel,
+                    hidden_size, reinterpret_cast<const half*>(grad_hy_ptr),
+                    reinterpret_cast<const half*>(grad_cy_ptr),
+                    reinterpret_cast<const half*>(cx_ptr), reinterpret_cast<const half*>(cy_ptr),
+                    reinterpret_cast<const half*>(workspace_ptr),
+                    reinterpret_cast<half*>(grad_gates_ptr), reinterpret_cast<half*>(grad_cx_ptr));
+  }
+}
+
+}  // namespace
+
+template<typename T>
+class GpuFusedLstmCellKernel final : public user_op::OpKernel {
+ public:
+  GpuFusedLstmCellKernel() = default;
+  ~GpuFusedLstmCellKernel() = default;
+
+ private:
+  using user_op::OpKernel::Compute;
+  void Compute(user_op::KernelComputeContext* ctx) const override {
+    const user_op::Tensor* input_gates = ctx->Tensor4ArgNameAndIndex("input_gates", 0);
+    const user_op::Tensor* hidden_gates = ctx->Tensor4ArgNameAndIndex("hidden_gates", 0);
+    const user_op::Tensor* cx = ctx->Tensor4ArgNameAndIndex("cx", 0);
+    user_op::Tensor* hy = ctx->Tensor4ArgNameAndIndex("hy", 0);
+    user_op::Tensor* cy = ctx->Tensor4ArgNameAndIndex("cy", 0);
+    user_op::Tensor* workspace = ctx->Tensor4ArgNameAndIndex("workspace", 0);
+
+    const T* input_bias_ptr = nullptr;
+    const T* hidden_bias_ptr = nullptr;
+    if (ctx->has_input("input_bias", 0)) {
+      CHECK(ctx->has_input("hidden_bias", 0));
+      input_bias_ptr = ctx->Tensor4ArgNameAndIndex("input_bias", 0)->dptr<T>();
+      hidden_bias_ptr = ctx->Tensor4ArgNameAndIndex("hidden_bias", 0)->dptr<T>();
+    }
+    const T* input_gates_ptr = input_gates->dptr<T>();
+    const T* hidden_gates_ptr = hidden_gates->dptr<T>();
+    const T* cx_ptr = cx->dptr<T>();
+
+    T* hy_ptr = hy->mut_dptr<T>();
+    T* cy_ptr = cy->mut_dptr<T>();
+    T* workspace_ptr = workspace->mut_dptr<T>();
+    const int64_t cx_numel = cx->shape_view().elem_cnt();
+    const int64_t workspace_numel = workspace->shape_view().elem_cnt();
+    const int64_t hidden_size = cx->shape_view().At(cx->shape_view().NumAxes() - 1);
+    FusedLstmCellFunctor<T>()(ctx->stream(), cx_numel, workspace_numel, hidden_size,
+                              input_gates_ptr, hidden_gates_ptr, cx_ptr, input_bias_ptr,
+                              hidden_bias_ptr, hy_ptr, cy_ptr, workspace_ptr);
+  }
+
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
+};
+
+#define REGISTER_FUSED_LSTM_CELL_KERNEL(dtype)                                                  \
+  REGISTER_USER_KERNEL("fused_lstm_cell")                                                       \
+      .SetCreateFn<GpuFusedLstmCellKernel<dtype>>()                                             \
+      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                          \
+                       && (user_op::HobDataType("cx", 0) == GetDataType<dtype>::value)          \
+                       && (user_op::HobDataType("input_gates", 0) == GetDataType<dtype>::value) \
+                       && (user_op::HobDataType("hidden_gates", 0) == GetDataType<dtype>::value))
+
+REGISTER_FUSED_LSTM_CELL_KERNEL(float);
+REGISTER_FUSED_LSTM_CELL_KERNEL(float16);
+
+class GpuFusedLstmCellGradFloatKernel final : public user_op::OpKernel {
+ public:
+  GpuFusedLstmCellGradFloatKernel() = default;
+  ~GpuFusedLstmCellGradFloatKernel() = default;
+
+ private:
+  using user_op::OpKernel::Compute;
+  void Compute(user_op::KernelComputeContext* ctx) const override {
+    const user_op::Tensor* grad_hy = ctx->Tensor4ArgNameAndIndex("grad_hy", 0);
+    const user_op::Tensor* grad_cy = ctx->Tensor4ArgNameAndIndex("grad_cy", 0);
+    const user_op::Tensor* cx = ctx->Tensor4ArgNameAndIndex("cx", 0);
+    const user_op::Tensor* cy = ctx->Tensor4ArgNameAndIndex("cy", 0);
+    const user_op::Tensor* workspace = ctx->Tensor4ArgNameAndIndex("workspace", 0);
+    user_op::Tensor* grad_gates = ctx->Tensor4ArgNameAndIndex("grad_gates", 0);
+    user_op::Tensor* grad_cx = ctx->Tensor4ArgNameAndIndex("grad_cx", 0);
+
+    const float* grad_hy_ptr = grad_hy->dptr<float>();
+    const float* grad_cy_ptr = grad_cy->dptr<float>();
+    const float* cx_ptr = cx->dptr<float>();
+    const float* cy_ptr = cy->dptr<float>();
+    const float* workspace_ptr = workspace->dptr<float>();
+
+    float* grad_gates_ptr = grad_gates->mut_dptr<float>();
+    float* grad_cx_ptr = nullptr;
+
+    if (ctx->has_output("grad_cx", 0)) { grad_cx_ptr = grad_cx->mut_dptr<float>(); }
+
+    const int64_t cx_numel = cx->shape_view().elem_cnt();
+    const int64_t workspace_numel = workspace->shape_view().elem_cnt();
+    const int64_t hidden_size = cx->shape_view().At(cx->shape_view().NumAxes() - 1);
+    FusedLstmCellGradFunctor<float>()(ctx->stream(), cx_numel, workspace_numel, hidden_size,
+                                      grad_hy_ptr, grad_cy_ptr, cx_ptr, cy_ptr, workspace_ptr,
+                                      grad_gates_ptr, grad_cx_ptr);
+
+    if (ctx->has_output("grad_bias", 0)) {
+      float* grad_bias_ptr = ctx->Tensor4ArgNameAndIndex("grad_bias", 0)->mut_dptr<float>();
+      std::vector<int32_t> axis;
+      axis.push_back(0);
+      const Shape& reduced_shape =
+          CreateReducedShape(workspace->shape_view(), {axis.begin(), axis.end()});
+      user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0);
+      NdarrayReduce<DeviceType::kCUDA, float, BinaryFuncSum>::Reduce(
+          ctx->stream(), XpuVarNdarray<float>(reduced_shape, grad_bias_ptr),
+          XpuVarNdarray<const float>(grad_gates->shape_view(), grad_gates->dptr<float>()),
+          XpuVarNdarray<float>(tmp_buffer->shape_view(), tmp_buffer->mut_dptr<float>()));
+    }
+  }
+
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
+};
+
+REGISTER_USER_KERNEL("fused_lstm_cell_grad")
+    .SetCreateFn<GpuFusedLstmCellGradFloatKernel>()
+    .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)
+                     && (user_op::HobDataType("grad_hy", 0) == GetDataType<float>::value)
+                     && (user_op::HobDataType("grad_cy", 0) == GetDataType<float>::value)
+                     && (user_op::HobDataType("cx", 0) == GetDataType<float>::value)
+                     && (user_op::HobDataType("cy", 0) == GetDataType<float>::value)
+                     && (user_op::HobDataType("workspace", 0) == GetDataType<float>::value))
+    .SetInferTmpSizeFn([](user_op::InferContext* ctx) {
+      size_t tmp_bytes = 0;
+      if (ctx->has_output("grad_bias", 0)) {
+        const Shape& in_shape = ctx->InputTensorDesc("workspace", 0).shape();
+        tmp_bytes = GetCudaAlignedSize(in_shape.elem_cnt() * sizeof(float));
+      } else {
+        tmp_bytes = 0;
+      }
+      return tmp_bytes;
+    });
+
+class GpuFusedLstmCellGradHalfKernel final : public user_op::OpKernel {
+ public:
+  GpuFusedLstmCellGradHalfKernel() = default;
+  ~GpuFusedLstmCellGradHalfKernel() = default;
+
+ private:
+  using user_op::OpKernel::Compute;
+  void Compute(user_op::KernelComputeContext* ctx) const override {
+    const user_op::Tensor* grad_hy = ctx->Tensor4ArgNameAndIndex("grad_hy", 0);
+    const user_op::Tensor* grad_cy = ctx->Tensor4ArgNameAndIndex("grad_cy", 0);
+    const user_op::Tensor* cx = ctx->Tensor4ArgNameAndIndex("cx", 0);
+    const user_op::Tensor* cy = ctx->Tensor4ArgNameAndIndex("cy", 0);
+    const user_op::Tensor* workspace = ctx->Tensor4ArgNameAndIndex("workspace", 0);
+    user_op::Tensor* grad_gates = ctx->Tensor4ArgNameAndIndex("grad_gates", 0);
+    user_op::Tensor* grad_cx = ctx->Tensor4ArgNameAndIndex("grad_cx", 0);
+
+    const float16* grad_hy_ptr = grad_hy->dptr<float16>();
+    const float16* grad_cy_ptr = grad_cy->dptr<float16>();
+    const float16* cx_ptr = cx->dptr<float16>();
+    const float16* cy_ptr = cy->dptr<float16>();
+    const float16* workspace_ptr = workspace->dptr<float16>();
+
+    float16* grad_gates_ptr = grad_gates->mut_dptr<float16>();
+    float16* grad_cx_ptr = nullptr;
+
+    if (ctx->has_output("grad_cx", 0)) { grad_cx_ptr = grad_cx->mut_dptr<float16>(); }
+
+    const int64_t cx_numel = cx->shape_view().elem_cnt();
+    const int64_t workspace_numel = workspace->shape_view().elem_cnt();
+    const int64_t hidden_size = cx->shape_view().At(cx->shape_view().NumAxes() - 1);
+    FusedLstmCellGradFunctor<float16>()(ctx->stream(), cx_numel, workspace_numel, hidden_size,
+                                        grad_hy_ptr, grad_cy_ptr, cx_ptr, cy_ptr, workspace_ptr,
+                                        grad_gates_ptr, grad_cx_ptr);
+
+    if (ctx->has_output("grad_bias", 0)) {
+      std::vector<int32_t> axis;
+      axis.push_back(0);
+      user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0);
+      const ShapeView& in_shape = grad_gates->shape_view();
+      const Shape& reduced_shape = CreateReducedShape(in_shape, {axis.begin(), axis.end()});
+      float* in_tmp_buffer = tmp_buffer->mut_dptr<float>();
+      const size_t in_tmp_buffer_bytes = GetCudaAlignedSize(in_shape.elem_cnt() * sizeof(float));
+      float* out_tmp_buffer =
+          reinterpret_cast<float*>(tmp_buffer->mut_dptr<char>() + in_tmp_buffer_bytes);
+      const size_t out_tmp_buffer_bytes =
+          GetCudaAlignedSize(reduced_shape.elem_cnt() * sizeof(float));
+      float* reduce_tmp_buffer = reinterpret_cast<float*>(
+          tmp_buffer->mut_dptr<char>() + in_tmp_buffer_bytes + out_tmp_buffer_bytes);
+      const size_t reduce_tmp_buffer_bytes =
+          GetCudaAlignedSize(in_shape.elem_cnt() * sizeof(float));
+      CHECK_LE(in_tmp_buffer_bytes + out_tmp_buffer_bytes + reduce_tmp_buffer_bytes,
+               tmp_buffer->shape_view().elem_cnt());
+      auto h2f = ep::primitive::NewPrimitive<ep::primitive::CastFactory>(
+          ctx->device_type(), DataType::kFloat16, DataType::kFloat);
+      CHECK(h2f);
+      auto f2h = ep::primitive::NewPrimitive<ep::primitive::CastFactory>(
+          ctx->device_type(), DataType::kFloat, DataType::kFloat16);
+      CHECK(f2h);
+      h2f->Launch(ctx->stream(), grad_gates->dptr<float16>(), in_tmp_buffer, in_shape.elem_cnt());
+
+      NdarrayReduce<DeviceType::kCUDA, float, BinaryFuncSum>::Reduce(
+          ctx->stream(), XpuVarNdarray<float>(reduced_shape, out_tmp_buffer),
+          XpuVarNdarray<const float>(in_shape, in_tmp_buffer),
+          XpuVarNdarray<float>(in_shape, reduce_tmp_buffer));
+
+      user_op::Tensor* output_tensor = ctx->Tensor4ArgNameAndIndex("grad_bias", 0);
+      f2h->Launch(ctx->stream(), out_tmp_buffer, output_tensor->mut_dptr<float16>(),
+                  output_tensor->shape_view().elem_cnt());
+    }
+  }
+
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
+};
+
+REGISTER_USER_KERNEL("fused_lstm_cell_grad")
+    .SetCreateFn<GpuFusedLstmCellGradHalfKernel>()
+    .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)
+                     && (user_op::HobDataType("grad_hy", 0) == GetDataType<float16>::value)
+                     && (user_op::HobDataType("grad_cy", 0) == GetDataType<float16>::value)
+                     && (user_op::HobDataType("cx", 0) == GetDataType<float16>::value)
+                     && (user_op::HobDataType("cy", 0) == GetDataType<float16>::value)
+                     && (user_op::HobDataType("workspace", 0) == GetDataType<float16>::value))
+    .SetInferTmpSizeFn([](user_op::InferContext* ctx) {
+      size_t tmp_bytes = 0;
+      if (ctx->has_output("grad_bias", 0)) {
+        const Shape& in_shape = ctx->InputTensorDesc("workspace", 0).shape();
+        const Shape& out_shape = ctx->OutputTensorDesc("grad_bias", 0)->shape();
+        tmp_bytes = (2 * GetCudaAlignedSize(in_shape.elem_cnt() * sizeof(float))
+                     + GetCudaAlignedSize(out_shape.elem_cnt() * sizeof(float)));
+      } else {
+        tmp_bytes = 0;
+      }
+      return tmp_bytes;
+    });
+
 }  // namespace oneflow
\ No newline at end of file
diff --git a/oneflow/user/kernels/fused_relu_dropout_grad_kernel.hip.cpp b/oneflow/user/kernels/fused_relu_dropout_grad_kernel.hip.cpp
index e1f6dc6..b7f1bd0 100644
--- a/oneflow/user/kernels/fused_relu_dropout_grad_kernel.hip.cpp
+++ b/oneflow/user/kernels/fused_relu_dropout_grad_kernel.hip.cpp
@@ -1,146 +1,146 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "hip/hip_runtime.h"
-#include "oneflow/core/framework/framework.h"
-#include "oneflow/core/device/cuda_util.h"
-#include "oneflow/core/ep/rocm/cuda_stream.h"
-#include "oneflow/core/kernel/cuda_graph_support.h"
-#include "oneflow/core/hip/elementwise.hip.h"
-
-namespace oneflow {
-
-namespace {
-
-constexpr int32_t kWarpSize = 64;
-
-template<typename T, typename IndexType, int pack_size, bool tail>
-__global__ void VectorizedReluDropoutBitmaskBackwardKernel(
-    const IndexType elem_cnt, const IndexType cols, const IndexType aux_ld, const float scale,
-    const IndexType n_tail, const IndexType tail_offset, const T* dy, const int32_t* mask, T* dx) {
-  int32_t global_thread_id = blockIdx.x * blockDim.x + threadIdx.x;
-  using LoadStoreType = cuda::elementwise::PackType<T, pack_size>;
-  using LoadStorePack = cuda::elementwise::Pack<T, pack_size>;
-
-  T t_scale = static_cast<T>(scale);
-  for (IndexType linear_pack_index = global_thread_id * pack_size; linear_pack_index < elem_cnt;
-       linear_pack_index += gridDim.x * blockDim.x * pack_size) {
-    const LoadStoreType* dy_load = reinterpret_cast<const LoadStoreType*>(dy + linear_pack_index);
-    LoadStorePack dy_vec;
-    dy_vec.storage = *dy_load;
-
-    LoadStorePack dx_vec;
-#pragma unroll
-    for (int i = 0; i < pack_size; i++) {
-      const IndexType linear_index = (linear_pack_index + i);
-      const IndexType row = linear_index / cols;
-      const IndexType col = linear_index - row * cols;
-      const int32_t col_mod_warpsize = col % kWarpSize;
-      const IndexType aux_idx = ((row * aux_ld) + col) / kWarpSize;
-      bool is_positive = mask[aux_idx] & (1 << col_mod_warpsize);
-      dx_vec.elem[i] =
-          dy_vec.elem[i] * static_cast<T>(static_cast<float>(is_positive)) * static_cast<T>(scale);
-    }
-    *(reinterpret_cast<LoadStoreType*>(dx + linear_pack_index)) = dx_vec.storage;
-  }
-
-  if (tail && global_thread_id < n_tail) {
-    const IndexType tail_index = tail_offset + global_thread_id;
-    const IndexType tail_row = tail_index / cols;
-    const IndexType tail_col = tail_index - tail_row * cols;
-    const IndexType tail_col_mod_warpsize = tail_col % kWarpSize;
-    const IndexType tail_aux_idx = ((tail_row * aux_ld) + tail_col) / kWarpSize;
-    bool is_positive = mask[tail_aux_idx] & (1 << tail_col_mod_warpsize);
-    dx[tail_index] =
-        dy[tail_index] * static_cast<T>(static_cast<float>(is_positive)) * static_cast<T>(scale);
-  }
-}
-
-template<typename T>
-void LaunchVectorizedReluDropoutBackwardKernel(ep::Stream* stream, const int64_t elem_cnt,
-                                               const int64_t cols, const int64_t aux_ld,
-                                               float scale, const T* dy, const int32_t* mask,
-                                               T* dx) {
-  constexpr int pack_size = cuda::elementwise::PackSize<T>();
-  const int64_t pack_num = elem_cnt / pack_size;
-  const int64_t tail_offset = pack_num * pack_size;
-  const int64_t n_tail = elem_cnt - tail_offset;
-  const bool tail = n_tail > 0 ? true : false;
-  if (tail) {
-    if (elem_cnt < GetMaxVal<int32_t>()) {
-      stream->As<ep::CudaStream>()->LaunchKernelDefaultWaves(
-          (VectorizedReluDropoutBitmaskBackwardKernel<T, int32_t, pack_size, true>),
-          std::max<int64_t>(1, pack_num), elem_cnt, cols, aux_ld, scale, n_tail, tail_offset, dy,
-          mask, dx);
-    } else {
-      stream->As<ep::CudaStream>()->LaunchKernelDefaultWaves(
-          (VectorizedReluDropoutBitmaskBackwardKernel<T, int64_t, pack_size, true>),
-          std::max<int64_t>(1, pack_num), elem_cnt, cols, aux_ld, scale, n_tail, tail_offset, dy,
-          mask, dx);
-    }
-  } else {
-    if (elem_cnt < GetMaxVal<int32_t>()) {
-      stream->As<ep::CudaStream>()->LaunchKernelDefaultWaves(
-          (VectorizedReluDropoutBitmaskBackwardKernel<T, int32_t, pack_size, false>),
-          std::max<int64_t>(1, pack_num), elem_cnt, cols, aux_ld, scale, /*n_tail=*/0, tail_offset,
-          dy, mask, dx);
-    } else {
-      stream->As<ep::CudaStream>()->LaunchKernelDefaultWaves(
-          (VectorizedReluDropoutBitmaskBackwardKernel<T, int64_t, pack_size, false>),
-          std::max<int64_t>(1, pack_num), elem_cnt, cols, aux_ld, scale, /*n_tail=*/0, tail_offset,
-          dy, mask, dx);
-    }
-  }
-}
-
-template<typename T>
-class FusedReluDropoutGradKernel final : public user_op::OpKernel,
-                                         public user_op::CudaGraphSupport {
- public:
-  FusedReluDropoutGradKernel() = default;
-  ~FusedReluDropoutGradKernel() override = default;
-
- private:
-  using user_op::OpKernel::Compute;
-  void Compute(user_op::KernelComputeContext* ctx) const override {
-    const user_op::Tensor* dy = ctx->Tensor4ArgNameAndIndex("dy", 0);
-    const user_op::Tensor* mask = ctx->Tensor4ArgNameAndIndex("mask", 0);
-    user_op::Tensor* dx = ctx->Tensor4ArgNameAndIndex("dx", 0);
-    const float scale = ctx->Attr<float>("scale");
-
-    const int64_t cols = dy->shape_view().At(1);
-    const int64_t aux_ld = mask->shape_view().At(1) * 32;
-    const int64_t elem_cnt = dy->shape_view().elem_cnt();
-    LaunchVectorizedReluDropoutBackwardKernel<T>(
-        ctx->stream(), elem_cnt, cols, aux_ld, scale, reinterpret_cast<const T*>(dy->dptr()),
-        mask->dptr<int32_t>(), reinterpret_cast<T*>(dx->mut_dptr()));
-  }
-
-  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
-};
-
-#define REGISTER_FUSED_RELU_DROPOUT_GRAD_KERNEL_GPU(cpp_type, data_type) \
-  REGISTER_USER_KERNEL("fused_relu_dropout_grad")                        \
-      .SetCreateFn<FusedReluDropoutGradKernel<cpp_type>>()               \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)   \
-                       && (user_op::HobDataType("dx", 0) == data_type));
-
-REGISTER_FUSED_RELU_DROPOUT_GRAD_KERNEL_GPU(float, DataType::kFloat)
-REGISTER_FUSED_RELU_DROPOUT_GRAD_KERNEL_GPU(half, DataType::kFloat16)
-
-
-}  // namespace
-
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "hip/hip_runtime.h"
+#include "oneflow/core/framework/framework.h"
+#include "oneflow/core/device/cuda_util.h"
+#include "oneflow/core/ep/rocm/cuda_stream.h"
+#include "oneflow/core/kernel/cuda_graph_support.h"
+#include "oneflow/core/hip/elementwise.hip.h"
+
+namespace oneflow {
+
+namespace {
+
+constexpr int32_t kWarpSize = 64;
+
+template<typename T, typename IndexType, int pack_size, bool tail>
+__global__ void VectorizedReluDropoutBitmaskBackwardKernel(
+    const IndexType elem_cnt, const IndexType cols, const IndexType aux_ld, const float scale,
+    const IndexType n_tail, const IndexType tail_offset, const T* dy, const int32_t* mask, T* dx) {
+  int32_t global_thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+  using LoadStoreType = cuda::elementwise::PackType<T, pack_size>;
+  using LoadStorePack = cuda::elementwise::Pack<T, pack_size>;
+
+  T t_scale = static_cast<T>(scale);
+  for (IndexType linear_pack_index = global_thread_id * pack_size; linear_pack_index < elem_cnt;
+       linear_pack_index += gridDim.x * blockDim.x * pack_size) {
+    const LoadStoreType* dy_load = reinterpret_cast<const LoadStoreType*>(dy + linear_pack_index);
+    LoadStorePack dy_vec;
+    dy_vec.storage = *dy_load;
+
+    LoadStorePack dx_vec;
+#pragma unroll
+    for (int i = 0; i < pack_size; i++) {
+      const IndexType linear_index = (linear_pack_index + i);
+      const IndexType row = linear_index / cols;
+      const IndexType col = linear_index - row * cols;
+      const int32_t col_mod_warpsize = col % kWarpSize;
+      const IndexType aux_idx = ((row * aux_ld) + col) / kWarpSize;
+      bool is_positive = mask[aux_idx] & (1 << col_mod_warpsize);
+      dx_vec.elem[i] =
+          dy_vec.elem[i] * static_cast<T>(static_cast<float>(is_positive)) * static_cast<T>(scale);
+    }
+    *(reinterpret_cast<LoadStoreType*>(dx + linear_pack_index)) = dx_vec.storage;
+  }
+
+  if (tail && global_thread_id < n_tail) {
+    const IndexType tail_index = tail_offset + global_thread_id;
+    const IndexType tail_row = tail_index / cols;
+    const IndexType tail_col = tail_index - tail_row * cols;
+    const IndexType tail_col_mod_warpsize = tail_col % kWarpSize;
+    const IndexType tail_aux_idx = ((tail_row * aux_ld) + tail_col) / kWarpSize;
+    bool is_positive = mask[tail_aux_idx] & (1 << tail_col_mod_warpsize);
+    dx[tail_index] =
+        dy[tail_index] * static_cast<T>(static_cast<float>(is_positive)) * static_cast<T>(scale);
+  }
+}
+
+template<typename T>
+void LaunchVectorizedReluDropoutBackwardKernel(ep::Stream* stream, const int64_t elem_cnt,
+                                               const int64_t cols, const int64_t aux_ld,
+                                               float scale, const T* dy, const int32_t* mask,
+                                               T* dx) {
+  constexpr int pack_size = cuda::elementwise::PackSize<T>();
+  const int64_t pack_num = elem_cnt / pack_size;
+  const int64_t tail_offset = pack_num * pack_size;
+  const int64_t n_tail = elem_cnt - tail_offset;
+  const bool tail = n_tail > 0 ? true : false;
+  if (tail) {
+    if (elem_cnt < GetMaxVal<int32_t>()) {
+      stream->As<ep::CudaStream>()->LaunchKernelDefaultWaves(
+          (VectorizedReluDropoutBitmaskBackwardKernel<T, int32_t, pack_size, true>),
+          std::max<int64_t>(1, pack_num), elem_cnt, cols, aux_ld, scale, n_tail, tail_offset, dy,
+          mask, dx);
+    } else {
+      stream->As<ep::CudaStream>()->LaunchKernelDefaultWaves(
+          (VectorizedReluDropoutBitmaskBackwardKernel<T, int64_t, pack_size, true>),
+          std::max<int64_t>(1, pack_num), elem_cnt, cols, aux_ld, scale, n_tail, tail_offset, dy,
+          mask, dx);
+    }
+  } else {
+    if (elem_cnt < GetMaxVal<int32_t>()) {
+      stream->As<ep::CudaStream>()->LaunchKernelDefaultWaves(
+          (VectorizedReluDropoutBitmaskBackwardKernel<T, int32_t, pack_size, false>),
+          std::max<int64_t>(1, pack_num), elem_cnt, cols, aux_ld, scale, /*n_tail=*/0, tail_offset,
+          dy, mask, dx);
+    } else {
+      stream->As<ep::CudaStream>()->LaunchKernelDefaultWaves(
+          (VectorizedReluDropoutBitmaskBackwardKernel<T, int64_t, pack_size, false>),
+          std::max<int64_t>(1, pack_num), elem_cnt, cols, aux_ld, scale, /*n_tail=*/0, tail_offset,
+          dy, mask, dx);
+    }
+  }
+}
+
+template<typename T>
+class FusedReluDropoutGradKernel final : public user_op::OpKernel,
+                                         public user_op::CudaGraphSupport {
+ public:
+  FusedReluDropoutGradKernel() = default;
+  ~FusedReluDropoutGradKernel() override = default;
+
+ private:
+  using user_op::OpKernel::Compute;
+  void Compute(user_op::KernelComputeContext* ctx) const override {
+    const user_op::Tensor* dy = ctx->Tensor4ArgNameAndIndex("dy", 0);
+    const user_op::Tensor* mask = ctx->Tensor4ArgNameAndIndex("mask", 0);
+    user_op::Tensor* dx = ctx->Tensor4ArgNameAndIndex("dx", 0);
+    const float scale = ctx->Attr<float>("scale");
+
+    const int64_t cols = dy->shape_view().At(1);
+    const int64_t aux_ld = mask->shape_view().At(1) * 32;
+    const int64_t elem_cnt = dy->shape_view().elem_cnt();
+    LaunchVectorizedReluDropoutBackwardKernel<T>(
+        ctx->stream(), elem_cnt, cols, aux_ld, scale, reinterpret_cast<const T*>(dy->dptr()),
+        mask->dptr<int32_t>(), reinterpret_cast<T*>(dx->mut_dptr()));
+  }
+
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
+};
+
+#define REGISTER_FUSED_RELU_DROPOUT_GRAD_KERNEL_GPU(cpp_type, data_type) \
+  REGISTER_USER_KERNEL("fused_relu_dropout_grad")                        \
+      .SetCreateFn<FusedReluDropoutGradKernel<cpp_type>>()               \
+      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)   \
+                       && (user_op::HobDataType("dx", 0) == data_type));
+
+REGISTER_FUSED_RELU_DROPOUT_GRAD_KERNEL_GPU(float, DataType::kFloat)
+REGISTER_FUSED_RELU_DROPOUT_GRAD_KERNEL_GPU(half, DataType::kFloat16)
+
+
+}  // namespace
+
 }  // namespace oneflow
\ No newline at end of file
diff --git a/oneflow/user/kernels/fused_scale_mask_softmax.hip.cpp b/oneflow/user/kernels/fused_scale_mask_softmax.hip.cpp
index 8813fb7..2a5e18c 100644
--- a/oneflow/user/kernels/fused_scale_mask_softmax.hip.cpp
+++ b/oneflow/user/kernels/fused_scale_mask_softmax.hip.cpp
@@ -1,236 +1,236 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "oneflow/core/framework/framework.h"
-#include "oneflow/core/hip/softmax.hip.h"
-#include "oneflow/core/ep/rocm/cuda_stream.h"
-#include "oneflow/user/kernels/fused_scale_mask_softmax.hip.h"
-namespace oneflow {
-
-namespace {
-
-template<typename T, typename ComputeType, typename MASK, size_t num_dims>
-void LaunchBroadcastForwardKernel(hipStream_t stream, const T* x, T* y, const MASK* mask,
-                                  const int64_t elem_cnt, const int64_t rows, const int64_t cols,
-                                  const float fill, const float scale, const int64_t* input_dims,
-                                  const int64_t* mask_dims) {
-  NdIndexOffsetHelper<int32_t, num_dims> input_index_helper(input_dims);
-  NdIndexOffsetHelper<int32_t, num_dims> mask_index_helper(mask_dims);
-  fused_scale_mask_softmax::BroadcastMaskSoftmaxParams<num_dims, int32_t> params;
-  params.src_index_helper = input_index_helper;
-  params.mask_index_helper = mask_index_helper;
-  params.mask_dims = mask_dims;
-  params.row_size = cols;
-  params.fill = fill;
-  params.scale = scale;
-  fused_scale_mask_softmax::BroadcastScaleMaskLoad<T, ComputeType, MASK, num_dims, int32_t> load(
-      x, mask, params);
-  cuda::softmax::DirectStore<ComputeType, T> store(y, cols);
-  OF_CUDA_CHECK((cuda::softmax::DispatchSoftmax<decltype(load), decltype(store), ComputeType>(
-      stream, load, store, rows, cols)));
-}
-
-template<typename T, typename ComputeType, typename MASK>
-void LaunchElementwiseForwardKernel(hipStream_t stream, const T* x, T* y, const MASK* mask,
-                                    const int64_t rows, const int64_t cols, const float fill,
-                                    const float scale) {
-  oneflow::fused_scale_mask_softmax::ElementwiseMaskSoftmaxParams params;
-  params.row_size = cols;
-  params.fill = fill;
-  params.scale = scale;
-  fused_scale_mask_softmax::ElementwiseScaleMaskLoad<T, ComputeType, MASK> load(x, mask, params);
-  cuda::softmax::DirectStore<ComputeType, T> store(y, cols);
-  OF_CUDA_CHECK((cuda::softmax::DispatchSoftmax<decltype(load), decltype(store), ComputeType>(
-      stream, load, store, rows, cols)));
-}
-
-template<typename T, typename ComputeType, typename MASK, size_t num_dims>
-void LaunchBroadcastBackwardKernel(hipStream_t stream, const T* y, const T* dy, T* dx,
-                                   const MASK* mask, const int64_t elem_cnt, const int64_t rows,
-                                   const int64_t cols, const float fill, const float scale,
-                                   const int64_t* input_dims, const int64_t* mask_dims) {
-  NdIndexOffsetHelper<int32_t, num_dims> input_index_helper(input_dims);
-  NdIndexOffsetHelper<int32_t, num_dims> mask_index_helper(mask_dims);
-  fused_scale_mask_softmax::BroadcastMaskSoftmaxParams<num_dims, int32_t> params;
-  params.src_index_helper = input_index_helper;
-  params.mask_index_helper = mask_index_helper;
-  params.mask_dims = mask_dims;
-  params.row_size = cols;
-  params.fill = fill;
-  params.scale = scale;
-  cuda::softmax::DirectLoad<T, ComputeType> load_y(y, cols);
-  cuda::softmax::DirectLoad<T, ComputeType> load_dy(dy, cols);
-  fused_scale_mask_softmax::BroadcastScaleMaskStore<ComputeType, T, MASK, num_dims, int32_t> store(
-      dx, mask, params);
-  OF_CUDA_CHECK((
-      cuda::softmax::DispatchSoftmaxGrad<decltype(load_y), decltype(load_dy), decltype(store),
-                                         ComputeType>(stream, load_y, load_dy, store, rows, cols)));
-}
-
-template<typename T, typename ComputeType, typename MASK>
-void LaunchElementwiseBackwardKernel(hipStream_t stream, const T* y, const T* dy, T* dx,
-                                     const MASK* mask, const int64_t rows, const int64_t cols,
-                                     const float fill, const float scale) {
-  fused_scale_mask_softmax::ElementwiseMaskSoftmaxParams params;
-  params.row_size = cols;
-  params.fill = fill;
-  params.scale = scale;
-  cuda::softmax::DirectLoad<T, ComputeType> load_y(y, cols);
-  cuda::softmax::DirectLoad<T, ComputeType> load_dy(dy, cols);
-  fused_scale_mask_softmax::ElementwiseScaleMaskStore<ComputeType, T, MASK> store(dx, mask, params);
-  OF_CUDA_CHECK((
-      cuda::softmax::DispatchSoftmaxGrad<decltype(load_y), decltype(load_dy), decltype(store),
-                                         ComputeType>(stream, load_y, load_dy, store, rows, cols)));
-}
-
-constexpr int32_t kMaxNumDims = 5;
-
-template<typename T, typename MASK>
-class FusedScaleMaskSoftmaxKernel final : public user_op::OpKernel {
- public:
-  FusedScaleMaskSoftmaxKernel() = default;
-  ~FusedScaleMaskSoftmaxKernel() override = default;
-
- private:
-  using user_op::OpKernel::Compute;
-  void Compute(user_op::KernelComputeContext* ctx) const override {
-    const user_op::Tensor* x = ctx->Tensor4ArgNameAndIndex("x", 0);
-    const user_op::Tensor* mask = ctx->Tensor4ArgNameAndIndex("mask", 0);
-    user_op::Tensor* y = ctx->Tensor4ArgNameAndIndex("y", 0);
-    const float mask_fill_value = ctx->Attr<float>("mask_fill_value");
-    const float scale_value = ctx->Attr<float>("scale_value");
-    const ShapeView& x_shape = x->shape_view();
-    const ShapeView& mask_shape = mask->shape_view();
-    CHECK_GE(x_shape.NumAxes(), 2);
-    const int64_t elem_cnt = x_shape.elem_cnt();
-    const int64_t cols = x_shape.At(x_shape.NumAxes() - 1);
-    const int64_t rows = x_shape.Count(0, x_shape.NumAxes() - 1);
-    const size_t num_input_dims = x_shape.NumAxes();
-    const int64_t* input_dims = x_shape.ptr();
-    const size_t num_mask_dims = mask_shape.NumAxes();
-    const int64_t* mask_dims = mask_shape.ptr();
-    using ComputeType = typename cuda::softmax::DefaultComputeType<T>::type;
-
-    size_t simplified_num_dims = 0;
-    int64_t simplified_input_dims[kMaxNumDims];
-    int64_t simplified_mask_dims[kMaxNumDims];
-    fused_scale_mask_softmax::SimplifyBroadcastDims(num_input_dims, input_dims, num_mask_dims,
-                                                    mask_dims, &simplified_num_dims,
-                                                    simplified_input_dims, simplified_mask_dims);
-    if (simplified_num_dims == 1) {
-      LaunchElementwiseForwardKernel<T, ComputeType, MASK>(
-          ctx->stream()->As<ep::CudaStream>()->cuda_stream(), x->dptr<T>(), y->mut_dptr<T>(),
-          mask->dptr<MASK>(), rows, cols, mask_fill_value, scale_value);
-    }
-#define DEFINE_ONE_ELIF(dims)                                                               \
-  else if (simplified_num_dims == dims) {                                                   \
-    LaunchBroadcastForwardKernel<T, ComputeType, MASK, dims>(                               \
-        ctx->stream()->As<ep::CudaStream>()->cuda_stream(), x->dptr<T>(), y->mut_dptr<T>(), \
-        mask->dptr<MASK>(), elem_cnt, rows, cols, mask_fill_value, scale_value,             \
-        simplified_input_dims, simplified_mask_dims);                                       \
-  }
-    DEFINE_ONE_ELIF(2)
-    DEFINE_ONE_ELIF(3)
-    DEFINE_ONE_ELIF(4)
-#undef DEFINE_ONE_ELIF
-    else {
-      UNIMPLEMENTED();
-    }
-  }
-  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
-};
-
-template<typename T, typename MASK>
-class FusedScaleMaskSoftmaxGradKernel final : public user_op::OpKernel {
- public:
-  FusedScaleMaskSoftmaxGradKernel() = default;
-  ~FusedScaleMaskSoftmaxGradKernel() override = default;
-
- private:
-  using user_op::OpKernel::Compute;
-  void Compute(user_op::KernelComputeContext* ctx) const override {
-    const user_op::Tensor* y = ctx->Tensor4ArgNameAndIndex("y", 0);
-    const user_op::Tensor* dy = ctx->Tensor4ArgNameAndIndex("dy", 0);
-    const user_op::Tensor* mask = ctx->Tensor4ArgNameAndIndex("mask", 0);
-    user_op::Tensor* dx = ctx->Tensor4ArgNameAndIndex("dx", 0);
-    const float scale_value = ctx->Attr<float>("scale_value");
-    const float mask_fill_value = static_cast<float>(0.0);
-    const ShapeView& dy_shape = dy->shape_view();
-    const ShapeView& mask_shape = mask->shape_view();
-    CHECK_GE(dy_shape.NumAxes(), 2);
-    const int64_t elem_cnt = dy_shape.elem_cnt();
-    const int64_t cols = dy_shape.At(dy_shape.NumAxes() - 1);
-    const int64_t rows = dy_shape.Count(0, dy_shape.NumAxes() - 1);
-    const int64_t* input_dims = dy_shape.ptr();
-    const size_t num_input_dims = dy_shape.NumAxes();
-    const int64_t* mask_dims = mask_shape.ptr();
-    const size_t num_mask_dims = mask_shape.NumAxes();
-
-    using ComputeType = typename cuda::softmax::DefaultComputeType<T>::type;
-
-    size_t simplified_num_dims = 0;
-    int64_t simplified_input_dims[kMaxNumDims];
-    int64_t simplified_mask_dims[kMaxNumDims];
-    fused_scale_mask_softmax::SimplifyBroadcastDims(num_input_dims, input_dims, num_mask_dims,
-                                                    mask_dims, &simplified_num_dims,
-                                                    simplified_input_dims, simplified_mask_dims);
-    if (simplified_num_dims == 1) {
-      LaunchElementwiseBackwardKernel<T, ComputeType, MASK>(
-          ctx->stream()->As<ep::CudaStream>()->cuda_stream(), y->dptr<T>(), dy->dptr<T>(),
-          dx->mut_dptr<T>(), mask->dptr<MASK>(), rows, cols, mask_fill_value, scale_value);
-    }
-#define DEFINE_ONE_ELIF(dims)                                                                      \
-  else if (simplified_num_dims == dims) {                                                          \
-    LaunchBroadcastBackwardKernel<T, ComputeType, MASK, dims>(                                     \
-        ctx->stream()->As<ep::CudaStream>()->cuda_stream(), y->dptr<T>(), dy->dptr<T>(),           \
-        dx->mut_dptr<T>(), mask->dptr<MASK>(), elem_cnt, rows, cols, mask_fill_value, scale_value, \
-        simplified_input_dims, simplified_mask_dims);                                              \
-  }
-    DEFINE_ONE_ELIF(2)
-    DEFINE_ONE_ELIF(3)
-    DEFINE_ONE_ELIF(4)
-#undef DEFINE_ONE_ELIF
-    else {
-      UNIMPLEMENTED();
-    }
-  }
-  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
-};
-
-}  // namespace
-
-#define REGISTER_FUSED_SCALE_MASK_SOFTMAX_CUDA_KERNEL(dtype, mask_dtype)              \
-  REGISTER_USER_KERNEL("fused_scale_mask_softmax")                                    \
-      .SetCreateFn<FusedScaleMaskSoftmaxKernel<dtype, mask_dtype>>()                  \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                \
-                       && (user_op::HobDataType("x", 0) == GetDataType<dtype>::value) \
-                       && (user_op::HobDataType("mask", 0) == GetDataType<mask_dtype>::value));
-
-REGISTER_FUSED_SCALE_MASK_SOFTMAX_CUDA_KERNEL(half, bool)
-REGISTER_FUSED_SCALE_MASK_SOFTMAX_CUDA_KERNEL(float, bool)
-#undef REGISTER_FUSED_SCALE_MASK_SOFTMAX_CUDA_KERNEL
-
-#define REGISTER_FUSED_SCALE_MASK_SOFTMAX_GRAD_KERNEL(dtype, mask_dtype)               \
-  REGISTER_USER_KERNEL("fused_scale_mask_softmax_grad")                                \
-      .SetCreateFn<FusedScaleMaskSoftmaxGradKernel<dtype, mask_dtype>>()               \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                 \
-                       && (user_op::HobDataType("dy", 0) == GetDataType<dtype>::value) \
-                       && (user_op::HobDataType("mask", 0) == GetDataType<mask_dtype>::value));
-
-REGISTER_FUSED_SCALE_MASK_SOFTMAX_GRAD_KERNEL(half, bool)
-REGISTER_FUSED_SCALE_MASK_SOFTMAX_GRAD_KERNEL(float, bool)
-#undef REGISTER_FUSED_SCALE_MASK_SOFTMAX_GRAD_KERNEL
-
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "oneflow/core/framework/framework.h"
+#include "oneflow/core/hip/softmax.hip.h"
+#include "oneflow/core/ep/rocm/cuda_stream.h"
+#include "oneflow/user/kernels/fused_scale_mask_softmax.hip.h"
+namespace oneflow {
+
+namespace {
+
+template<typename T, typename ComputeType, typename MASK, size_t num_dims>
+void LaunchBroadcastForwardKernel(hipStream_t stream, const T* x, T* y, const MASK* mask,
+                                  const int64_t elem_cnt, const int64_t rows, const int64_t cols,
+                                  const float fill, const float scale, const int64_t* input_dims,
+                                  const int64_t* mask_dims) {
+  NdIndexOffsetHelper<int32_t, num_dims> input_index_helper(input_dims);
+  NdIndexOffsetHelper<int32_t, num_dims> mask_index_helper(mask_dims);
+  fused_scale_mask_softmax::BroadcastMaskSoftmaxParams<num_dims, int32_t> params;
+  params.src_index_helper = input_index_helper;
+  params.mask_index_helper = mask_index_helper;
+  params.mask_dims = mask_dims;
+  params.row_size = cols;
+  params.fill = fill;
+  params.scale = scale;
+  fused_scale_mask_softmax::BroadcastScaleMaskLoad<T, ComputeType, MASK, num_dims, int32_t> load(
+      x, mask, params);
+  cuda::softmax::DirectStore<ComputeType, T> store(y, cols);
+  OF_CUDA_CHECK((cuda::softmax::DispatchSoftmax<decltype(load), decltype(store), ComputeType>(
+      stream, load, store, rows, cols)));
+}
+
+template<typename T, typename ComputeType, typename MASK>
+void LaunchElementwiseForwardKernel(hipStream_t stream, const T* x, T* y, const MASK* mask,
+                                    const int64_t rows, const int64_t cols, const float fill,
+                                    const float scale) {
+  oneflow::fused_scale_mask_softmax::ElementwiseMaskSoftmaxParams params;
+  params.row_size = cols;
+  params.fill = fill;
+  params.scale = scale;
+  fused_scale_mask_softmax::ElementwiseScaleMaskLoad<T, ComputeType, MASK> load(x, mask, params);
+  cuda::softmax::DirectStore<ComputeType, T> store(y, cols);
+  OF_CUDA_CHECK((cuda::softmax::DispatchSoftmax<decltype(load), decltype(store), ComputeType>(
+      stream, load, store, rows, cols)));
+}
+
+template<typename T, typename ComputeType, typename MASK, size_t num_dims>
+void LaunchBroadcastBackwardKernel(hipStream_t stream, const T* y, const T* dy, T* dx,
+                                   const MASK* mask, const int64_t elem_cnt, const int64_t rows,
+                                   const int64_t cols, const float fill, const float scale,
+                                   const int64_t* input_dims, const int64_t* mask_dims) {
+  NdIndexOffsetHelper<int32_t, num_dims> input_index_helper(input_dims);
+  NdIndexOffsetHelper<int32_t, num_dims> mask_index_helper(mask_dims);
+  fused_scale_mask_softmax::BroadcastMaskSoftmaxParams<num_dims, int32_t> params;
+  params.src_index_helper = input_index_helper;
+  params.mask_index_helper = mask_index_helper;
+  params.mask_dims = mask_dims;
+  params.row_size = cols;
+  params.fill = fill;
+  params.scale = scale;
+  cuda::softmax::DirectLoad<T, ComputeType> load_y(y, cols);
+  cuda::softmax::DirectLoad<T, ComputeType> load_dy(dy, cols);
+  fused_scale_mask_softmax::BroadcastScaleMaskStore<ComputeType, T, MASK, num_dims, int32_t> store(
+      dx, mask, params);
+  OF_CUDA_CHECK((
+      cuda::softmax::DispatchSoftmaxGrad<decltype(load_y), decltype(load_dy), decltype(store),
+                                         ComputeType>(stream, load_y, load_dy, store, rows, cols)));
+}
+
+template<typename T, typename ComputeType, typename MASK>
+void LaunchElementwiseBackwardKernel(hipStream_t stream, const T* y, const T* dy, T* dx,
+                                     const MASK* mask, const int64_t rows, const int64_t cols,
+                                     const float fill, const float scale) {
+  fused_scale_mask_softmax::ElementwiseMaskSoftmaxParams params;
+  params.row_size = cols;
+  params.fill = fill;
+  params.scale = scale;
+  cuda::softmax::DirectLoad<T, ComputeType> load_y(y, cols);
+  cuda::softmax::DirectLoad<T, ComputeType> load_dy(dy, cols);
+  fused_scale_mask_softmax::ElementwiseScaleMaskStore<ComputeType, T, MASK> store(dx, mask, params);
+  OF_CUDA_CHECK((
+      cuda::softmax::DispatchSoftmaxGrad<decltype(load_y), decltype(load_dy), decltype(store),
+                                         ComputeType>(stream, load_y, load_dy, store, rows, cols)));
+}
+
+constexpr int32_t kMaxNumDims = 5;
+
+template<typename T, typename MASK>
+class FusedScaleMaskSoftmaxKernel final : public user_op::OpKernel {
+ public:
+  FusedScaleMaskSoftmaxKernel() = default;
+  ~FusedScaleMaskSoftmaxKernel() override = default;
+
+ private:
+  using user_op::OpKernel::Compute;
+  void Compute(user_op::KernelComputeContext* ctx) const override {
+    const user_op::Tensor* x = ctx->Tensor4ArgNameAndIndex("x", 0);
+    const user_op::Tensor* mask = ctx->Tensor4ArgNameAndIndex("mask", 0);
+    user_op::Tensor* y = ctx->Tensor4ArgNameAndIndex("y", 0);
+    const float mask_fill_value = ctx->Attr<float>("mask_fill_value");
+    const float scale_value = ctx->Attr<float>("scale_value");
+    const ShapeView& x_shape = x->shape_view();
+    const ShapeView& mask_shape = mask->shape_view();
+    CHECK_GE(x_shape.NumAxes(), 2);
+    const int64_t elem_cnt = x_shape.elem_cnt();
+    const int64_t cols = x_shape.At(x_shape.NumAxes() - 1);
+    const int64_t rows = x_shape.Count(0, x_shape.NumAxes() - 1);
+    const size_t num_input_dims = x_shape.NumAxes();
+    const int64_t* input_dims = x_shape.ptr();
+    const size_t num_mask_dims = mask_shape.NumAxes();
+    const int64_t* mask_dims = mask_shape.ptr();
+    using ComputeType = typename cuda::softmax::DefaultComputeType<T>::type;
+
+    size_t simplified_num_dims = 0;
+    int64_t simplified_input_dims[kMaxNumDims];
+    int64_t simplified_mask_dims[kMaxNumDims];
+    fused_scale_mask_softmax::SimplifyBroadcastDims(num_input_dims, input_dims, num_mask_dims,
+                                                    mask_dims, &simplified_num_dims,
+                                                    simplified_input_dims, simplified_mask_dims);
+    if (simplified_num_dims == 1) {
+      LaunchElementwiseForwardKernel<T, ComputeType, MASK>(
+          ctx->stream()->As<ep::CudaStream>()->cuda_stream(), x->dptr<T>(), y->mut_dptr<T>(),
+          mask->dptr<MASK>(), rows, cols, mask_fill_value, scale_value);
+    }
+#define DEFINE_ONE_ELIF(dims)                                                               \
+  else if (simplified_num_dims == dims) {                                                   \
+    LaunchBroadcastForwardKernel<T, ComputeType, MASK, dims>(                               \
+        ctx->stream()->As<ep::CudaStream>()->cuda_stream(), x->dptr<T>(), y->mut_dptr<T>(), \
+        mask->dptr<MASK>(), elem_cnt, rows, cols, mask_fill_value, scale_value,             \
+        simplified_input_dims, simplified_mask_dims);                                       \
+  }
+    DEFINE_ONE_ELIF(2)
+    DEFINE_ONE_ELIF(3)
+    DEFINE_ONE_ELIF(4)
+#undef DEFINE_ONE_ELIF
+    else {
+      UNIMPLEMENTED();
+    }
+  }
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
+};
+
+template<typename T, typename MASK>
+class FusedScaleMaskSoftmaxGradKernel final : public user_op::OpKernel {
+ public:
+  FusedScaleMaskSoftmaxGradKernel() = default;
+  ~FusedScaleMaskSoftmaxGradKernel() override = default;
+
+ private:
+  using user_op::OpKernel::Compute;
+  void Compute(user_op::KernelComputeContext* ctx) const override {
+    const user_op::Tensor* y = ctx->Tensor4ArgNameAndIndex("y", 0);
+    const user_op::Tensor* dy = ctx->Tensor4ArgNameAndIndex("dy", 0);
+    const user_op::Tensor* mask = ctx->Tensor4ArgNameAndIndex("mask", 0);
+    user_op::Tensor* dx = ctx->Tensor4ArgNameAndIndex("dx", 0);
+    const float scale_value = ctx->Attr<float>("scale_value");
+    const float mask_fill_value = static_cast<float>(0.0);
+    const ShapeView& dy_shape = dy->shape_view();
+    const ShapeView& mask_shape = mask->shape_view();
+    CHECK_GE(dy_shape.NumAxes(), 2);
+    const int64_t elem_cnt = dy_shape.elem_cnt();
+    const int64_t cols = dy_shape.At(dy_shape.NumAxes() - 1);
+    const int64_t rows = dy_shape.Count(0, dy_shape.NumAxes() - 1);
+    const int64_t* input_dims = dy_shape.ptr();
+    const size_t num_input_dims = dy_shape.NumAxes();
+    const int64_t* mask_dims = mask_shape.ptr();
+    const size_t num_mask_dims = mask_shape.NumAxes();
+
+    using ComputeType = typename cuda::softmax::DefaultComputeType<T>::type;
+
+    size_t simplified_num_dims = 0;
+    int64_t simplified_input_dims[kMaxNumDims];
+    int64_t simplified_mask_dims[kMaxNumDims];
+    fused_scale_mask_softmax::SimplifyBroadcastDims(num_input_dims, input_dims, num_mask_dims,
+                                                    mask_dims, &simplified_num_dims,
+                                                    simplified_input_dims, simplified_mask_dims);
+    if (simplified_num_dims == 1) {
+      LaunchElementwiseBackwardKernel<T, ComputeType, MASK>(
+          ctx->stream()->As<ep::CudaStream>()->cuda_stream(), y->dptr<T>(), dy->dptr<T>(),
+          dx->mut_dptr<T>(), mask->dptr<MASK>(), rows, cols, mask_fill_value, scale_value);
+    }
+#define DEFINE_ONE_ELIF(dims)                                                                      \
+  else if (simplified_num_dims == dims) {                                                          \
+    LaunchBroadcastBackwardKernel<T, ComputeType, MASK, dims>(                                     \
+        ctx->stream()->As<ep::CudaStream>()->cuda_stream(), y->dptr<T>(), dy->dptr<T>(),           \
+        dx->mut_dptr<T>(), mask->dptr<MASK>(), elem_cnt, rows, cols, mask_fill_value, scale_value, \
+        simplified_input_dims, simplified_mask_dims);                                              \
+  }
+    DEFINE_ONE_ELIF(2)
+    DEFINE_ONE_ELIF(3)
+    DEFINE_ONE_ELIF(4)
+#undef DEFINE_ONE_ELIF
+    else {
+      UNIMPLEMENTED();
+    }
+  }
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
+};
+
+}  // namespace
+
+#define REGISTER_FUSED_SCALE_MASK_SOFTMAX_CUDA_KERNEL(dtype, mask_dtype)              \
+  REGISTER_USER_KERNEL("fused_scale_mask_softmax")                                    \
+      .SetCreateFn<FusedScaleMaskSoftmaxKernel<dtype, mask_dtype>>()                  \
+      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                \
+                       && (user_op::HobDataType("x", 0) == GetDataType<dtype>::value) \
+                       && (user_op::HobDataType("mask", 0) == GetDataType<mask_dtype>::value));
+
+REGISTER_FUSED_SCALE_MASK_SOFTMAX_CUDA_KERNEL(half, bool)
+REGISTER_FUSED_SCALE_MASK_SOFTMAX_CUDA_KERNEL(float, bool)
+#undef REGISTER_FUSED_SCALE_MASK_SOFTMAX_CUDA_KERNEL
+
+#define REGISTER_FUSED_SCALE_MASK_SOFTMAX_GRAD_KERNEL(dtype, mask_dtype)               \
+  REGISTER_USER_KERNEL("fused_scale_mask_softmax_grad")                                \
+      .SetCreateFn<FusedScaleMaskSoftmaxGradKernel<dtype, mask_dtype>>()               \
+      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                 \
+                       && (user_op::HobDataType("dy", 0) == GetDataType<dtype>::value) \
+                       && (user_op::HobDataType("mask", 0) == GetDataType<mask_dtype>::value));
+
+REGISTER_FUSED_SCALE_MASK_SOFTMAX_GRAD_KERNEL(half, bool)
+REGISTER_FUSED_SCALE_MASK_SOFTMAX_GRAD_KERNEL(float, bool)
+#undef REGISTER_FUSED_SCALE_MASK_SOFTMAX_GRAD_KERNEL
+
 }  // namespace oneflow
\ No newline at end of file
diff --git a/oneflow/user/kernels/fused_scale_mask_softmax.hip.h b/oneflow/user/kernels/fused_scale_mask_softmax.hip.h
index 84adfb6..43e49ce 100644
--- a/oneflow/user/kernels/fused_scale_mask_softmax.hip.h
+++ b/oneflow/user/kernels/fused_scale_mask_softmax.hip.h
@@ -1,216 +1,216 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "oneflow/core/common/nd_index_offset_helper.h"
-
-namespace oneflow {
-
-namespace fused_scale_mask_softmax {
-
-namespace {
-
-void SimplifyBroadcastDims(size_t num_a_dims, const int64_t* a_dims, size_t num_b_dims,
-                           const int64_t* b_dims, size_t* simplified_num_dims,
-                           int64_t* simplified_a_dims, int64_t* simplified_b_dims) {
-  const size_t num_max_dims = std::max(num_a_dims, num_b_dims);
-  auto MakeGetDim = [num_max_dims](size_t num_dims, const int64_t* dims) {
-    const int64_t num_padding_dims = num_max_dims - num_dims;
-    return [num_padding_dims, dims](size_t index) {
-      return index < num_padding_dims ? 1 : dims[index - num_padding_dims];
-    };
-  };
-  auto GetADim = MakeGetDim(num_a_dims, a_dims);
-  auto GetBDim = MakeGetDim(num_b_dims, b_dims);
-  *simplified_num_dims = 0;
-  bool prev_broadcast_a = false;
-  bool prev_broadcast_b = false;
-  for (int64_t i = 0; i < num_max_dims; ++i) {
-    const int64_t a_dim = GetADim(i);
-    const int64_t b_dim = GetBDim(i);
-    const int64_t broadcast_dim = std::max(a_dim, b_dim);
-    CHECK_GT(broadcast_dim, 0);
-    const bool broadcast_a = (a_dim == 1);
-    const bool broadcast_b = (b_dim == 1);
-    CHECK((a_dim == broadcast_dim) || broadcast_a);
-    CHECK((b_dim == broadcast_dim) || broadcast_b);
-    if (broadcast_dim == 1) {
-      continue;
-    } else if (*simplified_num_dims != 0
-               && (prev_broadcast_a == broadcast_a && prev_broadcast_b == broadcast_b)) {
-      simplified_a_dims[*simplified_num_dims - 1] *= a_dim;
-      simplified_b_dims[*simplified_num_dims - 1] *= b_dim;
-    } else {
-      simplified_a_dims[*simplified_num_dims] = a_dim;
-      simplified_b_dims[*simplified_num_dims] = b_dim;
-      *simplified_num_dims += 1;
-      prev_broadcast_a = broadcast_a;
-      prev_broadcast_b = broadcast_b;
-    }
-  }
-}
-
-template<size_t num_dims, typename IndexType>
-struct BroadcastMaskSoftmaxParams {
-  NdIndexOffsetHelper<IndexType, num_dims> src_index_helper;
-  NdIndexOffsetHelper<IndexType, num_dims> mask_index_helper;
-  const int64_t* mask_dims{};
-  int64_t row_size;
-  float fill;
-  float scale;
-};
-
-struct ElementwiseMaskSoftmaxParams {
-  int64_t row_size;
-  float fill;
-  float scale;
-};
-
-template<typename SRC, typename DST, typename MASK, size_t num_dims, typename IndexType>
-struct BroadcastScaleMaskLoad {
-  BroadcastScaleMaskLoad(const SRC* src, const MASK* mask,
-                         BroadcastMaskSoftmaxParams<num_dims, IndexType> params)
-      : src(src), mask(mask), params(params) {
-    for (int i = 0; i < num_dims; i++) { mask_dims[i] = params.mask_dims[i]; }
-  }
-  template<int N>
-  __device__ void load(DST* dst, int64_t row, int64_t col) {
-    cuda::softmax::Pack<SRC, N> pack;
-    cuda::softmax::Pack<MASK, N> mask_pack;
-    const IndexType offset = row * params.row_size + col;
-    IndexType input_index[num_dims];
-    IndexType mask_index[num_dims];
-    params.src_index_helper.OffsetToNdIndex(offset, input_index);
-    for (int dim = 0; dim < num_dims; ++dim) {
-      if (mask_dims[dim] == 1) {
-        mask_index[dim] = 0;
-      } else {
-        mask_index[dim] = input_index[dim];
-      }
-    }
-    const IndexType mask_offset = params.mask_index_helper.NdIndexToOffset(mask_index);
-    pack.storage = *(reinterpret_cast<const cuda::softmax::PackType<SRC, N>*>(src) + offset / N);
-    mask_pack.storage =
-        *(reinterpret_cast<const cuda::softmax::PackType<MASK, N>*>(mask) + mask_offset / N);
-#pragma unroll
-    for (int i = 0; i < N; ++i) {
-      if (mask_pack.elem[i] == 0) {
-        dst[i] = static_cast<DST>(params.fill);
-      } else {
-        dst[i] = static_cast<DST>(pack.elem[i]) * static_cast<DST>(params.scale);
-      }
-    }
-  }
-  const SRC* src;
-  const MASK* mask;
-  int64_t mask_dims[num_dims];
-  BroadcastMaskSoftmaxParams<num_dims, IndexType> params;
-};
-
-template<typename SRC, typename DST, typename MASK>
-struct ElementwiseScaleMaskLoad {
-  ElementwiseScaleMaskLoad(const SRC* src, const MASK* mask, ElementwiseMaskSoftmaxParams param)
-      : src(src), mask(mask), param(param) {}
-  template<int N>
-  __device__ void load(DST* dst, int64_t row, int64_t col) {
-    cuda::softmax::Pack<SRC, N> pack;
-    const int64_t offset = (row * param.row_size + col) / N;
-    pack.storage = *(reinterpret_cast<const cuda::softmax::PackType<SRC, N>*>(src) + offset);
-    cuda::softmax::Pack<int8_t, N> mask_pack;
-    mask_pack.storage = *(reinterpret_cast<const cuda::softmax::PackType<MASK, N>*>(mask) + offset);
-#pragma unroll
-    for (int i = 0; i < N; ++i) {
-      if (mask_pack.elem[i] == 0) {
-        dst[i] = static_cast<DST>(param.fill);
-      } else {
-        dst[i] = static_cast<DST>(pack.elem[i]) * static_cast<DST>(param.scale);
-      }
-    }
-  }
-  const SRC* src;
-  const MASK* mask;
-  ElementwiseMaskSoftmaxParams param;
-};
-
-template<typename SRC, typename DST, typename MASK, size_t num_dims, typename IndexType>
-struct BroadcastScaleMaskStore {
-  BroadcastScaleMaskStore(DST* dst, const MASK* mask,
-                          BroadcastMaskSoftmaxParams<num_dims, IndexType> params)
-      : dst(dst), mask(mask), params(params) {
-    for (int i = 0; i < num_dims; ++i) { mask_dims[i] = params.mask_dims[i]; }
-  }
-  template<int N>
-  __device__ void store(const SRC* src, int64_t row, int64_t col) {
-    cuda::softmax::Pack<DST, N> pack;
-    cuda::softmax::Pack<MASK, N> mask_pack;
-    const IndexType offset = row * params.row_size + col;
-    IndexType input_index[num_dims];
-    IndexType mask_index[num_dims];
-    params.src_index_helper.OffsetToNdIndex(offset, input_index);
-    for (int dim = 0; dim < num_dims; ++dim) {
-      if (mask_dims[dim] == 1) {
-        mask_index[dim] = 0;
-      } else {
-        mask_index[dim] = input_index[dim];
-      }
-    }
-    const IndexType mask_offset = params.mask_index_helper.NdIndexToOffset(mask_index);
-    mask_pack.storage =
-        *(reinterpret_cast<const cuda::softmax::PackType<MASK, N>*>(mask) + mask_offset / N);
-#pragma unroll
-    for (int i = 0; i < N; ++i) {
-      if (mask_pack.elem[i] == 0) {
-        pack.elem[i] = static_cast<DST>(params.fill);
-      } else {
-        pack.elem[i] = static_cast<DST>(src[i]) * static_cast<DST>(params.scale);
-      }
-    }
-    *(reinterpret_cast<cuda::softmax::PackType<DST, N>*>(dst) + offset / N) = pack.storage;
-  }
-  DST* dst;
-  const MASK* mask;
-  int64_t mask_dims[num_dims];
-  BroadcastMaskSoftmaxParams<num_dims, IndexType> params;
-};
-
-template<typename SRC, typename DST, typename MASK>
-struct ElementwiseScaleMaskStore {
-  ElementwiseScaleMaskStore(DST* dst, const MASK* mask, ElementwiseMaskSoftmaxParams params)
-      : dst(dst), mask(mask), params(params) {}
-  template<int N>
-  __device__ void store(const SRC* src, int64_t row, int64_t col) {
-    cuda::softmax::Pack<DST, N> pack;
-    const int64_t offset = (row * params.row_size + col) / N;
-    cuda::softmax::Pack<MASK, N> mask_pack;
-    mask_pack.storage = *(reinterpret_cast<const cuda::softmax::PackType<MASK, N>*>(mask) + offset);
-#pragma unroll
-    for (int i = 0; i < N; ++i) {
-      if (mask_pack.elem[i] == 0) {
-        pack.elem[i] = params.fill;
-      } else {
-        pack.elem[i] = static_cast<DST>(src[i]) * static_cast<DST>(params.scale);
-      }
-    }
-    *(reinterpret_cast<cuda::softmax::PackType<DST, N>*>(dst) + offset) = pack.storage;
-  }
-  DST* dst;
-  const MASK* mask;
-  ElementwiseMaskSoftmaxParams params;
-};
-
-}  // namespace
-
-}  // namespace fused_scale_mask_softmax
-
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "oneflow/core/common/nd_index_offset_helper.h"
+
+namespace oneflow {
+
+namespace fused_scale_mask_softmax {
+
+namespace {
+
+void SimplifyBroadcastDims(size_t num_a_dims, const int64_t* a_dims, size_t num_b_dims,
+                           const int64_t* b_dims, size_t* simplified_num_dims,
+                           int64_t* simplified_a_dims, int64_t* simplified_b_dims) {
+  const size_t num_max_dims = std::max(num_a_dims, num_b_dims);
+  auto MakeGetDim = [num_max_dims](size_t num_dims, const int64_t* dims) {
+    const int64_t num_padding_dims = num_max_dims - num_dims;
+    return [num_padding_dims, dims](size_t index) {
+      return index < num_padding_dims ? 1 : dims[index - num_padding_dims];
+    };
+  };
+  auto GetADim = MakeGetDim(num_a_dims, a_dims);
+  auto GetBDim = MakeGetDim(num_b_dims, b_dims);
+  *simplified_num_dims = 0;
+  bool prev_broadcast_a = false;
+  bool prev_broadcast_b = false;
+  for (int64_t i = 0; i < num_max_dims; ++i) {
+    const int64_t a_dim = GetADim(i);
+    const int64_t b_dim = GetBDim(i);
+    const int64_t broadcast_dim = std::max(a_dim, b_dim);
+    CHECK_GT(broadcast_dim, 0);
+    const bool broadcast_a = (a_dim == 1);
+    const bool broadcast_b = (b_dim == 1);
+    CHECK((a_dim == broadcast_dim) || broadcast_a);
+    CHECK((b_dim == broadcast_dim) || broadcast_b);
+    if (broadcast_dim == 1) {
+      continue;
+    } else if (*simplified_num_dims != 0
+               && (prev_broadcast_a == broadcast_a && prev_broadcast_b == broadcast_b)) {
+      simplified_a_dims[*simplified_num_dims - 1] *= a_dim;
+      simplified_b_dims[*simplified_num_dims - 1] *= b_dim;
+    } else {
+      simplified_a_dims[*simplified_num_dims] = a_dim;
+      simplified_b_dims[*simplified_num_dims] = b_dim;
+      *simplified_num_dims += 1;
+      prev_broadcast_a = broadcast_a;
+      prev_broadcast_b = broadcast_b;
+    }
+  }
+}
+
+template<size_t num_dims, typename IndexType>
+struct BroadcastMaskSoftmaxParams {
+  NdIndexOffsetHelper<IndexType, num_dims> src_index_helper;
+  NdIndexOffsetHelper<IndexType, num_dims> mask_index_helper;
+  const int64_t* mask_dims{};
+  int64_t row_size;
+  float fill;
+  float scale;
+};
+
+struct ElementwiseMaskSoftmaxParams {
+  int64_t row_size;
+  float fill;
+  float scale;
+};
+
+template<typename SRC, typename DST, typename MASK, size_t num_dims, typename IndexType>
+struct BroadcastScaleMaskLoad {
+  BroadcastScaleMaskLoad(const SRC* src, const MASK* mask,
+                         BroadcastMaskSoftmaxParams<num_dims, IndexType> params)
+      : src(src), mask(mask), params(params) {
+    for (int i = 0; i < num_dims; i++) { mask_dims[i] = params.mask_dims[i]; }
+  }
+  template<int N>
+  __device__ void load(DST* dst, int64_t row, int64_t col) {
+    cuda::softmax::Pack<SRC, N> pack;
+    cuda::softmax::Pack<MASK, N> mask_pack;
+    const IndexType offset = row * params.row_size + col;
+    IndexType input_index[num_dims];
+    IndexType mask_index[num_dims];
+    params.src_index_helper.OffsetToNdIndex(offset, input_index);
+    for (int dim = 0; dim < num_dims; ++dim) {
+      if (mask_dims[dim] == 1) {
+        mask_index[dim] = 0;
+      } else {
+        mask_index[dim] = input_index[dim];
+      }
+    }
+    const IndexType mask_offset = params.mask_index_helper.NdIndexToOffset(mask_index);
+    pack.storage = *(reinterpret_cast<const cuda::softmax::PackType<SRC, N>*>(src) + offset / N);
+    mask_pack.storage =
+        *(reinterpret_cast<const cuda::softmax::PackType<MASK, N>*>(mask) + mask_offset / N);
+#pragma unroll
+    for (int i = 0; i < N; ++i) {
+      if (mask_pack.elem[i] == 0) {
+        dst[i] = static_cast<DST>(params.fill);
+      } else {
+        dst[i] = static_cast<DST>(pack.elem[i]) * static_cast<DST>(params.scale);
+      }
+    }
+  }
+  const SRC* src;
+  const MASK* mask;
+  int64_t mask_dims[num_dims];
+  BroadcastMaskSoftmaxParams<num_dims, IndexType> params;
+};
+
+template<typename SRC, typename DST, typename MASK>
+struct ElementwiseScaleMaskLoad {
+  ElementwiseScaleMaskLoad(const SRC* src, const MASK* mask, ElementwiseMaskSoftmaxParams param)
+      : src(src), mask(mask), param(param) {}
+  template<int N>
+  __device__ void load(DST* dst, int64_t row, int64_t col) {
+    cuda::softmax::Pack<SRC, N> pack;
+    const int64_t offset = (row * param.row_size + col) / N;
+    pack.storage = *(reinterpret_cast<const cuda::softmax::PackType<SRC, N>*>(src) + offset);
+    cuda::softmax::Pack<int8_t, N> mask_pack;
+    mask_pack.storage = *(reinterpret_cast<const cuda::softmax::PackType<MASK, N>*>(mask) + offset);
+#pragma unroll
+    for (int i = 0; i < N; ++i) {
+      if (mask_pack.elem[i] == 0) {
+        dst[i] = static_cast<DST>(param.fill);
+      } else {
+        dst[i] = static_cast<DST>(pack.elem[i]) * static_cast<DST>(param.scale);
+      }
+    }
+  }
+  const SRC* src;
+  const MASK* mask;
+  ElementwiseMaskSoftmaxParams param;
+};
+
+template<typename SRC, typename DST, typename MASK, size_t num_dims, typename IndexType>
+struct BroadcastScaleMaskStore {
+  BroadcastScaleMaskStore(DST* dst, const MASK* mask,
+                          BroadcastMaskSoftmaxParams<num_dims, IndexType> params)
+      : dst(dst), mask(mask), params(params) {
+    for (int i = 0; i < num_dims; ++i) { mask_dims[i] = params.mask_dims[i]; }
+  }
+  template<int N>
+  __device__ void store(const SRC* src, int64_t row, int64_t col) {
+    cuda::softmax::Pack<DST, N> pack;
+    cuda::softmax::Pack<MASK, N> mask_pack;
+    const IndexType offset = row * params.row_size + col;
+    IndexType input_index[num_dims];
+    IndexType mask_index[num_dims];
+    params.src_index_helper.OffsetToNdIndex(offset, input_index);
+    for (int dim = 0; dim < num_dims; ++dim) {
+      if (mask_dims[dim] == 1) {
+        mask_index[dim] = 0;
+      } else {
+        mask_index[dim] = input_index[dim];
+      }
+    }
+    const IndexType mask_offset = params.mask_index_helper.NdIndexToOffset(mask_index);
+    mask_pack.storage =
+        *(reinterpret_cast<const cuda::softmax::PackType<MASK, N>*>(mask) + mask_offset / N);
+#pragma unroll
+    for (int i = 0; i < N; ++i) {
+      if (mask_pack.elem[i] == 0) {
+        pack.elem[i] = static_cast<DST>(params.fill);
+      } else {
+        pack.elem[i] = static_cast<DST>(src[i]) * static_cast<DST>(params.scale);
+      }
+    }
+    *(reinterpret_cast<cuda::softmax::PackType<DST, N>*>(dst) + offset / N) = pack.storage;
+  }
+  DST* dst;
+  const MASK* mask;
+  int64_t mask_dims[num_dims];
+  BroadcastMaskSoftmaxParams<num_dims, IndexType> params;
+};
+
+template<typename SRC, typename DST, typename MASK>
+struct ElementwiseScaleMaskStore {
+  ElementwiseScaleMaskStore(DST* dst, const MASK* mask, ElementwiseMaskSoftmaxParams params)
+      : dst(dst), mask(mask), params(params) {}
+  template<int N>
+  __device__ void store(const SRC* src, int64_t row, int64_t col) {
+    cuda::softmax::Pack<DST, N> pack;
+    const int64_t offset = (row * params.row_size + col) / N;
+    cuda::softmax::Pack<MASK, N> mask_pack;
+    mask_pack.storage = *(reinterpret_cast<const cuda::softmax::PackType<MASK, N>*>(mask) + offset);
+#pragma unroll
+    for (int i = 0; i < N; ++i) {
+      if (mask_pack.elem[i] == 0) {
+        pack.elem[i] = params.fill;
+      } else {
+        pack.elem[i] = static_cast<DST>(src[i]) * static_cast<DST>(params.scale);
+      }
+    }
+    *(reinterpret_cast<cuda::softmax::PackType<DST, N>*>(dst) + offset) = pack.storage;
+  }
+  DST* dst;
+  const MASK* mask;
+  ElementwiseMaskSoftmaxParams params;
+};
+
+}  // namespace
+
+}  // namespace fused_scale_mask_softmax
+
 }  // namespace oneflow
\ No newline at end of file
diff --git a/oneflow/user/kernels/fused_scale_mask_softmax_dropout.hip.cpp b/oneflow/user/kernels/fused_scale_mask_softmax_dropout.hip.cpp
index a0e6482..12a8fc9 100644
--- a/oneflow/user/kernels/fused_scale_mask_softmax_dropout.hip.cpp
+++ b/oneflow/user/kernels/fused_scale_mask_softmax_dropout.hip.cpp
@@ -1,303 +1,303 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "oneflow/core/framework/framework.h"
-#include "oneflow/core/hip/softmax.hip.h"
-#include "oneflow/core/ep/rocm/cuda_stream.h"
-#include "oneflow/user/kernels/fused_scale_mask_softmax.hip.h"
-
-namespace oneflow {
-
-namespace {
-
-template<typename SRC, typename DST>
-struct DropoutLoad {
-  DropoutLoad(const SRC* src, const bool* mask, int64_t row_size, SRC scale)
-      : src(src), mask(mask), row_size(row_size), scale(scale) {}
-  template<int N>
-  __device__ void load(DST* dst, int64_t row, int64_t col) const {
-    cuda::softmax::Pack<SRC, N> pack;
-    const int64_t offset = (row * row_size + col) / N;
-    pack.storage = *(reinterpret_cast<const cuda::softmax::PackType<SRC, N>*>(src) + offset);
-    cuda::softmax::Pack<bool, N> mask_pack;
-    mask_pack.storage = *(reinterpret_cast<const cuda::softmax::PackType<bool, N>*>(mask) + offset);
-#pragma unroll
-    for (int i = 0; i < N; ++i) {
-      dst[i] = static_cast<DST>(pack.elem[i]) * static_cast<DST>(mask_pack.elem[i])
-               * static_cast<DST>(scale);
-    }
-  }
-  const SRC* src;
-  const bool* mask;
-  int64_t row_size;
-  SRC scale;
-};
-
-template<typename SRC, typename DST>
-struct DropoutStore {
-  DropoutStore(DST* dst, DST* softmax_y, const bool* mask, int64_t row_size, DST scale)
-      : dst(dst), softmax_y(softmax_y), mask(mask), row_size(row_size), scale(scale) {}
-  template<int N>
-  __device__ void store(const SRC* src, int64_t row, int64_t col) {
-    cuda::softmax::Pack<DST, N> softmax_y_pack;
-    cuda::softmax::Pack<DST, N> dst_pack;
-    const int64_t offset = (row * row_size + col) / N;
-    cuda::softmax::Pack<bool, N> mask_pack;
-    mask_pack.storage = *(reinterpret_cast<const cuda::softmax::PackType<bool, N>*>(mask) + offset);
-#pragma unroll
-    for (int i = 0; i < N; ++i) {
-      softmax_y_pack.elem[i] = static_cast<DST>(src[i]);
-      dst_pack.elem[i] =
-          static_cast<DST>(src[i]) * static_cast<DST>(mask_pack.elem[i]) * static_cast<DST>(scale);
-    }
-    *(reinterpret_cast<cuda::softmax::PackType<DST, N>*>(softmax_y) + offset) =
-        softmax_y_pack.storage;
-    *(reinterpret_cast<cuda::softmax::PackType<DST, N>*>(dst) + offset) = dst_pack.storage;
-  }
-  DST* dst;
-  DST* softmax_y;
-  const bool* mask;
-  int64_t row_size;
-  DST scale;
-};
-
-template<typename T, typename ComputeType, typename MASK, int num_dims>
-void LaunchBroadcastForwardKernel(hipStream_t stream, const T* x, T* y, T* softmax_y,
-                                  const MASK* mask, const bool* dropout_mask,
-                                  const int64_t elem_cnt, const int64_t rows, const int64_t cols,
-                                  const float fill, const float scale, const float dropout_scale,
-                                  const int64_t* input_dims, const int64_t* mask_dims) {
-  DropoutStore<ComputeType, T> store(y, softmax_y, dropout_mask, cols, dropout_scale);
-  NdIndexOffsetHelper<int32_t, num_dims> input_index_helper(input_dims);
-  NdIndexOffsetHelper<int32_t, num_dims> mask_index_helper(mask_dims);
-  fused_scale_mask_softmax::BroadcastMaskSoftmaxParams<num_dims, int32_t> params;
-  params.src_index_helper = input_index_helper;
-  params.mask_index_helper = mask_index_helper;
-  params.mask_dims = mask_dims;
-  params.row_size = cols;
-  params.fill = fill;
-  params.scale = scale;
-  fused_scale_mask_softmax::BroadcastScaleMaskLoad<T, ComputeType, MASK, num_dims, int32_t> load(
-      x, mask, params);
-  OF_CUDA_CHECK((cuda::softmax::DispatchSoftmax<decltype(load), decltype(store), ComputeType>(
-      stream, load, store, rows, cols)));
-}
-
-template<typename T, typename ComputeType, typename MASK>
-void LaunchElementwiseForwardKernel(hipStream_t stream, const T* x, T* y, T* softmax_y,
-                                    const MASK* mask, const bool* dropout_mask, const int64_t rows,
-                                    const int64_t cols, const float fill, const float scale,
-                                    const float dropout_scale) {
-  fused_scale_mask_softmax::ElementwiseMaskSoftmaxParams params;
-  params.row_size = cols;
-  params.fill = fill;
-  params.scale = scale;
-  fused_scale_mask_softmax::ElementwiseScaleMaskLoad<T, ComputeType, MASK> load(x, mask, params);
-  DropoutStore<ComputeType, T> store(y, softmax_y, dropout_mask, cols, dropout_scale);
-  OF_CUDA_CHECK((cuda::softmax::DispatchSoftmax<decltype(load), decltype(store), ComputeType>(
-      stream, load, store, rows, cols)));
-}
-
-template<typename T, typename ComputeType, typename MASK, int num_dims>
-void LaunchBroadcastBackwardKernel(hipStream_t stream, const T* softmax_y, const T* dy, T* dx,
-                                   const MASK* mask, const bool* dropout_mask,
-                                   const int64_t elem_cnt, const int64_t rows, const int64_t cols,
-                                   const float fill, const float scale, const float dropout_scale,
-                                   const int64_t* input_dims, const int64_t* mask_dims) {
-  DropoutLoad<T, ComputeType> load_dy(dy, dropout_mask, cols, dropout_scale);
-  NdIndexOffsetHelper<int32_t, num_dims> input_index_helper(input_dims, num_dims);
-  NdIndexOffsetHelper<int32_t, num_dims> mask_index_helper(mask_dims, num_dims);
-  fused_scale_mask_softmax::BroadcastMaskSoftmaxParams<num_dims, int32_t> params;
-  params.src_index_helper = input_index_helper;
-  params.mask_index_helper = mask_index_helper;
-  params.mask_dims = mask_dims;
-  params.row_size = cols;
-  params.fill = fill;
-  params.scale = scale;
-  cuda::softmax::DirectLoad<T, ComputeType> load_softmax_y(softmax_y, cols);
-  fused_scale_mask_softmax::BroadcastScaleMaskStore<ComputeType, T, MASK, num_dims, int32_t> store(
-      dx, mask, params);
-  OF_CUDA_CHECK((cuda::softmax::DispatchSoftmaxGrad<decltype(load_softmax_y), decltype(load_dy),
-                                                    decltype(store), ComputeType>(
-      stream, load_softmax_y, load_dy, store, rows, cols)));
-}
-
-template<typename T, typename ComputeType, typename MASK>
-void LaunchElementwiseBackwardKernel(hipStream_t stream, const T* softmax_y, const T* dy, T* dx,
-                                     const MASK* mask, const bool* dropout_mask, const int64_t rows,
-                                     const int64_t cols, const float fill, const float scale,
-                                     const float dropout_scale) {
-  fused_scale_mask_softmax::ElementwiseMaskSoftmaxParams params;
-  params.row_size = cols;
-  params.fill = fill;
-  params.scale = scale;
-  cuda::softmax::DirectLoad<T, ComputeType> load_softmax_y(softmax_y, cols);
-  DropoutLoad<T, ComputeType> load_dy(dy, dropout_mask, cols, dropout_scale);
-  fused_scale_mask_softmax::ElementwiseScaleMaskStore<ComputeType, T, MASK> store(dx, mask, params);
-  OF_CUDA_CHECK((cuda::softmax::DispatchSoftmaxGrad<decltype(load_softmax_y), decltype(load_dy),
-                                                    decltype(store), ComputeType>(
-      stream, load_softmax_y, load_dy, store, rows, cols)));
-}
-
-constexpr int32_t kMaxNumDims = 5;
-
-template<typename T, typename MASK>
-class FusedScaleMaskSoftmaxDropoutKernel final : public user_op::OpKernel {
- public:
-  FusedScaleMaskSoftmaxDropoutKernel() = default;
-  ~FusedScaleMaskSoftmaxDropoutKernel() override = default;
-
- private:
-  using user_op::OpKernel::Compute;
-  void Compute(user_op::KernelComputeContext* ctx) const override {
-    const user_op::Tensor* x = ctx->Tensor4ArgNameAndIndex("x", 0);
-    const user_op::Tensor* mask = ctx->Tensor4ArgNameAndIndex("mask", 0);
-    const user_op::Tensor* dropout_mask = ctx->Tensor4ArgNameAndIndex("dropout_mask", 0);
-    user_op::Tensor* y = ctx->Tensor4ArgNameAndIndex("y", 0);
-    const float mask_fill_value = ctx->Attr<float>("mask_fill_value");
-    const float scale_value = ctx->Attr<float>("scale_value");
-    const float dropout_scale_value = ctx->Attr<float>("dropout_scale_value");
-    user_op::Tensor* softmax_y = ctx->Tensor4ArgNameAndIndex("softmax_y", 0);
-    const ShapeView& x_shape = x->shape_view();
-    const ShapeView& mask_shape = mask->shape_view();
-    CHECK_GE(x_shape.NumAxes(), 2);
-    const int64_t elem_cnt = x_shape.elem_cnt();
-    const int64_t cols = x_shape.At(x_shape.NumAxes() - 1);
-    const int64_t rows = x_shape.Count(0, x_shape.NumAxes() - 1);
-    const size_t num_input_dims = x_shape.NumAxes();
-    const int64_t* input_dims = x_shape.ptr();
-    const size_t num_mask_dims = mask_shape.NumAxes();
-    const int64_t* mask_dims = mask_shape.ptr();
-    using ComputeType = typename cuda::softmax::DefaultComputeType<T>::type;
-
-    size_t simplified_num_dims = 0;
-    int64_t simplified_input_dims[kMaxNumDims];
-    int64_t simplified_mask_dims[kMaxNumDims];
-    fused_scale_mask_softmax::SimplifyBroadcastDims(num_input_dims, input_dims, num_mask_dims,
-                                                    mask_dims, &simplified_num_dims,
-                                                    simplified_input_dims, simplified_mask_dims);
-    if (simplified_num_dims == 1) {
-      LaunchElementwiseForwardKernel<T, ComputeType, MASK>(
-          ctx->stream()->As<ep::CudaStream>()->cuda_stream(), x->dptr<T>(), y->mut_dptr<T>(),
-          softmax_y->mut_dptr<T>(), mask->dptr<MASK>(), dropout_mask->dptr<bool>(), rows, cols,
-          mask_fill_value, scale_value, dropout_scale_value);
-    }
-
-#define DEFINE_ONE_ELIF(dims)                                                                     \
-  else if (simplified_num_dims == dims) {                                                         \
-    LaunchBroadcastForwardKernel<T, ComputeType, MASK, dims>(                                     \
-        ctx->stream()->As<ep::CudaStream>()->cuda_stream(), x->dptr<T>(), y->mut_dptr<T>(),       \
-        softmax_y->mut_dptr<T>(), mask->dptr<MASK>(), dropout_mask->dptr<bool>(), elem_cnt, rows, \
-        cols, mask_fill_value, scale_value, dropout_scale_value, simplified_input_dims,           \
-        simplified_mask_dims);                                                                    \
-  }
-    DEFINE_ONE_ELIF(2)
-    DEFINE_ONE_ELIF(3)
-    DEFINE_ONE_ELIF(4)
-#undef DEFINE_ONE_ELIF
-    else {
-      UNIMPLEMENTED();
-    }
-  }
-  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
-};
-
-template<typename T, typename MASK>
-class FusedScaleMaskSoftmaxDropoutGradKernel final : public user_op::OpKernel {
- public:
-  FusedScaleMaskSoftmaxDropoutGradKernel() = default;
-  ~FusedScaleMaskSoftmaxDropoutGradKernel() override = default;
-
- private:
-  using user_op::OpKernel::Compute;
-  void Compute(user_op::KernelComputeContext* ctx) const override {
-    const user_op::Tensor* softmax_y = ctx->Tensor4ArgNameAndIndex("softmax_y", 0);
-    const user_op::Tensor* dy = ctx->Tensor4ArgNameAndIndex("dy", 0);
-    const user_op::Tensor* mask = ctx->Tensor4ArgNameAndIndex("mask", 0);
-    const user_op::Tensor* dropout_mask = ctx->Tensor4ArgNameAndIndex("dropout_mask", 0);
-    user_op::Tensor* dx = ctx->Tensor4ArgNameAndIndex("dx", 0);
-    const float mask_fill_value = static_cast<float>(0.0);
-    const float scale_value = ctx->Attr<float>("scale_value");
-    const float dropout_scale_value = ctx->Attr<float>("dropout_scale_value");
-    const ShapeView& dy_shape = dy->shape_view();
-    const int64_t elem_cnt = dy_shape.elem_cnt();
-    const ShapeView& mask_shape = mask->shape_view();
-    CHECK_GE(dy_shape.NumAxes(), 2);
-    const int64_t cols = dy_shape.At(dy_shape.NumAxes() - 1);
-    const int64_t rows = dy_shape.Count(0, dy_shape.NumAxes() - 1);
-    const int64_t* input_dims = dy_shape.ptr();
-    const size_t num_input_dims = dy_shape.NumAxes();
-    const int64_t* mask_dims = mask_shape.ptr();
-    const size_t num_mask_dims = mask_shape.NumAxes();
-
-    using ComputeType = typename cuda::softmax::DefaultComputeType<T>::type;
-    cuda::softmax::DirectLoad<T, ComputeType> load_softmax_y(softmax_y->dptr<T>(), cols);
-
-    size_t simplified_num_dims = 0;
-    int64_t simplified_input_dims[kMaxNumDims];
-    int64_t simplified_mask_dims[kMaxNumDims];
-    fused_scale_mask_softmax::SimplifyBroadcastDims(num_input_dims, input_dims, num_mask_dims,
-                                                    mask_dims, &simplified_num_dims,
-                                                    simplified_input_dims, simplified_mask_dims);
-    if (simplified_num_dims == 1) {
-      LaunchElementwiseBackwardKernel<T, ComputeType, MASK>(
-          ctx->stream()->As<ep::CudaStream>()->cuda_stream(), softmax_y->dptr<T>(), dy->dptr<T>(),
-          dx->mut_dptr<T>(), mask->dptr<MASK>(), dropout_mask->dptr<bool>(), rows, cols,
-          mask_fill_value, scale_value, dropout_scale_value);
-    }
-#define DEFINE_ONE_ELIF(dims)                                                                    \
-  else if (simplified_num_dims == dims) {                                                        \
-    LaunchBroadcastBackwardKernel<T, ComputeType, MASK, dims>(                                   \
-        ctx->stream()->As<ep::CudaStream>()->cuda_stream(), softmax_y->dptr<T>(), dy->dptr<T>(), \
-        dx->mut_dptr<T>(), mask->dptr<MASK>(), dropout_mask->dptr<bool>(), elem_cnt, rows, cols, \
-        static_cast<float>(0.0), ctx->Attr<float>("scale_value"),                                \
-        ctx->Attr<float>("dropout_scale_value"), simplified_input_dims, simplified_mask_dims);   \
-  }
-    DEFINE_ONE_ELIF(2)
-    DEFINE_ONE_ELIF(3)
-    DEFINE_ONE_ELIF(4)
-#undef DEFINE_ONE_ELIF
-    else {
-      UNIMPLEMENTED();
-    }
-  }
-  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
-};
-
-}  // namespace
-
-#define REGISTER_FUSED_SCALE_MASK_SOFTMAX_DROPOUT_CUDA_KERNEL(dtype, mask_dtype)      \
-  REGISTER_USER_KERNEL("fused_scale_mask_softmax_dropout")                            \
-      .SetCreateFn<FusedScaleMaskSoftmaxDropoutKernel<dtype, mask_dtype>>()           \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                \
-                       && (user_op::HobDataType("x", 0) == GetDataType<dtype>::value) \
-                       && (user_op::HobDataType("mask", 0) == GetDataType<mask_dtype>::value));
-
-REGISTER_FUSED_SCALE_MASK_SOFTMAX_DROPOUT_CUDA_KERNEL(half, bool)
-REGISTER_FUSED_SCALE_MASK_SOFTMAX_DROPOUT_CUDA_KERNEL(float, bool)
-#undef REGISTER_FUSED_SCALE_MASK_SOFTMAX_DROPOUT_CUDA_KERNEL
-
-#define REGISTER_FUSED_SCALE_MASK_SOFTMAX_DROPOUT_GRAD_KERNEL(dtype, mask_dtype)       \
-  REGISTER_USER_KERNEL("fused_scale_mask_softmax_dropout_grad")                        \
-      .SetCreateFn<FusedScaleMaskSoftmaxDropoutGradKernel<dtype, mask_dtype>>()        \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                 \
-                       && (user_op::HobDataType("dx", 0) == GetDataType<dtype>::value) \
-                       && (user_op::HobDataType("mask", 0) == GetDataType<mask_dtype>::value));
-
-REGISTER_FUSED_SCALE_MASK_SOFTMAX_DROPOUT_GRAD_KERNEL(half, bool)
-REGISTER_FUSED_SCALE_MASK_SOFTMAX_DROPOUT_GRAD_KERNEL(float, bool)
-#undef REGISTER_FUSED_SCALE_MASK_SOFTMAX_DROPOUT_GRAD_KERNEL
-
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "oneflow/core/framework/framework.h"
+#include "oneflow/core/hip/softmax.hip.h"
+#include "oneflow/core/ep/rocm/cuda_stream.h"
+#include "oneflow/user/kernels/fused_scale_mask_softmax.hip.h"
+
+namespace oneflow {
+
+namespace {
+
+template<typename SRC, typename DST>
+struct DropoutLoad {
+  DropoutLoad(const SRC* src, const bool* mask, int64_t row_size, SRC scale)
+      : src(src), mask(mask), row_size(row_size), scale(scale) {}
+  template<int N>
+  __device__ void load(DST* dst, int64_t row, int64_t col) const {
+    cuda::softmax::Pack<SRC, N> pack;
+    const int64_t offset = (row * row_size + col) / N;
+    pack.storage = *(reinterpret_cast<const cuda::softmax::PackType<SRC, N>*>(src) + offset);
+    cuda::softmax::Pack<bool, N> mask_pack;
+    mask_pack.storage = *(reinterpret_cast<const cuda::softmax::PackType<bool, N>*>(mask) + offset);
+#pragma unroll
+    for (int i = 0; i < N; ++i) {
+      dst[i] = static_cast<DST>(pack.elem[i]) * static_cast<DST>(mask_pack.elem[i])
+               * static_cast<DST>(scale);
+    }
+  }
+  const SRC* src;
+  const bool* mask;
+  int64_t row_size;
+  SRC scale;
+};
+
+template<typename SRC, typename DST>
+struct DropoutStore {
+  DropoutStore(DST* dst, DST* softmax_y, const bool* mask, int64_t row_size, DST scale)
+      : dst(dst), softmax_y(softmax_y), mask(mask), row_size(row_size), scale(scale) {}
+  template<int N>
+  __device__ void store(const SRC* src, int64_t row, int64_t col) {
+    cuda::softmax::Pack<DST, N> softmax_y_pack;
+    cuda::softmax::Pack<DST, N> dst_pack;
+    const int64_t offset = (row * row_size + col) / N;
+    cuda::softmax::Pack<bool, N> mask_pack;
+    mask_pack.storage = *(reinterpret_cast<const cuda::softmax::PackType<bool, N>*>(mask) + offset);
+#pragma unroll
+    for (int i = 0; i < N; ++i) {
+      softmax_y_pack.elem[i] = static_cast<DST>(src[i]);
+      dst_pack.elem[i] =
+          static_cast<DST>(src[i]) * static_cast<DST>(mask_pack.elem[i]) * static_cast<DST>(scale);
+    }
+    *(reinterpret_cast<cuda::softmax::PackType<DST, N>*>(softmax_y) + offset) =
+        softmax_y_pack.storage;
+    *(reinterpret_cast<cuda::softmax::PackType<DST, N>*>(dst) + offset) = dst_pack.storage;
+  }
+  DST* dst;
+  DST* softmax_y;
+  const bool* mask;
+  int64_t row_size;
+  DST scale;
+};
+
+template<typename T, typename ComputeType, typename MASK, int num_dims>
+void LaunchBroadcastForwardKernel(hipStream_t stream, const T* x, T* y, T* softmax_y,
+                                  const MASK* mask, const bool* dropout_mask,
+                                  const int64_t elem_cnt, const int64_t rows, const int64_t cols,
+                                  const float fill, const float scale, const float dropout_scale,
+                                  const int64_t* input_dims, const int64_t* mask_dims) {
+  DropoutStore<ComputeType, T> store(y, softmax_y, dropout_mask, cols, dropout_scale);
+  NdIndexOffsetHelper<int32_t, num_dims> input_index_helper(input_dims);
+  NdIndexOffsetHelper<int32_t, num_dims> mask_index_helper(mask_dims);
+  fused_scale_mask_softmax::BroadcastMaskSoftmaxParams<num_dims, int32_t> params;
+  params.src_index_helper = input_index_helper;
+  params.mask_index_helper = mask_index_helper;
+  params.mask_dims = mask_dims;
+  params.row_size = cols;
+  params.fill = fill;
+  params.scale = scale;
+  fused_scale_mask_softmax::BroadcastScaleMaskLoad<T, ComputeType, MASK, num_dims, int32_t> load(
+      x, mask, params);
+  OF_CUDA_CHECK((cuda::softmax::DispatchSoftmax<decltype(load), decltype(store), ComputeType>(
+      stream, load, store, rows, cols)));
+}
+
+template<typename T, typename ComputeType, typename MASK>
+void LaunchElementwiseForwardKernel(hipStream_t stream, const T* x, T* y, T* softmax_y,
+                                    const MASK* mask, const bool* dropout_mask, const int64_t rows,
+                                    const int64_t cols, const float fill, const float scale,
+                                    const float dropout_scale) {
+  fused_scale_mask_softmax::ElementwiseMaskSoftmaxParams params;
+  params.row_size = cols;
+  params.fill = fill;
+  params.scale = scale;
+  fused_scale_mask_softmax::ElementwiseScaleMaskLoad<T, ComputeType, MASK> load(x, mask, params);
+  DropoutStore<ComputeType, T> store(y, softmax_y, dropout_mask, cols, dropout_scale);
+  OF_CUDA_CHECK((cuda::softmax::DispatchSoftmax<decltype(load), decltype(store), ComputeType>(
+      stream, load, store, rows, cols)));
+}
+
+template<typename T, typename ComputeType, typename MASK, int num_dims>
+void LaunchBroadcastBackwardKernel(hipStream_t stream, const T* softmax_y, const T* dy, T* dx,
+                                   const MASK* mask, const bool* dropout_mask,
+                                   const int64_t elem_cnt, const int64_t rows, const int64_t cols,
+                                   const float fill, const float scale, const float dropout_scale,
+                                   const int64_t* input_dims, const int64_t* mask_dims) {
+  DropoutLoad<T, ComputeType> load_dy(dy, dropout_mask, cols, dropout_scale);
+  NdIndexOffsetHelper<int32_t, num_dims> input_index_helper(input_dims, num_dims);
+  NdIndexOffsetHelper<int32_t, num_dims> mask_index_helper(mask_dims, num_dims);
+  fused_scale_mask_softmax::BroadcastMaskSoftmaxParams<num_dims, int32_t> params;
+  params.src_index_helper = input_index_helper;
+  params.mask_index_helper = mask_index_helper;
+  params.mask_dims = mask_dims;
+  params.row_size = cols;
+  params.fill = fill;
+  params.scale = scale;
+  cuda::softmax::DirectLoad<T, ComputeType> load_softmax_y(softmax_y, cols);
+  fused_scale_mask_softmax::BroadcastScaleMaskStore<ComputeType, T, MASK, num_dims, int32_t> store(
+      dx, mask, params);
+  OF_CUDA_CHECK((cuda::softmax::DispatchSoftmaxGrad<decltype(load_softmax_y), decltype(load_dy),
+                                                    decltype(store), ComputeType>(
+      stream, load_softmax_y, load_dy, store, rows, cols)));
+}
+
+template<typename T, typename ComputeType, typename MASK>
+void LaunchElementwiseBackwardKernel(hipStream_t stream, const T* softmax_y, const T* dy, T* dx,
+                                     const MASK* mask, const bool* dropout_mask, const int64_t rows,
+                                     const int64_t cols, const float fill, const float scale,
+                                     const float dropout_scale) {
+  fused_scale_mask_softmax::ElementwiseMaskSoftmaxParams params;
+  params.row_size = cols;
+  params.fill = fill;
+  params.scale = scale;
+  cuda::softmax::DirectLoad<T, ComputeType> load_softmax_y(softmax_y, cols);
+  DropoutLoad<T, ComputeType> load_dy(dy, dropout_mask, cols, dropout_scale);
+  fused_scale_mask_softmax::ElementwiseScaleMaskStore<ComputeType, T, MASK> store(dx, mask, params);
+  OF_CUDA_CHECK((cuda::softmax::DispatchSoftmaxGrad<decltype(load_softmax_y), decltype(load_dy),
+                                                    decltype(store), ComputeType>(
+      stream, load_softmax_y, load_dy, store, rows, cols)));
+}
+
+constexpr int32_t kMaxNumDims = 5;
+
+template<typename T, typename MASK>
+class FusedScaleMaskSoftmaxDropoutKernel final : public user_op::OpKernel {
+ public:
+  FusedScaleMaskSoftmaxDropoutKernel() = default;
+  ~FusedScaleMaskSoftmaxDropoutKernel() override = default;
+
+ private:
+  using user_op::OpKernel::Compute;
+  void Compute(user_op::KernelComputeContext* ctx) const override {
+    const user_op::Tensor* x = ctx->Tensor4ArgNameAndIndex("x", 0);
+    const user_op::Tensor* mask = ctx->Tensor4ArgNameAndIndex("mask", 0);
+    const user_op::Tensor* dropout_mask = ctx->Tensor4ArgNameAndIndex("dropout_mask", 0);
+    user_op::Tensor* y = ctx->Tensor4ArgNameAndIndex("y", 0);
+    const float mask_fill_value = ctx->Attr<float>("mask_fill_value");
+    const float scale_value = ctx->Attr<float>("scale_value");
+    const float dropout_scale_value = ctx->Attr<float>("dropout_scale_value");
+    user_op::Tensor* softmax_y = ctx->Tensor4ArgNameAndIndex("softmax_y", 0);
+    const ShapeView& x_shape = x->shape_view();
+    const ShapeView& mask_shape = mask->shape_view();
+    CHECK_GE(x_shape.NumAxes(), 2);
+    const int64_t elem_cnt = x_shape.elem_cnt();
+    const int64_t cols = x_shape.At(x_shape.NumAxes() - 1);
+    const int64_t rows = x_shape.Count(0, x_shape.NumAxes() - 1);
+    const size_t num_input_dims = x_shape.NumAxes();
+    const int64_t* input_dims = x_shape.ptr();
+    const size_t num_mask_dims = mask_shape.NumAxes();
+    const int64_t* mask_dims = mask_shape.ptr();
+    using ComputeType = typename cuda::softmax::DefaultComputeType<T>::type;
+
+    size_t simplified_num_dims = 0;
+    int64_t simplified_input_dims[kMaxNumDims];
+    int64_t simplified_mask_dims[kMaxNumDims];
+    fused_scale_mask_softmax::SimplifyBroadcastDims(num_input_dims, input_dims, num_mask_dims,
+                                                    mask_dims, &simplified_num_dims,
+                                                    simplified_input_dims, simplified_mask_dims);
+    if (simplified_num_dims == 1) {
+      LaunchElementwiseForwardKernel<T, ComputeType, MASK>(
+          ctx->stream()->As<ep::CudaStream>()->cuda_stream(), x->dptr<T>(), y->mut_dptr<T>(),
+          softmax_y->mut_dptr<T>(), mask->dptr<MASK>(), dropout_mask->dptr<bool>(), rows, cols,
+          mask_fill_value, scale_value, dropout_scale_value);
+    }
+
+#define DEFINE_ONE_ELIF(dims)                                                                     \
+  else if (simplified_num_dims == dims) {                                                         \
+    LaunchBroadcastForwardKernel<T, ComputeType, MASK, dims>(                                     \
+        ctx->stream()->As<ep::CudaStream>()->cuda_stream(), x->dptr<T>(), y->mut_dptr<T>(),       \
+        softmax_y->mut_dptr<T>(), mask->dptr<MASK>(), dropout_mask->dptr<bool>(), elem_cnt, rows, \
+        cols, mask_fill_value, scale_value, dropout_scale_value, simplified_input_dims,           \
+        simplified_mask_dims);                                                                    \
+  }
+    DEFINE_ONE_ELIF(2)
+    DEFINE_ONE_ELIF(3)
+    DEFINE_ONE_ELIF(4)
+#undef DEFINE_ONE_ELIF
+    else {
+      UNIMPLEMENTED();
+    }
+  }
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
+};
+
+template<typename T, typename MASK>
+class FusedScaleMaskSoftmaxDropoutGradKernel final : public user_op::OpKernel {
+ public:
+  FusedScaleMaskSoftmaxDropoutGradKernel() = default;
+  ~FusedScaleMaskSoftmaxDropoutGradKernel() override = default;
+
+ private:
+  using user_op::OpKernel::Compute;
+  void Compute(user_op::KernelComputeContext* ctx) const override {
+    const user_op::Tensor* softmax_y = ctx->Tensor4ArgNameAndIndex("softmax_y", 0);
+    const user_op::Tensor* dy = ctx->Tensor4ArgNameAndIndex("dy", 0);
+    const user_op::Tensor* mask = ctx->Tensor4ArgNameAndIndex("mask", 0);
+    const user_op::Tensor* dropout_mask = ctx->Tensor4ArgNameAndIndex("dropout_mask", 0);
+    user_op::Tensor* dx = ctx->Tensor4ArgNameAndIndex("dx", 0);
+    const float mask_fill_value = static_cast<float>(0.0);
+    const float scale_value = ctx->Attr<float>("scale_value");
+    const float dropout_scale_value = ctx->Attr<float>("dropout_scale_value");
+    const ShapeView& dy_shape = dy->shape_view();
+    const int64_t elem_cnt = dy_shape.elem_cnt();
+    const ShapeView& mask_shape = mask->shape_view();
+    CHECK_GE(dy_shape.NumAxes(), 2);
+    const int64_t cols = dy_shape.At(dy_shape.NumAxes() - 1);
+    const int64_t rows = dy_shape.Count(0, dy_shape.NumAxes() - 1);
+    const int64_t* input_dims = dy_shape.ptr();
+    const size_t num_input_dims = dy_shape.NumAxes();
+    const int64_t* mask_dims = mask_shape.ptr();
+    const size_t num_mask_dims = mask_shape.NumAxes();
+
+    using ComputeType = typename cuda::softmax::DefaultComputeType<T>::type;
+    cuda::softmax::DirectLoad<T, ComputeType> load_softmax_y(softmax_y->dptr<T>(), cols);
+
+    size_t simplified_num_dims = 0;
+    int64_t simplified_input_dims[kMaxNumDims];
+    int64_t simplified_mask_dims[kMaxNumDims];
+    fused_scale_mask_softmax::SimplifyBroadcastDims(num_input_dims, input_dims, num_mask_dims,
+                                                    mask_dims, &simplified_num_dims,
+                                                    simplified_input_dims, simplified_mask_dims);
+    if (simplified_num_dims == 1) {
+      LaunchElementwiseBackwardKernel<T, ComputeType, MASK>(
+          ctx->stream()->As<ep::CudaStream>()->cuda_stream(), softmax_y->dptr<T>(), dy->dptr<T>(),
+          dx->mut_dptr<T>(), mask->dptr<MASK>(), dropout_mask->dptr<bool>(), rows, cols,
+          mask_fill_value, scale_value, dropout_scale_value);
+    }
+#define DEFINE_ONE_ELIF(dims)                                                                    \
+  else if (simplified_num_dims == dims) {                                                        \
+    LaunchBroadcastBackwardKernel<T, ComputeType, MASK, dims>(                                   \
+        ctx->stream()->As<ep::CudaStream>()->cuda_stream(), softmax_y->dptr<T>(), dy->dptr<T>(), \
+        dx->mut_dptr<T>(), mask->dptr<MASK>(), dropout_mask->dptr<bool>(), elem_cnt, rows, cols, \
+        static_cast<float>(0.0), ctx->Attr<float>("scale_value"),                                \
+        ctx->Attr<float>("dropout_scale_value"), simplified_input_dims, simplified_mask_dims);   \
+  }
+    DEFINE_ONE_ELIF(2)
+    DEFINE_ONE_ELIF(3)
+    DEFINE_ONE_ELIF(4)
+#undef DEFINE_ONE_ELIF
+    else {
+      UNIMPLEMENTED();
+    }
+  }
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
+};
+
+}  // namespace
+
+#define REGISTER_FUSED_SCALE_MASK_SOFTMAX_DROPOUT_CUDA_KERNEL(dtype, mask_dtype)      \
+  REGISTER_USER_KERNEL("fused_scale_mask_softmax_dropout")                            \
+      .SetCreateFn<FusedScaleMaskSoftmaxDropoutKernel<dtype, mask_dtype>>()           \
+      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                \
+                       && (user_op::HobDataType("x", 0) == GetDataType<dtype>::value) \
+                       && (user_op::HobDataType("mask", 0) == GetDataType<mask_dtype>::value));
+
+REGISTER_FUSED_SCALE_MASK_SOFTMAX_DROPOUT_CUDA_KERNEL(half, bool)
+REGISTER_FUSED_SCALE_MASK_SOFTMAX_DROPOUT_CUDA_KERNEL(float, bool)
+#undef REGISTER_FUSED_SCALE_MASK_SOFTMAX_DROPOUT_CUDA_KERNEL
+
+#define REGISTER_FUSED_SCALE_MASK_SOFTMAX_DROPOUT_GRAD_KERNEL(dtype, mask_dtype)       \
+  REGISTER_USER_KERNEL("fused_scale_mask_softmax_dropout_grad")                        \
+      .SetCreateFn<FusedScaleMaskSoftmaxDropoutGradKernel<dtype, mask_dtype>>()        \
+      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                 \
+                       && (user_op::HobDataType("dx", 0) == GetDataType<dtype>::value) \
+                       && (user_op::HobDataType("mask", 0) == GetDataType<mask_dtype>::value));
+
+REGISTER_FUSED_SCALE_MASK_SOFTMAX_DROPOUT_GRAD_KERNEL(half, bool)
+REGISTER_FUSED_SCALE_MASK_SOFTMAX_DROPOUT_GRAD_KERNEL(float, bool)
+#undef REGISTER_FUSED_SCALE_MASK_SOFTMAX_DROPOUT_GRAD_KERNEL
+
 }  // namespace oneflow
\ No newline at end of file
diff --git a/oneflow/user/kernels/fused_self_attention_query_mul_key_and_value_kernel.hip.cpp b/oneflow/user/kernels/fused_self_attention_query_mul_key_and_value_kernel.hip.cpp
index c72cc2a..458b8b9 100644
--- a/oneflow/user/kernels/fused_self_attention_query_mul_key_and_value_kernel.hip.cpp
+++ b/oneflow/user/kernels/fused_self_attention_query_mul_key_and_value_kernel.hip.cpp
@@ -1,293 +1,293 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "oneflow/core/framework/framework.h"
-#include "oneflow/user/kernels/slice_util.h"
-#include "oneflow/core/kernel/new_kernel_util.h"
-#include "oneflow/core/ep/include/primitive/permute.h"
-#include "oneflow/core/ep/rocm/cuda_stream.h"
-
-namespace oneflow {
-
-namespace {
-
-inline hipblasOperation_t GetCublasOp(char op) {
-  switch (op) {
-    case 'n':
-    case 'N': {
-      return HIPBLAS_OP_N;
-    }
-    case 't':
-    case 'T': {
-      return HIPBLAS_OP_T;
-    }
-    case 'c':
-    case 'C': {
-      return HIPBLAS_OP_C;
-    }
-    default: {
-      UNIMPLEMENTED();
-    }
-  }
-  return HIPBLAS_OP_N;
-}
-
-template<typename T>
-struct CudaDataTypeTrait;
-
-template<>
-struct CudaDataTypeTrait<float> {
-  const static hipblasDatatype_t value = HIPBLAS_R_32F;
-};
-
-template<>
-struct CudaDataTypeTrait<half> {
-  const static hipblasDatatype_t value = HIPBLAS_R_16F;
-};
-
-template<typename T>
-void CublasBatchGemm(hipblasHandle_t handle, char transa, char transb, int64_t m, int64_t n,
-                     int64_t k, T alpha, const T* a, int64_t lda, int64_t stridea, const T* b,
-                     int64_t ldb, int64_t strideb, T beta, T* c, int64_t ldc, int64_t stridec,
-                     int64_t batch_size) {
-  hipblasOperation_t opa = GetCublasOp(transa);
-  hipblasOperation_t opb = GetCublasOp(transb);
- 
-
-  hipblasDatatype_t data_type = CudaDataTypeTrait<T>::value;
-  OF_CUBLAS_CHECK(hipblasGemmStridedBatchedEx(
-      handle, opa, opb, m, n, k, reinterpret_cast<const void*>(&alpha),
-      reinterpret_cast<const void*>(a), data_type, lda, stridea, reinterpret_cast<const void*>(b),
-      data_type, ldb, strideb, reinterpret_cast<const void*>(&beta), reinterpret_cast<void*>(c),
-      data_type, ldc, stridec, batch_size, data_type, HIPBLAS_GEMM_DEFAULT));
- 
-}
-
-
-template<>
-void CublasBatchGemm<half>(hipblasHandle_t handle, char transa, char transb, int64_t m, int64_t n,
-                           int64_t k, half alpha, const half* a, int64_t lda, int64_t stridea,
-                           const half* b, int64_t ldb, int64_t strideb, half beta, half* c,
-                           int64_t ldc, int64_t stridec, int64_t batch_size) {
-  using comp_t = float;
-  hipblasOperation_t opa = GetCublasOp(transa);
-  hipblasOperation_t opb = GetCublasOp(transb);
-
-
-  float alpha_f = static_cast<comp_t>(alpha);
-  float beta_f = static_cast<comp_t>(beta);
-  hipblasGemmAlgo_t algo = HIPBLAS_GEMM_DEFAULT;
-  hipblasDatatype_t data_type = CudaDataTypeTrait<half>::value;
-  hipblasDatatype_t comp_type = CudaDataTypeTrait<comp_t>::value;
-  OF_CUBLAS_CHECK(hipblasGemmStridedBatchedEx(
-      handle, opa, opb, m, n, k, &alpha_f, reinterpret_cast<const void*>(a), data_type, lda,
-      stridea, reinterpret_cast<const void*>(b), data_type, ldb, strideb, &beta_f,
-      reinterpret_cast<void*>(c), data_type, ldc, stridec, batch_size, comp_type, algo));
-  
-}
-
-template<>
-void CublasBatchGemm<float16>(hipblasHandle_t handle, char transa, char transb, int64_t m, int64_t n,
-                              int64_t k, float16 alpha, const float16* a, int64_t lda,
-                              int64_t stridea, const float16* b, int64_t ldb, int64_t strideb,
-                              float16 beta, float16* c, int64_t ldc, int64_t stridec,
-                              int64_t batch_size) {
-  CublasBatchGemm<half>(handle, transa, transb, m, n, k, static_cast<half>(alpha),
-                        reinterpret_cast<const half*>(a), lda, stridea,
-                        reinterpret_cast<const half*>(b), ldb, strideb, static_cast<half>(beta),
-                        reinterpret_cast<half*>(c), ldc, stridec, batch_size);
-}
-
-
-template<typename T>
-void BatchedGemm(ep::Stream* stream, char opa, char opb, int64_t m, int64_t n, int64_t k,
-                 float alpha, const T* a, int64_t lda, int64_t stridea, const T* b, int64_t ldb,
-                 int64_t strideb, float beta, T* c, int64_t ldc, int64_t stridec,
-                 int64_t batch_size) {
-  // swap m and n, a and b to convert from row-major to col-major
-  CublasBatchGemm<T>(stream->As<ep::CudaStream>()->cublas_handle(), opb, opa, n, m, k,
-                     static_cast<T>(alpha), b, ldb, strideb, a, lda, stridea, static_cast<T>(beta),
-                     c, ldc, stridec, batch_size);
-}
-
-SliceParams ConstructSliceParams4Value(int64_t seq_len, int64_t batch_size, int64_t num_heads,
-                                       int64_t head_size) {
-  // slice (s, b, n, 3, h) to (s, b, n, 1, h)
-  SliceParams params;
-  params.ndim = 4;
-  params.dims[0] = seq_len;
-  params.dims[1] = batch_size;
-  params.dims[2] = num_heads;
-  params.dims[3] = 3 * head_size;
-  params.start[0] = 0;
-  params.start[1] = 0;
-  params.start[2] = 0;
-  params.start[3] = 2 * head_size;
-  params.step[0] = 1;
-  params.step[1] = 1;
-  params.step[2] = 1;
-  params.step[3] = 1;
-  params.size[0] = seq_len;
-  params.size[1] = batch_size;
-  params.size[2] = num_heads;
-  params.size[3] = head_size;
-  return params;
-}
-
-template<typename T>
-void TransposeGpu(ep::Stream* stream, DataType data_type, const ShapeView& in_shape,
-                  const ShapeView& out_shape, const std::vector<int32_t>& perm, const T* in,
-                  T* out) {
-  CHECK_EQ(in_shape.NumAxes(), out_shape.NumAxes());
-  int32_t num_axes = in_shape.NumAxes();
-  CHECK_EQ(num_axes, perm.size());
-  for (int i = 0; i < perm.size(); ++i) { CHECK_EQ(in_shape.At(perm[i]), out_shape.At(i)); }
-  auto transpose = ep::primitive::NewPrimitive<ep::primitive::PermuteFactory>(stream->device_type(),
-                                                                              in_shape.NumAxes());
-  CHECK(transpose);
-  transpose->Launch(stream, data_type, in_shape.NumAxes(), in_shape.ptr(), in, perm.data(), out);
-}
-
-template<typename T>
-class FusedSelfAttentionQueryMulKeyAndValueGpuKernel final : public user_op::OpKernel {
- public:
-  FusedSelfAttentionQueryMulKeyAndValueGpuKernel() = default;
-  ~FusedSelfAttentionQueryMulKeyAndValueGpuKernel() override = default;
-
- private:
-  using user_op::OpKernel::Compute;
-  void Compute(user_op::KernelComputeContext* ctx) const override {
-    const user_op::Tensor* h_tensor = ctx->Tensor4ArgNameAndIndex("hidden_states", 0);
-    int64_t seq_len = h_tensor->shape_view().At(0);
-    int64_t batch_size = h_tensor->shape_view().At(1);
-    int64_t hidden_size = h_tensor->shape_view().At(2);
-    int64_t head_size = ctx->Attr<int64_t>("head_size");
-    int64_t num_heads = hidden_size / (3 * head_size);
-    int64_t ld = batch_size * hidden_size;
-    int64_t stride = 3 * head_size;
-    int64_t k_offset = head_size;
-
-    // q * k: (sq, b, n, h) x (sk, b, n, h) => (b, n, sq, h) x (b, n, sk, h)
-    // => (b, n, sq, h) x (b, n, h, sk) -> (b, n, sq, sk)
-    float alpha = ctx->Attr<float>("alpha");
-    user_op::Tensor* qmk_tensor = ctx->Tensor4ArgNameAndIndex("query_mul_key", 0);
-    const T* q_dptr = h_tensor->dptr<T>();
-    const T* k_dptr = h_tensor->dptr<T>() + k_offset;
-    BatchedGemm<T>(ctx->stream(), 'N', 'T', seq_len, seq_len, head_size, alpha, q_dptr, ld, stride,
-                   k_dptr, ld, stride, 0.0f, qmk_tensor->mut_dptr<T>(), seq_len, seq_len * seq_len,
-                   batch_size * num_heads);
-
-    // slice v
-    user_op::Tensor* tmp_v_tensor = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0);
-    user_op::Tensor* v_tensor = ctx->Tensor4ArgNameAndIndex("value", 0);
-    SliceParams params = ConstructSliceParams4Value(seq_len, batch_size, num_heads, head_size);
-    SliceKernelUtil<DeviceType::kCUDA, T>::Forward(ctx->stream(), params, h_tensor->dptr<T>(),
-                                                   tmp_v_tensor->mut_dptr<T>());
-    // v from (s, b, n, h) transpose to (b, n, s, h)
-    Shape value_shape({seq_len, batch_size, num_heads, head_size});
-    TransposeGpu<T>(ctx->stream(), h_tensor->data_type(), value_shape, v_tensor->shape_view(),
-                    {1, 2, 0, 3}, tmp_v_tensor->dptr<T>(), v_tensor->mut_dptr<T>());
-  }
-  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
-};
-
-template<typename T>
-class FusedSelfAttentionQueryMulKeyAndValueGradGpuKernel final : public user_op::OpKernel {
- public:
-  FusedSelfAttentionQueryMulKeyAndValueGradGpuKernel() = default;
-  ~FusedSelfAttentionQueryMulKeyAndValueGradGpuKernel() override = default;
-
- private:
-  using user_op::OpKernel::Compute;
-  void Compute(user_op::KernelComputeContext* ctx) const override {
-    const user_op::Tensor* v_grad_tensor = ctx->Tensor4ArgNameAndIndex("value_grad", 0);
-    const user_op::Tensor* qmk_grad_tensor = ctx->Tensor4ArgNameAndIndex("query_mul_key_grad", 0);
-    const user_op::Tensor* h_tensor = ctx->Tensor4ArgNameAndIndex("hidden_states", 0);
-    user_op::Tensor* tmp_v_tensor = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0);
-    user_op::Tensor* h_grad_tensor = ctx->Tensor4ArgNameAndIndex("hidden_states_grad", 0);
-
-    float alpha = ctx->Attr<float>("alpha");
-    int64_t seq_len = h_grad_tensor->shape_view().At(0);
-    int64_t batch_size = h_grad_tensor->shape_view().At(1);
-    int64_t hidden_size = h_grad_tensor->shape_view().At(2);
-    int64_t num_heads = v_grad_tensor->shape_view().At(1);
-    int64_t head_size = v_grad_tensor->shape_view().At(3);
-    int64_t ld = batch_size * hidden_size;
-    int64_t stride = 3 * head_size;
-    CHECK_EQ(hidden_size, num_heads * stride);
-
-    // transpose from (b, n, s, h) to (s, b, n, h)
-    Shape value_shape({seq_len, batch_size, num_heads, head_size});
-    TransposeGpu<T>(ctx->stream(), v_grad_tensor->data_type(), v_grad_tensor->shape_view(),
-                    value_shape, {2, 0, 1, 3}, v_grad_tensor->dptr<T>(),
-                    tmp_v_tensor->mut_dptr<T>());
-    // slice v grad
-    SliceParams params = ConstructSliceParams4Value(seq_len, batch_size, num_heads, head_size);
-    SliceKernelUtil<DeviceType::kCUDA, T>::Backward(ctx->stream(), params, tmp_v_tensor->dptr<T>(),
-                                                    h_grad_tensor->mut_dptr<T>());
-
-    // grad_q = grad_qmk * k
-    // (b, n, sq, sk) x (b, n, sk, h) -> (b, n, s, h) <= (s, b, n, h) <= (s, b, n, 3, h)
-    const T* qmk_grad_dptr = qmk_grad_tensor->dptr<T>();
-    const T* k_dptr = h_tensor->dptr<T>() + head_size;
-    T* grad_q_dptr = h_grad_tensor->mut_dptr<T>();
-    BatchedGemm<T>(ctx->stream(), 'N', 'N', seq_len, head_size, seq_len, alpha, qmk_grad_dptr,
-                   seq_len, seq_len * seq_len, k_dptr, ld, stride, 0.0f, grad_q_dptr, ld, stride,
-                   batch_size * num_heads);
-    // grad_k = grad_qmk * q
-    // (b, n, sk, sq) x (b, n, sq, h) -> (b, n, sk, h) <= (s, b, n, h) <= (s, b, n, 3, h)
-    const T* q_dptr = h_tensor->dptr<T>();
-    T* grad_k_dptr = h_grad_tensor->mut_dptr<T>() + head_size;
-    BatchedGemm<T>(ctx->stream(), 'T', 'N', seq_len, head_size, seq_len, alpha, qmk_grad_dptr,
-                   seq_len, seq_len * seq_len, q_dptr, ld, stride, 0.0f, grad_k_dptr, ld, stride,
-                   batch_size * num_heads);
-  }
-  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
-};
-
-size_t InferTmpBufferSize(user_op::InferContext* ctx) {
-  const Shape* value_shape = ctx->OutputShape("value", 0);
-  DataType value_dtype = *ctx->OutputDType("value", 0);
-  return value_shape->elem_cnt() * GetSizeOfDataType(value_dtype);
-}
-
-size_t InferGradTmpBufferSize(user_op::InferContext* ctx) {
-  const Shape& value_shape = ctx->InputShape("value_grad", 0);
-  const DataType& value_dtype = ctx->InputDType("value_grad", 0);
-  return value_shape.elem_cnt() * GetSizeOfDataType(value_dtype);
-}
-
-}  // namespace
-
-#define REGISTER_FUSED_SELF_ATTENTION_QUERY_MUL_KEY_AND_VALUE_CUDA_KERNEL(dtype)                   \
-  REGISTER_USER_KERNEL("fused_self_attention_query_mul_key_and_value")                             \
-      .SetCreateFn<FusedSelfAttentionQueryMulKeyAndValueGpuKernel<dtype>>()                        \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                             \
-                       && (user_op::HobDataType("hidden_states", 0) == GetDataType<dtype>::value)) \
-      .SetInferTmpSizeFn(InferTmpBufferSize);
-
-#define REGISTER_FUSED_SELF_ATTENTION_QUERY_MUL_KEY_AND_VALUE_GRAD_CUDA_KERNEL(dtype)              \
-  REGISTER_USER_KERNEL("fused_self_attention_query_mul_key_and_value_grad")                        \
-      .SetCreateFn<FusedSelfAttentionQueryMulKeyAndValueGradGpuKernel<dtype>>()                    \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                             \
-                       && (user_op::HobDataType("hidden_states", 0) == GetDataType<dtype>::value)) \
-      .SetInferTmpSizeFn(InferGradTmpBufferSize);
-
-REGISTER_FUSED_SELF_ATTENTION_QUERY_MUL_KEY_AND_VALUE_CUDA_KERNEL(float)
-REGISTER_FUSED_SELF_ATTENTION_QUERY_MUL_KEY_AND_VALUE_CUDA_KERNEL(float16)
-REGISTER_FUSED_SELF_ATTENTION_QUERY_MUL_KEY_AND_VALUE_GRAD_CUDA_KERNEL(float)
-REGISTER_FUSED_SELF_ATTENTION_QUERY_MUL_KEY_AND_VALUE_GRAD_CUDA_KERNEL(float16)
-
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "oneflow/core/framework/framework.h"
+#include "oneflow/user/kernels/slice_util.h"
+#include "oneflow/core/kernel/new_kernel_util.h"
+#include "oneflow/core/ep/include/primitive/permute.h"
+#include "oneflow/core/ep/rocm/cuda_stream.h"
+
+namespace oneflow {
+
+namespace {
+
+inline hipblasOperation_t GetCublasOp(char op) {
+  switch (op) {
+    case 'n':
+    case 'N': {
+      return HIPBLAS_OP_N;
+    }
+    case 't':
+    case 'T': {
+      return HIPBLAS_OP_T;
+    }
+    case 'c':
+    case 'C': {
+      return HIPBLAS_OP_C;
+    }
+    default: {
+      UNIMPLEMENTED();
+    }
+  }
+  return HIPBLAS_OP_N;
+}
+
+template<typename T>
+struct CudaDataTypeTrait;
+
+template<>
+struct CudaDataTypeTrait<float> {
+  const static hipblasDatatype_t value = HIPBLAS_R_32F;
+};
+
+template<>
+struct CudaDataTypeTrait<half> {
+  const static hipblasDatatype_t value = HIPBLAS_R_16F;
+};
+
+template<typename T>
+void CublasBatchGemm(hipblasHandle_t handle, char transa, char transb, int64_t m, int64_t n,
+                     int64_t k, T alpha, const T* a, int64_t lda, int64_t stridea, const T* b,
+                     int64_t ldb, int64_t strideb, T beta, T* c, int64_t ldc, int64_t stridec,
+                     int64_t batch_size) {
+  hipblasOperation_t opa = GetCublasOp(transa);
+  hipblasOperation_t opb = GetCublasOp(transb);
+ 
+
+  hipblasDatatype_t data_type = CudaDataTypeTrait<T>::value;
+  OF_CUBLAS_CHECK(hipblasGemmStridedBatchedEx(
+      handle, opa, opb, m, n, k, reinterpret_cast<const void*>(&alpha),
+      reinterpret_cast<const void*>(a), data_type, lda, stridea, reinterpret_cast<const void*>(b),
+      data_type, ldb, strideb, reinterpret_cast<const void*>(&beta), reinterpret_cast<void*>(c),
+      data_type, ldc, stridec, batch_size, data_type, HIPBLAS_GEMM_DEFAULT));
+ 
+}
+
+
+template<>
+void CublasBatchGemm<half>(hipblasHandle_t handle, char transa, char transb, int64_t m, int64_t n,
+                           int64_t k, half alpha, const half* a, int64_t lda, int64_t stridea,
+                           const half* b, int64_t ldb, int64_t strideb, half beta, half* c,
+                           int64_t ldc, int64_t stridec, int64_t batch_size) {
+  using comp_t = float;
+  hipblasOperation_t opa = GetCublasOp(transa);
+  hipblasOperation_t opb = GetCublasOp(transb);
+
+
+  float alpha_f = static_cast<comp_t>(alpha);
+  float beta_f = static_cast<comp_t>(beta);
+  hipblasGemmAlgo_t algo = HIPBLAS_GEMM_DEFAULT;
+  hipblasDatatype_t data_type = CudaDataTypeTrait<half>::value;
+  hipblasDatatype_t comp_type = CudaDataTypeTrait<comp_t>::value;
+  OF_CUBLAS_CHECK(hipblasGemmStridedBatchedEx(
+      handle, opa, opb, m, n, k, &alpha_f, reinterpret_cast<const void*>(a), data_type, lda,
+      stridea, reinterpret_cast<const void*>(b), data_type, ldb, strideb, &beta_f,
+      reinterpret_cast<void*>(c), data_type, ldc, stridec, batch_size, comp_type, algo));
+  
+}
+
+template<>
+void CublasBatchGemm<float16>(hipblasHandle_t handle, char transa, char transb, int64_t m, int64_t n,
+                              int64_t k, float16 alpha, const float16* a, int64_t lda,
+                              int64_t stridea, const float16* b, int64_t ldb, int64_t strideb,
+                              float16 beta, float16* c, int64_t ldc, int64_t stridec,
+                              int64_t batch_size) {
+  CublasBatchGemm<half>(handle, transa, transb, m, n, k, static_cast<half>(alpha),
+                        reinterpret_cast<const half*>(a), lda, stridea,
+                        reinterpret_cast<const half*>(b), ldb, strideb, static_cast<half>(beta),
+                        reinterpret_cast<half*>(c), ldc, stridec, batch_size);
+}
+
+
+template<typename T>
+void BatchedGemm(ep::Stream* stream, char opa, char opb, int64_t m, int64_t n, int64_t k,
+                 float alpha, const T* a, int64_t lda, int64_t stridea, const T* b, int64_t ldb,
+                 int64_t strideb, float beta, T* c, int64_t ldc, int64_t stridec,
+                 int64_t batch_size) {
+  // swap m and n, a and b to convert from row-major to col-major
+  CublasBatchGemm<T>(stream->As<ep::CudaStream>()->cublas_handle(), opb, opa, n, m, k,
+                     static_cast<T>(alpha), b, ldb, strideb, a, lda, stridea, static_cast<T>(beta),
+                     c, ldc, stridec, batch_size);
+}
+
+SliceParams ConstructSliceParams4Value(int64_t seq_len, int64_t batch_size, int64_t num_heads,
+                                       int64_t head_size) {
+  // slice (s, b, n, 3, h) to (s, b, n, 1, h)
+  SliceParams params;
+  params.ndim = 4;
+  params.dims[0] = seq_len;
+  params.dims[1] = batch_size;
+  params.dims[2] = num_heads;
+  params.dims[3] = 3 * head_size;
+  params.start[0] = 0;
+  params.start[1] = 0;
+  params.start[2] = 0;
+  params.start[3] = 2 * head_size;
+  params.step[0] = 1;
+  params.step[1] = 1;
+  params.step[2] = 1;
+  params.step[3] = 1;
+  params.size[0] = seq_len;
+  params.size[1] = batch_size;
+  params.size[2] = num_heads;
+  params.size[3] = head_size;
+  return params;
+}
+
+template<typename T>
+void TransposeGpu(ep::Stream* stream, DataType data_type, const ShapeView& in_shape,
+                  const ShapeView& out_shape, const std::vector<int32_t>& perm, const T* in,
+                  T* out) {
+  CHECK_EQ(in_shape.NumAxes(), out_shape.NumAxes());
+  int32_t num_axes = in_shape.NumAxes();
+  CHECK_EQ(num_axes, perm.size());
+  for (int i = 0; i < perm.size(); ++i) { CHECK_EQ(in_shape.At(perm[i]), out_shape.At(i)); }
+  auto transpose = ep::primitive::NewPrimitive<ep::primitive::PermuteFactory>(stream->device_type(),
+                                                                              in_shape.NumAxes());
+  CHECK(transpose);
+  transpose->Launch(stream, data_type, in_shape.NumAxes(), in_shape.ptr(), in, perm.data(), out);
+}
+
+template<typename T>
+class FusedSelfAttentionQueryMulKeyAndValueGpuKernel final : public user_op::OpKernel {
+ public:
+  FusedSelfAttentionQueryMulKeyAndValueGpuKernel() = default;
+  ~FusedSelfAttentionQueryMulKeyAndValueGpuKernel() override = default;
+
+ private:
+  using user_op::OpKernel::Compute;
+  void Compute(user_op::KernelComputeContext* ctx) const override {
+    const user_op::Tensor* h_tensor = ctx->Tensor4ArgNameAndIndex("hidden_states", 0);
+    int64_t seq_len = h_tensor->shape_view().At(0);
+    int64_t batch_size = h_tensor->shape_view().At(1);
+    int64_t hidden_size = h_tensor->shape_view().At(2);
+    int64_t head_size = ctx->Attr<int64_t>("head_size");
+    int64_t num_heads = hidden_size / (3 * head_size);
+    int64_t ld = batch_size * hidden_size;
+    int64_t stride = 3 * head_size;
+    int64_t k_offset = head_size;
+
+    // q * k: (sq, b, n, h) x (sk, b, n, h) => (b, n, sq, h) x (b, n, sk, h)
+    // => (b, n, sq, h) x (b, n, h, sk) -> (b, n, sq, sk)
+    float alpha = ctx->Attr<float>("alpha");
+    user_op::Tensor* qmk_tensor = ctx->Tensor4ArgNameAndIndex("query_mul_key", 0);
+    const T* q_dptr = h_tensor->dptr<T>();
+    const T* k_dptr = h_tensor->dptr<T>() + k_offset;
+    BatchedGemm<T>(ctx->stream(), 'N', 'T', seq_len, seq_len, head_size, alpha, q_dptr, ld, stride,
+                   k_dptr, ld, stride, 0.0f, qmk_tensor->mut_dptr<T>(), seq_len, seq_len * seq_len,
+                   batch_size * num_heads);
+
+    // slice v
+    user_op::Tensor* tmp_v_tensor = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0);
+    user_op::Tensor* v_tensor = ctx->Tensor4ArgNameAndIndex("value", 0);
+    SliceParams params = ConstructSliceParams4Value(seq_len, batch_size, num_heads, head_size);
+    SliceKernelUtil<DeviceType::kCUDA, T>::Forward(ctx->stream(), params, h_tensor->dptr<T>(),
+                                                   tmp_v_tensor->mut_dptr<T>());
+    // v from (s, b, n, h) transpose to (b, n, s, h)
+    Shape value_shape({seq_len, batch_size, num_heads, head_size});
+    TransposeGpu<T>(ctx->stream(), h_tensor->data_type(), value_shape, v_tensor->shape_view(),
+                    {1, 2, 0, 3}, tmp_v_tensor->dptr<T>(), v_tensor->mut_dptr<T>());
+  }
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
+};
+
+template<typename T>
+class FusedSelfAttentionQueryMulKeyAndValueGradGpuKernel final : public user_op::OpKernel {
+ public:
+  FusedSelfAttentionQueryMulKeyAndValueGradGpuKernel() = default;
+  ~FusedSelfAttentionQueryMulKeyAndValueGradGpuKernel() override = default;
+
+ private:
+  using user_op::OpKernel::Compute;
+  void Compute(user_op::KernelComputeContext* ctx) const override {
+    const user_op::Tensor* v_grad_tensor = ctx->Tensor4ArgNameAndIndex("value_grad", 0);
+    const user_op::Tensor* qmk_grad_tensor = ctx->Tensor4ArgNameAndIndex("query_mul_key_grad", 0);
+    const user_op::Tensor* h_tensor = ctx->Tensor4ArgNameAndIndex("hidden_states", 0);
+    user_op::Tensor* tmp_v_tensor = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0);
+    user_op::Tensor* h_grad_tensor = ctx->Tensor4ArgNameAndIndex("hidden_states_grad", 0);
+
+    float alpha = ctx->Attr<float>("alpha");
+    int64_t seq_len = h_grad_tensor->shape_view().At(0);
+    int64_t batch_size = h_grad_tensor->shape_view().At(1);
+    int64_t hidden_size = h_grad_tensor->shape_view().At(2);
+    int64_t num_heads = v_grad_tensor->shape_view().At(1);
+    int64_t head_size = v_grad_tensor->shape_view().At(3);
+    int64_t ld = batch_size * hidden_size;
+    int64_t stride = 3 * head_size;
+    CHECK_EQ(hidden_size, num_heads * stride);
+
+    // transpose from (b, n, s, h) to (s, b, n, h)
+    Shape value_shape({seq_len, batch_size, num_heads, head_size});
+    TransposeGpu<T>(ctx->stream(), v_grad_tensor->data_type(), v_grad_tensor->shape_view(),
+                    value_shape, {2, 0, 1, 3}, v_grad_tensor->dptr<T>(),
+                    tmp_v_tensor->mut_dptr<T>());
+    // slice v grad
+    SliceParams params = ConstructSliceParams4Value(seq_len, batch_size, num_heads, head_size);
+    SliceKernelUtil<DeviceType::kCUDA, T>::Backward(ctx->stream(), params, tmp_v_tensor->dptr<T>(),
+                                                    h_grad_tensor->mut_dptr<T>());
+
+    // grad_q = grad_qmk * k
+    // (b, n, sq, sk) x (b, n, sk, h) -> (b, n, s, h) <= (s, b, n, h) <= (s, b, n, 3, h)
+    const T* qmk_grad_dptr = qmk_grad_tensor->dptr<T>();
+    const T* k_dptr = h_tensor->dptr<T>() + head_size;
+    T* grad_q_dptr = h_grad_tensor->mut_dptr<T>();
+    BatchedGemm<T>(ctx->stream(), 'N', 'N', seq_len, head_size, seq_len, alpha, qmk_grad_dptr,
+                   seq_len, seq_len * seq_len, k_dptr, ld, stride, 0.0f, grad_q_dptr, ld, stride,
+                   batch_size * num_heads);
+    // grad_k = grad_qmk * q
+    // (b, n, sk, sq) x (b, n, sq, h) -> (b, n, sk, h) <= (s, b, n, h) <= (s, b, n, 3, h)
+    const T* q_dptr = h_tensor->dptr<T>();
+    T* grad_k_dptr = h_grad_tensor->mut_dptr<T>() + head_size;
+    BatchedGemm<T>(ctx->stream(), 'T', 'N', seq_len, head_size, seq_len, alpha, qmk_grad_dptr,
+                   seq_len, seq_len * seq_len, q_dptr, ld, stride, 0.0f, grad_k_dptr, ld, stride,
+                   batch_size * num_heads);
+  }
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
+};
+
+size_t InferTmpBufferSize(user_op::InferContext* ctx) {
+  const Shape* value_shape = ctx->OutputShape("value", 0);
+  DataType value_dtype = *ctx->OutputDType("value", 0);
+  return value_shape->elem_cnt() * GetSizeOfDataType(value_dtype);
+}
+
+size_t InferGradTmpBufferSize(user_op::InferContext* ctx) {
+  const Shape& value_shape = ctx->InputShape("value_grad", 0);
+  const DataType& value_dtype = ctx->InputDType("value_grad", 0);
+  return value_shape.elem_cnt() * GetSizeOfDataType(value_dtype);
+}
+
+}  // namespace
+
+#define REGISTER_FUSED_SELF_ATTENTION_QUERY_MUL_KEY_AND_VALUE_CUDA_KERNEL(dtype)                   \
+  REGISTER_USER_KERNEL("fused_self_attention_query_mul_key_and_value")                             \
+      .SetCreateFn<FusedSelfAttentionQueryMulKeyAndValueGpuKernel<dtype>>()                        \
+      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                             \
+                       && (user_op::HobDataType("hidden_states", 0) == GetDataType<dtype>::value)) \
+      .SetInferTmpSizeFn(InferTmpBufferSize);
+
+#define REGISTER_FUSED_SELF_ATTENTION_QUERY_MUL_KEY_AND_VALUE_GRAD_CUDA_KERNEL(dtype)              \
+  REGISTER_USER_KERNEL("fused_self_attention_query_mul_key_and_value_grad")                        \
+      .SetCreateFn<FusedSelfAttentionQueryMulKeyAndValueGradGpuKernel<dtype>>()                    \
+      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                             \
+                       && (user_op::HobDataType("hidden_states", 0) == GetDataType<dtype>::value)) \
+      .SetInferTmpSizeFn(InferGradTmpBufferSize);
+
+REGISTER_FUSED_SELF_ATTENTION_QUERY_MUL_KEY_AND_VALUE_CUDA_KERNEL(float)
+REGISTER_FUSED_SELF_ATTENTION_QUERY_MUL_KEY_AND_VALUE_CUDA_KERNEL(float16)
+REGISTER_FUSED_SELF_ATTENTION_QUERY_MUL_KEY_AND_VALUE_GRAD_CUDA_KERNEL(float)
+REGISTER_FUSED_SELF_ATTENTION_QUERY_MUL_KEY_AND_VALUE_GRAD_CUDA_KERNEL(float16)
+
 }  // namespace oneflow
\ No newline at end of file
diff --git a/oneflow/user/kernels/fused_tril_scale_softmax_mask_scale_kernel.hip.cpp b/oneflow/user/kernels/fused_tril_scale_softmax_mask_scale_kernel.hip.cpp
index 0f8f340..9585018 100644
--- a/oneflow/user/kernels/fused_tril_scale_softmax_mask_scale_kernel.hip.cpp
+++ b/oneflow/user/kernels/fused_tril_scale_softmax_mask_scale_kernel.hip.cpp
@@ -1,229 +1,229 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "oneflow/core/framework/framework.h"
-#include "oneflow/core/hip/softmax.hip.h"
-#include "oneflow/core/ep/rocm/cuda_stream.h"
-
-namespace oneflow {
-
-template<typename SRC, typename DST>
-struct TrilScaleLoad {
-  TrilScaleLoad(const SRC* src, int64_t tril_num_rows, int64_t row_size, int64_t diagonal, SRC fill,
-                SRC scale)
-      : src(src),
-        tril_num_rows(tril_num_rows),
-        row_size(row_size),
-        diagonal(diagonal),
-        fill(fill),
-        scale(scale) {}
-  template<int N>
-  __device__ void load(DST* dst, int64_t row, int64_t col) {
-    int64_t tril_row = row % tril_num_rows;
-    int64_t diagonal_col_id = tril_row + diagonal;
-    bool need_load = (col <= diagonal_col_id);
-    cuda::softmax::Pack<SRC, N> pack;
-    if (need_load) {
-      const int64_t offset = (row * row_size + col) / N;
-      pack.storage = *(reinterpret_cast<const cuda::softmax::PackType<SRC, N>*>(src) + offset);
-    }
-#pragma unroll
-    for (int i = 0; i < N; ++i) {
-      if (col + i > diagonal_col_id) {
-        dst[i] = static_cast<DST>(fill);
-      } else {
-        dst[i] = static_cast<DST>(pack.elem[i]) * static_cast<DST>(scale);
-      }
-    }
-  }
-  const SRC* src;
-  int64_t tril_num_rows;
-  int64_t row_size;
-  int64_t diagonal;
-  SRC fill;
-  SRC scale;
-};
-
-template<typename SRC, typename DST>
-struct MaskAndScaleStore {
-  MaskAndScaleStore(DST* dst, DST* softmax_y, const bool* mask, int64_t row_size, DST scale)
-      : dst(dst), softmax_y(softmax_y), mask(mask), row_size(row_size), scale(scale) {}
-  template<int N>
-  __device__ void store(const SRC* src, int64_t row, int64_t col) {
-    cuda::softmax::Pack<DST, N> softmax_y_pack;
-    cuda::softmax::Pack<DST, N> dst_pack;
-    const int64_t offset = (row * row_size + col) / N;
-    cuda::softmax::Pack<bool, N> mask_pack;
-    mask_pack.storage = *(reinterpret_cast<const cuda::softmax::PackType<bool, N>*>(mask) + offset);
-#pragma unroll
-    for (int i = 0; i < N; ++i) {
-      softmax_y_pack.elem[i] = static_cast<DST>(src[i]);
-      dst_pack.elem[i] =
-          static_cast<DST>(src[i]) * static_cast<DST>(mask_pack.elem[i]) * static_cast<DST>(scale);
-    }
-    *(reinterpret_cast<cuda::softmax::PackType<DST, N>*>(softmax_y) + offset) =
-        softmax_y_pack.storage;
-    *(reinterpret_cast<cuda::softmax::PackType<DST, N>*>(dst) + offset) = dst_pack.storage;
-  }
-  DST* dst;
-  DST* softmax_y;
-  const bool* mask;
-  int64_t row_size;
-  DST scale;
-};
-
-template<typename SRC, typename DST>
-struct MaskAndScaleLoad {
-  MaskAndScaleLoad(const SRC* src, const bool* mask, int64_t row_size, SRC scale)
-      : src(src), mask(mask), row_size(row_size), scale(scale) {}
-  template<int N>
-  __device__ void load(DST* dst, int64_t row, int64_t col) const {
-    cuda::softmax::Pack<SRC, N> pack;
-    const int64_t offset = (row * row_size + col) / N;
-    pack.storage = *(reinterpret_cast<const cuda::softmax::PackType<SRC, N>*>(src) + offset);
-    cuda::softmax::Pack<bool, N> mask_pack;
-    mask_pack.storage = *(reinterpret_cast<const cuda::softmax::PackType<bool, N>*>(mask) + offset);
-#pragma unroll
-    for (int i = 0; i < N; ++i) {
-      dst[i] = static_cast<DST>(pack.elem[i]) * static_cast<DST>(mask_pack.elem[i])
-               * static_cast<DST>(scale);
-    }
-  }
-  const SRC* src;
-  const bool* mask;
-  int64_t row_size;
-  SRC scale;
-};
-
-template<typename SRC, typename DST>
-struct TrilScaleStore {
-  TrilScaleStore(DST* dst, int64_t tril_num_rows, int64_t row_size, int64_t diagonal, DST fill,
-                 DST scale)
-      : dst(dst),
-        tril_num_rows(tril_num_rows),
-        row_size(row_size),
-        diagonal(diagonal),
-        fill(fill),
-        scale(scale) {}
-  template<int N>
-  __device__ void store(const SRC* src, int64_t row, int64_t col) {
-    cuda::softmax::Pack<DST, N> pack;
-    const int64_t offset = (row * row_size + col) / N;
-    int64_t tril_row = row % tril_num_rows;
-#pragma unroll
-    for (int i = 0; i < N; ++i) {
-      if (col + i > tril_row + diagonal) {
-        pack.elem[i] = fill;
-      } else {
-        pack.elem[i] = static_cast<DST>(src[i]) * static_cast<DST>(scale);
-      }
-    }
-    *(reinterpret_cast<cuda::softmax::PackType<DST, N>*>(dst) + offset) = pack.storage;
-  }
-  DST* dst;
-  int64_t tril_num_rows;
-  int64_t row_size;
-  int64_t diagonal;
-  DST fill;
-  DST scale;
-};
-
-template<typename T>
-class FusedTrilScaleSoftmaxMaskScaleKernel final : public user_op::OpKernel {
- public:
-  FusedTrilScaleSoftmaxMaskScaleKernel() = default;
-  ~FusedTrilScaleSoftmaxMaskScaleKernel() override = default;
-
- private:
-  using user_op::OpKernel::Compute;
-  void Compute(user_op::KernelComputeContext* ctx) const override {
-    const user_op::Tensor* x = ctx->Tensor4ArgNameAndIndex("x", 0);
-    const user_op::Tensor* mask = ctx->Tensor4ArgNameAndIndex("mask", 0);
-    user_op::Tensor* y = ctx->Tensor4ArgNameAndIndex("y", 0);
-    user_op::Tensor* softmax_y = ctx->Tensor4ArgNameAndIndex("softmax_y", 0);
-    const ShapeView& x_shape = x->shape_view();
-    CHECK_GE(x_shape.NumAxes(), 2);
-    const int64_t cols = x_shape.At(x_shape.NumAxes() - 1);
-    const int64_t rows = x_shape.Count(0, x_shape.NumAxes() - 1);
-    const int64_t tril_num_rows = x_shape.At(x_shape.NumAxes() - 2);
-    using ComputeType = typename cuda::softmax::DefaultComputeType<T>::type;
-    TrilScaleLoad<T, ComputeType> load(
-        x->dptr<T>(), tril_num_rows, cols, ctx->Attr<int64_t>("diagonal"),
-        ctx->Attr<float>("tril_fill_value"), ctx->Attr<float>("tril_scale_value"));
-    MaskAndScaleStore<ComputeType, T> store(y->mut_dptr<T>(), softmax_y->mut_dptr<T>(),
-                                            mask->dptr<bool>(), cols,
-                                            ctx->Attr<float>("mask_scale_value"));
-    OF_CUDA_CHECK((cuda::softmax::DispatchSoftmax<decltype(load), decltype(store), ComputeType>(
-        ctx->stream()->As<ep::CudaStream>()->cuda_stream(), load, store, rows, cols)));
-  }
-  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
-};
-
-#define REGISTER_FUSED_TRIL_SCALE_SOFTMAX_MASK_SCALE_CUDA_KERNEL(dtype) \
-  REGISTER_USER_KERNEL("fused_tril_scale_softmax_mask_scale")           \
-      .SetCreateFn<FusedTrilScaleSoftmaxMaskScaleKernel<dtype>>()       \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)  \
-                       && (user_op::HobDataType("y", 0) == GetDataType<dtype>::value));
-
-REGISTER_FUSED_TRIL_SCALE_SOFTMAX_MASK_SCALE_CUDA_KERNEL(half)
-REGISTER_FUSED_TRIL_SCALE_SOFTMAX_MASK_SCALE_CUDA_KERNEL(float)
-REGISTER_FUSED_TRIL_SCALE_SOFTMAX_MASK_SCALE_CUDA_KERNEL(double)
-#undef REGISTER_FUSED_TRIL_SCALE_SOFTMAX_MASK_SCALE_CUDA_KERNEL
-
-template<typename T>
-class FusedTrilScaleSoftmaxMaskScaleGradKernel final : public user_op::OpKernel {
- public:
-  FusedTrilScaleSoftmaxMaskScaleGradKernel() = default;
-  ~FusedTrilScaleSoftmaxMaskScaleGradKernel() override = default;
-
- private:
-  using user_op::OpKernel::Compute;
-  void Compute(user_op::KernelComputeContext* ctx) const override {
-    const user_op::Tensor* softmax_y = ctx->Tensor4ArgNameAndIndex("softmax_y", 0);
-    const user_op::Tensor* dy = ctx->Tensor4ArgNameAndIndex("dy", 0);
-    const user_op::Tensor* mask = ctx->Tensor4ArgNameAndIndex("mask", 0);
-    user_op::Tensor* dx = ctx->Tensor4ArgNameAndIndex("dx", 0);
-    const ShapeView& dy_shape = dy->shape_view();
-    CHECK_GE(dy_shape.NumAxes(), 2);
-    const int64_t cols = dy_shape.At(dy_shape.NumAxes() - 1);
-    const int64_t rows = dy_shape.Count(0, dy_shape.NumAxes() - 1);
-    const int64_t tril_num_rows = dy_shape.At(dy_shape.NumAxes() - 2);
-    using ComputeType = typename cuda::softmax::DefaultComputeType<T>::type;
-    cuda::softmax::DirectLoad<T, ComputeType> load_softmax_y(softmax_y->dptr<T>(), cols);
-    MaskAndScaleLoad<T, ComputeType> load_dy(dy->dptr<T>(), mask->dptr<bool>(), cols,
-                                             ctx->Attr<float>("mask_scale_value"));
-    TrilScaleStore<ComputeType, T> store(dx->mut_dptr<T>(), tril_num_rows, cols,
-                                         ctx->Attr<int64_t>("diagonal"), static_cast<T>(0.0),
-                                         ctx->Attr<float>("tril_scale_value"));
-    OF_CUDA_CHECK((cuda::softmax::DispatchSoftmaxGrad<decltype(load_softmax_y), decltype(load_dy),
-                                                      decltype(store), ComputeType>(
-        ctx->stream()->As<ep::CudaStream>()->cuda_stream(), load_softmax_y, load_dy, store, rows,
-        cols)));
-  }
-  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
-};
-
-#define REGISTER_FUSED_TRIL_SCALE_SOFTMAX_MASK_SCALE_GRAD_KERNEL(dtype) \
-  REGISTER_USER_KERNEL("fused_tril_scale_softmax_mask_scale_grad")      \
-      .SetCreateFn<FusedTrilScaleSoftmaxMaskScaleGradKernel<dtype>>()   \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)  \
-                       && (user_op::HobDataType("dx", 0) == GetDataType<dtype>::value));
-
-REGISTER_FUSED_TRIL_SCALE_SOFTMAX_MASK_SCALE_GRAD_KERNEL(half)
-REGISTER_FUSED_TRIL_SCALE_SOFTMAX_MASK_SCALE_GRAD_KERNEL(float)
-REGISTER_FUSED_TRIL_SCALE_SOFTMAX_MASK_SCALE_GRAD_KERNEL(double)
-#undef REGISTER_FUSED_TRIL_SCALE_SOFTMAX_MASK_SCALE_GRAD_KERNEL
-
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "oneflow/core/framework/framework.h"
+#include "oneflow/core/hip/softmax.hip.h"
+#include "oneflow/core/ep/rocm/cuda_stream.h"
+
+namespace oneflow {
+
+template<typename SRC, typename DST>
+struct TrilScaleLoad {
+  TrilScaleLoad(const SRC* src, int64_t tril_num_rows, int64_t row_size, int64_t diagonal, SRC fill,
+                SRC scale)
+      : src(src),
+        tril_num_rows(tril_num_rows),
+        row_size(row_size),
+        diagonal(diagonal),
+        fill(fill),
+        scale(scale) {}
+  template<int N>
+  __device__ void load(DST* dst, int64_t row, int64_t col) {
+    int64_t tril_row = row % tril_num_rows;
+    int64_t diagonal_col_id = tril_row + diagonal;
+    bool need_load = (col <= diagonal_col_id);
+    cuda::softmax::Pack<SRC, N> pack;
+    if (need_load) {
+      const int64_t offset = (row * row_size + col) / N;
+      pack.storage = *(reinterpret_cast<const cuda::softmax::PackType<SRC, N>*>(src) + offset);
+    }
+#pragma unroll
+    for (int i = 0; i < N; ++i) {
+      if (col + i > diagonal_col_id) {
+        dst[i] = static_cast<DST>(fill);
+      } else {
+        dst[i] = static_cast<DST>(pack.elem[i]) * static_cast<DST>(scale);
+      }
+    }
+  }
+  const SRC* src;
+  int64_t tril_num_rows;
+  int64_t row_size;
+  int64_t diagonal;
+  SRC fill;
+  SRC scale;
+};
+
+template<typename SRC, typename DST>
+struct MaskAndScaleStore {
+  MaskAndScaleStore(DST* dst, DST* softmax_y, const bool* mask, int64_t row_size, DST scale)
+      : dst(dst), softmax_y(softmax_y), mask(mask), row_size(row_size), scale(scale) {}
+  template<int N>
+  __device__ void store(const SRC* src, int64_t row, int64_t col) {
+    cuda::softmax::Pack<DST, N> softmax_y_pack;
+    cuda::softmax::Pack<DST, N> dst_pack;
+    const int64_t offset = (row * row_size + col) / N;
+    cuda::softmax::Pack<bool, N> mask_pack;
+    mask_pack.storage = *(reinterpret_cast<const cuda::softmax::PackType<bool, N>*>(mask) + offset);
+#pragma unroll
+    for (int i = 0; i < N; ++i) {
+      softmax_y_pack.elem[i] = static_cast<DST>(src[i]);
+      dst_pack.elem[i] =
+          static_cast<DST>(src[i]) * static_cast<DST>(mask_pack.elem[i]) * static_cast<DST>(scale);
+    }
+    *(reinterpret_cast<cuda::softmax::PackType<DST, N>*>(softmax_y) + offset) =
+        softmax_y_pack.storage;
+    *(reinterpret_cast<cuda::softmax::PackType<DST, N>*>(dst) + offset) = dst_pack.storage;
+  }
+  DST* dst;
+  DST* softmax_y;
+  const bool* mask;
+  int64_t row_size;
+  DST scale;
+};
+
+template<typename SRC, typename DST>
+struct MaskAndScaleLoad {
+  MaskAndScaleLoad(const SRC* src, const bool* mask, int64_t row_size, SRC scale)
+      : src(src), mask(mask), row_size(row_size), scale(scale) {}
+  template<int N>
+  __device__ void load(DST* dst, int64_t row, int64_t col) const {
+    cuda::softmax::Pack<SRC, N> pack;
+    const int64_t offset = (row * row_size + col) / N;
+    pack.storage = *(reinterpret_cast<const cuda::softmax::PackType<SRC, N>*>(src) + offset);
+    cuda::softmax::Pack<bool, N> mask_pack;
+    mask_pack.storage = *(reinterpret_cast<const cuda::softmax::PackType<bool, N>*>(mask) + offset);
+#pragma unroll
+    for (int i = 0; i < N; ++i) {
+      dst[i] = static_cast<DST>(pack.elem[i]) * static_cast<DST>(mask_pack.elem[i])
+               * static_cast<DST>(scale);
+    }
+  }
+  const SRC* src;
+  const bool* mask;
+  int64_t row_size;
+  SRC scale;
+};
+
+template<typename SRC, typename DST>
+struct TrilScaleStore {
+  TrilScaleStore(DST* dst, int64_t tril_num_rows, int64_t row_size, int64_t diagonal, DST fill,
+                 DST scale)
+      : dst(dst),
+        tril_num_rows(tril_num_rows),
+        row_size(row_size),
+        diagonal(diagonal),
+        fill(fill),
+        scale(scale) {}
+  template<int N>
+  __device__ void store(const SRC* src, int64_t row, int64_t col) {
+    cuda::softmax::Pack<DST, N> pack;
+    const int64_t offset = (row * row_size + col) / N;
+    int64_t tril_row = row % tril_num_rows;
+#pragma unroll
+    for (int i = 0; i < N; ++i) {
+      if (col + i > tril_row + diagonal) {
+        pack.elem[i] = fill;
+      } else {
+        pack.elem[i] = static_cast<DST>(src[i]) * static_cast<DST>(scale);
+      }
+    }
+    *(reinterpret_cast<cuda::softmax::PackType<DST, N>*>(dst) + offset) = pack.storage;
+  }
+  DST* dst;
+  int64_t tril_num_rows;
+  int64_t row_size;
+  int64_t diagonal;
+  DST fill;
+  DST scale;
+};
+
+template<typename T>
+class FusedTrilScaleSoftmaxMaskScaleKernel final : public user_op::OpKernel {
+ public:
+  FusedTrilScaleSoftmaxMaskScaleKernel() = default;
+  ~FusedTrilScaleSoftmaxMaskScaleKernel() override = default;
+
+ private:
+  using user_op::OpKernel::Compute;
+  void Compute(user_op::KernelComputeContext* ctx) const override {
+    const user_op::Tensor* x = ctx->Tensor4ArgNameAndIndex("x", 0);
+    const user_op::Tensor* mask = ctx->Tensor4ArgNameAndIndex("mask", 0);
+    user_op::Tensor* y = ctx->Tensor4ArgNameAndIndex("y", 0);
+    user_op::Tensor* softmax_y = ctx->Tensor4ArgNameAndIndex("softmax_y", 0);
+    const ShapeView& x_shape = x->shape_view();
+    CHECK_GE(x_shape.NumAxes(), 2);
+    const int64_t cols = x_shape.At(x_shape.NumAxes() - 1);
+    const int64_t rows = x_shape.Count(0, x_shape.NumAxes() - 1);
+    const int64_t tril_num_rows = x_shape.At(x_shape.NumAxes() - 2);
+    using ComputeType = typename cuda::softmax::DefaultComputeType<T>::type;
+    TrilScaleLoad<T, ComputeType> load(
+        x->dptr<T>(), tril_num_rows, cols, ctx->Attr<int64_t>("diagonal"),
+        ctx->Attr<float>("tril_fill_value"), ctx->Attr<float>("tril_scale_value"));
+    MaskAndScaleStore<ComputeType, T> store(y->mut_dptr<T>(), softmax_y->mut_dptr<T>(),
+                                            mask->dptr<bool>(), cols,
+                                            ctx->Attr<float>("mask_scale_value"));
+    OF_CUDA_CHECK((cuda::softmax::DispatchSoftmax<decltype(load), decltype(store), ComputeType>(
+        ctx->stream()->As<ep::CudaStream>()->cuda_stream(), load, store, rows, cols)));
+  }
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
+};
+
+#define REGISTER_FUSED_TRIL_SCALE_SOFTMAX_MASK_SCALE_CUDA_KERNEL(dtype) \
+  REGISTER_USER_KERNEL("fused_tril_scale_softmax_mask_scale")           \
+      .SetCreateFn<FusedTrilScaleSoftmaxMaskScaleKernel<dtype>>()       \
+      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)  \
+                       && (user_op::HobDataType("y", 0) == GetDataType<dtype>::value));
+
+REGISTER_FUSED_TRIL_SCALE_SOFTMAX_MASK_SCALE_CUDA_KERNEL(half)
+REGISTER_FUSED_TRIL_SCALE_SOFTMAX_MASK_SCALE_CUDA_KERNEL(float)
+REGISTER_FUSED_TRIL_SCALE_SOFTMAX_MASK_SCALE_CUDA_KERNEL(double)
+#undef REGISTER_FUSED_TRIL_SCALE_SOFTMAX_MASK_SCALE_CUDA_KERNEL
+
+template<typename T>
+class FusedTrilScaleSoftmaxMaskScaleGradKernel final : public user_op::OpKernel {
+ public:
+  FusedTrilScaleSoftmaxMaskScaleGradKernel() = default;
+  ~FusedTrilScaleSoftmaxMaskScaleGradKernel() override = default;
+
+ private:
+  using user_op::OpKernel::Compute;
+  void Compute(user_op::KernelComputeContext* ctx) const override {
+    const user_op::Tensor* softmax_y = ctx->Tensor4ArgNameAndIndex("softmax_y", 0);
+    const user_op::Tensor* dy = ctx->Tensor4ArgNameAndIndex("dy", 0);
+    const user_op::Tensor* mask = ctx->Tensor4ArgNameAndIndex("mask", 0);
+    user_op::Tensor* dx = ctx->Tensor4ArgNameAndIndex("dx", 0);
+    const ShapeView& dy_shape = dy->shape_view();
+    CHECK_GE(dy_shape.NumAxes(), 2);
+    const int64_t cols = dy_shape.At(dy_shape.NumAxes() - 1);
+    const int64_t rows = dy_shape.Count(0, dy_shape.NumAxes() - 1);
+    const int64_t tril_num_rows = dy_shape.At(dy_shape.NumAxes() - 2);
+    using ComputeType = typename cuda::softmax::DefaultComputeType<T>::type;
+    cuda::softmax::DirectLoad<T, ComputeType> load_softmax_y(softmax_y->dptr<T>(), cols);
+    MaskAndScaleLoad<T, ComputeType> load_dy(dy->dptr<T>(), mask->dptr<bool>(), cols,
+                                             ctx->Attr<float>("mask_scale_value"));
+    TrilScaleStore<ComputeType, T> store(dx->mut_dptr<T>(), tril_num_rows, cols,
+                                         ctx->Attr<int64_t>("diagonal"), static_cast<T>(0.0),
+                                         ctx->Attr<float>("tril_scale_value"));
+    OF_CUDA_CHECK((cuda::softmax::DispatchSoftmaxGrad<decltype(load_softmax_y), decltype(load_dy),
+                                                      decltype(store), ComputeType>(
+        ctx->stream()->As<ep::CudaStream>()->cuda_stream(), load_softmax_y, load_dy, store, rows,
+        cols)));
+  }
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
+};
+
+#define REGISTER_FUSED_TRIL_SCALE_SOFTMAX_MASK_SCALE_GRAD_KERNEL(dtype) \
+  REGISTER_USER_KERNEL("fused_tril_scale_softmax_mask_scale_grad")      \
+      .SetCreateFn<FusedTrilScaleSoftmaxMaskScaleGradKernel<dtype>>()   \
+      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)  \
+                       && (user_op::HobDataType("dx", 0) == GetDataType<dtype>::value));
+
+REGISTER_FUSED_TRIL_SCALE_SOFTMAX_MASK_SCALE_GRAD_KERNEL(half)
+REGISTER_FUSED_TRIL_SCALE_SOFTMAX_MASK_SCALE_GRAD_KERNEL(float)
+REGISTER_FUSED_TRIL_SCALE_SOFTMAX_MASK_SCALE_GRAD_KERNEL(double)
+#undef REGISTER_FUSED_TRIL_SCALE_SOFTMAX_MASK_SCALE_GRAD_KERNEL
+
 }  // namespace oneflow
\ No newline at end of file
diff --git a/oneflow/user/kernels/gather_kernel_util.hip.cpp b/oneflow/user/kernels/gather_kernel_util.hip.cpp
index 675a617..c783961 100644
--- a/oneflow/user/kernels/gather_kernel_util.hip.cpp
+++ b/oneflow/user/kernels/gather_kernel_util.hip.cpp
@@ -1,123 +1,123 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "hip/hip_runtime.h"
-#include "oneflow/user/kernels/gather_kernel_util.h"
-#include "oneflow/core/kernel/kernel.h"
-#include "oneflow/core/ep/rocm/cuda_stream.h"
-#include "oneflow/core/common/nd_index_offset_helper.h"
-#include <assert.h>
-
-namespace oneflow {
-
-namespace {
-
-template<typename T, typename K, typename IDX>
-__global__ void GatherForwardGpu(const IDX elem_cnt, NdIndexOffsetHelper<IDX, 3> in_helper,
-                                 NdIndexOffsetHelper<IDX, 3> out_helper, const K* indices,
-                                 const T* in, const IDX gather_dim_size, T* out, const IDX offset) {
-  IDX index[3];
-  CUDA_1D_KERNEL_LOOP_T(IDX, i, elem_cnt) {
-    out_helper.OffsetToNdIndex(i, index);
-    index[1] = indices[index[1]] - offset;
-    T v{};
-    if (index[1] >= 0 && index[1] < gather_dim_size) { v = in[in_helper.NdIndexToOffset(index)]; }
-    out[i] = v;
-  }
-}
-
-bool IsSafeUseIndex32(int64_t outer_dim_size, int64_t gather_dim_size, int64_t inner_dim_size,
-                      int64_t num_indices) {
-  const int64_t in_elem_cnt = outer_dim_size * gather_dim_size * inner_dim_size;
-  const int64_t out_elem_cnt = outer_dim_size * num_indices * inner_dim_size;
-  return std::max(out_elem_cnt, in_elem_cnt) < GetMaxVal<int32_t>() / 2;
-}
-
-template<typename T, typename K>
-void DispatchIndexSize(ep::Stream* stream, int64_t outer_dim_size, int64_t gather_dim_size,
-                       int64_t inner_dim_size, int64_t num_indices, int64_t offset,
-                       const K* indices, const T* in, T* out) {
-  const int64_t out_elem_cnt = outer_dim_size * num_indices * inner_dim_size;
-  if (IsSafeUseIndex32(outer_dim_size, gather_dim_size, inner_dim_size, num_indices)) {
-    NdIndexOffsetHelper<int32_t, 3> in_helper(outer_dim_size, gather_dim_size, inner_dim_size);
-    NdIndexOffsetHelper<int32_t, 3> out_helper(outer_dim_size, num_indices, inner_dim_size);
-    GatherForwardGpu<T, K, int32_t><<<BlocksNum4ThreadsNum(out_elem_cnt), kCudaThreadsNumPerBlock,
-                                      0, stream->As<ep::CudaStream>()->cuda_stream()>>>(
-        out_elem_cnt, in_helper, out_helper, indices, in, gather_dim_size, out, offset);
-  } else {
-    NdIndexOffsetHelper<int64_t, 3> in_helper(outer_dim_size, gather_dim_size, inner_dim_size);
-    NdIndexOffsetHelper<int64_t, 3> out_helper(outer_dim_size, num_indices, inner_dim_size);
-    GatherForwardGpu<T, K, int64_t><<<BlocksNum4ThreadsNum(out_elem_cnt), kCudaThreadsNumPerBlock,
-                                      0, stream->As<ep::CudaStream>()->cuda_stream()>>>(
-        out_elem_cnt, in_helper, out_helper, indices, in, gather_dim_size, out, offset);
-  }
-}
-
-template<typename K, typename T>
-bool TryDispatchMovementType(ep::Stream* stream, int64_t outer_dim_size, int64_t gather_dim_size,
-                             int64_t inner_dim_size, int64_t num_indices, int64_t offset,
-                             const K* indices, const void* in, void* out) {
-  if (reinterpret_cast<uintptr_t>(in) % sizeof(T) == 0
-      && reinterpret_cast<uintptr_t>(out) % sizeof(T) == 0 && inner_dim_size % sizeof(T) == 0) {
-    DispatchIndexSize<T, K>(stream, outer_dim_size, gather_dim_size, inner_dim_size / sizeof(T),
-                            num_indices, offset, indices, static_cast<const T*>(in),
-                            static_cast<T*>(out));
-    return true;
-  } else {
-    return false;
-  }
-}
-
-template<typename K>
-void DispatchMovementSize(ep::Stream* stream, int64_t outer_dim_size, int64_t gather_dim_size,
-                          int64_t inner_dim_size, int64_t num_indices, int64_t offset,
-                          const K* indices, const void* in, void* out) {
-  using Func = bool (*)(ep::Stream * stream, int64_t outer_dim_size, int64_t gather_dim_size,
-                        int64_t inner_dim_size, int64_t num_indices, int64_t offset,
-                        const K* indices, const void* in, void* out);
-  Func funcs[] = {
-      TryDispatchMovementType<K, ulonglong2>,  // 16B
-      TryDispatchMovementType<K, uint64_t>,    // 8B
-      TryDispatchMovementType<K, uint32_t>,    // 4B
-      TryDispatchMovementType<K, uint16_t>,    // 2B
-      TryDispatchMovementType<K, uint8_t>,     // 1B
-  };
-  for (size_t i = 0; i < sizeof(funcs) / sizeof(funcs[0]); ++i) {
-    if (funcs[i](stream, outer_dim_size, gather_dim_size, inner_dim_size, num_indices, offset,
-                 indices, in, out)) {
-      break;
-    }
-  }
-}
-
-}  // namespace
-
-template<typename T, typename K>
-struct GatherKernelUtilImpl<DeviceType::kCUDA, T, K> final {
-  static void Forward(ep::Stream* stream, const K* indices, int64_t num_indices, const T* in,
-                      const Shape& flat_in_shape, T* out, const int64_t offset) {
-    DispatchMovementSize(stream, flat_in_shape.At(0), flat_in_shape.At(1),
-                         flat_in_shape.At(2) * sizeof(T), num_indices, offset, indices, in, out);
-  }
-};
-
-#define INITIATE_GATHER_KERNEL_UTIL_CUDA_IMPL(in_type_pair, index_type_pair)              \
-  template struct GatherKernelUtilImpl<DeviceType::kCUDA, OF_PP_PAIR_FIRST(in_type_pair), \
-                                       OF_PP_PAIR_FIRST(index_type_pair)>;
-OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INITIATE_GATHER_KERNEL_UTIL_CUDA_IMPL,
-                                 GATHER_DATA_TYPE_SEQ HALF_DATA_TYPE_SEQ, GATHER_INDEX_TYPE_SEQ);
-#undef INITIATE_GATHER_KERNEL_UTIL_CUDA_IMPL
-
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "hip/hip_runtime.h"
+#include "oneflow/user/kernels/gather_kernel_util.h"
+#include "oneflow/core/kernel/kernel.h"
+#include "oneflow/core/ep/rocm/cuda_stream.h"
+#include "oneflow/core/common/nd_index_offset_helper.h"
+#include <assert.h>
+
+namespace oneflow {
+
+namespace {
+
+template<typename T, typename K, typename IDX>
+__global__ void GatherForwardGpu(const IDX elem_cnt, NdIndexOffsetHelper<IDX, 3> in_helper,
+                                 NdIndexOffsetHelper<IDX, 3> out_helper, const K* indices,
+                                 const T* in, const IDX gather_dim_size, T* out, const IDX offset) {
+  IDX index[3];
+  CUDA_1D_KERNEL_LOOP_T(IDX, i, elem_cnt) {
+    out_helper.OffsetToNdIndex(i, index);
+    index[1] = indices[index[1]] - offset;
+    T v{};
+    if (index[1] >= 0 && index[1] < gather_dim_size) { v = in[in_helper.NdIndexToOffset(index)]; }
+    out[i] = v;
+  }
+}
+
+bool IsSafeUseIndex32(int64_t outer_dim_size, int64_t gather_dim_size, int64_t inner_dim_size,
+                      int64_t num_indices) {
+  const int64_t in_elem_cnt = outer_dim_size * gather_dim_size * inner_dim_size;
+  const int64_t out_elem_cnt = outer_dim_size * num_indices * inner_dim_size;
+  return std::max(out_elem_cnt, in_elem_cnt) < GetMaxVal<int32_t>() / 2;
+}
+
+template<typename T, typename K>
+void DispatchIndexSize(ep::Stream* stream, int64_t outer_dim_size, int64_t gather_dim_size,
+                       int64_t inner_dim_size, int64_t num_indices, int64_t offset,
+                       const K* indices, const T* in, T* out) {
+  const int64_t out_elem_cnt = outer_dim_size * num_indices * inner_dim_size;
+  if (IsSafeUseIndex32(outer_dim_size, gather_dim_size, inner_dim_size, num_indices)) {
+    NdIndexOffsetHelper<int32_t, 3> in_helper(outer_dim_size, gather_dim_size, inner_dim_size);
+    NdIndexOffsetHelper<int32_t, 3> out_helper(outer_dim_size, num_indices, inner_dim_size);
+    GatherForwardGpu<T, K, int32_t><<<BlocksNum4ThreadsNum(out_elem_cnt), kCudaThreadsNumPerBlock,
+                                      0, stream->As<ep::CudaStream>()->cuda_stream()>>>(
+        out_elem_cnt, in_helper, out_helper, indices, in, gather_dim_size, out, offset);
+  } else {
+    NdIndexOffsetHelper<int64_t, 3> in_helper(outer_dim_size, gather_dim_size, inner_dim_size);
+    NdIndexOffsetHelper<int64_t, 3> out_helper(outer_dim_size, num_indices, inner_dim_size);
+    GatherForwardGpu<T, K, int64_t><<<BlocksNum4ThreadsNum(out_elem_cnt), kCudaThreadsNumPerBlock,
+                                      0, stream->As<ep::CudaStream>()->cuda_stream()>>>(
+        out_elem_cnt, in_helper, out_helper, indices, in, gather_dim_size, out, offset);
+  }
+}
+
+template<typename K, typename T>
+bool TryDispatchMovementType(ep::Stream* stream, int64_t outer_dim_size, int64_t gather_dim_size,
+                             int64_t inner_dim_size, int64_t num_indices, int64_t offset,
+                             const K* indices, const void* in, void* out) {
+  if (reinterpret_cast<uintptr_t>(in) % sizeof(T) == 0
+      && reinterpret_cast<uintptr_t>(out) % sizeof(T) == 0 && inner_dim_size % sizeof(T) == 0) {
+    DispatchIndexSize<T, K>(stream, outer_dim_size, gather_dim_size, inner_dim_size / sizeof(T),
+                            num_indices, offset, indices, static_cast<const T*>(in),
+                            static_cast<T*>(out));
+    return true;
+  } else {
+    return false;
+  }
+}
+
+template<typename K>
+void DispatchMovementSize(ep::Stream* stream, int64_t outer_dim_size, int64_t gather_dim_size,
+                          int64_t inner_dim_size, int64_t num_indices, int64_t offset,
+                          const K* indices, const void* in, void* out) {
+  using Func = bool (*)(ep::Stream * stream, int64_t outer_dim_size, int64_t gather_dim_size,
+                        int64_t inner_dim_size, int64_t num_indices, int64_t offset,
+                        const K* indices, const void* in, void* out);
+  Func funcs[] = {
+      TryDispatchMovementType<K, ulonglong2>,  // 16B
+      TryDispatchMovementType<K, uint64_t>,    // 8B
+      TryDispatchMovementType<K, uint32_t>,    // 4B
+      TryDispatchMovementType<K, uint16_t>,    // 2B
+      TryDispatchMovementType<K, uint8_t>,     // 1B
+  };
+  for (size_t i = 0; i < sizeof(funcs) / sizeof(funcs[0]); ++i) {
+    if (funcs[i](stream, outer_dim_size, gather_dim_size, inner_dim_size, num_indices, offset,
+                 indices, in, out)) {
+      break;
+    }
+  }
+}
+
+}  // namespace
+
+template<typename T, typename K>
+struct GatherKernelUtilImpl<DeviceType::kCUDA, T, K> final {
+  static void Forward(ep::Stream* stream, const K* indices, int64_t num_indices, const T* in,
+                      const Shape& flat_in_shape, T* out, const int64_t offset) {
+    DispatchMovementSize(stream, flat_in_shape.At(0), flat_in_shape.At(1),
+                         flat_in_shape.At(2) * sizeof(T), num_indices, offset, indices, in, out);
+  }
+};
+
+#define INITIATE_GATHER_KERNEL_UTIL_CUDA_IMPL(in_type_pair, index_type_pair)              \
+  template struct GatherKernelUtilImpl<DeviceType::kCUDA, OF_PP_PAIR_FIRST(in_type_pair), \
+                                       OF_PP_PAIR_FIRST(index_type_pair)>;
+OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INITIATE_GATHER_KERNEL_UTIL_CUDA_IMPL,
+                                 GATHER_DATA_TYPE_SEQ HALF_DATA_TYPE_SEQ, GATHER_INDEX_TYPE_SEQ);
+#undef INITIATE_GATHER_KERNEL_UTIL_CUDA_IMPL
+
 }  // namespace oneflow
\ No newline at end of file
diff --git a/oneflow/user/kernels/generate_random_batch_permutation_indices_kernel.hip.cpp b/oneflow/user/kernels/generate_random_batch_permutation_indices_kernel.hip.cpp
index 3b75fe2..2306d8c 100644
--- a/oneflow/user/kernels/generate_random_batch_permutation_indices_kernel.hip.cpp
+++ b/oneflow/user/kernels/generate_random_batch_permutation_indices_kernel.hip.cpp
@@ -1,138 +1,138 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "hip/hip_runtime.h"
-#include "oneflow/core/framework/framework.h"
-#include "oneflow/core/kernel/new_kernel_util.h"
-#include "oneflow/core/kernel/random_generator.h"
-#include "oneflow/user/kernels/radix_sort.hip.h"
-#include "oneflow/user/kernels/op_kernel_wrapper.h"
-#include "oneflow/core/ep/rocm/cuda_stream.h"
-
-namespace oneflow {
-
-namespace {
-
-class TmpBufferManager final {
- public:
-  OF_DISALLOW_COPY_AND_MOVE(TmpBufferManager);
-  TmpBufferManager(const int32_t& batch_size, const int32_t& capacity, void* ptr)
-      : capacity_{capacity},
-        random_value_elem_cnt_{batch_size},
-        sorted_value_elem_cnt_{batch_size},
-        indices_elem_cnt_{batch_size} {
-    const int32_t random_value_aligned_bytes =
-        GetCudaAlignedSize(random_value_elem_cnt_ * sizeof(float));
-    const int32_t sorted_value_aligned_bytes =
-        GetCudaAlignedSize(sorted_value_elem_cnt_ * sizeof(float));
-    const int32_t indices_aligned_bytes = GetCudaAlignedSize(indices_elem_cnt_ * sizeof(int32_t));
-    random_value_ptr_ = reinterpret_cast<float*>(ptr);
-    sorted_value_ptr_ = reinterpret_cast<float*>(reinterpret_cast<char*>(random_value_ptr_)
-                                                 + random_value_aligned_bytes);
-    indices_ptr_ = reinterpret_cast<int32_t*>(reinterpret_cast<char*>(sorted_value_ptr_)
-                                              + sorted_value_aligned_bytes);
-    temp_storage_ptr_ =
-        reinterpret_cast<void*>(reinterpret_cast<char*>(indices_ptr_) + indices_aligned_bytes);
-    temp_storage_bytes_ =
-        capacity_ - random_value_aligned_bytes - sorted_value_aligned_bytes - indices_aligned_bytes;
-    CHECK_GE(temp_storage_bytes_, 0);
-  }
-  ~TmpBufferManager() = default;
-
-  float* RandomValuePtr() const { return random_value_ptr_; }
-  float* SortedValuePtr() const { return sorted_value_ptr_; }
-  int32_t* IndicesPtr() const { return indices_ptr_; }
-  void* TempStoragePtr() const { return temp_storage_ptr_; }
-
-  int32_t TempStorageBytes() const { return temp_storage_bytes_; }
-
- private:
-  int32_t capacity_;
-
-  float* random_value_ptr_;
-  float* sorted_value_ptr_;
-  int32_t* indices_ptr_;
-  void* temp_storage_ptr_;
-
-  int32_t random_value_elem_cnt_;
-  int32_t sorted_value_elem_cnt_;
-  int32_t indices_elem_cnt_;
-  int32_t temp_storage_bytes_;
-};
-
-__global__ void InitializeIndices(int32_t elem_cnt, int32_t* indices_ptr) {
-  CUDA_1D_KERNEL_LOOP(i, elem_cnt) { indices_ptr[i] = i; };
-}
-
-}  // namespace
-
-class GenerateRandomBatchPermutationIndicesGPUKernel final : public user_op::OpKernel {
- public:
-  GenerateRandomBatchPermutationIndicesGPUKernel() = default;
-  ~GenerateRandomBatchPermutationIndicesGPUKernel() = default;
-
-  std::shared_ptr<user_op::OpKernelState> CreateOpKernelState(
-      user_op::KernelInitContext* ctx) const override {
-    int64_t seed = ctx->Attr<int64_t>("seed");
-    return std::make_shared<OpKernelStateWrapper<RandomGenerator<DeviceType::kCUDA>>>(
-        seed, ctx->stream());
-  }
-
- private:
-  using user_op::OpKernel::Compute;
-  void Compute(user_op::KernelComputeContext* ctx, user_op::OpKernelState* state,
-               const user_op::OpKernelCache*) const override {
-    auto* random_generator =
-        dynamic_cast<OpKernelStateWrapper<RandomGenerator<DeviceType::kCUDA>>*>(state);
-    user_op::Tensor* y = ctx->Tensor4ArgNameAndIndex("y", 0);
-    const int32_t batch_size = y->shape_view().At(0);
-    user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0);
-    TmpBufferManager buf_manager(batch_size,
-                                 static_cast<int32_t>(tmp_buffer->shape_view().elem_cnt()),
-                                 tmp_buffer->mut_dptr<void>());
-    random_generator->Mutable()->Uniform(batch_size, buf_manager.RandomValuePtr());
-    InitializeIndices<<<BlocksNum4ThreadsNum(batch_size), kCudaThreadsNumPerBlock, 0,
-                        ctx->stream()->As<ep::CudaStream>()->cuda_stream()>>>(
-        batch_size, buf_manager.IndicesPtr());
-    const int32_t argsort_instance_num = 1;
-    const int32_t argsort_instance_size = batch_size;
-    SortPairsAscending(buf_manager.RandomValuePtr(), buf_manager.IndicesPtr(), argsort_instance_num,
-                       argsort_instance_size, buf_manager.TempStoragePtr(),
-                       buf_manager.TempStorageBytes(), buf_manager.SortedValuePtr(),
-                       y->mut_dptr<int32_t>(), ctx->stream()->As<ep::CudaStream>()->cuda_stream());
-  }
-  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
-};
-
-REGISTER_USER_KERNEL("generate_random_batch_permutation_indices")
-    .SetCreateFn<GenerateRandomBatchPermutationIndicesGPUKernel>()
-    .SetIsMatchedHob(user_op::HobDeviceType() == DeviceType::kCUDA)
-    .SetInferTmpSizeFn([](oneflow::user_op::InferContext* ctx) {
-      const Shape* y_shape = ctx->OutputShape("y", 0);
-      const int32_t batch_size = y_shape->At(0);
-
-      const int32_t random_value_aligned_bytes = GetCudaAlignedSize(batch_size * sizeof(float));
-      const int32_t sorted_value_aligned_bytes = GetCudaAlignedSize(batch_size * sizeof(float));
-      const int32_t indices_aligned_bytes = GetCudaAlignedSize(batch_size * sizeof(int32_t));
-      const int32_t argsort_instance_num = 1;
-      const int32_t argsort_instance_size = batch_size;
-      const int32_t temp_storage_bytes = InferTempStorageForSortPairsAscending<float, int32_t>(
-          argsort_instance_num, argsort_instance_size);
-
-      return random_value_aligned_bytes + sorted_value_aligned_bytes + indices_aligned_bytes
-             + temp_storage_bytes;
-    });
-
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "hip/hip_runtime.h"
+#include "oneflow/core/framework/framework.h"
+#include "oneflow/core/kernel/new_kernel_util.h"
+#include "oneflow/core/kernel/random_generator.h"
+#include "oneflow/user/kernels/radix_sort.hip.h"
+#include "oneflow/user/kernels/op_kernel_wrapper.h"
+#include "oneflow/core/ep/rocm/cuda_stream.h"
+
+namespace oneflow {
+
+namespace {
+
+class TmpBufferManager final {
+ public:
+  OF_DISALLOW_COPY_AND_MOVE(TmpBufferManager);
+  TmpBufferManager(const int32_t& batch_size, const int32_t& capacity, void* ptr)
+      : capacity_{capacity},
+        random_value_elem_cnt_{batch_size},
+        sorted_value_elem_cnt_{batch_size},
+        indices_elem_cnt_{batch_size} {
+    const int32_t random_value_aligned_bytes =
+        GetCudaAlignedSize(random_value_elem_cnt_ * sizeof(float));
+    const int32_t sorted_value_aligned_bytes =
+        GetCudaAlignedSize(sorted_value_elem_cnt_ * sizeof(float));
+    const int32_t indices_aligned_bytes = GetCudaAlignedSize(indices_elem_cnt_ * sizeof(int32_t));
+    random_value_ptr_ = reinterpret_cast<float*>(ptr);
+    sorted_value_ptr_ = reinterpret_cast<float*>(reinterpret_cast<char*>(random_value_ptr_)
+                                                 + random_value_aligned_bytes);
+    indices_ptr_ = reinterpret_cast<int32_t*>(reinterpret_cast<char*>(sorted_value_ptr_)
+                                              + sorted_value_aligned_bytes);
+    temp_storage_ptr_ =
+        reinterpret_cast<void*>(reinterpret_cast<char*>(indices_ptr_) + indices_aligned_bytes);
+    temp_storage_bytes_ =
+        capacity_ - random_value_aligned_bytes - sorted_value_aligned_bytes - indices_aligned_bytes;
+    CHECK_GE(temp_storage_bytes_, 0);
+  }
+  ~TmpBufferManager() = default;
+
+  float* RandomValuePtr() const { return random_value_ptr_; }
+  float* SortedValuePtr() const { return sorted_value_ptr_; }
+  int32_t* IndicesPtr() const { return indices_ptr_; }
+  void* TempStoragePtr() const { return temp_storage_ptr_; }
+
+  int32_t TempStorageBytes() const { return temp_storage_bytes_; }
+
+ private:
+  int32_t capacity_;
+
+  float* random_value_ptr_;
+  float* sorted_value_ptr_;
+  int32_t* indices_ptr_;
+  void* temp_storage_ptr_;
+
+  int32_t random_value_elem_cnt_;
+  int32_t sorted_value_elem_cnt_;
+  int32_t indices_elem_cnt_;
+  int32_t temp_storage_bytes_;
+};
+
+__global__ void InitializeIndices(int32_t elem_cnt, int32_t* indices_ptr) {
+  CUDA_1D_KERNEL_LOOP(i, elem_cnt) { indices_ptr[i] = i; };
+}
+
+}  // namespace
+
+class GenerateRandomBatchPermutationIndicesGPUKernel final : public user_op::OpKernel {
+ public:
+  GenerateRandomBatchPermutationIndicesGPUKernel() = default;
+  ~GenerateRandomBatchPermutationIndicesGPUKernel() = default;
+
+  std::shared_ptr<user_op::OpKernelState> CreateOpKernelState(
+      user_op::KernelInitContext* ctx) const override {
+    int64_t seed = ctx->Attr<int64_t>("seed");
+    return std::make_shared<OpKernelStateWrapper<RandomGenerator<DeviceType::kCUDA>>>(
+        seed, ctx->stream());
+  }
+
+ private:
+  using user_op::OpKernel::Compute;
+  void Compute(user_op::KernelComputeContext* ctx, user_op::OpKernelState* state,
+               const user_op::OpKernelCache*) const override {
+    auto* random_generator =
+        dynamic_cast<OpKernelStateWrapper<RandomGenerator<DeviceType::kCUDA>>*>(state);
+    user_op::Tensor* y = ctx->Tensor4ArgNameAndIndex("y", 0);
+    const int32_t batch_size = y->shape_view().At(0);
+    user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0);
+    TmpBufferManager buf_manager(batch_size,
+                                 static_cast<int32_t>(tmp_buffer->shape_view().elem_cnt()),
+                                 tmp_buffer->mut_dptr<void>());
+    random_generator->Mutable()->Uniform(batch_size, buf_manager.RandomValuePtr());
+    InitializeIndices<<<BlocksNum4ThreadsNum(batch_size), kCudaThreadsNumPerBlock, 0,
+                        ctx->stream()->As<ep::CudaStream>()->cuda_stream()>>>(
+        batch_size, buf_manager.IndicesPtr());
+    const int32_t argsort_instance_num = 1;
+    const int32_t argsort_instance_size = batch_size;
+    SortPairsAscending(buf_manager.RandomValuePtr(), buf_manager.IndicesPtr(), argsort_instance_num,
+                       argsort_instance_size, buf_manager.TempStoragePtr(),
+                       buf_manager.TempStorageBytes(), buf_manager.SortedValuePtr(),
+                       y->mut_dptr<int32_t>(), ctx->stream()->As<ep::CudaStream>()->cuda_stream());
+  }
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
+};
+
+REGISTER_USER_KERNEL("generate_random_batch_permutation_indices")
+    .SetCreateFn<GenerateRandomBatchPermutationIndicesGPUKernel>()
+    .SetIsMatchedHob(user_op::HobDeviceType() == DeviceType::kCUDA)
+    .SetInferTmpSizeFn([](oneflow::user_op::InferContext* ctx) {
+      const Shape* y_shape = ctx->OutputShape("y", 0);
+      const int32_t batch_size = y_shape->At(0);
+
+      const int32_t random_value_aligned_bytes = GetCudaAlignedSize(batch_size * sizeof(float));
+      const int32_t sorted_value_aligned_bytes = GetCudaAlignedSize(batch_size * sizeof(float));
+      const int32_t indices_aligned_bytes = GetCudaAlignedSize(batch_size * sizeof(int32_t));
+      const int32_t argsort_instance_num = 1;
+      const int32_t argsort_instance_size = batch_size;
+      const int32_t temp_storage_bytes = InferTempStorageForSortPairsAscending<float, int32_t>(
+          argsort_instance_num, argsort_instance_size);
+
+      return random_value_aligned_bytes + sorted_value_aligned_bytes + indices_aligned_bytes
+             + temp_storage_bytes;
+    });
+
 }  // namespace oneflow
\ No newline at end of file
diff --git a/oneflow/user/kernels/heap_selection_top_k_kernel.hip.cpp b/oneflow/user/kernels/heap_selection_top_k_kernel.hip.cpp
index 0b59386..3fbc1aa 100644
--- a/oneflow/user/kernels/heap_selection_top_k_kernel.hip.cpp
+++ b/oneflow/user/kernels/heap_selection_top_k_kernel.hip.cpp
@@ -1,233 +1,233 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "hip/hip_runtime.h"
-#include "oneflow/core/framework/framework.h"
-#include "oneflow/core/ep/rocm/cuda_stream.h"
-
-namespace oneflow {
-
-namespace {
-
-template<typename T>
-T PowOf2Floor(T val, int64_t max_power) {
-  CHECK_GT(val, GetZeroVal<T>());
-  T max_floor = static_cast<T>(std::pow(2, max_power));
-  val = std::min(val, max_floor);
-  T ret = GetOneVal<T>();
-  while (true) {
-    ret *= 2;
-    if (ret >= val) { return ret == val ? ret : ret / 2; }
-  }
-}
-
-template<typename T>
-T PowOf2Ceil(T val, int64_t max_power) {
-  CHECK_GT(val, GetZeroVal<T>());
-  T max_ceil = static_cast<T>(std::pow(2, max_power));
-  val = std::min(val, max_ceil);
-  T ret = GetOneVal<T>();
-  while (true) {
-    ret *= 2;
-    if (ret >= val) { return ret; }
-  }
-}
-
-template<typename T, typename Compare>
-__device__ void BitonicSwap(T* data, const int64_t i, const int64_t j, const bool dir,
-                            const Compare& comp) {
-  if (comp(data[i], data[j]) == dir) {
-    T tmp = data[i];
-    data[i] = data[j];
-    data[j] = tmp;
-  }
-}
-
-// https://en.wikipedia.org/wiki/Bitonic_sorter
-template<typename T, typename Compare>
-__device__ void BitonicSort(T* data, const int64_t elem_cnt, const Compare& comp) {
-  // The element count of instance should be pow-of-2
-  assert(elem_cnt > 0 && !(elem_cnt & (elem_cnt - 1)));
-
-  // Generate a bitonic sequence from input
-  for (int64_t size = 2; size <= elem_cnt / 2; size *= 2) {
-    // Merge 2 bitonic sequences of length 'size' into a bitonic sequence of length '2 * size'
-    for (int64_t stride = size / 2; stride > 0; stride /= 2) {
-      for (int64_t swap_id = threadIdx.x; swap_id < elem_cnt / 2; swap_id += blockDim.x) {
-        // Change dir at intervals of 'size / 2' swaps
-        const bool dir = swap_id & (size / 2);
-        // Locate the pair {pos, pos + stride} which is going te be swaped if needed
-        const int pos = 2 * swap_id - (swap_id & (stride - 1));
-
-        BitonicSwap(data, pos, pos + stride, dir, comp);
-
-        __syncthreads();
-      }
-    }
-  }
-
-  // Sort the bitonic sequence
-  for (int64_t stride = elem_cnt / 2; stride > 0; stride /= 2) {
-    for (int64_t swap_id = threadIdx.x; swap_id < elem_cnt / 2; swap_id += blockDim.x) {
-      // Locate the pair {pos, pos + stride} which is going te be swaped if needed
-      const int pos = 2 * swap_id - (swap_id & (stride - 1));
-
-      BitonicSwap(data, pos, pos + stride, false, comp);
-
-      __syncthreads();
-    }
-  }
-}
-
-template<typename T>
-class Entry final {
- public:
-  __device__ __forceinline__ Entry(int64_t index, T value) : index_(index), value_(value) {}
-
-  __device__ __forceinline__ int64_t GetIndex() const { return index_; }
-  __device__ __forceinline__ T GetValue() const { return value_; }
-  __device__ __forceinline__ void SetIndex(int64_t index) { index_ = index; }
-  __device__ __forceinline__ void SetValue(T value) { value_ = value; }
-
-  __device__ __forceinline__ bool operator<(const Entry& entry) const {
-    return (value_ < entry.GetValue()) || (value_ == entry.GetValue() && index_ > entry.GetIndex());
-  }
-  __device__ __forceinline__ bool operator>(const Entry& entry) const {
-    return (value_ > entry.GetValue()) || (value_ == entry.GetValue() && index_ < entry.GetIndex());
-  }
-
- private:
-  int64_t index_;
-  T value_;
-};
-
-template<typename T>
-class MinHeap final {
- public:
-  __device__ __forceinline__ MinHeap(Entry<T>* data, const int64_t heap_size,
-                                     const int64_t init_index, const T init_value)
-      : data_(data), heap_size_(heap_size) {
-    for (int64_t i = 0; i < heap_size; ++i) {
-      data_[i].SetIndex(init_index);
-      data_[i].SetValue(init_value);
-    }
-  }
-  __device__ __forceinline__ Entry<T>& Top() { return data_[0]; }
-  __device__ __forceinline__ void Swap(const int64_t i, const int64_t j) {
-    auto tmp = data_[j];
-    data_[j] = data_[i];
-    data_[i] = tmp;
-  }
-  __device__ __forceinline__ void MinHeapify(int64_t index) {
-    while (true) {
-      const int64_t left = 2 * index + 1;
-      const int64_t right = 2 * index + 2;
-      int64_t min = index;
-      if (left < heap_size_ && data_[left] < data_[min]) { min = left; }
-      if (right < heap_size_ && data_[right] < data_[min]) { min = right; }
-      if (min == index) { return; }
-      Swap(min, index);
-      index = min;
-    }
-  }
-
- private:
-  Entry<T>* data_;
-  int64_t heap_size_;
-};
-
-template<typename T>
-__global__ void HeapTopKKernel(const T* in_ptr, const int64_t instance_num,
-                               const int64_t instance_size, const int64_t k,
-                               const int64_t heap_size, const int64_t init_index,
-                               const T init_value, int64_t* out_ptr) {
-  extern __shared__ char smem[];
-  auto* shared_entries = reinterpret_cast<Entry<T>*>(smem);
-
-  // Divide elements to be sorted into disjoint sets (# of sets == # of heaps).
-  // Each thread in the thread block manipulates one heap to select top heap_size entries from
-  // corresponding set
-  const T* input = in_ptr + blockIdx.x * instance_size;
-  auto heap =
-      MinHeap<T>(shared_entries + threadIdx.x * heap_size, heap_size, init_index, init_value);
-  for (int64_t i = threadIdx.x; i < instance_size; i += blockDim.x) {
-    auto entry = Entry<T>(i, input[i]);
-    if (entry > heap.Top()) {
-      heap.Top() = entry;
-      heap.MinHeapify(0);
-    }
-  }
-
-  __syncthreads();
-
-  // Merge all heaps into a unified, sorted array
-  BitonicSort(shared_entries, blockDim.x * heap_size,
-              [](const Entry<T>& x, const Entry<T>& y) { return x > y; });
-
-  // Write top_k elements in sorted array to output
-  for (int64_t i = threadIdx.x; i < k; i += blockDim.x) {
-    (out_ptr + blockIdx.x * k)[i] = shared_entries[i].GetIndex();
-  }
-}
-
-}  // namespace
-
-template<typename T>
-class GpuHeapSelectionTopKKernel final : public user_op::OpKernel {
- public:
-  GpuHeapSelectionTopKKernel() = default;
-  ~GpuHeapSelectionTopKKernel() = default;
-
- private:
-  using user_op::OpKernel::Compute;
-  void Compute(user_op::KernelComputeContext* ctx) const override {
-    const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("in", 0);
-    if (in->shape_view().elem_cnt() == 0) { return; }
-    user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0);
-
-    const int64_t instance_size = in->shape_view().At(in->shape_view().NumAxes() - 1);
-    const int64_t instance_num = in->shape_view().elem_cnt() / instance_size;
-    const int64_t k = std::min(static_cast<int64_t>(ctx->Attr<int32_t>("k")), instance_size);
-
-    // Use as many heaps as possible (# of heaps == # of threads used in thread block).
-    // Limitation 1: size of shared memory
-    // We also need heap_size * num_heap to be pow-of-2 which is necessary for bitonic sort
-    const int64_t heap_size = PowOf2Ceil(k, 16);
-    int32_t num_heap =
-        PowOf2Floor(kCudaMaxSharedMemoryByteSize / (heap_size * sizeof(Entry<T>)), 16);
-    // Limitation 2: # of threads in thread block
-    num_heap = std::min(num_heap, kCudaThreadsNumPerBlock);
-
-    HeapTopKKernel<T><<<instance_num, num_heap, num_heap * heap_size * sizeof(Entry<T>),
-                        ctx->stream()->As<ep::CudaStream>()->cuda_stream()>>>(
-        in->dptr<T>(), instance_num, instance_size, k, heap_size, GetMaxVal<int64_t>(),
-        GetMinVal<T>(), out->mut_dptr<int64_t>());
-  }
-  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
-};
-
-#define REGISTER_CUDA_HEAP_SELECTION_TOP_K_KERNEL(dtype)                                          \
-  REGISTER_USER_KERNEL("top_k").SetCreateFn<GpuHeapSelectionTopKKernel<dtype>>().SetIsMatchedHob( \
-      (user_op::HobDeviceType() == DeviceType::kCUDA) && (user_op::HobAttr<int32_t>("k") <= 128)  \
-      && (user_op::HobDataType("in", 0) == GetDataType<dtype>::value));
-
-REGISTER_CUDA_HEAP_SELECTION_TOP_K_KERNEL(float)
-REGISTER_CUDA_HEAP_SELECTION_TOP_K_KERNEL(double)
-REGISTER_CUDA_HEAP_SELECTION_TOP_K_KERNEL(uint8_t)
-REGISTER_CUDA_HEAP_SELECTION_TOP_K_KERNEL(int8_t)
-REGISTER_CUDA_HEAP_SELECTION_TOP_K_KERNEL(int32_t)
-REGISTER_CUDA_HEAP_SELECTION_TOP_K_KERNEL(int64_t)
-
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "hip/hip_runtime.h"
+#include "oneflow/core/framework/framework.h"
+#include "oneflow/core/ep/rocm/cuda_stream.h"
+
+namespace oneflow {
+
+namespace {
+
+template<typename T>
+T PowOf2Floor(T val, int64_t max_power) {
+  CHECK_GT(val, GetZeroVal<T>());
+  T max_floor = static_cast<T>(std::pow(2, max_power));
+  val = std::min(val, max_floor);
+  T ret = GetOneVal<T>();
+  while (true) {
+    ret *= 2;
+    if (ret >= val) { return ret == val ? ret : ret / 2; }
+  }
+}
+
+template<typename T>
+T PowOf2Ceil(T val, int64_t max_power) {
+  CHECK_GT(val, GetZeroVal<T>());
+  T max_ceil = static_cast<T>(std::pow(2, max_power));
+  val = std::min(val, max_ceil);
+  T ret = GetOneVal<T>();
+  while (true) {
+    ret *= 2;
+    if (ret >= val) { return ret; }
+  }
+}
+
+template<typename T, typename Compare>
+__device__ void BitonicSwap(T* data, const int64_t i, const int64_t j, const bool dir,
+                            const Compare& comp) {
+  if (comp(data[i], data[j]) == dir) {
+    T tmp = data[i];
+    data[i] = data[j];
+    data[j] = tmp;
+  }
+}
+
+// https://en.wikipedia.org/wiki/Bitonic_sorter
+template<typename T, typename Compare>
+__device__ void BitonicSort(T* data, const int64_t elem_cnt, const Compare& comp) {
+  // The element count of instance should be pow-of-2
+  assert(elem_cnt > 0 && !(elem_cnt & (elem_cnt - 1)));
+
+  // Generate a bitonic sequence from input
+  for (int64_t size = 2; size <= elem_cnt / 2; size *= 2) {
+    // Merge 2 bitonic sequences of length 'size' into a bitonic sequence of length '2 * size'
+    for (int64_t stride = size / 2; stride > 0; stride /= 2) {
+      for (int64_t swap_id = threadIdx.x; swap_id < elem_cnt / 2; swap_id += blockDim.x) {
+        // Change dir at intervals of 'size / 2' swaps
+        const bool dir = swap_id & (size / 2);
+        // Locate the pair {pos, pos + stride} which is going te be swaped if needed
+        const int pos = 2 * swap_id - (swap_id & (stride - 1));
+
+        BitonicSwap(data, pos, pos + stride, dir, comp);
+
+        __syncthreads();
+      }
+    }
+  }
+
+  // Sort the bitonic sequence
+  for (int64_t stride = elem_cnt / 2; stride > 0; stride /= 2) {
+    for (int64_t swap_id = threadIdx.x; swap_id < elem_cnt / 2; swap_id += blockDim.x) {
+      // Locate the pair {pos, pos + stride} which is going te be swaped if needed
+      const int pos = 2 * swap_id - (swap_id & (stride - 1));
+
+      BitonicSwap(data, pos, pos + stride, false, comp);
+
+      __syncthreads();
+    }
+  }
+}
+
+template<typename T>
+class Entry final {
+ public:
+  __device__ __forceinline__ Entry(int64_t index, T value) : index_(index), value_(value) {}
+
+  __device__ __forceinline__ int64_t GetIndex() const { return index_; }
+  __device__ __forceinline__ T GetValue() const { return value_; }
+  __device__ __forceinline__ void SetIndex(int64_t index) { index_ = index; }
+  __device__ __forceinline__ void SetValue(T value) { value_ = value; }
+
+  __device__ __forceinline__ bool operator<(const Entry& entry) const {
+    return (value_ < entry.GetValue()) || (value_ == entry.GetValue() && index_ > entry.GetIndex());
+  }
+  __device__ __forceinline__ bool operator>(const Entry& entry) const {
+    return (value_ > entry.GetValue()) || (value_ == entry.GetValue() && index_ < entry.GetIndex());
+  }
+
+ private:
+  int64_t index_;
+  T value_;
+};
+
+template<typename T>
+class MinHeap final {
+ public:
+  __device__ __forceinline__ MinHeap(Entry<T>* data, const int64_t heap_size,
+                                     const int64_t init_index, const T init_value)
+      : data_(data), heap_size_(heap_size) {
+    for (int64_t i = 0; i < heap_size; ++i) {
+      data_[i].SetIndex(init_index);
+      data_[i].SetValue(init_value);
+    }
+  }
+  __device__ __forceinline__ Entry<T>& Top() { return data_[0]; }
+  __device__ __forceinline__ void Swap(const int64_t i, const int64_t j) {
+    auto tmp = data_[j];
+    data_[j] = data_[i];
+    data_[i] = tmp;
+  }
+  __device__ __forceinline__ void MinHeapify(int64_t index) {
+    while (true) {
+      const int64_t left = 2 * index + 1;
+      const int64_t right = 2 * index + 2;
+      int64_t min = index;
+      if (left < heap_size_ && data_[left] < data_[min]) { min = left; }
+      if (right < heap_size_ && data_[right] < data_[min]) { min = right; }
+      if (min == index) { return; }
+      Swap(min, index);
+      index = min;
+    }
+  }
+
+ private:
+  Entry<T>* data_;
+  int64_t heap_size_;
+};
+
+template<typename T>
+__global__ void HeapTopKKernel(const T* in_ptr, const int64_t instance_num,
+                               const int64_t instance_size, const int64_t k,
+                               const int64_t heap_size, const int64_t init_index,
+                               const T init_value, int64_t* out_ptr) {
+  extern __shared__ char smem[];
+  auto* shared_entries = reinterpret_cast<Entry<T>*>(smem);
+
+  // Divide elements to be sorted into disjoint sets (# of sets == # of heaps).
+  // Each thread in the thread block manipulates one heap to select top heap_size entries from
+  // corresponding set
+  const T* input = in_ptr + blockIdx.x * instance_size;
+  auto heap =
+      MinHeap<T>(shared_entries + threadIdx.x * heap_size, heap_size, init_index, init_value);
+  for (int64_t i = threadIdx.x; i < instance_size; i += blockDim.x) {
+    auto entry = Entry<T>(i, input[i]);
+    if (entry > heap.Top()) {
+      heap.Top() = entry;
+      heap.MinHeapify(0);
+    }
+  }
+
+  __syncthreads();
+
+  // Merge all heaps into a unified, sorted array
+  BitonicSort(shared_entries, blockDim.x * heap_size,
+              [](const Entry<T>& x, const Entry<T>& y) { return x > y; });
+
+  // Write top_k elements in sorted array to output
+  for (int64_t i = threadIdx.x; i < k; i += blockDim.x) {
+    (out_ptr + blockIdx.x * k)[i] = shared_entries[i].GetIndex();
+  }
+}
+
+}  // namespace
+
+template<typename T>
+class GpuHeapSelectionTopKKernel final : public user_op::OpKernel {
+ public:
+  GpuHeapSelectionTopKKernel() = default;
+  ~GpuHeapSelectionTopKKernel() = default;
+
+ private:
+  using user_op::OpKernel::Compute;
+  void Compute(user_op::KernelComputeContext* ctx) const override {
+    const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("in", 0);
+    if (in->shape_view().elem_cnt() == 0) { return; }
+    user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0);
+
+    const int64_t instance_size = in->shape_view().At(in->shape_view().NumAxes() - 1);
+    const int64_t instance_num = in->shape_view().elem_cnt() / instance_size;
+    const int64_t k = std::min(static_cast<int64_t>(ctx->Attr<int32_t>("k")), instance_size);
+
+    // Use as many heaps as possible (# of heaps == # of threads used in thread block).
+    // Limitation 1: size of shared memory
+    // We also need heap_size * num_heap to be pow-of-2 which is necessary for bitonic sort
+    const int64_t heap_size = PowOf2Ceil(k, 16);
+    int32_t num_heap =
+        PowOf2Floor(kCudaMaxSharedMemoryByteSize / (heap_size * sizeof(Entry<T>)), 16);
+    // Limitation 2: # of threads in thread block
+    num_heap = std::min(num_heap, kCudaThreadsNumPerBlock);
+
+    HeapTopKKernel<T><<<instance_num, num_heap, num_heap * heap_size * sizeof(Entry<T>),
+                        ctx->stream()->As<ep::CudaStream>()->cuda_stream()>>>(
+        in->dptr<T>(), instance_num, instance_size, k, heap_size, GetMaxVal<int64_t>(),
+        GetMinVal<T>(), out->mut_dptr<int64_t>());
+  }
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
+};
+
+#define REGISTER_CUDA_HEAP_SELECTION_TOP_K_KERNEL(dtype)                                          \
+  REGISTER_USER_KERNEL("top_k").SetCreateFn<GpuHeapSelectionTopKKernel<dtype>>().SetIsMatchedHob( \
+      (user_op::HobDeviceType() == DeviceType::kCUDA) && (user_op::HobAttr<int32_t>("k") <= 128)  \
+      && (user_op::HobDataType("in", 0) == GetDataType<dtype>::value));
+
+REGISTER_CUDA_HEAP_SELECTION_TOP_K_KERNEL(float)
+REGISTER_CUDA_HEAP_SELECTION_TOP_K_KERNEL(double)
+REGISTER_CUDA_HEAP_SELECTION_TOP_K_KERNEL(uint8_t)
+REGISTER_CUDA_HEAP_SELECTION_TOP_K_KERNEL(int8_t)
+REGISTER_CUDA_HEAP_SELECTION_TOP_K_KERNEL(int32_t)
+REGISTER_CUDA_HEAP_SELECTION_TOP_K_KERNEL(int64_t)
+
 }  // namespace oneflow
\ No newline at end of file
diff --git a/oneflow/user/kernels/image_preprocess_kernels.hip.cpp b/oneflow/user/kernels/image_preprocess_kernels.hip.cpp
index 26a961e..49d0bef 100644
--- a/oneflow/user/kernels/image_preprocess_kernels.hip.cpp
+++ b/oneflow/user/kernels/image_preprocess_kernels.hip.cpp
@@ -1,216 +1,216 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "hip/hip_runtime.h"
-#include "oneflow/core/framework/framework.h"
-#include "oneflow/core/common/small_vector.h"
-#include "oneflow/core/common/nd_index_offset_helper.h"
-#include "oneflow/core/ep/rocm/cuda_stream.h"
-
-namespace oneflow {
-
-namespace {
-
-struct NormalizeVal {
-  float val[3];
-};
-
-enum TensorLayout {
-  kNCHW = 0,
-  kNHWC = 1,
-};
-
-class NormalizeAttr final : public user_op::OpKernelState {
- public:
-  NormalizeAttr(user_op::KernelInitContext* ctx) {
-    const std::vector<float>& mean_vec = ctx->Attr<std::vector<float>>("mean");
-    if (mean_vec.size() == 1) {
-      for (int i = 0; i < 3; ++i) { mean_.val[i] = mean_vec.at(0); }
-    } else if (mean_vec.size() == 3) {
-      for (int i = 0; i < 3; ++i) { mean_.val[i] = mean_vec.at(i); }
-    } else {
-      UNIMPLEMENTED();
-    }
-
-    const std::vector<float>& std_vec = ctx->Attr<std::vector<float>>("std");
-    if (std_vec.size() == 1) {
-      for (int i = 0; i < 3; ++i) { inv_std_.val[i] = 1.0f / std_vec.at(0); }
-    } else if (std_vec.size() == 3) {
-      for (int i = 0; i < 3; ++i) { inv_std_.val[i] = 1.0f / std_vec.at(i); }
-    } else {
-      UNIMPLEMENTED();
-    }
-  }
-  ~NormalizeAttr() = default;
-
-  const NormalizeVal& mean() const { return mean_; }
-  const NormalizeVal& inv_std() const { return inv_std_; }
-
- private:
-  NormalizeVal mean_;
-  NormalizeVal inv_std_;
-};
-
-template<TensorLayout layout>
-__device__ __forceinline__ void OutIdx2InIdx(int32_t* out_idx, int32_t* in_idx,
-                                             const int8_t* mirror_dptr, int32_t out_W,
-                                             int32_t H_offset, int32_t W_offset);
-template<>
-__device__ __forceinline__ void OutIdx2InIdx<TensorLayout::kNCHW>(int32_t* out_idx, int32_t* in_idx,
-                                                                  const int8_t* mirror_dptr,
-                                                                  int32_t out_W, int32_t H_offset,
-                                                                  int32_t W_offset) {
-  if (mirror_dptr && mirror_dptr[out_idx[0]]) { out_idx[3] = out_W - 1 - out_idx[3]; }
-  in_idx[0] = out_idx[0];             // N
-  in_idx[1] = out_idx[2] + H_offset;  // H
-  in_idx[2] = out_idx[3] + W_offset;  // W
-  in_idx[3] = out_idx[1];             // C
-}
-
-template<>
-__device__ __forceinline__ void OutIdx2InIdx<TensorLayout::kNHWC>(int32_t* out_idx, int32_t* in_idx,
-                                                                  const int8_t* mirror_dptr,
-                                                                  int32_t out_W, int32_t H_offset,
-                                                                  int32_t W_offset) {
-  if (mirror_dptr && mirror_dptr[out_idx[0]]) { out_idx[2] = out_W - 1 - out_idx[2]; }
-  in_idx[0] = out_idx[0];             // N
-  in_idx[1] = out_idx[1] + H_offset;  // H
-  in_idx[2] = out_idx[2] + W_offset;  // W
-  in_idx[3] = out_idx[3];             // C
-}
-
-template<TensorLayout layout>
-__global__ void CropMirrorNormalizeGpuImpl(int32_t elem_cnt, const uint8_t* in_dptr,
-                                           float* out_dptr, const int8_t* mirror_dptr,
-                                           int32_t out_W,
-                                           const NdIndexOffsetHelper<int32_t, 4> in_helper,
-                                           const NdIndexOffsetHelper<int32_t, 4> out_helper,
-                                           int32_t H_offset, int32_t W_offset,
-                                           const NormalizeVal mean, const NormalizeVal inv_std) {
-  CUDA_1D_KERNEL_LOOP(out_offset, elem_cnt) {
-    int32_t in_idx[4];
-    int32_t out_idx[4];
-    out_helper.OffsetToNdIndex(out_offset, out_idx);
-    OutIdx2InIdx<layout>(out_idx, in_idx, mirror_dptr, out_W, H_offset, W_offset);
-    float mean_val;
-    float inv_std_val;
-    const int32_t c = in_idx[3];
-    // When the compiler can't resolve array indices to constants it will put private arrays into
-    // GPU local memory. Using local memory is slower than keeping array elements directly in
-    // registers.
-    if (c == 0) {
-      mean_val = mean.val[0];
-      inv_std_val = inv_std.val[0];
-    } else if (c == 1) {
-      mean_val = mean.val[1];
-      inv_std_val = inv_std.val[1];
-    } else if (c == 2) {
-      mean_val = mean.val[2];
-      inv_std_val = inv_std.val[2];
-    } else {
-      // undefined behavior
-      assert(false);
-    }
-    int32_t in_offset = in_helper.NdIndexToOffset(in_idx);
-    out_dptr[out_offset] = (static_cast<float>(in_dptr[in_offset]) - mean_val) * inv_std_val;
-  }
-}
-
-}  // namespace
-
-class CropMirrorNormalizeGpuKernel final : public user_op::OpKernel {
- public:
-  CropMirrorNormalizeGpuKernel() = default;
-  ~CropMirrorNormalizeGpuKernel() override = default;
-
-  std::shared_ptr<user_op::OpKernelState> CreateOpKernelState(
-      user_op::KernelInitContext* ctx) const override {
-    return std::make_shared<NormalizeAttr>(ctx);
-  }
-
- private:
-  using user_op::OpKernel::Compute;
-  void Compute(user_op::KernelComputeContext* ctx, user_op::OpKernelState* state,
-               const user_op::OpKernelCache*) const override {
-    auto* normalize_attr = dynamic_cast<NormalizeAttr*>(state);
-    const NormalizeVal& mean = normalize_attr->mean();
-    const NormalizeVal& inv_std = normalize_attr->inv_std();
-    user_op::Tensor* in_blob = ctx->Tensor4ArgNameAndIndex("in", 0);
-    user_op::Tensor* out_blob = ctx->Tensor4ArgNameAndIndex("out", 0);
-    const std::string& output_layout = ctx->Attr<std::string>("output_layout");
-    float* out_dptr = out_blob->mut_dptr<float>();
-    const uint8_t* in_dptr = in_blob->dptr<uint8_t>();
-    const ShapeView& in_shape = in_blob->shape_view();
-    const ShapeView& out_shape = out_blob->shape_view();
-    CHECK_EQ(in_shape.NumAxes(), 4);
-    CHECK_EQ(out_shape.NumAxes(), 4);
-    int32_t elem_cnt = out_shape.elem_cnt();
-    CHECK_LE(elem_cnt, GetMaxVal<int32_t>());
-    float crop_pos_y = ctx->Attr<float>("crop_pos_y");
-    float crop_pos_x = ctx->Attr<float>("crop_pos_x");
-
-    int32_t N = in_shape.At(0);
-    int32_t in_H = in_shape.At(1);
-    int32_t in_W = in_shape.At(2);
-    int32_t C = in_shape.At(3);
-    const NdIndexOffsetHelper<int32_t, 4> in_helper(N, in_H, in_W, C);
-    const int8_t* mirror_dptr = nullptr;
-    user_op::Tensor* mirror_blob = ctx->Tensor4ArgNameAndIndex("mirror", 0);
-    if (mirror_blob) { mirror_dptr = mirror_blob->dptr<int8_t>(); }
-
-    if (output_layout == "NCHW") {
-      CHECK_EQ(N, out_shape.At(0));
-      CHECK_EQ(C, out_shape.At(1));
-      int32_t out_H = out_shape.At(2);
-      int32_t out_W = out_shape.At(3);
-      CHECK_LE(out_H, in_H);
-      CHECK_LE(out_W, in_W);
-      int32_t H_offset = (in_H - out_H) * crop_pos_y;
-      int32_t W_offset = (in_W - out_W) * crop_pos_x;
-      const NdIndexOffsetHelper<int32_t, 4> out_helper(N, C, out_H, out_W);
-      CropMirrorNormalizeGpuImpl<TensorLayout::kNCHW>
-          <<<BlocksNum4ThreadsNum(elem_cnt), kCudaThreadsNumPerBlock, 0,
-             ctx->stream()->As<ep::CudaStream>()->cuda_stream()>>>(
-              elem_cnt, in_dptr, out_dptr, mirror_dptr, out_W, in_helper, out_helper, H_offset,
-              W_offset, mean, inv_std);
-    } else if (output_layout == "NHWC") {
-      CHECK_EQ(N, out_shape.At(0));
-      int32_t out_H = out_shape.At(1);
-      int32_t out_W = out_shape.At(2);
-      CHECK_EQ(C, out_shape.At(3));
-      CHECK_LE(out_H, in_H);
-      CHECK_LE(out_W, in_W);
-      int32_t H_offset = (in_H - out_H) * crop_pos_y;
-      int32_t W_offset = (in_W - out_W) * crop_pos_x;
-      const NdIndexOffsetHelper<int32_t, 4> out_helper(N, out_H, out_W, C);
-      CropMirrorNormalizeGpuImpl<TensorLayout::kNHWC>
-          <<<BlocksNum4ThreadsNum(elem_cnt), kCudaThreadsNumPerBlock, 0,
-             ctx->stream()->As<ep::CudaStream>()->cuda_stream()>>>(
-              elem_cnt, in_dptr, out_dptr, mirror_dptr, out_W, in_helper, out_helper, H_offset,
-              W_offset, mean, inv_std);
-    } else {
-      UNIMPLEMENTED();
-    }
-  }
-  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
-};
-
-REGISTER_USER_KERNEL("crop_mirror_normalize_from_uint8")
-    .SetCreateFn<CropMirrorNormalizeGpuKernel>()
-    .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)
-                     && (user_op::HobDataType("in", 0) == DataType::kUInt8)
-                     && (user_op::HobDataType("out", 0) == DataType::kFloat));
-
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "hip/hip_runtime.h"
+#include "oneflow/core/framework/framework.h"
+#include "oneflow/core/common/small_vector.h"
+#include "oneflow/core/common/nd_index_offset_helper.h"
+#include "oneflow/core/ep/rocm/cuda_stream.h"
+
+namespace oneflow {
+
+namespace {
+
+struct NormalizeVal {
+  float val[3];
+};
+
+enum TensorLayout {
+  kNCHW = 0,
+  kNHWC = 1,
+};
+
+class NormalizeAttr final : public user_op::OpKernelState {
+ public:
+  NormalizeAttr(user_op::KernelInitContext* ctx) {
+    const std::vector<float>& mean_vec = ctx->Attr<std::vector<float>>("mean");
+    if (mean_vec.size() == 1) {
+      for (int i = 0; i < 3; ++i) { mean_.val[i] = mean_vec.at(0); }
+    } else if (mean_vec.size() == 3) {
+      for (int i = 0; i < 3; ++i) { mean_.val[i] = mean_vec.at(i); }
+    } else {
+      UNIMPLEMENTED();
+    }
+
+    const std::vector<float>& std_vec = ctx->Attr<std::vector<float>>("std");
+    if (std_vec.size() == 1) {
+      for (int i = 0; i < 3; ++i) { inv_std_.val[i] = 1.0f / std_vec.at(0); }
+    } else if (std_vec.size() == 3) {
+      for (int i = 0; i < 3; ++i) { inv_std_.val[i] = 1.0f / std_vec.at(i); }
+    } else {
+      UNIMPLEMENTED();
+    }
+  }
+  ~NormalizeAttr() = default;
+
+  const NormalizeVal& mean() const { return mean_; }
+  const NormalizeVal& inv_std() const { return inv_std_; }
+
+ private:
+  NormalizeVal mean_;
+  NormalizeVal inv_std_;
+};
+
+template<TensorLayout layout>
+__device__ __forceinline__ void OutIdx2InIdx(int32_t* out_idx, int32_t* in_idx,
+                                             const int8_t* mirror_dptr, int32_t out_W,
+                                             int32_t H_offset, int32_t W_offset);
+template<>
+__device__ __forceinline__ void OutIdx2InIdx<TensorLayout::kNCHW>(int32_t* out_idx, int32_t* in_idx,
+                                                                  const int8_t* mirror_dptr,
+                                                                  int32_t out_W, int32_t H_offset,
+                                                                  int32_t W_offset) {
+  if (mirror_dptr && mirror_dptr[out_idx[0]]) { out_idx[3] = out_W - 1 - out_idx[3]; }
+  in_idx[0] = out_idx[0];             // N
+  in_idx[1] = out_idx[2] + H_offset;  // H
+  in_idx[2] = out_idx[3] + W_offset;  // W
+  in_idx[3] = out_idx[1];             // C
+}
+
+template<>
+__device__ __forceinline__ void OutIdx2InIdx<TensorLayout::kNHWC>(int32_t* out_idx, int32_t* in_idx,
+                                                                  const int8_t* mirror_dptr,
+                                                                  int32_t out_W, int32_t H_offset,
+                                                                  int32_t W_offset) {
+  if (mirror_dptr && mirror_dptr[out_idx[0]]) { out_idx[2] = out_W - 1 - out_idx[2]; }
+  in_idx[0] = out_idx[0];             // N
+  in_idx[1] = out_idx[1] + H_offset;  // H
+  in_idx[2] = out_idx[2] + W_offset;  // W
+  in_idx[3] = out_idx[3];             // C
+}
+
+template<TensorLayout layout>
+__global__ void CropMirrorNormalizeGpuImpl(int32_t elem_cnt, const uint8_t* in_dptr,
+                                           float* out_dptr, const int8_t* mirror_dptr,
+                                           int32_t out_W,
+                                           const NdIndexOffsetHelper<int32_t, 4> in_helper,
+                                           const NdIndexOffsetHelper<int32_t, 4> out_helper,
+                                           int32_t H_offset, int32_t W_offset,
+                                           const NormalizeVal mean, const NormalizeVal inv_std) {
+  CUDA_1D_KERNEL_LOOP(out_offset, elem_cnt) {
+    int32_t in_idx[4];
+    int32_t out_idx[4];
+    out_helper.OffsetToNdIndex(out_offset, out_idx);
+    OutIdx2InIdx<layout>(out_idx, in_idx, mirror_dptr, out_W, H_offset, W_offset);
+    float mean_val;
+    float inv_std_val;
+    const int32_t c = in_idx[3];
+    // When the compiler can't resolve array indices to constants it will put private arrays into
+    // GPU local memory. Using local memory is slower than keeping array elements directly in
+    // registers.
+    if (c == 0) {
+      mean_val = mean.val[0];
+      inv_std_val = inv_std.val[0];
+    } else if (c == 1) {
+      mean_val = mean.val[1];
+      inv_std_val = inv_std.val[1];
+    } else if (c == 2) {
+      mean_val = mean.val[2];
+      inv_std_val = inv_std.val[2];
+    } else {
+      // undefined behavior
+      assert(false);
+    }
+    int32_t in_offset = in_helper.NdIndexToOffset(in_idx);
+    out_dptr[out_offset] = (static_cast<float>(in_dptr[in_offset]) - mean_val) * inv_std_val;
+  }
+}
+
+}  // namespace
+
+class CropMirrorNormalizeGpuKernel final : public user_op::OpKernel {
+ public:
+  CropMirrorNormalizeGpuKernel() = default;
+  ~CropMirrorNormalizeGpuKernel() override = default;
+
+  std::shared_ptr<user_op::OpKernelState> CreateOpKernelState(
+      user_op::KernelInitContext* ctx) const override {
+    return std::make_shared<NormalizeAttr>(ctx);
+  }
+
+ private:
+  using user_op::OpKernel::Compute;
+  void Compute(user_op::KernelComputeContext* ctx, user_op::OpKernelState* state,
+               const user_op::OpKernelCache*) const override {
+    auto* normalize_attr = dynamic_cast<NormalizeAttr*>(state);
+    const NormalizeVal& mean = normalize_attr->mean();
+    const NormalizeVal& inv_std = normalize_attr->inv_std();
+    user_op::Tensor* in_blob = ctx->Tensor4ArgNameAndIndex("in", 0);
+    user_op::Tensor* out_blob = ctx->Tensor4ArgNameAndIndex("out", 0);
+    const std::string& output_layout = ctx->Attr<std::string>("output_layout");
+    float* out_dptr = out_blob->mut_dptr<float>();
+    const uint8_t* in_dptr = in_blob->dptr<uint8_t>();
+    const ShapeView& in_shape = in_blob->shape_view();
+    const ShapeView& out_shape = out_blob->shape_view();
+    CHECK_EQ(in_shape.NumAxes(), 4);
+    CHECK_EQ(out_shape.NumAxes(), 4);
+    int32_t elem_cnt = out_shape.elem_cnt();
+    CHECK_LE(elem_cnt, GetMaxVal<int32_t>());
+    float crop_pos_y = ctx->Attr<float>("crop_pos_y");
+    float crop_pos_x = ctx->Attr<float>("crop_pos_x");
+
+    int32_t N = in_shape.At(0);
+    int32_t in_H = in_shape.At(1);
+    int32_t in_W = in_shape.At(2);
+    int32_t C = in_shape.At(3);
+    const NdIndexOffsetHelper<int32_t, 4> in_helper(N, in_H, in_W, C);
+    const int8_t* mirror_dptr = nullptr;
+    user_op::Tensor* mirror_blob = ctx->Tensor4ArgNameAndIndex("mirror", 0);
+    if (mirror_blob) { mirror_dptr = mirror_blob->dptr<int8_t>(); }
+
+    if (output_layout == "NCHW") {
+      CHECK_EQ(N, out_shape.At(0));
+      CHECK_EQ(C, out_shape.At(1));
+      int32_t out_H = out_shape.At(2);
+      int32_t out_W = out_shape.At(3);
+      CHECK_LE(out_H, in_H);
+      CHECK_LE(out_W, in_W);
+      int32_t H_offset = (in_H - out_H) * crop_pos_y;
+      int32_t W_offset = (in_W - out_W) * crop_pos_x;
+      const NdIndexOffsetHelper<int32_t, 4> out_helper(N, C, out_H, out_W);
+      CropMirrorNormalizeGpuImpl<TensorLayout::kNCHW>
+          <<<BlocksNum4ThreadsNum(elem_cnt), kCudaThreadsNumPerBlock, 0,
+             ctx->stream()->As<ep::CudaStream>()->cuda_stream()>>>(
+              elem_cnt, in_dptr, out_dptr, mirror_dptr, out_W, in_helper, out_helper, H_offset,
+              W_offset, mean, inv_std);
+    } else if (output_layout == "NHWC") {
+      CHECK_EQ(N, out_shape.At(0));
+      int32_t out_H = out_shape.At(1);
+      int32_t out_W = out_shape.At(2);
+      CHECK_EQ(C, out_shape.At(3));
+      CHECK_LE(out_H, in_H);
+      CHECK_LE(out_W, in_W);
+      int32_t H_offset = (in_H - out_H) * crop_pos_y;
+      int32_t W_offset = (in_W - out_W) * crop_pos_x;
+      const NdIndexOffsetHelper<int32_t, 4> out_helper(N, out_H, out_W, C);
+      CropMirrorNormalizeGpuImpl<TensorLayout::kNHWC>
+          <<<BlocksNum4ThreadsNum(elem_cnt), kCudaThreadsNumPerBlock, 0,
+             ctx->stream()->As<ep::CudaStream>()->cuda_stream()>>>(
+              elem_cnt, in_dptr, out_dptr, mirror_dptr, out_W, in_helper, out_helper, H_offset,
+              W_offset, mean, inv_std);
+    } else {
+      UNIMPLEMENTED();
+    }
+  }
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
+};
+
+REGISTER_USER_KERNEL("crop_mirror_normalize_from_uint8")
+    .SetCreateFn<CropMirrorNormalizeGpuKernel>()
+    .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)
+                     && (user_op::HobDataType("in", 0) == DataType::kUInt8)
+                     && (user_op::HobDataType("out", 0) == DataType::kFloat));
+
 }  // namespace oneflow
\ No newline at end of file
diff --git a/oneflow/user/kernels/in_top_k_kernel_util.hip.cpp b/oneflow/user/kernels/in_top_k_kernel_util.hip.cpp
index b971dc1..e6be6f7 100644
--- a/oneflow/user/kernels/in_top_k_kernel_util.hip.cpp
+++ b/oneflow/user/kernels/in_top_k_kernel_util.hip.cpp
@@ -1,68 +1,68 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "hip/hip_runtime.h"
-#include "oneflow/user/kernels/in_top_k_kernel_util.h"
-#include "oneflow/core/device/cuda_util.h"
-
-namespace oneflow {
-
-namespace {
-
-template<typename T>
-__global__ void InTopkGpu(const int instance_num, const int classes_num, const T* targets,
-                          const float* predictions, const int k, bool* out) {
-  CUDA_1D_KERNEL_LOOP(idx, instance_num) {
-    T target = targets[idx];
-    bool cannot_say = (target >= classes_num) || !isfinite(predictions[idx * classes_num + target]);
-
-    int32_t more_probable_classes = 0;
-    if (!cannot_say) {
-      const float target_prediction = predictions[idx * classes_num + target];
-      FOR_RANGE(int32_t, class_idx, 0, classes_num) {
-        float pred = predictions[idx * classes_num + class_idx];
-
-        if (!isfinite(pred)) {
-          cannot_say = true;
-          break;
-        } else if (pred > target_prediction) {
-          ++more_probable_classes;
-          if (more_probable_classes > k) break;
-        }
-      }
-    }
-    out[idx] = cannot_say ? false : (more_probable_classes < k);
-  }
-}
-
-}  // namespace
-
-template<typename T>
-struct InTopkKernelUtil<DeviceType::kCUDA, T> {
-  static void InTopk(ep::Stream* stream, const int instance_num, const int classes_num,
-                     const T* targets, const float* predictions, const int k, bool* out) {
-    RUN_CUDA_KERNEL((InTopkGpu<T>), stream, instance_num, instance_num, classes_num, targets,
-                    predictions, k, out);
-  }
-};
-
-#define INSTANTIATE_IN_TOP_K_KERNEL_UTIL_CUDA(cpp_data_type, data_type) \
-  template struct InTopkKernelUtil<DeviceType::kCUDA, cpp_data_type>;
-
-OF_PP_FOR_EACH_TUPLE(INSTANTIATE_IN_TOP_K_KERNEL_UTIL_CUDA, INDEX_DATA_TYPE_SEQ)
-
-#undef INSTANTIATE_IN_TOP_K_KERNEL_UTIL_CUDA
-
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "hip/hip_runtime.h"
+#include "oneflow/user/kernels/in_top_k_kernel_util.h"
+#include "oneflow/core/device/cuda_util.h"
+
+namespace oneflow {
+
+namespace {
+
+template<typename T>
+__global__ void InTopkGpu(const int instance_num, const int classes_num, const T* targets,
+                          const float* predictions, const int k, bool* out) {
+  CUDA_1D_KERNEL_LOOP(idx, instance_num) {
+    T target = targets[idx];
+    bool cannot_say = (target >= classes_num) || !isfinite(predictions[idx * classes_num + target]);
+
+    int32_t more_probable_classes = 0;
+    if (!cannot_say) {
+      const float target_prediction = predictions[idx * classes_num + target];
+      FOR_RANGE(int32_t, class_idx, 0, classes_num) {
+        float pred = predictions[idx * classes_num + class_idx];
+
+        if (!isfinite(pred)) {
+          cannot_say = true;
+          break;
+        } else if (pred > target_prediction) {
+          ++more_probable_classes;
+          if (more_probable_classes > k) break;
+        }
+      }
+    }
+    out[idx] = cannot_say ? false : (more_probable_classes < k);
+  }
+}
+
+}  // namespace
+
+template<typename T>
+struct InTopkKernelUtil<DeviceType::kCUDA, T> {
+  static void InTopk(ep::Stream* stream, const int instance_num, const int classes_num,
+                     const T* targets, const float* predictions, const int k, bool* out) {
+    RUN_CUDA_KERNEL((InTopkGpu<T>), stream, instance_num, instance_num, classes_num, targets,
+                    predictions, k, out);
+  }
+};
+
+#define INSTANTIATE_IN_TOP_K_KERNEL_UTIL_CUDA(cpp_data_type, data_type) \
+  template struct InTopkKernelUtil<DeviceType::kCUDA, cpp_data_type>;
+
+OF_PP_FOR_EACH_TUPLE(INSTANTIATE_IN_TOP_K_KERNEL_UTIL_CUDA, INDEX_DATA_TYPE_SEQ)
+
+#undef INSTANTIATE_IN_TOP_K_KERNEL_UTIL_CUDA
+
 }  // namespace oneflow
\ No newline at end of file
diff --git a/oneflow/user/kernels/kl_div_kernel.hip.cpp b/oneflow/user/kernels/kl_div_kernel.hip.cpp
index cc78fa5..eddebd8 100644
--- a/oneflow/user/kernels/kl_div_kernel.hip.cpp
+++ b/oneflow/user/kernels/kl_div_kernel.hip.cpp
@@ -1,121 +1,121 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "oneflow/core/common/data_type.h"
-#include "oneflow/core/hip/elementwise.hip.h"
-#include "oneflow/core/framework/framework.h"
-#include "oneflow/core/kernel/kernel_util.hip.h"
-#include "oneflow/user/kernels/loss_kernel_util.h"
-#include "oneflow/core/ep/rocm/cuda_stream.h"
-
-namespace oneflow {
-namespace user_op {
-namespace {
-
-using namespace loss;
-
-template<typename T, bool LOG_TARGET>
-struct KLDivFunctor {
-  __device__ __forceinline__ T operator()(T input_val, T target_val) const {
-    if (LOG_TARGET) {
-      return exp(target_val) * (target_val - input_val);
-    } else {
-      const T zero_val = static_cast<T>(0);
-      const T out_val = target_val * (SafeLog(target_val) - input_val);
-      return target_val > zero_val ? out_val : zero_val;
-    }
-  }
-};
-
-template<bool LOG_TARGET>
-struct KLDivFunctor<half, LOG_TARGET> {
-  __device__ __forceinline__ half operator()(half input_val, half target_val) const {
-    if (LOG_TARGET) {
-      return hexp(target_val) * (target_val - input_val);
-    } else {
-      const half zero_val = __float2half(0.f);
-      const half out_val = target_val * (SafeLog(target_val) - input_val);
-      return target_val > zero_val ? out_val : zero_val;
-    }
-  }
-};
-
-template<typename T, bool LOG_TARGET>
-struct KLDivGradFunctor {
-  __device__ __forceinline__ T operator()(T target_val, T dy_val) const {
-    if (LOG_TARGET) {
-      return -exp(target_val) * dy_val;
-    } else {
-      const T zero_val = static_cast<T>(0);
-      return target_val > zero_val ? -target_val * dy_val : zero_val;
-    }
-  }
-};
-
-template<bool LOG_TARGET>
-struct KLDivGradFunctor<half, LOG_TARGET> {
-  __device__ __forceinline__ half operator()(half target_val, half dy_val) const {
-    if (LOG_TARGET) {
-      return __hneg(hexp(target_val) * dy_val);
-    } else {
-      const half zero_val = __float2half(0.f);
-      return target_val > zero_val ? __hneg(target_val * dy_val) : zero_val;
-    }
-  }
-};
-
-template<typename T>
-class KLDivKernel : public SimpleLossKernel<DeviceType::kCUDA, T, KLDivKernel<T>> {
- public:
-  void ComputeOut(user_op::KernelComputeContext* ctx, int64_t elem_cnt, const T* input,
-                  const T* target, T* out) const {
-    const bool log_target = ctx->Attr<bool>("log_target");
-    if (log_target) {
-      OF_CUDA_CHECK(
-          (cuda::elementwise::Binary(KLDivFunctor<T, true>(), elem_cnt, out, input, target,
-                                     ctx->stream()->As<ep::CudaStream>()->cuda_stream())));
-    } else {
-      OF_CUDA_CHECK(
-          (cuda::elementwise::Binary(KLDivFunctor<T, false>(), elem_cnt, out, input, target,
-                                     ctx->stream()->As<ep::CudaStream>()->cuda_stream())));
-    }
-  }
-};
-
-template<typename T>
-class KLDivGradKernel : public SimpleLossGradKernel<DeviceType::kCUDA, T, KLDivGradKernel<T>> {
- public:
-  void ComputeOut(user_op::KernelComputeContext* ctx, int64_t elem_cnt, const T* input,
-                  const T* target, const T* dy, T* dx) const {
-    const bool log_target = ctx->Attr<bool>("log_target");
-    if (log_target) {
-      OF_CUDA_CHECK((cuda::elementwise::Binary(
-          KLDivGradFunctor<T, /*LOG_TARGET*/ true>(), elem_cnt, dx, target, dy,
-          ctx->stream()->As<ep::CudaStream>()->cuda_stream())));
-    } else {
-      OF_CUDA_CHECK((cuda::elementwise::Binary(
-          KLDivGradFunctor<T, /*LOG_TARGET*/ false>(), elem_cnt, dx, target, dy,
-          ctx->stream()->As<ep::CudaStream>()->cuda_stream())));
-    }
-  }
-};
-
-}  // namespace
-
-REGISTER_SIMPLE_LOSS_KERNEL_CUDA("kl_div_loss", KLDivKernel)
-REGISTER_SIMPLE_LOSS_GRAD_KERNEL_CUDA("kl_div_loss_grad", KLDivGradKernel)
-
-}  // namespace user_op
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "oneflow/core/common/data_type.h"
+#include "oneflow/core/hip/elementwise.hip.h"
+#include "oneflow/core/framework/framework.h"
+#include "oneflow/core/kernel/kernel_util.hip.h"
+#include "oneflow/user/kernels/loss_kernel_util.h"
+#include "oneflow/core/ep/rocm/cuda_stream.h"
+
+namespace oneflow {
+namespace user_op {
+namespace {
+
+using namespace loss;
+
+template<typename T, bool LOG_TARGET>
+struct KLDivFunctor {
+  __device__ __forceinline__ T operator()(T input_val, T target_val) const {
+    if (LOG_TARGET) {
+      return exp(target_val) * (target_val - input_val);
+    } else {
+      const T zero_val = static_cast<T>(0);
+      const T out_val = target_val * (SafeLog(target_val) - input_val);
+      return target_val > zero_val ? out_val : zero_val;
+    }
+  }
+};
+
+template<bool LOG_TARGET>
+struct KLDivFunctor<half, LOG_TARGET> {
+  __device__ __forceinline__ half operator()(half input_val, half target_val) const {
+    if (LOG_TARGET) {
+      return hexp(target_val) * (target_val - input_val);
+    } else {
+      const half zero_val = __float2half(0.f);
+      const half out_val = target_val * (SafeLog(target_val) - input_val);
+      return target_val > zero_val ? out_val : zero_val;
+    }
+  }
+};
+
+template<typename T, bool LOG_TARGET>
+struct KLDivGradFunctor {
+  __device__ __forceinline__ T operator()(T target_val, T dy_val) const {
+    if (LOG_TARGET) {
+      return -exp(target_val) * dy_val;
+    } else {
+      const T zero_val = static_cast<T>(0);
+      return target_val > zero_val ? -target_val * dy_val : zero_val;
+    }
+  }
+};
+
+template<bool LOG_TARGET>
+struct KLDivGradFunctor<half, LOG_TARGET> {
+  __device__ __forceinline__ half operator()(half target_val, half dy_val) const {
+    if (LOG_TARGET) {
+      return __hneg(hexp(target_val) * dy_val);
+    } else {
+      const half zero_val = __float2half(0.f);
+      return target_val > zero_val ? __hneg(target_val * dy_val) : zero_val;
+    }
+  }
+};
+
+template<typename T>
+class KLDivKernel : public SimpleLossKernel<DeviceType::kCUDA, T, KLDivKernel<T>> {
+ public:
+  void ComputeOut(user_op::KernelComputeContext* ctx, int64_t elem_cnt, const T* input,
+                  const T* target, T* out) const {
+    const bool log_target = ctx->Attr<bool>("log_target");
+    if (log_target) {
+      OF_CUDA_CHECK(
+          (cuda::elementwise::Binary(KLDivFunctor<T, true>(), elem_cnt, out, input, target,
+                                     ctx->stream()->As<ep::CudaStream>()->cuda_stream())));
+    } else {
+      OF_CUDA_CHECK(
+          (cuda::elementwise::Binary(KLDivFunctor<T, false>(), elem_cnt, out, input, target,
+                                     ctx->stream()->As<ep::CudaStream>()->cuda_stream())));
+    }
+  }
+};
+
+template<typename T>
+class KLDivGradKernel : public SimpleLossGradKernel<DeviceType::kCUDA, T, KLDivGradKernel<T>> {
+ public:
+  void ComputeOut(user_op::KernelComputeContext* ctx, int64_t elem_cnt, const T* input,
+                  const T* target, const T* dy, T* dx) const {
+    const bool log_target = ctx->Attr<bool>("log_target");
+    if (log_target) {
+      OF_CUDA_CHECK((cuda::elementwise::Binary(
+          KLDivGradFunctor<T, /*LOG_TARGET*/ true>(), elem_cnt, dx, target, dy,
+          ctx->stream()->As<ep::CudaStream>()->cuda_stream())));
+    } else {
+      OF_CUDA_CHECK((cuda::elementwise::Binary(
+          KLDivGradFunctor<T, /*LOG_TARGET*/ false>(), elem_cnt, dx, target, dy,
+          ctx->stream()->As<ep::CudaStream>()->cuda_stream())));
+    }
+  }
+};
+
+}  // namespace
+
+REGISTER_SIMPLE_LOSS_KERNEL_CUDA("kl_div_loss", KLDivKernel)
+REGISTER_SIMPLE_LOSS_GRAD_KERNEL_CUDA("kl_div_loss_grad", KLDivGradKernel)
+
+}  // namespace user_op
 }  // namespace oneflow
\ No newline at end of file
diff --git a/oneflow/user/kernels/l1_l2_regularize_gradient_kernel_util.hip.cpp b/oneflow/user/kernels/l1_l2_regularize_gradient_kernel_util.hip.cpp
index 9672d97..27404bf 100644
--- a/oneflow/user/kernels/l1_l2_regularize_gradient_kernel_util.hip.cpp
+++ b/oneflow/user/kernels/l1_l2_regularize_gradient_kernel_util.hip.cpp
@@ -1,51 +1,51 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "hip/hip_runtime.h"
-#include "oneflow/user/kernels/l1_l2_regularize_gradient_kernel_util.h"
-#include "oneflow/core/ep/rocm/cuda_stream.h"
-
-namespace oneflow {
-
-namespace {
-
-template<typename T>
-__global__ void L1L2RegularizeGradientGpu(int64_t n, const T* model, const T* model_diff, T* out,
-                                          const T l1, const T l2) {
-  CUDA_1D_KERNEL_LOOP(i, n) {
-    const T model_val = model[i];
-    out[i] = model_diff[i] + l1 * ((model_val >= 0) - (model_val <= 0)) + l2 * model_val;
-  }
-}
-
-}  // namespace
-
-template<typename T>
-struct L1L2RegularizeGradientKernelUtil<DeviceType::kCUDA, T> {
-  static void RegularizeGradient(ep::Stream* stream, int64_t n, const T* model, const T* model_diff,
-                                 T* out, const T l1, const T l2) {
-    L1L2RegularizeGradientGpu<<<BlocksNum4ThreadsNum(n), kCudaThreadsNumPerBlock, 0,
-                                stream->As<ep::CudaStream>()->cuda_stream()>>>(n, model, model_diff,
-                                                                               out, l1, l2);
-  }
-};
-
-#define INSTANTIATE_L1_L2_REGULARIZE_GRADIENT_KERNEL_UTIL_CUDA(type_cpp, type_proto) \
-  template struct L1L2RegularizeGradientKernelUtil<DeviceType::kCUDA, type_cpp>;
-OF_PP_FOR_EACH_TUPLE(INSTANTIATE_L1_L2_REGULARIZE_GRADIENT_KERNEL_UTIL_CUDA,
-                     FLOATING_DATA_TYPE_SEQ);
-#undef INSTANTIATE_L1_L2_REGULARIZE_GRADIENT_KERNEL_UTIL_CUDA
-
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "hip/hip_runtime.h"
+#include "oneflow/user/kernels/l1_l2_regularize_gradient_kernel_util.h"
+#include "oneflow/core/ep/rocm/cuda_stream.h"
+
+namespace oneflow {
+
+namespace {
+
+template<typename T>
+__global__ void L1L2RegularizeGradientGpu(int64_t n, const T* model, const T* model_diff, T* out,
+                                          const T l1, const T l2) {
+  CUDA_1D_KERNEL_LOOP(i, n) {
+    const T model_val = model[i];
+    out[i] = model_diff[i] + l1 * ((model_val >= 0) - (model_val <= 0)) + l2 * model_val;
+  }
+}
+
+}  // namespace
+
+template<typename T>
+struct L1L2RegularizeGradientKernelUtil<DeviceType::kCUDA, T> {
+  static void RegularizeGradient(ep::Stream* stream, int64_t n, const T* model, const T* model_diff,
+                                 T* out, const T l1, const T l2) {
+    L1L2RegularizeGradientGpu<<<BlocksNum4ThreadsNum(n), kCudaThreadsNumPerBlock, 0,
+                                stream->As<ep::CudaStream>()->cuda_stream()>>>(n, model, model_diff,
+                                                                               out, l1, l2);
+  }
+};
+
+#define INSTANTIATE_L1_L2_REGULARIZE_GRADIENT_KERNEL_UTIL_CUDA(type_cpp, type_proto) \
+  template struct L1L2RegularizeGradientKernelUtil<DeviceType::kCUDA, type_cpp>;
+OF_PP_FOR_EACH_TUPLE(INSTANTIATE_L1_L2_REGULARIZE_GRADIENT_KERNEL_UTIL_CUDA,
+                     FLOATING_DATA_TYPE_SEQ);
+#undef INSTANTIATE_L1_L2_REGULARIZE_GRADIENT_KERNEL_UTIL_CUDA
+
 }  // namespace oneflow
\ No newline at end of file
diff --git a/oneflow/user/kernels/l2_normalize_kernel.hip.cpp b/oneflow/user/kernels/l2_normalize_kernel.hip.cpp
index 5228003..f3ac7b2 100644
--- a/oneflow/user/kernels/l2_normalize_kernel.hip.cpp
+++ b/oneflow/user/kernels/l2_normalize_kernel.hip.cpp
@@ -1,150 +1,150 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "hip/hip_runtime.h"
-#include "oneflow/core/framework/framework.h"
-#include <hipcub/hipcub.hpp>
-#include "oneflow/core/device/cuda_util.h"
-
-namespace oneflow {
-
-namespace {
-
-template<typename T>
-__global__ void L2NormalizeForward(const int32_t n, const int32_t c, const int32_t d,
-                                   const T epsilon, const T* in, T* square_x_sum, T* out) {
-  using BlockReduce = hipcub::BlockReduce<T, ep::CudaStream::kDefaultBlockSize>;
-  __shared__ typename BlockReduce::TempStorage temp_storage;
-
-  for (int32_t i = blockIdx.x; i < n; i += gridDim.x) {
-    T sum = GetZeroVal<T>();
-    const int32_t offset = (i / d) * d * c + (i % d);
-    for (int32_t j = threadIdx.x; j < c; j += blockDim.x) {
-      const T x = in[offset + j * d];
-      sum += x * x;
-    }
-    const T reduce_sum = BlockReduce(temp_storage).Sum(sum);
-    if (threadIdx.x == 0) { square_x_sum[i] = reduce_sum; }
-    __syncthreads();
-
-    const T inv_norm = rsqrtf(fmaxf(square_x_sum[i], epsilon));
-    for (int32_t j = threadIdx.x; j < c; j += blockDim.x) {
-      const int32_t index = offset + j * d;
-      out[index] = inv_norm * in[index];
-    }
-  }
-}
-
-template<typename T>
-__global__ void L2NormalizeBackward(const int32_t n, const int32_t c, const int32_t d,
-                                    const float epsilon, const T* out, const T* out_diff,
-                                    const T* square_x_sum, T* in_diff) {
-  for (int32_t i = blockIdx.x; i < n; i += gridDim.x) {
-    const T inv_norm = rsqrt(fmaxf(square_x_sum[i], epsilon));
-    const int32_t offset = (i / d) * d * c + (i % d);
-    if (square_x_sum[i] >= epsilon) {
-      using BlockReduce = hipcub::BlockReduce<T, ep::CudaStream::kDefaultBlockSize>;
-      __shared__ typename BlockReduce::TempStorage temp_storage_prod_sum;
-
-      T y_dy_prod_sum = GetZeroVal<T>();
-      for (int32_t j = threadIdx.x; j < c; j += blockDim.x) {
-        const int32_t index = offset + j * d;
-        y_dy_prod_sum += out[index] * out_diff[index];
-      }
-
-      const T reduce_y_dy_prod_sum = BlockReduce(temp_storage_prod_sum).Sum(y_dy_prod_sum);
-      __shared__ T y_dy_inner_prod;
-      if (threadIdx.x == 0) { y_dy_inner_prod = reduce_y_dy_prod_sum; }
-      __syncthreads();
-
-      for (int32_t j = threadIdx.x; j < c; j += blockDim.x) {
-        const int32_t index = offset + j * d;
-        in_diff[index] = inv_norm * (out_diff[index] - y_dy_inner_prod * out[index]);
-      }
-    } else {
-      for (int32_t j = threadIdx.x; j < c; j += blockDim.x) {
-        const int32_t index = offset + j * d;
-        in_diff[index] = inv_norm * out_diff[index];
-      }
-    }
-  }
-}
-
-}  // namespace
-
-template<typename T>
-class GpuL2NormalizeKernel final : public user_op::OpKernel {
- public:
-  GpuL2NormalizeKernel() = default;
-  ~GpuL2NormalizeKernel() = default;
-
- private:
-  using user_op::OpKernel::Compute;
-  void Compute(user_op::KernelComputeContext* ctx) const override {
-    const user_op::Tensor* x = ctx->Tensor4ArgNameAndIndex("x", 0);
-    user_op::Tensor* y = ctx->Tensor4ArgNameAndIndex("y", 0);
-    user_op::Tensor* square_x_sum = ctx->Tensor4ArgNameAndIndex("square_x_sum", 0);
-    const float epsilon = ctx->Attr<float>("epsilon");
-    int32_t axis = ctx->Attr<int32_t>("axis");
-    int32_t c = x->shape_view().At(axis);
-    int32_t n = x->shape_view().elem_cnt() / c;
-    int32_t d = x->shape_view().Count(axis + 1);
-    RUN_CUDA_KERNEL((L2NormalizeForward<T>), ctx->stream(), n, n, c, d, static_cast<T>(epsilon),
-                    x->dptr<T>(), square_x_sum->mut_dptr<T>(), y->mut_dptr<T>());
-  }
-  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
-};
-
-#define REGISTER_CUDA_L2_NORMALIZE_KERNEL(dtype)                       \
-  REGISTER_USER_KERNEL("l2_normalize")                                 \
-      .SetCreateFn<GpuL2NormalizeKernel<dtype>>()                      \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \
-                       && (user_op::HobDataType("y", 0) == GetDataType<dtype>::value));
-
-REGISTER_CUDA_L2_NORMALIZE_KERNEL(float)
-
-template<typename T>
-class GpuL2NormalizeGradKernel final : public user_op::OpKernel {
- public:
-  GpuL2NormalizeGradKernel() = default;
-  ~GpuL2NormalizeGradKernel() = default;
-
- private:
-  using user_op::OpKernel::Compute;
-  void Compute(user_op::KernelComputeContext* ctx) const override {
-    const user_op::Tensor* y = ctx->Tensor4ArgNameAndIndex("y", 0);
-    const user_op::Tensor* dy = ctx->Tensor4ArgNameAndIndex("dy", 0);
-    const user_op::Tensor* square_x_sum = ctx->Tensor4ArgNameAndIndex("square_x_sum", 0);
-    user_op::Tensor* dx = ctx->Tensor4ArgNameAndIndex("dx", 0);
-    const float epsilon = ctx->Attr<float>("epsilon");
-    int32_t axis = ctx->Attr<int32_t>("axis");
-    int32_t c = dy->shape_view().At(axis);
-    int32_t n = dy->shape_view().elem_cnt() / c;
-    int32_t d = dy->shape_view().Count(axis + 1);
-    RUN_CUDA_KERNEL((L2NormalizeBackward<T>), ctx->stream(), n, n, c, d, static_cast<T>(epsilon),
-                    y->dptr<T>(), dy->dptr<T>(), square_x_sum->dptr<T>(), dx->mut_dptr<T>());
-  }
-  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
-};
-
-#define REGISTER_CUDA_L2_NORMALIZE_GRAD_KERNEL(dtype)                  \
-  REGISTER_USER_KERNEL("l2_normalize_grad")                            \
-      .SetCreateFn<GpuL2NormalizeGradKernel<dtype>>()                  \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \
-                       && (user_op::HobDataType("dx", 0) == GetDataType<dtype>::value));
-
-REGISTER_CUDA_L2_NORMALIZE_GRAD_KERNEL(float)
-
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "hip/hip_runtime.h"
+#include "oneflow/core/framework/framework.h"
+#include <hipcub/hipcub.hpp>
+#include "oneflow/core/device/cuda_util.h"
+
+namespace oneflow {
+
+namespace {
+
+template<typename T>
+__global__ void L2NormalizeForward(const int32_t n, const int32_t c, const int32_t d,
+                                   const T epsilon, const T* in, T* square_x_sum, T* out) {
+  using BlockReduce = hipcub::BlockReduce<T, ep::CudaStream::kDefaultBlockSize>;
+  __shared__ typename BlockReduce::TempStorage temp_storage;
+
+  for (int32_t i = blockIdx.x; i < n; i += gridDim.x) {
+    T sum = GetZeroVal<T>();
+    const int32_t offset = (i / d) * d * c + (i % d);
+    for (int32_t j = threadIdx.x; j < c; j += blockDim.x) {
+      const T x = in[offset + j * d];
+      sum += x * x;
+    }
+    const T reduce_sum = BlockReduce(temp_storage).Sum(sum);
+    if (threadIdx.x == 0) { square_x_sum[i] = reduce_sum; }
+    __syncthreads();
+
+    const T inv_norm = rsqrtf(fmaxf(square_x_sum[i], epsilon));
+    for (int32_t j = threadIdx.x; j < c; j += blockDim.x) {
+      const int32_t index = offset + j * d;
+      out[index] = inv_norm * in[index];
+    }
+  }
+}
+
+template<typename T>
+__global__ void L2NormalizeBackward(const int32_t n, const int32_t c, const int32_t d,
+                                    const float epsilon, const T* out, const T* out_diff,
+                                    const T* square_x_sum, T* in_diff) {
+  for (int32_t i = blockIdx.x; i < n; i += gridDim.x) {
+    const T inv_norm = rsqrt(fmaxf(square_x_sum[i], epsilon));
+    const int32_t offset = (i / d) * d * c + (i % d);
+    if (square_x_sum[i] >= epsilon) {
+      using BlockReduce = hipcub::BlockReduce<T, ep::CudaStream::kDefaultBlockSize>;
+      __shared__ typename BlockReduce::TempStorage temp_storage_prod_sum;
+
+      T y_dy_prod_sum = GetZeroVal<T>();
+      for (int32_t j = threadIdx.x; j < c; j += blockDim.x) {
+        const int32_t index = offset + j * d;
+        y_dy_prod_sum += out[index] * out_diff[index];
+      }
+
+      const T reduce_y_dy_prod_sum = BlockReduce(temp_storage_prod_sum).Sum(y_dy_prod_sum);
+      __shared__ T y_dy_inner_prod;
+      if (threadIdx.x == 0) { y_dy_inner_prod = reduce_y_dy_prod_sum; }
+      __syncthreads();
+
+      for (int32_t j = threadIdx.x; j < c; j += blockDim.x) {
+        const int32_t index = offset + j * d;
+        in_diff[index] = inv_norm * (out_diff[index] - y_dy_inner_prod * out[index]);
+      }
+    } else {
+      for (int32_t j = threadIdx.x; j < c; j += blockDim.x) {
+        const int32_t index = offset + j * d;
+        in_diff[index] = inv_norm * out_diff[index];
+      }
+    }
+  }
+}
+
+}  // namespace
+
+template<typename T>
+class GpuL2NormalizeKernel final : public user_op::OpKernel {
+ public:
+  GpuL2NormalizeKernel() = default;
+  ~GpuL2NormalizeKernel() = default;
+
+ private:
+  using user_op::OpKernel::Compute;
+  void Compute(user_op::KernelComputeContext* ctx) const override {
+    const user_op::Tensor* x = ctx->Tensor4ArgNameAndIndex("x", 0);
+    user_op::Tensor* y = ctx->Tensor4ArgNameAndIndex("y", 0);
+    user_op::Tensor* square_x_sum = ctx->Tensor4ArgNameAndIndex("square_x_sum", 0);
+    const float epsilon = ctx->Attr<float>("epsilon");
+    int32_t axis = ctx->Attr<int32_t>("axis");
+    int32_t c = x->shape_view().At(axis);
+    int32_t n = x->shape_view().elem_cnt() / c;
+    int32_t d = x->shape_view().Count(axis + 1);
+    RUN_CUDA_KERNEL((L2NormalizeForward<T>), ctx->stream(), n, n, c, d, static_cast<T>(epsilon),
+                    x->dptr<T>(), square_x_sum->mut_dptr<T>(), y->mut_dptr<T>());
+  }
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
+};
+
+#define REGISTER_CUDA_L2_NORMALIZE_KERNEL(dtype)                       \
+  REGISTER_USER_KERNEL("l2_normalize")                                 \
+      .SetCreateFn<GpuL2NormalizeKernel<dtype>>()                      \
+      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \
+                       && (user_op::HobDataType("y", 0) == GetDataType<dtype>::value));
+
+REGISTER_CUDA_L2_NORMALIZE_KERNEL(float)
+
+template<typename T>
+class GpuL2NormalizeGradKernel final : public user_op::OpKernel {
+ public:
+  GpuL2NormalizeGradKernel() = default;
+  ~GpuL2NormalizeGradKernel() = default;
+
+ private:
+  using user_op::OpKernel::Compute;
+  void Compute(user_op::KernelComputeContext* ctx) const override {
+    const user_op::Tensor* y = ctx->Tensor4ArgNameAndIndex("y", 0);
+    const user_op::Tensor* dy = ctx->Tensor4ArgNameAndIndex("dy", 0);
+    const user_op::Tensor* square_x_sum = ctx->Tensor4ArgNameAndIndex("square_x_sum", 0);
+    user_op::Tensor* dx = ctx->Tensor4ArgNameAndIndex("dx", 0);
+    const float epsilon = ctx->Attr<float>("epsilon");
+    int32_t axis = ctx->Attr<int32_t>("axis");
+    int32_t c = dy->shape_view().At(axis);
+    int32_t n = dy->shape_view().elem_cnt() / c;
+    int32_t d = dy->shape_view().Count(axis + 1);
+    RUN_CUDA_KERNEL((L2NormalizeBackward<T>), ctx->stream(), n, n, c, d, static_cast<T>(epsilon),
+                    y->dptr<T>(), dy->dptr<T>(), square_x_sum->dptr<T>(), dx->mut_dptr<T>());
+  }
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
+};
+
+#define REGISTER_CUDA_L2_NORMALIZE_GRAD_KERNEL(dtype)                  \
+  REGISTER_USER_KERNEL("l2_normalize_grad")                            \
+      .SetCreateFn<GpuL2NormalizeGradKernel<dtype>>()                  \
+      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \
+                       && (user_op::HobDataType("dx", 0) == GetDataType<dtype>::value));
+
+REGISTER_CUDA_L2_NORMALIZE_GRAD_KERNEL(float)
+
 }  // namespace oneflow
\ No newline at end of file
diff --git a/oneflow/user/kernels/layer_norm_gpu_kernel.hip.cpp b/oneflow/user/kernels/layer_norm_gpu_kernel.hip.cpp
index 70da59a..d9f3285 100644
--- a/oneflow/user/kernels/layer_norm_gpu_kernel.hip.cpp
+++ b/oneflow/user/kernels/layer_norm_gpu_kernel.hip.cpp
@@ -1,465 +1,678 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "hip/hip_runtime.h"
-#include "oneflow/core/device/cudnn_util.h"
-#include "oneflow/core/framework/framework.h"
-#include "oneflow/core/ndarray/ndarray_util.h"
-#include "oneflow/core/hip/atomic.hip.h"
-#include <hipcub/hipcub.hpp>
-#include "oneflow/core/kernel/cuda_graph_support.h"
-#include "oneflow/core/ep/include/primitive/fill.h"
-#include "oneflow/core/ep/include/primitive/matmul.h"
-#include "oneflow/core/ep/rocm/cuda_stream.h"
-#include "oneflow/core/hip/layer_norm.hip.h"
-
-namespace oneflow {
-
-namespace {
-
-template<typename SRC, typename DST, bool do_scale, bool do_center>
-struct AffineStore {
-  AffineStore(DST* y, int64_t row_size, const DST* gamma, const DST* beta)
-      : y(y), row_size(row_size), gamma(gamma), beta(beta) {}
-  template<int N>
-  __device__ void store(const SRC* src, int64_t row, int64_t col) {
-    cuda::layer_norm::Pack<DST, N> y_pack;
-    cuda::layer_norm::Pack<DST, N> gamma_pack;
-    cuda::layer_norm::Pack<DST, N> beta_pack;
-    const int64_t offset = (row * row_size + col) / N;
-    const int64_t gamma_offset = col / N;
-    if (do_scale) {
-      gamma_pack.storage =
-          *(reinterpret_cast<const cuda::layer_norm::PackType<DST, N>*>(gamma) + gamma_offset);
-    } else {
-#pragma unroll
-      for (int i = 0; i < N; ++i) { gamma_pack.elem[i] = 1; }
-    }
-    if (do_center) {
-      beta_pack.storage =
-          *(reinterpret_cast<const cuda::layer_norm::PackType<DST, N>*>(beta) + gamma_offset);
-    } else {
-#pragma unroll
-      for (int i = 0; i < N; ++i) { beta_pack.elem[i] = 0; }
-    }
-#pragma unroll
-    for (int i = 0; i < N; ++i) {
-      DST normalized_i = static_cast<DST>(src[i]);
-      if (do_scale || do_center) {
-        y_pack.elem[i] = normalized_i * gamma_pack.elem[i] + beta_pack.elem[i];
-      } else {
-        y_pack.elem[i] = normalized_i;
-      }
-    }
-    *(reinterpret_cast<cuda::layer_norm::PackType<DST, N>*>(y) + offset) = y_pack.storage;
-  }
-  DST* y;
-  int64_t row_size;
-  const DST* gamma;
-  const DST* beta;
-};
-
-template<typename SRC, typename DST, bool do_scale>
-struct ScaleLoad {
-  ScaleLoad(const SRC* src, const SRC* gamma, int64_t row_size)
-      : src(src), gamma(gamma), row_size(row_size) {}
-  template<int N>
-  __device__ void load(DST* dst, int64_t row, int64_t col) const {
-    cuda::layer_norm::Pack<SRC, N> src_pack;
-    cuda::layer_norm::Pack<SRC, N> gamma_pack;
-    const int64_t offset = (row * row_size + col) / N;
-    const int64_t gamma_offset = col / N;
-    src_pack.storage = *(reinterpret_cast<const cuda::layer_norm::PackType<SRC, N>*>(src) + offset);
-    if (do_scale) {
-      gamma_pack.storage =
-          *(reinterpret_cast<const cuda::layer_norm::PackType<SRC, N>*>(gamma) + gamma_offset);
-    } else {
-#pragma unroll
-      for (int i = 0; i < N; ++i) { gamma_pack.elem[i] = static_cast<SRC>(1); }
-    }
-#pragma unroll
-    for (int i = 0; i < N; ++i) {
-      dst[i] = static_cast<DST>(src_pack.elem[i] * gamma_pack.elem[i]);
-    }
-  }
-  const SRC* src;
-  const SRC* gamma;
-  int64_t row_size;
-};
-
-template<typename SRC, typename DST, bool do_add>
-struct AddStore {
-  AddStore(const DST* add_to_output, DST* dst, int64_t row_size)
-      : add_to_output(add_to_output), dst(dst), row_size(row_size) {}
-  template<int N>
-  __device__ void store(const SRC* src, int64_t row, int64_t col) {
-    cuda::layer_norm::Pack<DST, N> add_to_output_pack;
-    cuda::layer_norm::Pack<DST, N> dst_pack;
-    const int64_t offset = (row * row_size + col) / N;
-    if (do_add) {
-      add_to_output_pack.storage =
-          *(reinterpret_cast<const cuda::layer_norm::PackType<DST, N>*>(add_to_output) + offset);
-    }
-#pragma unroll
-    for (int i = 0; i < N; ++i) {
-      if (do_add) {
-        dst_pack.elem[i] = static_cast<DST>(src[i]) + add_to_output_pack.elem[i];
-      } else {
-        dst_pack.elem[i] = static_cast<DST>(src[i]);
-      }
-    }
-    *(reinterpret_cast<cuda::layer_norm::PackType<DST, N>*>(dst) + offset) = dst_pack.storage;
-  }
-  const DST* add_to_output;
-  DST* dst;
-  int64_t row_size;
-};
-
-template<typename T>
-__inline__ __device__ T WarpReduce(T val) {
-//   for (int mask = 16; mask > 0; mask /= 2) { val += __shfl_down_sync(0xffffffff, val, mask); }
-  for (int mask = 32; mask > 0; mask /= 2) { val += __shfl_down(val, mask, 64); }
-  return val;
-}
-
-constexpr int tile_size = 32;
-constexpr int num_per_block = 4;
-constexpr int block_dim_x = 32;
-constexpr int block_dim_y = 32 / num_per_block;
-
-template<typename T, typename ComputeType>
-__global__ void LayerNormParamGrad(int rows, int cols, const T* __restrict__ dy,
-                                   const T* __restrict__ x, const ComputeType* __restrict__ mean,
-                                   const ComputeType* __restrict__ inv_var,
-                                   T* __restrict__ tmp_gamma_diff, T* __restrict__ tmp_beta_diff) {
-  __shared__ ComputeType dgamma[32][33];
-  __shared__ ComputeType dbeta[32][33];
-  ComputeType dgamma_sum[num_per_block];
-  ComputeType dbeta_sum[num_per_block];
-#pragma unroll
-  for (int index = 0; index < num_per_block; ++index) {
-    dgamma_sum[index] = 0;
-    dbeta_sum[index] = 0;
-  }
-  const int col_id = blockIdx.x * blockDim.x + threadIdx.x;
-  if (col_id < cols) {
-    for (int i = blockIdx.y * tile_size + threadIdx.y; i < rows; i += tile_size * gridDim.y) {
-#pragma unroll
-      for (int index = 0; index < num_per_block; ++index) {
-        int row_id = i + index * blockDim.y;
-        if (row_id < rows) {
-          int offset = row_id * cols + col_id;
-          const ComputeType dy_val = static_cast<ComputeType>(dy[offset]);
-          const ComputeType x_val = static_cast<ComputeType>(x[offset]);
-          const ComputeType mean_val = mean[row_id];
-          const ComputeType inv_var_val = inv_var[row_id];
-          dgamma_sum[index] += dy_val * (x_val - mean_val) * inv_var_val;
-          dbeta_sum[index] += dy_val;
-        }
-      }
-    }
-  }
-#pragma unroll
-  for (int index = 0; index < num_per_block; ++index) {
-    dgamma[index * blockDim.y + threadIdx.y][threadIdx.x] = dgamma_sum[index];
-    dbeta[index * blockDim.y + threadIdx.y][threadIdx.x] = dbeta_sum[index];
-  }
-  __syncthreads();
-#pragma unroll
-  for (int index = 0; index < num_per_block; ++index) {
-    const int col_id = blockIdx.x * blockDim.x + threadIdx.y + index * blockDim.y;
-    if (col_id < cols) {
-      ComputeType gamma_sum = dgamma[threadIdx.x][threadIdx.y + index * blockDim.y];
-      ComputeType beta_sum = dbeta[threadIdx.x][threadIdx.y + index * blockDim.y];
-      ComputeType global_dgamma = WarpReduce<ComputeType>(gamma_sum);
-      ComputeType global_dbeta = WarpReduce<ComputeType>(beta_sum);
-      if (threadIdx.x == 0) {
-        const int offset = blockIdx.y * cols + col_id;
-        tmp_gamma_diff[offset] = global_dgamma;
-        tmp_beta_diff[offset] = global_dbeta;
-      }
-    }
-  }
-}
-
-template<typename T>
-int GetGirdDimY(const int64_t num_instances, const int64_t norm_size) {
-  using ComputeType = typename cuda::layer_norm::DefaultComputeType<T>::type;
-  const int grid_dim_x = (norm_size + tile_size - 1) / tile_size;
-  const int max_grid_dim_y = (num_instances + tile_size - 1) / tile_size;
-  const int block_size = block_dim_x * block_dim_y;
-  int max_active_blocks = 0;
-  OF_CUDA_CHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor(
-      &max_active_blocks, LayerNormParamGrad<T, ComputeType>, block_size, 0));
-  int waves = 1;
-  int dev;
-  OF_CUDA_CHECK(hipGetDevice(&dev));
-  int sm_count;
-  OF_CUDA_CHECK(hipDeviceGetAttribute(&sm_count, hipDeviceAttributeMultiprocessorCount, dev));
-  int num_blocks = max_active_blocks * sm_count * waves;
-  int grid_dim_y = std::min(max_grid_dim_y, static_cast<int>(num_blocks / grid_dim_x));
-  return std::max(grid_dim_y, 1);
-}
-
-template<typename T, bool do_scale, bool do_center>
-void LayerNormForwardGpu(ep::Stream* stream, const int64_t num_instances, const int64_t norm_size,
-                         const double epsilon, const T* x_ptr, const T* gamma_ptr,
-                         const T* beta_ptr, T* y_ptr, user_op::Tensor* mean,
-                         user_op::Tensor* inv_variance) {
-  using ComputeType = typename cuda::layer_norm::DefaultComputeType<T>::type;
-  cuda::layer_norm::DirectLoad<T, ComputeType> load(x_ptr, norm_size);
-  AffineStore<ComputeType, T, do_scale, do_center> store(y_ptr, norm_size, gamma_ptr, beta_ptr);
-  cuda::layer_norm::DispatchLayerNorm<decltype(load), decltype(store), ComputeType>(
-      stream->As<ep::CudaStream>()->cuda_stream(), load, store, num_instances, norm_size, epsilon,
-      mean->mut_dptr<ComputeType>(), inv_variance->mut_dptr<ComputeType>());
-}
-
-template<typename T>
-void DispatchLayerNormForwardGpu(ep::Stream* stream, const int64_t num_instances,
-                                 const int64_t norm_size, const double epsilon, const T* x_ptr,
-                                 const T* gamma_ptr, const T* beta_ptr, T* y_ptr,
-                                 user_op::Tensor* mean, user_op::Tensor* inv_variance) {
-  if (gamma_ptr != nullptr && beta_ptr != nullptr) {
-    LayerNormForwardGpu<T, true, true>(stream, num_instances, norm_size, epsilon, x_ptr, gamma_ptr,
-                                       beta_ptr, y_ptr, mean, inv_variance);
-  } else if (gamma_ptr != nullptr && beta_ptr == nullptr) {
-    LayerNormForwardGpu<T, true, false>(stream, num_instances, norm_size, epsilon, x_ptr, gamma_ptr,
-                                        beta_ptr, y_ptr, mean, inv_variance);
-  } else if (gamma_ptr == nullptr && beta_ptr != nullptr) {
-    LayerNormForwardGpu<T, false, true>(stream, num_instances, norm_size, epsilon, x_ptr, gamma_ptr,
-                                        beta_ptr, y_ptr, mean, inv_variance);
-  } else {
-    LayerNormForwardGpu<T, false, false>(stream, num_instances, norm_size, epsilon, x_ptr,
-                                         gamma_ptr, beta_ptr, y_ptr, mean, inv_variance);
-  }
-}
-
-template<typename T, bool do_scale, bool do_add>
-void LayerNormBackwardGpu(ep::Stream* stream, const int64_t num_instances, const int64_t norm_size,
-                          const T* dy_ptr, const T* x_ptr, const user_op::Tensor* mean,
-                          const user_op::Tensor* inv_variance, const T* gamma_ptr,
-                          const T* add_to_output_ptr, T* dx_ptr) {
-  using ComputeType = typename cuda::layer_norm::DefaultComputeType<T>::type;
-  cuda::layer_norm::DirectLoad<T, ComputeType> load_x(x_ptr, norm_size);
-  ScaleLoad<T, ComputeType, do_scale> load_scaled_dy(dy_ptr, gamma_ptr, norm_size);
-  AddStore<ComputeType, T, do_add> store(add_to_output_ptr, dx_ptr, norm_size);
-  OF_CUDA_CHECK((cuda::layer_norm::DispatchLayerNormGrad<decltype(load_x), decltype(load_scaled_dy),
-                                                         decltype(store), ComputeType>(
-      stream->As<ep::CudaStream>()->cuda_stream(), load_x, load_scaled_dy, store,
-      mean->dptr<ComputeType>(), inv_variance->dptr<ComputeType>(), num_instances, norm_size)));
-}
-
-template<typename T, bool do_scale>
-void DispatchLayerNormBackwardDoAdd(ep::Stream* stream, const int64_t num_instances,
-                                    const int64_t norm_size, const T* dy_ptr, const T* x_ptr,
-                                    const user_op::Tensor* mean,
-                                    const user_op::Tensor* inv_variance, const T* gamma_ptr,
-                                    const T* add_to_output_ptr, T* dx_ptr) {
-  if (add_to_output_ptr != nullptr) {
-    LayerNormBackwardGpu<T, do_scale, true>(stream, num_instances, norm_size, dy_ptr, x_ptr, mean,
-                                            inv_variance, gamma_ptr, add_to_output_ptr, dx_ptr);
-  } else {
-    LayerNormBackwardGpu<T, do_scale, false>(stream, num_instances, norm_size, dy_ptr, x_ptr, mean,
-                                             inv_variance, gamma_ptr, add_to_output_ptr, dx_ptr);
-  }
-}
-
-template<typename T>
-void LaunchLayerNormBackward(ep::Stream* stream, const int64_t num_instances,
-                             const int64_t norm_size, const T* dy_ptr, const T* x_ptr,
-                             const user_op::Tensor* mean, const user_op::Tensor* inv_variance,
-                             const T* gamma_ptr, const T* add_to_output_ptr, T* dx_ptr) {
-  if (gamma_ptr != nullptr) {
-    DispatchLayerNormBackwardDoAdd<T, true>(stream, num_instances, norm_size, dy_ptr, x_ptr, mean,
-                                            inv_variance, gamma_ptr, add_to_output_ptr, dx_ptr);
-  } else {
-    DispatchLayerNormBackwardDoAdd<T, false>(stream, num_instances, norm_size, dy_ptr, x_ptr, mean,
-                                             inv_variance, gamma_ptr, add_to_output_ptr, dx_ptr);
-  }
-}
-
-}  // namespace
-
-template<typename T>
-class LayerNormGpuKernel final : public user_op::OpKernel, public user_op::CudaGraphSupport {
- public:
-  LayerNormGpuKernel() = default;
-  ~LayerNormGpuKernel() = default;
-
- private:
-  using user_op::OpKernel::Compute;
-  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
-  void Compute(user_op::KernelComputeContext* ctx) const override {
-    const user_op::Tensor* x = ctx->Tensor4ArgNameAndIndex("x", 0);
-    user_op::Tensor* y = ctx->Tensor4ArgNameAndIndex("y", 0);
-    user_op::Tensor* mean = ctx->Tensor4ArgNameAndIndex("mean", 0);
-    user_op::Tensor* inv_variance = ctx->Tensor4ArgNameAndIndex("inv_variance", 0);
-    const double epsilon = ctx->Attr<double>("epsilon");
-    CHECK_GE(epsilon, HIPDNN_BN_MIN_EPSILON);
-    const int64_t num_instances = mean->shape_view().elem_cnt();
-    const int64_t norm_size = x->shape_view().elem_cnt() / num_instances;
-    const T* gamma_ptr = nullptr;
-    const T* beta_ptr = nullptr;
-    if (ctx->has_input("gamma", 0)) {
-      const user_op::Tensor* gamma = ctx->Tensor4ArgNameAndIndex("gamma", 0);
-      gamma_ptr = gamma->dptr<T>();
-      CHECK_EQ(gamma->shape_view().elem_cnt(), norm_size);
-    }
-    if (ctx->has_input("beta", 0)) { beta_ptr = ctx->Tensor4ArgNameAndIndex("beta", 0)->dptr<T>(); }
-    DispatchLayerNormForwardGpu<T>(ctx->stream(), num_instances, norm_size, epsilon, x->dptr<T>(),
-                                   gamma_ptr, beta_ptr, y->mut_dptr<T>(), mean, inv_variance);
-  };
-};
-
-#define REGISTER_LAYER_NORM_CUDA_KERNEL(dtype)                         \
-  REGISTER_USER_KERNEL("layer_norm")                                   \
-      .SetCreateFn<LayerNormGpuKernel<dtype>>()                        \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \
-                       && (user_op::HobDataType("x", 0) == GetDataType<dtype>::value));
-
-REGISTER_LAYER_NORM_CUDA_KERNEL(float)
-REGISTER_LAYER_NORM_CUDA_KERNEL(double)
-REGISTER_LAYER_NORM_CUDA_KERNEL(half)
-
-template<typename T>
-class LayerNormGradGpuKernel final : public user_op::OpKernel, public user_op::CudaGraphSupport {
- public:
-  LayerNormGradGpuKernel() = default;
-  ~LayerNormGradGpuKernel() = default;
-
- private:
-  using user_op::OpKernel::Compute;
-  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
-  void Compute(user_op::KernelComputeContext* ctx) const override {
-    const user_op::Tensor* dy = ctx->Tensor4ArgNameAndIndex("dy", 0);
-    const user_op::Tensor* x = ctx->Tensor4ArgNameAndIndex("x", 0);
-    const user_op::Tensor* mean = ctx->Tensor4ArgNameAndIndex("mean", 0);
-    const user_op::Tensor* inv_variance = ctx->Tensor4ArgNameAndIndex("inv_variance", 0);
-    user_op::Tensor* dx = ctx->Tensor4ArgNameAndIndex("dx", 0);
-    const int64_t num_instances = mean->shape_view().elem_cnt();
-    const int64_t norm_size = x->shape_view().elem_cnt() / num_instances;
-    const T* gamma_ptr = nullptr;
-    if (ctx->has_input("gamma", 0)) {
-      gamma_ptr = ctx->Tensor4ArgNameAndIndex("gamma", 0)->dptr<T>();
-    }
-    const T* add_to_output_ptr = nullptr;
-    if (ctx->has_input("_add_to_output", 0)) {
-      const user_op::Tensor* add_to_output = ctx->Tensor4ArgNameAndIndex("_add_to_output", 0);
-      CHECK_EQ(add_to_output->data_type(), dx->data_type());
-      CHECK_EQ(add_to_output->shape_view(), dx->shape_view());
-      add_to_output_ptr = add_to_output->dptr<T>();
-    }
-    LaunchLayerNormBackward<T>(ctx->stream(), num_instances, norm_size, dy->dptr<T>(), x->dptr<T>(),
-                               mean, inv_variance, gamma_ptr, add_to_output_ptr, dx->mut_dptr<T>());
-  };
-};
-
-#define REGISTER_LAYER_NORM_GRAD_CUDA_KERNEL(dtype)                                        \
-  REGISTER_USER_KERNEL("layer_norm_grad")                                                  \
-      .SetCreateFn<LayerNormGradGpuKernel<dtype>>()                                        \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                     \
-                       && (user_op::HobDataType("dy", 0) == GetDataType<dtype>::value))    \
-      .SetInplaceProposalFn(                                                               \
-          [](const user_op::InferContext& ctx,                                             \
-             const user_op::AddInplaceArgPair& AddInplaceArgPairFn) -> Maybe<void> {       \
-            if (ctx.has_input("_add_to_output", 0)) {                                      \
-              OF_RETURN_IF_ERROR(AddInplaceArgPairFn("dx", 0, "_add_to_output", 0, true)); \
-            }                                                                              \
-            return Maybe<void>::Ok();                                                      \
-          });
-
-REGISTER_LAYER_NORM_GRAD_CUDA_KERNEL(float)
-REGISTER_LAYER_NORM_GRAD_CUDA_KERNEL(double)
-REGISTER_LAYER_NORM_GRAD_CUDA_KERNEL(half)
-
-template<typename T>
-class LayerNormParamGradGpuKernel final : public user_op::OpKernel,
-                                          public user_op::CudaGraphSupport {
- public:
-  LayerNormParamGradGpuKernel() = default;
-  ~LayerNormParamGradGpuKernel() = default;
-
- private:
-  using user_op::OpKernel::Compute;
-  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
-  void Compute(user_op::KernelComputeContext* ctx) const override {
-    const user_op::Tensor* dy = ctx->Tensor4ArgNameAndIndex("dy", 0);
-    const user_op::Tensor* x = ctx->Tensor4ArgNameAndIndex("x", 0);
-    const user_op::Tensor* mean = ctx->Tensor4ArgNameAndIndex("mean", 0);
-    const user_op::Tensor* inv_variance = ctx->Tensor4ArgNameAndIndex("inv_variance", 0);
-    const int64_t num_instances = mean->shape_view().elem_cnt();
-    const int64_t norm_size = x->shape_view().elem_cnt() / num_instances;
-    user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0);
-    const DataType data_type = dy->data_type();
-    const int grid_dim_x = (norm_size + tile_size - 1) / tile_size;
-    const int grid_dim_y = GetGirdDimY<T>(num_instances, norm_size);
-    const size_t tmp_gamma_diff_size = grid_dim_y * norm_size * sizeof(T);
-    T* tmp_gamma_diff_ptr = reinterpret_cast<T*>(tmp_buffer->mut_dptr());
-    T* tmp_beta_diff_ptr = reinterpret_cast<T*>(tmp_buffer->mut_dptr<char>() + tmp_gamma_diff_size);
-    T* reduce_buf_ptr =
-        reinterpret_cast<T*>(tmp_buffer->mut_dptr<char>() + 2 * tmp_gamma_diff_size);
-    using ComputeType = typename cuda::layer_norm::DefaultComputeType<T>::type;
-    LayerNormParamGrad<T, ComputeType><<<dim3(grid_dim_x, grid_dim_y), dim3(32, 32 / num_per_block),
-                                         0, ctx->stream()->As<ep::CudaStream>()->cuda_stream()>>>(
-        num_instances, norm_size, dy->dptr<T>(), x->dptr<T>(), mean->dptr<ComputeType>(),
-        inv_variance->dptr<ComputeType>(), tmp_gamma_diff_ptr, tmp_beta_diff_ptr);
-    const int32_t m = norm_size;
-    const int32_t n = 1;
-    const int32_t k = grid_dim_y;
-    std::unique_ptr<ep::primitive::Fill> fill =
-        ep::primitive::NewPrimitive<ep::primitive::FillFactory>(ctx->stream()->device_type(),
-                                                                data_type);
-    CHECK(fill);
-    fill->Launch(ctx->stream(), reduce_buf_ptr, 1.0, grid_dim_y);
-    std::unique_ptr<ep::primitive::Matmul> matmul =
-        ep::primitive::NewPrimitive<ep::primitive::MatmulFactory>(
-            ctx->stream()->device_type(), data_type, ep::primitive::BlasTransposeType::T,
-            ep::primitive::BlasTransposeType::N);
-    CHECK(matmul);
-    if (ctx->has_output("gamma_diff", 0)) {
-      user_op::Tensor* gamma_diff = ctx->Tensor4ArgNameAndIndex("gamma_diff", 0);
-      matmul->Launch(ctx->stream(), m, n, k, 1.0, tmp_gamma_diff_ptr, reduce_buf_ptr, 0.0,
-                     gamma_diff->mut_dptr());
-    }
-    if (ctx->has_output("beta_diff", 0)) {
-      user_op::Tensor* beta_diff = ctx->Tensor4ArgNameAndIndex("beta_diff", 0);
-      matmul->Launch(ctx->stream(), m, n, k, 1.0, tmp_beta_diff_ptr, reduce_buf_ptr, 0.0,
-                     beta_diff->mut_dptr());
-    }
-  };
-};
-
-#define REGISTER_LAYER_NORM_PARAM_GRAD_GPU_KERNEL(dtype)                                    \
-  REGISTER_USER_KERNEL("layer_norm_param_grad")                                             \
-      .SetCreateFn<LayerNormParamGradGpuKernel<dtype>>()                                    \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                      \
-                       && (user_op::HobDataType("dy", 0) == GetDataType<dtype>::value))     \
-      .SetInferTmpSizeFn([](user_op::InferContext* ctx) {                                   \
-        const int64_t begin_params_axis = ctx->Attr<int64_t>("begin_params_axis");          \
-        const bool has_gamma_diff = ctx->has_output("gamma_diff", 0);                       \
-        const bool has_beta_diff = ctx->has_output("beta_diff", 0);                         \
-        const auto& dy = ctx->InputTensorDesc("dy", 0);                                     \
-        const int64_t num_instances = dy.shape().Count(0, begin_params_axis);               \
-        const int64_t norm_size = dy.shape().Count(begin_params_axis);                      \
-        const int grid_dim_y = GetGirdDimY<dtype>(num_instances, norm_size);                \
-        size_t tmp_buffer_size = (2 * grid_dim_y * norm_size + grid_dim_y) * sizeof(dtype); \
-        return tmp_buffer_size;                                                             \
-      });
-
-REGISTER_LAYER_NORM_PARAM_GRAD_GPU_KERNEL(float)
-REGISTER_LAYER_NORM_PARAM_GRAD_GPU_KERNEL(double)
-REGISTER_LAYER_NORM_PARAM_GRAD_GPU_KERNEL(half)
-
-}  // namespace oneflow
\ No newline at end of file
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "hip/hip_runtime.h"
+#include "oneflow/core/device/cudnn_util.h"
+#include "oneflow/core/framework/framework.h"
+#include "oneflow/core/ndarray/ndarray_util.h"
+#include "oneflow/core/hip/atomic.hip.h"
+#include <hipcub/hipcub.hpp>
+#include "oneflow/core/kernel/cuda_graph_support.h"
+#include "oneflow/core/ep/include/primitive/fill.h"
+#include "oneflow/core/ep/include/primitive/matmul.h"
+#include "oneflow/core/ep/rocm/cuda_stream.h"
+#include "oneflow/core/hip/layer_norm.hip.h"
+#include <hipcub/hipcub.hpp>
+#include <thrust/pair.h>
+
+template <typename T, bool is_cuda>
+struct AccumulateType { };
+
+#if defined(__HIPCC__)
+template <> struct AccumulateType<half, true> { using type = float; };
+#endif
+template <> struct AccumulateType<float, true> { using type = float; };
+template <> struct AccumulateType<double, true> { using type = double; };
+template <> struct AccumulateType<int8_t, true> { using type = int64_t; };
+template <> struct AccumulateType<uint8_t, true> { using type = int64_t; };
+template <> struct AccumulateType<char, true> { using type = int64_t; };
+template <> struct AccumulateType<int16_t, true> { using type = int64_t; };
+template <> struct AccumulateType<int32_t, true> { using type = int64_t; };
+template <> struct AccumulateType<int64_t, true> { using type = int64_t; };
+template <> struct AccumulateType<bool, true> {using type = bool; };
+template <> struct AccumulateType<float, false> { using type = double; };
+template <> struct AccumulateType<double, false> { using type = double; };
+template <> struct AccumulateType<int8_t, false> { using type = int64_t; };
+template <> struct AccumulateType<uint8_t, false> { using type = int64_t; };
+template <> struct AccumulateType<char, false> { using type = int64_t; };
+template <> struct AccumulateType<int16_t, false> { using type = int64_t; };
+template <> struct AccumulateType<int32_t, false> { using type = int64_t; };
+template <> struct AccumulateType<int64_t, false> { using type = int64_t; };
+template <> struct AccumulateType<bool, false> {using type = bool; };
+
+template<typename T, bool is_cuda>
+using acc_type = typename AccumulateType<T, is_cuda>::type;
+
+#define C10_HOST_DEVICE __host__ __device__
+#define C10_DEVICE __device__
+#define C10_HOST __host__
+#define C10_WARP_SIZE 64
+
+#define VEC 4
+typedef int64_t IndexType ;
+
+constexpr int BlockReduceNumThreads=512;
+constexpr int NumThreads = 256;
+constexpr int ColwiseReduceTileSize = 32;
+
+template <typename scalar_t, typename index_t, typename combine_t>
+struct WelfordData {
+  scalar_t mean;
+  scalar_t m2;
+  index_t n;
+  combine_t nf;
+
+  C10_HOST_DEVICE WelfordData() : mean(0), m2(0), n(0), nf(0) {}
+
+  C10_HOST_DEVICE WelfordData(
+      scalar_t mean,
+      scalar_t m2,
+      index_t n,
+      combine_t nf)
+      : mean(mean), m2(m2), n(n), nf(nf) {}
+};
+
+
+template <typename scalar_t, typename acc_scalar_t, typename index_t, typename combine_t, typename res_t>
+struct WelfordOps {
+ public:
+  using acc_t = WelfordData<acc_scalar_t, index_t, combine_t>;
+  inline C10_DEVICE acc_t reduce(acc_t acc, scalar_t data) const {
+    acc_scalar_t delta = data - acc.mean;
+    // using acc.nf(combine_t) here, as acc.n(index_t) would still be converted
+    // accumulation in reduce is done through index_T
+    acc_scalar_t new_mean = acc.mean + delta / (acc.nf + 1);
+    acc_scalar_t new_delta = data - new_mean;
+    return {
+      new_mean,
+      acc.m2 + delta * new_delta,
+      acc.n + 1,
+      combine_t(acc.n + 1), // accumulate for combine_t uses index_t
+    };
+  }
+  inline C10_DEVICE acc_t combine(acc_t a, acc_t b) const {
+    if (a.nf == 0) {
+      return b;
+    }
+    if (b.nf == 0) {
+      return a;
+    }
+    acc_scalar_t delta = b.mean - a.mean;
+    combine_t new_count = a.nf + b.nf;
+    acc_scalar_t nb_over_n = b.nf / new_count;
+    return {
+      a.mean + delta * nb_over_n,
+      a.m2 + b.m2 + delta * delta * a.nf * nb_over_n,
+      // setting acc.n as -1 since acc.n might not be able to represent the count
+      // correctly within its range, setting it to -1 to avoid confusion
+      -1,
+      new_count
+    };
+  }
+  inline C10_DEVICE res_t project(acc_t acc) const {
+    return res_t(acc.m2 / acc.nf, static_cast<scalar_t>(acc.mean));
+  }
+
+  inline __device__ acc_t warp_shfl_down(acc_t acc, int offset) const {
+    return {
+      __shfl_down(acc.mean, offset)
+      , __shfl_down(acc.m2, offset)
+      , __shfl_down(acc.n, offset)
+      , __shfl_down(acc.nf, offset)
+    };
+  }
+};
+
+template <typename T, class ReduceOp>
+__inline__ __device__ T WarpReduce(T val, const ReduceOp& op) {
+#pragma unroll
+  for (int offset = (C10_WARP_SIZE >> 1); offset > 0; offset >>= 1) {
+    val = op.combine(val, op.warp_shfl_down(val, offset));
+  }
+  return val;
+}
+
+template <typename T, class ReduceOp>
+__inline__ __device__ T WarpReduce(T val,int max,const ReduceOp& op) {
+#pragma unroll
+  for (int offset = max; offset > 0; offset >>= 1) {
+    val = op.combine(val, op.warp_shfl_down(val, offset));
+  }
+  return val;
+}
+
+template <typename T, class ReduceOp>
+__inline__ __device__ T
+BlockReduce(T val, const ReduceOp& op, T* shared) {
+  const int lid = threadIdx.x % C10_WARP_SIZE;
+  const int wid = threadIdx.x / C10_WARP_SIZE;
+  val = WarpReduce(val, op);
+  __syncthreads();
+  if (lid == 0) {
+    shared[wid] = val;
+  }
+  __syncthreads();
+  if (wid == 0) {
+    val= shared[lid];
+    val = WarpReduce(val,blockDim.x / C10_WARP_SIZE / 2,op);
+  }
+  return val;
+}
+
+template <typename T>
+__inline__ __device__ T WarpReduceSum(T val) {
+#pragma unroll
+  for (int offset = (C10_WARP_SIZE >> 1); offset > 0; offset >>= 1) {
+    val += __shfl_down(val, offset);
+  }
+  return val;
+}
+
+template <typename T>
+__inline__ __device__ T WarpReduceSum(T val,int max) {
+#pragma unroll
+  for (int offset = max; offset > 0; offset >>= 1) {
+    val += __shfl_down(val, offset);
+  }
+  return val;
+}
+
+
+template <typename T>
+__inline__ __device__ T BlockReduceSum(T val, T* shared) {
+  const int lid = threadIdx.x % C10_WARP_SIZE;
+  const int wid = threadIdx.x / C10_WARP_SIZE;
+  val = WarpReduceSum(val);
+  __syncthreads();
+  if (lid == 0) {
+    shared[wid] = val;
+  }
+  __syncthreads();
+  if (wid == 0) {
+    val= shared[lid];
+    val = WarpReduceSum(val,blockDim.x / C10_WARP_SIZE / 2);
+  }
+  return val;
+}
+
+template <typename scalar_t>
+__global__ void layernorm_forward_kernel(const scalar_t* input,scalar_t* ret,acc_type<scalar_t, true>* mean,acc_type<scalar_t, true>* rstd,
+                                            const scalar_t* gamma,const scalar_t* beta,IndexType cols,double eps)
+{
+  //dropout do nothing in val mode
+  IndexType i=blockIdx.x;
+  // add + layernorm get mean and rstd
+  using T_ACC = acc_type<scalar_t, true>;
+  using WelfordType = WelfordData<T_ACC, IndexType, T_ACC>;
+  using WelfordOp = WelfordOps<T_ACC, T_ACC, IndexType, T_ACC, thrust::pair<T_ACC, T_ACC>>;
+  __shared__ typename std::aligned_storage<sizeof(WelfordType), alignof(WelfordType)>::type val_shared[BlockReduceNumThreads/C10_WARP_SIZE];
+  WelfordType* val_shared_ptr = reinterpret_cast<WelfordType*>(val_shared);
+  WelfordOp welford_op;
+  WelfordType val;
+
+  #pragma unroll
+  for (IndexType j = threadIdx.x; j < cols; j += blockDim.x) {
+    IndexType index = i * cols + j;
+    val = welford_op.reduce(val, static_cast<T_ACC>(input[index]));
+  }
+  val = BlockReduce(val,welford_op,val_shared_ptr);
+
+  __shared__ T_ACC s_mean;
+  __shared__ T_ACC s_rstd;
+  if (threadIdx.x == 0) {
+    thrust::tie(s_rstd, s_mean) = welford_op.project(val);
+    mean[i] = s_mean;
+    s_rstd=rsqrt(s_rstd + static_cast<T_ACC>(eps));
+    rstd[i] = s_rstd;
+  }
+  __syncthreads();
+  //layernorm  (x-mean)*rstd*gamma+beta
+  #pragma unroll
+  for (IndexType j = threadIdx.x; j < cols; j += blockDim.x) {
+    IndexType index = i * cols + j;
+    ret[index] = (static_cast<T_ACC>(input[index]) - s_mean)*s_rstd * (gamma == nullptr ? T_ACC(1) : static_cast<T_ACC>(gamma[j])) 
+    + (beta == nullptr ? T_ACC(0) : static_cast<T_ACC>(beta[j]));
+  }
+}
+
+template <typename T>
+void LayerNormKernelImplInternal(
+    oneflow::ep::Stream* stream, 
+    const T* X,
+    const T* gamma,
+    const T* beta,
+    int64_t M,
+    int64_t N,
+    double eps,
+    T* Y,
+    acc_type<T, true>* mean,
+    acc_type<T, true>* rstd) {
+  using T_ACC = acc_type<T, true>;
+  const T* X_data = X;
+  const T* gamma_data = gamma;
+  const T* beta_data = beta;
+  T* Y_data = Y;
+  T_ACC* mean_data = mean;
+  T_ACC* rstd_data = rstd;
+  hipStream_t cuda_stream = stream->As<oneflow::ep::CudaStream>()->cuda_stream();
+  layernorm_forward_kernel<T><<<M, BlockReduceNumThreads, 0, cuda_stream>>>(
+                                   X_data,Y_data,mean_data,rstd_data,gamma_data,beta_data,N,eps);
+}
+
+template <typename scalar_t>
+__global__ void GammaBetaBackwardSimple(IndexType M,IndexType N,const scalar_t* dY,const scalar_t* X,const acc_type<scalar_t, true>* mean,
+                                                   const acc_type<scalar_t, true>* rstd,scalar_t* dg,scalar_t* db)
+{
+   using T_ACC = acc_type<scalar_t, true>;
+  const int64_t j = blockIdx.x * blockDim.x + threadIdx.x;
+  if (j < N) {
+    T_ACC sum1 = 0;
+    T_ACC sum2 = 0;
+    for (int64_t i = 0; i < M; ++i) {
+      const int64_t index = i * N + j;
+      sum1 += dg == nullptr ? T_ACC(0)
+                            : static_cast<T_ACC>(dY[index]) *
+              (static_cast<T_ACC>(X[index]) - static_cast<T_ACC>(mean[i])) *
+              static_cast<T_ACC>(rstd[i]);
+      sum2 += db == nullptr ? T_ACC(0) : static_cast<T_ACC>(dY[index]);
+    }
+    if (dg != nullptr) {
+      dg[j] = sum1;
+    }
+    if (db != nullptr) {
+      db[j] = sum2;
+    }
+  }
+}
+
+template <typename scalar_t>
+__global__ void GammaBetaBackward(IndexType M,IndexType N,const scalar_t* dY,const scalar_t* X,const acc_type<scalar_t, true>* mean,
+                                  const acc_type<scalar_t, true>* rstd,scalar_t* dg,scalar_t* db)
+{
+  using T_ACC = acc_type<scalar_t, true>;
+  __shared__ T_ACC g_shared[ColwiseReduceTileSize][ColwiseReduceTileSize + 1];
+  __shared__ T_ACC b_shared[ColwiseReduceTileSize][ColwiseReduceTileSize + 1];
+  const int64_t j = blockIdx.x * blockDim.x + threadIdx.x;
+  T_ACC dg_sum1 = 0;
+  T_ACC dg_sum2 = 0;
+  T_ACC db_sum1 = 0;
+  T_ACC db_sum2 = 0;
+  if (j < N) {
+    for (int64_t i = threadIdx.y; i < M; i += blockDim.y * 2) {
+      const int64_t i1 = i;
+      const int64_t i2 = i + blockDim.y;
+      const int64_t index1 = i1 * N + j;
+      const int64_t index2 = i2 * N + j;
+      dg_sum1 += dg == nullptr ? T_ACC(0)
+                               : static_cast<T_ACC>(dY[index1]) *
+              (static_cast<T_ACC>(X[index1]) - static_cast<T_ACC>(mean[i1])) *
+              static_cast<T_ACC>(rstd[i1]);
+      db_sum1 += db == nullptr ? T_ACC(0) : static_cast<T_ACC>(dY[index1]);
+      if (i2 < M) {
+        dg_sum2 += dg == nullptr ? T_ACC(0)
+                                 : static_cast<T_ACC>(dY[index2]) *
+                (static_cast<T_ACC>(X[index2]) - static_cast<T_ACC>(mean[i2])) *
+                static_cast<T_ACC>(rstd[i2]);
+        db_sum2 += db == nullptr ? T_ACC(0) : static_cast<T_ACC>(dY[index2]);
+      }
+    }
+  }
+  g_shared[threadIdx.y][threadIdx.x] = dg_sum1;
+  g_shared[threadIdx.y + blockDim.y][threadIdx.x] = dg_sum2;
+  b_shared[threadIdx.y][threadIdx.x] = db_sum1;
+  b_shared[threadIdx.y + blockDim.y][threadIdx.x] = db_sum2;
+  __syncthreads();
+  T_ACC sum1 = g_shared[threadIdx.x][threadIdx.y];
+  T_ACC sum2 = b_shared[threadIdx.x][threadIdx.y];
+  sum1 = WarpReduceSum(sum1);
+  sum2 = WarpReduceSum(sum2);
+  if (threadIdx.x == 0) {
+    const int64_t j = blockIdx.x * blockDim.x + threadIdx.y;
+    if (j < N) {
+      if (dg != nullptr) {
+        dg[j] = sum1;
+      }
+      if (db != nullptr) {
+        db[j] = sum2;
+      }
+    }
+  }
+  sum1 = g_shared[threadIdx.x][threadIdx.y + blockDim.y];
+  sum2 = b_shared[threadIdx.x][threadIdx.y + blockDim.y];
+  sum1 = WarpReduceSum(sum1);
+  sum2 = WarpReduceSum(sum2);
+  if (threadIdx.x == 0) {
+    const int64_t j = blockIdx.x * blockDim.x + threadIdx.y + blockDim.y;
+    if (j < N) {
+      if (dg != nullptr) {
+        dg[j] = sum1;
+      }
+      if (db != nullptr) {
+        db[j] = sum2;
+      }
+    }
+  }
+}
+
+template <typename scalar_t>
+__global__ void LayerNormBackward_kernel(IndexType N,const scalar_t* dY,const scalar_t* X,const scalar_t* gamma,const acc_type<scalar_t, true>* mean,
+                                         const acc_type<scalar_t, true>* rstd, scalar_t* dX, const scalar_t* add_to_output)
+{
+  using T_ACC = acc_type<scalar_t, true>;
+  __shared__ T_ACC ds_shared[C10_WARP_SIZE];
+  __shared__ T_ACC db_shared[C10_WARP_SIZE];
+  const IndexType i = blockIdx.x;
+  T_ACC sum1 = 0;
+  T_ACC sum2 = 0;
+  #pragma unroll
+  for (IndexType j = threadIdx.x; j < N; j += blockDim.x) {
+    const IndexType index = i * N + j;
+    const T_ACC gamma_v = gamma == nullptr ? T_ACC(1) : static_cast<T_ACC>(gamma[j]);
+    sum1 += static_cast<T_ACC>(dY[index]) * static_cast<T_ACC>(X[index]) * gamma_v;
+    sum2 += static_cast<T_ACC>(dY[index]) * gamma_v;
+  }
+  sum1 = BlockReduceSum<T_ACC>(sum1, ds_shared);
+  sum2 = BlockReduceSum<T_ACC>(sum2, db_shared);
+  const T_ACC s = T_ACC(1) / static_cast<T_ACC>(N);
+  __shared__ T_ACC b;
+  __shared__ T_ACC c;
+  if (threadIdx.x == 0) {
+    b = (sum2 * static_cast<T_ACC>(mean[i]) - sum1) * static_cast<T_ACC>(rstd[i]) * static_cast<T_ACC>(rstd[i]) *static_cast<T_ACC>(rstd[i]) * s;
+    c = -(b * static_cast<T_ACC>(mean[i]) + sum2 * static_cast<T_ACC>(rstd[i]) * s);
+  }
+  __syncthreads();
+  #pragma unroll
+  for (IndexType j = threadIdx.x; j < N; j += blockDim.x) {
+    const IndexType index = i * N + j;
+    const T_ACC gamma_v = gamma == nullptr ? T_ACC(1) : static_cast<T_ACC>(gamma[j]);
+    dX[index] = static_cast<T_ACC>(rstd[i]) * static_cast<T_ACC>(dY[index]) * gamma_v + b * static_cast<T_ACC>(X[index]) + c 
+    + (add_to_output == nullptr ? T_ACC(0) : static_cast<T_ACC>(add_to_output[index]));
+  }
+}
+
+template <typename T>
+void LayerNormBackwardKernelImplInternal(
+    oneflow::ep::Stream* stream,
+    const T* dY,
+    const T* X,
+    const acc_type<T, true>* mean,
+    const acc_type<T, true>* rstd,
+    const T* gamma,
+    int64_t M,
+    int64_t N,
+    T* dX,
+    const T* add_to_output) {
+  using T_ACC = acc_type<T, true>;
+  const T* dY_data = dY;
+  const T* X_data = X;
+  const T_ACC* mean_data = mean;
+  const T_ACC* rstd_data = rstd;
+  const T* gamma_data = gamma;
+  T* dX_data = dX;
+  const T* add_to_output_data = add_to_output;
+  hipStream_t cuda_stream = stream->As<oneflow::ep::CudaStream>()->cuda_stream();
+  if (dX_data != nullptr) {
+    LayerNormBackward_kernel<T><<<M, BlockReduceNumThreads, 0, cuda_stream>>>(
+                  N, dY_data, X_data,gamma_data,mean_data,rstd_data,dX_data,add_to_output_data);
+  }
+}
+
+template <typename T>
+void LayerNormBackwardKernelImplInternalParam(
+    oneflow::ep::Stream* stream,
+    const T* dY,
+    const T* X,
+    const acc_type<T, true>* mean,
+    const acc_type<T, true>* rstd,
+    int64_t M,
+    int64_t N,
+    T* dgamma,
+    T* dbeta) {
+  using T_ACC = acc_type<T, true>;
+  const T* dY_data = dY;
+  const T* X_data = X;
+  const T_ACC* mean_data = mean;
+  const T_ACC* rstd_data = rstd;
+  hipStream_t cuda_stream = stream->As<oneflow::ep::CudaStream>()->cuda_stream();
+  T* dgamma_data = dgamma;
+  T* dbeta_data = dbeta;
+  if (M < 512) {
+    // For small batch size, do colwise reduce directly.
+    const int64_t B = (N + NumThreads - 1) / NumThreads;
+    GammaBetaBackwardSimple<T>
+        <<<B, NumThreads, 0, cuda_stream>>>(
+            M,
+            N,
+            dY_data,
+            X_data,
+            mean_data,
+            rstd_data,
+            dgamma_data,
+            dbeta_data);
+  } else {
+    const int64_t B =
+        (N + ColwiseReduceTileSize - 1) / ColwiseReduceTileSize;
+    constexpr int kThreadX = ColwiseReduceTileSize;
+    constexpr int kThreadY = ColwiseReduceTileSize / 2;
+    GammaBetaBackward<T>
+        <<<B, dim3(kThreadX, kThreadY), 0, cuda_stream>>>(
+            M,
+            N,
+            dY_data,
+            X_data,
+            mean_data,
+            rstd_data,
+            dgamma_data,
+            dbeta_data);
+  }
+}
+
+namespace oneflow {
+
+template<typename T>
+class LayerNormGpuKernel final : public user_op::OpKernel, public user_op::CudaGraphSupport {
+ public:
+  LayerNormGpuKernel() = default;
+  ~LayerNormGpuKernel() = default;
+
+ private:
+  using user_op::OpKernel::Compute;
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
+  void Compute(user_op::KernelComputeContext* ctx) const override {
+    const user_op::Tensor* x = ctx->Tensor4ArgNameAndIndex("x", 0);
+    user_op::Tensor* y = ctx->Tensor4ArgNameAndIndex("y", 0);
+    user_op::Tensor* mean = ctx->Tensor4ArgNameAndIndex("mean", 0);
+    user_op::Tensor* inv_variance = ctx->Tensor4ArgNameAndIndex("inv_variance", 0);
+    double epsilon = ctx->Attr<double>("epsilon");
+    int64_t num_instances = mean->shape_view().elem_cnt();
+    int64_t norm_size = x->shape_view().elem_cnt() / num_instances;
+    const T* gamma_ptr = nullptr;
+    const T* beta_ptr = nullptr;
+    if (ctx->has_input("gamma", 0)) {
+      const user_op::Tensor* gamma = ctx->Tensor4ArgNameAndIndex("gamma", 0);
+      gamma_ptr = gamma->dptr<T>();
+      CHECK_EQ(gamma->shape_view().elem_cnt(), norm_size);
+    }
+    if (ctx->has_input("beta", 0)) { beta_ptr = ctx->Tensor4ArgNameAndIndex("beta", 0)->dptr<T>(); }
+    // DispatchLayerNormForwardGpu<T>(ctx->stream(), num_instances, norm_size, epsilon, x->dptr<T>(),
+    //                                gamma_ptr, beta_ptr, y->mut_dptr<T>(), mean, inv_variance);
+    using ComputeType = typename cuda::layer_norm::DefaultComputeType<T>::type;
+    LayerNormKernelImplInternal<T>(ctx->stream(), x->dptr<T>(), gamma_ptr, beta_ptr, num_instances, norm_size, epsilon, 
+                                  y->mut_dptr<T>(), mean->mut_dptr<ComputeType>(), inv_variance->mut_dptr<ComputeType>());
+  };
+};
+
+#define REGISTER_LAYER_NORM_CUDA_KERNEL(dtype)                         \
+  REGISTER_USER_KERNEL("layer_norm")                                   \
+      .SetCreateFn<LayerNormGpuKernel<dtype>>()                        \
+      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \
+                       && (user_op::HobDataType("x", 0) == GetDataType<dtype>::value));
+
+REGISTER_LAYER_NORM_CUDA_KERNEL(float)
+REGISTER_LAYER_NORM_CUDA_KERNEL(double)
+REGISTER_LAYER_NORM_CUDA_KERNEL(half)
+#if CUDA_VERSION >= 11000
+REGISTER_LAYER_NORM_CUDA_KERNEL(nv_bfloat16)
+#endif
+
+template<typename T>
+class LayerNormGradGpuKernel final : public user_op::OpKernel, public user_op::CudaGraphSupport {
+ public:
+  LayerNormGradGpuKernel() = default;
+  ~LayerNormGradGpuKernel() = default;
+
+ private:
+  using user_op::OpKernel::Compute;
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
+  void Compute(user_op::KernelComputeContext* ctx) const override {
+    const user_op::Tensor* dy = ctx->Tensor4ArgNameAndIndex("dy", 0);
+    const user_op::Tensor* x = ctx->Tensor4ArgNameAndIndex("x", 0);
+    const user_op::Tensor* mean = ctx->Tensor4ArgNameAndIndex("mean", 0);
+    const user_op::Tensor* inv_variance = ctx->Tensor4ArgNameAndIndex("inv_variance", 0);
+    user_op::Tensor* dx = ctx->Tensor4ArgNameAndIndex("dx", 0);
+    int64_t num_instances = mean->shape_view().elem_cnt();
+    int64_t norm_size = x->shape_view().elem_cnt() / num_instances;
+    const T* gamma_ptr = nullptr;
+    if (ctx->has_input("gamma", 0)) {
+      gamma_ptr = ctx->Tensor4ArgNameAndIndex("gamma", 0)->dptr<T>();
+    }
+    const T* add_to_output_ptr = nullptr;
+    if (ctx->has_input("_add_to_output", 0)) {
+      const user_op::Tensor* add_to_output = ctx->Tensor4ArgNameAndIndex("_add_to_output", 0);
+      CHECK_EQ(add_to_output->data_type(), dx->data_type());
+      CHECK_EQ(add_to_output->shape_view(), dx->shape_view());
+      add_to_output_ptr = add_to_output->dptr<T>();
+    }
+    // LaunchLayerNormBackward<T>(ctx->stream(), num_instances, norm_size, dy->dptr<T>(), x->dptr<T>(),
+    //                            mean, inv_variance, gamma_ptr, add_to_output_ptr, dx->mut_dptr<T>());
+    using ComputeType = typename cuda::layer_norm::DefaultComputeType<T>::type;
+    LayerNormBackwardKernelImplInternal<T>(ctx->stream(), dy->dptr<T>(), x->dptr<T>(), mean->dptr<ComputeType>(), inv_variance->dptr<ComputeType>(), 
+                                        gamma_ptr, num_instances, norm_size, dx->mut_dptr<T>(), add_to_output_ptr);
+  };
+};
+
+#define REGISTER_LAYER_NORM_GRAD_CUDA_KERNEL(dtype)                                        \
+  REGISTER_USER_KERNEL("layer_norm_grad")                                                  \
+      .SetCreateFn<LayerNormGradGpuKernel<dtype>>()                                        \
+      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                     \
+                       && (user_op::HobDataType("dy", 0) == GetDataType<dtype>::value))    \
+      .SetInplaceProposalFn(                                                               \
+          [](const user_op::InferContext& ctx,                                             \
+             const user_op::AddInplaceArgPair& AddInplaceArgPairFn) -> Maybe<void> {       \
+            if (ctx.has_input("_add_to_output", 0)) {                                      \
+              OF_RETURN_IF_ERROR(AddInplaceArgPairFn("dx", 0, "_add_to_output", 0, true)); \
+            }                                                                              \
+            return Maybe<void>::Ok();                                                      \
+          });
+
+REGISTER_LAYER_NORM_GRAD_CUDA_KERNEL(float)
+REGISTER_LAYER_NORM_GRAD_CUDA_KERNEL(double)
+REGISTER_LAYER_NORM_GRAD_CUDA_KERNEL(half)
+#if CUDA_VERSION >= 11000
+REGISTER_LAYER_NORM_GRAD_CUDA_KERNEL(nv_bfloat16)
+#endif
+
+template<typename T>
+class LayerNormParamGradGpuKernel final : public user_op::OpKernel,
+                                          public user_op::CudaGraphSupport {
+ public:
+  LayerNormParamGradGpuKernel() = default;
+  ~LayerNormParamGradGpuKernel() = default;
+
+ private:
+  using user_op::OpKernel::Compute;
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
+  void Compute(user_op::KernelComputeContext* ctx) const override {
+    const user_op::Tensor* dy = ctx->Tensor4ArgNameAndIndex("dy", 0);
+    const user_op::Tensor* x = ctx->Tensor4ArgNameAndIndex("x", 0);
+    const user_op::Tensor* mean = ctx->Tensor4ArgNameAndIndex("mean", 0);
+    const user_op::Tensor* inv_variance = ctx->Tensor4ArgNameAndIndex("inv_variance", 0);
+    int64_t num_instances = mean->shape_view().elem_cnt();
+    int64_t norm_size = x->shape_view().elem_cnt() / num_instances;
+    user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0);
+    // const DataType data_type = dy->data_type();
+    // const int grid_dim_x = (norm_size + tile_size - 1) / tile_size;
+    // const int grid_dim_y = GetGirdDimY<T>(num_instances, norm_size);
+    // const size_t tmp_gamma_diff_size = grid_dim_y * norm_size * sizeof(T);
+    // T* tmp_gamma_diff_ptr = reinterpret_cast<T*>(tmp_buffer->mut_dptr());
+    // T* tmp_beta_diff_ptr = reinterpret_cast<T*>(tmp_buffer->mut_dptr<char>() + tmp_gamma_diff_size);
+    // T* reduce_buf_ptr =
+    //     reinterpret_cast<T*>(tmp_buffer->mut_dptr<char>() + 2 * tmp_gamma_diff_size);
+    using ComputeType = typename cuda::layer_norm::DefaultComputeType<T>::type;
+    // LayerNormParamGrad<T, ComputeType><<<dim3(grid_dim_x, grid_dim_y), dim3(32, 32 / num_per_block),
+    //                                      0, ctx->stream()->As<ep::CudaStream>()->cuda_stream()>>>(
+    //     num_instances, norm_size, dy->dptr<T>(), x->dptr<T>(), mean->dptr<ComputeType>(),
+    //     inv_variance->dptr<ComputeType>(), tmp_gamma_diff_ptr, tmp_beta_diff_ptr);
+    // const int32_t m = norm_size;
+    // const int32_t n = 1;
+    // const int32_t k = grid_dim_y;
+    // std::unique_ptr<ep::primitive::Fill> fill =
+    //     ep::primitive::NewPrimitive<ep::primitive::FillFactory>(ctx->stream()->device_type(),
+    //                                                             data_type);
+    // CHECK(fill);
+    // fill->Launch(ctx->stream(), reduce_buf_ptr, 1.0, grid_dim_y);
+    // std::unique_ptr<ep::primitive::Matmul> matmul =
+    //     ep::primitive::NewPrimitive<ep::primitive::MatmulFactory>(
+    //         ctx->stream()->device_type(), data_type, ep::primitive::BlasTransposeType::T,
+    //         ep::primitive::BlasTransposeType::N);
+    // CHECK(matmul);
+    // if (ctx->has_output("gamma_diff", 0)) {
+    //   user_op::Tensor* gamma_diff = ctx->Tensor4ArgNameAndIndex("gamma_diff", 0);
+    //   matmul->Launch(ctx->stream(), m, n, k, 1.0, tmp_gamma_diff_ptr, reduce_buf_ptr, 0.0,
+    //                  gamma_diff->mut_dptr());
+    // }
+    // if (ctx->has_output("beta_diff", 0)) {
+    //   user_op::Tensor* beta_diff = ctx->Tensor4ArgNameAndIndex("beta_diff", 0);
+    //   matmul->Launch(ctx->stream(), m, n, k, 1.0, tmp_beta_diff_ptr, reduce_buf_ptr, 0.0,
+    //                  beta_diff->mut_dptr());
+    // }
+    T* gamma_diff_ptr = nullptr;
+    T* beta_diff_ptr = nullptr;
+    if (ctx->has_output("gamma_diff", 0)) {
+      gamma_diff_ptr = ctx->Tensor4ArgNameAndIndex("gamma_diff", 0)->mut_dptr<T>();
+    }
+    if (ctx->has_output("beta_diff", 0)) {
+      beta_diff_ptr = ctx->Tensor4ArgNameAndIndex("beta_diff", 0)->mut_dptr<T>();
+    }
+    LayerNormBackwardKernelImplInternalParam<T>(ctx->stream(), dy->dptr<T>(), x->dptr<T>(), mean->dptr<ComputeType>(), inv_variance->dptr<ComputeType>(), 
+                                                num_instances, norm_size, gamma_diff_ptr, beta_diff_ptr);
+  };
+};
+
+#define REGISTER_LAYER_NORM_PARAM_GRAD_GPU_KERNEL(dtype)                                    \
+  REGISTER_USER_KERNEL("layer_norm_param_grad")                                             \
+      .SetCreateFn<LayerNormParamGradGpuKernel<dtype>>()                                    \
+      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                      \
+                       && (user_op::HobDataType("dy", 0) == GetDataType<dtype>::value))     \
+      .SetInferTmpSizeFn([](user_op::InferContext* ctx) {                                   \
+        const int64_t begin_params_axis = ctx->Attr<int64_t>("begin_params_axis");          \
+        const bool has_gamma_diff = ctx->has_output("gamma_diff", 0);                       \
+        const bool has_beta_diff = ctx->has_output("beta_diff", 0);                         \
+        const auto& dy = ctx->InputTensorDesc("dy", 0);                                     \
+        const int64_t num_instances = dy.shape().Count(0, begin_params_axis);               \
+        const int64_t norm_size = dy.shape().Count(begin_params_axis);                      \
+        const int grid_dim_y = num_instances;                                               \
+        size_t tmp_buffer_size = (2 * grid_dim_y * norm_size + grid_dim_y) * sizeof(dtype); \
+        return tmp_buffer_size;                                                             \
+      });
+
+REGISTER_LAYER_NORM_PARAM_GRAD_GPU_KERNEL(float)
+REGISTER_LAYER_NORM_PARAM_GRAD_GPU_KERNEL(double)
+REGISTER_LAYER_NORM_PARAM_GRAD_GPU_KERNEL(half)
+#if CUDA_VERSION >= 11000
+REGISTER_LAYER_NORM_PARAM_GRAD_GPU_KERNEL(nv_bfloat16)
+#endif
+
+}
\ No newline at end of file
diff --git a/oneflow/user/kernels/math_binary_elementwise_kernel.hip.cpp b/oneflow/user/kernels/math_binary_elementwise_kernel.hip.cpp
index db2ebfd..de7d6f2 100644
--- a/oneflow/user/kernels/math_binary_elementwise_kernel.hip.cpp
+++ b/oneflow/user/kernels/math_binary_elementwise_kernel.hip.cpp
@@ -1,244 +1,244 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "hip/hip_runtime.h"
-#include "oneflow/core/framework/framework.h"
-#include "oneflow/user/kernels/math_binary_elementwise_func.h"
-#include "oneflow/core/ep/rocm/cuda_stream.h"
-
-namespace oneflow {
-
-namespace {
-
-template<template<typename> class BinaryFunctor, typename T>
-__global__ void MathBinaryElementwiseForwardGpu(const int64_t n, const T* x, const T* y, T* z) {
-  CUDA_1D_KERNEL_LOOP_T(int64_t, i, n) { z[i] = BinaryFunctor<T>::Forward(x[i], y[i]); }
-}
-
-template<template<typename> class BinaryFunctor, typename T>
-__global__ void MathBinaryElementwiseBackwardXGradGpu(const int64_t n, const T* x, const T* y,
-                                                      const T* dz, T* dx) {
-  CUDA_1D_KERNEL_LOOP_T(int64_t, i, n) {
-    dx[i] = BinaryFunctor<T>::BackwardXGrad(x[i], y[i], dz[i]);
-  }
-}
-
-template<template<typename> class BinaryFunctor, typename T>
-__global__ void MathBinaryElementwiseBackwardYGradGpu(const int64_t n, const T* x, const T* y,
-                                                      const T* dz, T* dy) {
-  CUDA_1D_KERNEL_LOOP_T(int64_t, i, n) {
-    dy[i] = BinaryFunctor<T>::BackwardYGrad(x[i], y[i], dz[i]);
-  }
-}
-
-}  // namespace
-
-template<template<typename> class BinaryFunctor, typename T>
-class MathBinaryElementwiseGpuKernel final : public user_op::OpKernel {
- public:
-  MathBinaryElementwiseGpuKernel() = default;
-  ~MathBinaryElementwiseGpuKernel() = default;
-
- private:
-  using user_op::OpKernel::Compute;
-  void Compute(user_op::KernelComputeContext* ctx) const override {
-    const user_op::Tensor* tensor_x = ctx->Tensor4ArgNameAndIndex("x", 0);
-    const user_op::Tensor* tensor_y = ctx->Tensor4ArgNameAndIndex("y", 0);
-    user_op::Tensor* tensor_z = ctx->Tensor4ArgNameAndIndex("z", 0);
-    int64_t n = tensor_x->shape_view().elem_cnt();
-    if (n == 0) { return; }
-    MathBinaryElementwiseForwardGpu<BinaryFunctor, T>
-        <<<BlocksNum4ThreadsNum(n), kCudaThreadsNumPerBlock, 0,
-           ctx->stream()->As<ep::CudaStream>()->cuda_stream()>>>(
-            n, tensor_x->dptr<T>(), tensor_y->dptr<T>(), tensor_z->mut_dptr<T>());
-  }
-  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
-};
-
-template<template<typename> class BinaryFunctor, typename T>
-class MathBinaryElementwiseXGradGpuKernel final : public user_op::OpKernel {
- public:
-  MathBinaryElementwiseXGradGpuKernel() = default;
-  ~MathBinaryElementwiseXGradGpuKernel() = default;
-
- private:
-  using user_op::OpKernel::Compute;
-  void Compute(user_op::KernelComputeContext* ctx) const override {
-    const user_op::Tensor* tensor_x = ctx->Tensor4ArgNameAndIndex("x", 0);
-    const user_op::Tensor* tensor_y = ctx->Tensor4ArgNameAndIndex("y", 0);
-    const user_op::Tensor* tensor_dz = ctx->Tensor4ArgNameAndIndex("dz", 0);
-    user_op::Tensor* tensor_dx = ctx->Tensor4ArgNameAndIndex("dx", 0);
-    int64_t n = tensor_x->shape_view().elem_cnt();
-    if (n == 0) { return; }
-    MathBinaryElementwiseBackwardXGradGpu<BinaryFunctor, T>
-        <<<BlocksNum4ThreadsNum(n), kCudaThreadsNumPerBlock, 0,
-           ctx->stream()->As<ep::CudaStream>()->cuda_stream()>>>(
-            n, tensor_x->dptr<T>(), tensor_y->dptr<T>(), tensor_dz->dptr<T>(),
-            tensor_dx->mut_dptr<T>());
-  }
-  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
-};
-
-template<template<typename> class BinaryFunctor, typename T>
-class MathBinaryElementwiseYGradGpuKernel final : public user_op::OpKernel {
- public:
-  MathBinaryElementwiseYGradGpuKernel() = default;
-  ~MathBinaryElementwiseYGradGpuKernel() = default;
-
- private:
-  using user_op::OpKernel::Compute;
-  void Compute(user_op::KernelComputeContext* ctx) const override {
-    const user_op::Tensor* tensor_x = ctx->Tensor4ArgNameAndIndex("x", 0);
-    const user_op::Tensor* tensor_y = ctx->Tensor4ArgNameAndIndex("y", 0);
-    const user_op::Tensor* tensor_dz = ctx->Tensor4ArgNameAndIndex("dz", 0);
-    user_op::Tensor* tensor_dy = ctx->Tensor4ArgNameAndIndex("dy", 0);
-    int64_t n = tensor_x->shape_view().elem_cnt();
-    if (n == 0) { return; }
-    MathBinaryElementwiseBackwardYGradGpu<BinaryFunctor, T>
-        <<<BlocksNum4ThreadsNum(n), kCudaThreadsNumPerBlock, 0,
-           ctx->stream()->As<ep::CudaStream>()->cuda_stream()>>>(
-            n, tensor_x->dptr<T>(), tensor_y->dptr<T>(), tensor_dz->dptr<T>(),
-            tensor_dy->mut_dptr<T>());
-  }
-  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
-};
-
-#define REGISTER_MATH_BINARY_ELEMENTWISE_CUDA_KERNEL_AND_GRAD(math_type_pair, data_type_pair)   \
-  REGISTER_USER_KERNEL(OF_PP_PAIR_FIRST(math_type_pair))                                        \
-      .SetCreateFn<                                                                             \
-          MathBinaryElementwiseGpuKernel<OF_PP_CAT(OF_PP_PAIR_SECOND(math_type_pair), Functor), \
-                                         OF_PP_PAIR_FIRST(data_type_pair)>>()                   \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                          \
-                       && (user_op::HobDataType("x", 0) == OF_PP_PAIR_SECOND(data_type_pair))); \
-                                                                                                \
-  REGISTER_USER_KERNEL((std::string("") + OF_PP_PAIR_FIRST(math_type_pair) + "_x_grad"))        \
-      .SetCreateFn<MathBinaryElementwiseXGradGpuKernel<                                         \
-          OF_PP_CAT(OF_PP_PAIR_SECOND(math_type_pair), Functor),                                \
-          OF_PP_PAIR_FIRST(data_type_pair)>>()                                                  \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                          \
-                       && (user_op::HobDataType("x", 0) == OF_PP_PAIR_SECOND(data_type_pair))); \
-  REGISTER_USER_KERNEL((std::string("") + OF_PP_PAIR_FIRST(math_type_pair) + "_y_grad"))        \
-      .SetCreateFn<MathBinaryElementwiseYGradGpuKernel<                                         \
-          OF_PP_CAT(OF_PP_PAIR_SECOND(math_type_pair), Functor),                                \
-          OF_PP_PAIR_FIRST(data_type_pair)>>()                                                  \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                          \
-                       && (user_op::HobDataType("x", 0) == OF_PP_PAIR_SECOND(data_type_pair)));
-
-OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_MATH_BINARY_ELEMENTWISE_CUDA_KERNEL_AND_GRAD,
-                                 MATH_BINARY_ELEMENTWISE_FUNC_SEQ, FLOATING_DATA_TYPE_SEQ)
-OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_MATH_BINARY_ELEMENTWISE_CUDA_KERNEL_AND_GRAD,
-                                 OF_PP_MAKE_TUPLE_SEQ("floordiv", FloorDiv),
-                                 INT_DATA_TYPE_SEQ UNSIGNED_INT_DATA_TYPE_SEQ)
-
-template<template<typename> class BinaryFunctor>
-class MathBinaryElementwiseGpuHalfKernel final : public user_op::OpKernel {
- public:
-  MathBinaryElementwiseGpuHalfKernel() = default;
-  ~MathBinaryElementwiseGpuHalfKernel() = default;
-
- private:
-  using user_op::OpKernel::Compute;
-  void Compute(user_op::KernelComputeContext* ctx) const override {
-    const user_op::Tensor* tensor_x = ctx->Tensor4ArgNameAndIndex("x", 0);
-    const user_op::Tensor* tensor_y = ctx->Tensor4ArgNameAndIndex("y", 0);
-    user_op::Tensor* tensor_z = ctx->Tensor4ArgNameAndIndex("z", 0);
-    const half* x = reinterpret_cast<const half*>(tensor_x->dptr<float16>());
-    const half* y = reinterpret_cast<const half*>(tensor_y->dptr<float16>());
-    half* z = reinterpret_cast<half*>(tensor_z->mut_dptr<float16>());
-    int64_t n = tensor_x->shape_view().elem_cnt();
-    if (n == 0) { return; }
-    MathBinaryElementwiseForwardGpu<BinaryFunctor, half>
-        <<<BlocksNum4ThreadsNum(n), kCudaThreadsNumPerBlock, 0,
-           ctx->stream()->As<ep::CudaStream>()->cuda_stream()>>>(n, x, y, z);
-  }
-  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
-};
-
-template<template<typename> class BinaryFunctor>
-class MathBinaryElementwiseXGradGpuHalfKernel final : public user_op::OpKernel {
- public:
-  MathBinaryElementwiseXGradGpuHalfKernel() = default;
-  ~MathBinaryElementwiseXGradGpuHalfKernel() = default;
-
- private:
-  using user_op::OpKernel::Compute;
-  void Compute(user_op::KernelComputeContext* ctx) const override {
-    const user_op::Tensor* tensor_x = ctx->Tensor4ArgNameAndIndex("x", 0);
-    const user_op::Tensor* tensor_y = ctx->Tensor4ArgNameAndIndex("y", 0);
-    const user_op::Tensor* tensor_dz = ctx->Tensor4ArgNameAndIndex("dz", 0);
-    user_op::Tensor* tensor_dx = ctx->Tensor4ArgNameAndIndex("dx", 0);
-
-    const half* x = reinterpret_cast<const half*>(tensor_x->dptr<float16>());
-    const half* y = reinterpret_cast<const half*>(tensor_y->dptr<float16>());
-    const half* dz = reinterpret_cast<const half*>(tensor_dz->dptr<float16>());
-    half* dx = reinterpret_cast<half*>(tensor_dx->mut_dptr<float16>());
-    int64_t n = tensor_x->shape_view().elem_cnt();
-    if (n == 0) { return; }
-    MathBinaryElementwiseBackwardXGradGpu<BinaryFunctor, half>
-        <<<BlocksNum4ThreadsNum(n), kCudaThreadsNumPerBlock, 0,
-           ctx->stream()->As<ep::CudaStream>()->cuda_stream()>>>(n, x, y, dz, dx);
-  }
-  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
-};
-
-template<template<typename> class BinaryFunctor>
-class MathBinaryElementwiseYGradGpuHalfKernel final : public user_op::OpKernel {
- public:
-  MathBinaryElementwiseYGradGpuHalfKernel() = default;
-  ~MathBinaryElementwiseYGradGpuHalfKernel() = default;
-
- private:
-  using user_op::OpKernel::Compute;
-  void Compute(user_op::KernelComputeContext* ctx) const override {
-    const user_op::Tensor* tensor_x = ctx->Tensor4ArgNameAndIndex("x", 0);
-    const user_op::Tensor* tensor_y = ctx->Tensor4ArgNameAndIndex("y", 0);
-    const user_op::Tensor* tensor_dz = ctx->Tensor4ArgNameAndIndex("dz", 0);
-    user_op::Tensor* tensor_dy = ctx->Tensor4ArgNameAndIndex("dy", 0);
-
-    const half* x = reinterpret_cast<const half*>(tensor_x->dptr<float16>());
-    const half* y = reinterpret_cast<const half*>(tensor_y->dptr<float16>());
-    const half* dz = reinterpret_cast<const half*>(tensor_dz->dptr<float16>());
-    half* dy = reinterpret_cast<half*>(tensor_dy->mut_dptr<float16>());
-    int64_t n = tensor_x->shape_view().elem_cnt();
-    if (n == 0) { return; }
-    MathBinaryElementwiseBackwardYGradGpu<BinaryFunctor, half>
-        <<<BlocksNum4ThreadsNum(n), kCudaThreadsNumPerBlock, 0,
-           ctx->stream()->As<ep::CudaStream>()->cuda_stream()>>>(n, x, y, dz, dy);
-  }
-  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
-};
-
-#define REGISTER_MATH_BINARY_ELEMENTWISE_CUDA_HALF_KERNEL_AND_GRAD(math_type_str,              \
-                                                                   math_func_prefix)           \
-  REGISTER_USER_KERNEL(math_type_str)                                                          \
-      .SetCreateFn<MathBinaryElementwiseGpuHalfKernel<OF_PP_CAT(math_func_prefix, Functor)>>() \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                         \
-                       && (user_op::HobDataType("x", 0) == DataType::kFloat16));               \
-                                                                                               \
-  REGISTER_USER_KERNEL((std::string("") + math_type_str + "_x_grad"))                          \
-      .SetCreateFn<                                                                            \
-          MathBinaryElementwiseXGradGpuHalfKernel<OF_PP_CAT(math_func_prefix, Functor)>>()     \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                         \
-                       && (user_op::HobDataType("x", 0) == DataType::kFloat16));               \
-  REGISTER_USER_KERNEL((std::string("") + math_type_str + "_y_grad"))                          \
-      .SetCreateFn<                                                                            \
-          MathBinaryElementwiseYGradGpuHalfKernel<OF_PP_CAT(math_func_prefix, Functor)>>()     \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                         \
-                       && (user_op::HobDataType("x", 0) == DataType::kFloat16));
-
-OF_PP_FOR_EACH_TUPLE(REGISTER_MATH_BINARY_ELEMENTWISE_CUDA_HALF_KERNEL_AND_GRAD,
-                     MATH_BINARY_ELEMENTWISE_FUNC_SEQ)
-
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "hip/hip_runtime.h"
+#include "oneflow/core/framework/framework.h"
+#include "oneflow/user/kernels/math_binary_elementwise_func.h"
+#include "oneflow/core/ep/rocm/cuda_stream.h"
+
+namespace oneflow {
+
+namespace {
+
+template<template<typename> class BinaryFunctor, typename T>
+__global__ void MathBinaryElementwiseForwardGpu(const int64_t n, const T* x, const T* y, T* z) {
+  CUDA_1D_KERNEL_LOOP_T(int64_t, i, n) { z[i] = BinaryFunctor<T>::Forward(x[i], y[i]); }
+}
+
+template<template<typename> class BinaryFunctor, typename T>
+__global__ void MathBinaryElementwiseBackwardXGradGpu(const int64_t n, const T* x, const T* y,
+                                                      const T* dz, T* dx) {
+  CUDA_1D_KERNEL_LOOP_T(int64_t, i, n) {
+    dx[i] = BinaryFunctor<T>::BackwardXGrad(x[i], y[i], dz[i]);
+  }
+}
+
+template<template<typename> class BinaryFunctor, typename T>
+__global__ void MathBinaryElementwiseBackwardYGradGpu(const int64_t n, const T* x, const T* y,
+                                                      const T* dz, T* dy) {
+  CUDA_1D_KERNEL_LOOP_T(int64_t, i, n) {
+    dy[i] = BinaryFunctor<T>::BackwardYGrad(x[i], y[i], dz[i]);
+  }
+}
+
+}  // namespace
+
+template<template<typename> class BinaryFunctor, typename T>
+class MathBinaryElementwiseGpuKernel final : public user_op::OpKernel {
+ public:
+  MathBinaryElementwiseGpuKernel() = default;
+  ~MathBinaryElementwiseGpuKernel() = default;
+
+ private:
+  using user_op::OpKernel::Compute;
+  void Compute(user_op::KernelComputeContext* ctx) const override {
+    const user_op::Tensor* tensor_x = ctx->Tensor4ArgNameAndIndex("x", 0);
+    const user_op::Tensor* tensor_y = ctx->Tensor4ArgNameAndIndex("y", 0);
+    user_op::Tensor* tensor_z = ctx->Tensor4ArgNameAndIndex("z", 0);
+    int64_t n = tensor_x->shape_view().elem_cnt();
+    if (n == 0) { return; }
+    MathBinaryElementwiseForwardGpu<BinaryFunctor, T>
+        <<<BlocksNum4ThreadsNum(n), kCudaThreadsNumPerBlock, 0,
+           ctx->stream()->As<ep::CudaStream>()->cuda_stream()>>>(
+            n, tensor_x->dptr<T>(), tensor_y->dptr<T>(), tensor_z->mut_dptr<T>());
+  }
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
+};
+
+template<template<typename> class BinaryFunctor, typename T>
+class MathBinaryElementwiseXGradGpuKernel final : public user_op::OpKernel {
+ public:
+  MathBinaryElementwiseXGradGpuKernel() = default;
+  ~MathBinaryElementwiseXGradGpuKernel() = default;
+
+ private:
+  using user_op::OpKernel::Compute;
+  void Compute(user_op::KernelComputeContext* ctx) const override {
+    const user_op::Tensor* tensor_x = ctx->Tensor4ArgNameAndIndex("x", 0);
+    const user_op::Tensor* tensor_y = ctx->Tensor4ArgNameAndIndex("y", 0);
+    const user_op::Tensor* tensor_dz = ctx->Tensor4ArgNameAndIndex("dz", 0);
+    user_op::Tensor* tensor_dx = ctx->Tensor4ArgNameAndIndex("dx", 0);
+    int64_t n = tensor_x->shape_view().elem_cnt();
+    if (n == 0) { return; }
+    MathBinaryElementwiseBackwardXGradGpu<BinaryFunctor, T>
+        <<<BlocksNum4ThreadsNum(n), kCudaThreadsNumPerBlock, 0,
+           ctx->stream()->As<ep::CudaStream>()->cuda_stream()>>>(
+            n, tensor_x->dptr<T>(), tensor_y->dptr<T>(), tensor_dz->dptr<T>(),
+            tensor_dx->mut_dptr<T>());
+  }
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
+};
+
+template<template<typename> class BinaryFunctor, typename T>
+class MathBinaryElementwiseYGradGpuKernel final : public user_op::OpKernel {
+ public:
+  MathBinaryElementwiseYGradGpuKernel() = default;
+  ~MathBinaryElementwiseYGradGpuKernel() = default;
+
+ private:
+  using user_op::OpKernel::Compute;
+  void Compute(user_op::KernelComputeContext* ctx) const override {
+    const user_op::Tensor* tensor_x = ctx->Tensor4ArgNameAndIndex("x", 0);
+    const user_op::Tensor* tensor_y = ctx->Tensor4ArgNameAndIndex("y", 0);
+    const user_op::Tensor* tensor_dz = ctx->Tensor4ArgNameAndIndex("dz", 0);
+    user_op::Tensor* tensor_dy = ctx->Tensor4ArgNameAndIndex("dy", 0);
+    int64_t n = tensor_x->shape_view().elem_cnt();
+    if (n == 0) { return; }
+    MathBinaryElementwiseBackwardYGradGpu<BinaryFunctor, T>
+        <<<BlocksNum4ThreadsNum(n), kCudaThreadsNumPerBlock, 0,
+           ctx->stream()->As<ep::CudaStream>()->cuda_stream()>>>(
+            n, tensor_x->dptr<T>(), tensor_y->dptr<T>(), tensor_dz->dptr<T>(),
+            tensor_dy->mut_dptr<T>());
+  }
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
+};
+
+#define REGISTER_MATH_BINARY_ELEMENTWISE_CUDA_KERNEL_AND_GRAD(math_type_pair, data_type_pair)   \
+  REGISTER_USER_KERNEL(OF_PP_PAIR_FIRST(math_type_pair))                                        \
+      .SetCreateFn<                                                                             \
+          MathBinaryElementwiseGpuKernel<OF_PP_CAT(OF_PP_PAIR_SECOND(math_type_pair), Functor), \
+                                         OF_PP_PAIR_FIRST(data_type_pair)>>()                   \
+      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                          \
+                       && (user_op::HobDataType("x", 0) == OF_PP_PAIR_SECOND(data_type_pair))); \
+                                                                                                \
+  REGISTER_USER_KERNEL((std::string("") + OF_PP_PAIR_FIRST(math_type_pair) + "_x_grad"))        \
+      .SetCreateFn<MathBinaryElementwiseXGradGpuKernel<                                         \
+          OF_PP_CAT(OF_PP_PAIR_SECOND(math_type_pair), Functor),                                \
+          OF_PP_PAIR_FIRST(data_type_pair)>>()                                                  \
+      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                          \
+                       && (user_op::HobDataType("x", 0) == OF_PP_PAIR_SECOND(data_type_pair))); \
+  REGISTER_USER_KERNEL((std::string("") + OF_PP_PAIR_FIRST(math_type_pair) + "_y_grad"))        \
+      .SetCreateFn<MathBinaryElementwiseYGradGpuKernel<                                         \
+          OF_PP_CAT(OF_PP_PAIR_SECOND(math_type_pair), Functor),                                \
+          OF_PP_PAIR_FIRST(data_type_pair)>>()                                                  \
+      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                          \
+                       && (user_op::HobDataType("x", 0) == OF_PP_PAIR_SECOND(data_type_pair)));
+
+OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_MATH_BINARY_ELEMENTWISE_CUDA_KERNEL_AND_GRAD,
+                                 MATH_BINARY_ELEMENTWISE_FUNC_SEQ, FLOATING_DATA_TYPE_SEQ)
+OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_MATH_BINARY_ELEMENTWISE_CUDA_KERNEL_AND_GRAD,
+                                 OF_PP_MAKE_TUPLE_SEQ("floordiv", FloorDiv),
+                                 INT_DATA_TYPE_SEQ UNSIGNED_INT_DATA_TYPE_SEQ)
+
+template<template<typename> class BinaryFunctor>
+class MathBinaryElementwiseGpuHalfKernel final : public user_op::OpKernel {
+ public:
+  MathBinaryElementwiseGpuHalfKernel() = default;
+  ~MathBinaryElementwiseGpuHalfKernel() = default;
+
+ private:
+  using user_op::OpKernel::Compute;
+  void Compute(user_op::KernelComputeContext* ctx) const override {
+    const user_op::Tensor* tensor_x = ctx->Tensor4ArgNameAndIndex("x", 0);
+    const user_op::Tensor* tensor_y = ctx->Tensor4ArgNameAndIndex("y", 0);
+    user_op::Tensor* tensor_z = ctx->Tensor4ArgNameAndIndex("z", 0);
+    const half* x = reinterpret_cast<const half*>(tensor_x->dptr<float16>());
+    const half* y = reinterpret_cast<const half*>(tensor_y->dptr<float16>());
+    half* z = reinterpret_cast<half*>(tensor_z->mut_dptr<float16>());
+    int64_t n = tensor_x->shape_view().elem_cnt();
+    if (n == 0) { return; }
+    MathBinaryElementwiseForwardGpu<BinaryFunctor, half>
+        <<<BlocksNum4ThreadsNum(n), kCudaThreadsNumPerBlock, 0,
+           ctx->stream()->As<ep::CudaStream>()->cuda_stream()>>>(n, x, y, z);
+  }
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
+};
+
+template<template<typename> class BinaryFunctor>
+class MathBinaryElementwiseXGradGpuHalfKernel final : public user_op::OpKernel {
+ public:
+  MathBinaryElementwiseXGradGpuHalfKernel() = default;
+  ~MathBinaryElementwiseXGradGpuHalfKernel() = default;
+
+ private:
+  using user_op::OpKernel::Compute;
+  void Compute(user_op::KernelComputeContext* ctx) const override {
+    const user_op::Tensor* tensor_x = ctx->Tensor4ArgNameAndIndex("x", 0);
+    const user_op::Tensor* tensor_y = ctx->Tensor4ArgNameAndIndex("y", 0);
+    const user_op::Tensor* tensor_dz = ctx->Tensor4ArgNameAndIndex("dz", 0);
+    user_op::Tensor* tensor_dx = ctx->Tensor4ArgNameAndIndex("dx", 0);
+
+    const half* x = reinterpret_cast<const half*>(tensor_x->dptr<float16>());
+    const half* y = reinterpret_cast<const half*>(tensor_y->dptr<float16>());
+    const half* dz = reinterpret_cast<const half*>(tensor_dz->dptr<float16>());
+    half* dx = reinterpret_cast<half*>(tensor_dx->mut_dptr<float16>());
+    int64_t n = tensor_x->shape_view().elem_cnt();
+    if (n == 0) { return; }
+    MathBinaryElementwiseBackwardXGradGpu<BinaryFunctor, half>
+        <<<BlocksNum4ThreadsNum(n), kCudaThreadsNumPerBlock, 0,
+           ctx->stream()->As<ep::CudaStream>()->cuda_stream()>>>(n, x, y, dz, dx);
+  }
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
+};
+
+template<template<typename> class BinaryFunctor>
+class MathBinaryElementwiseYGradGpuHalfKernel final : public user_op::OpKernel {
+ public:
+  MathBinaryElementwiseYGradGpuHalfKernel() = default;
+  ~MathBinaryElementwiseYGradGpuHalfKernel() = default;
+
+ private:
+  using user_op::OpKernel::Compute;
+  void Compute(user_op::KernelComputeContext* ctx) const override {
+    const user_op::Tensor* tensor_x = ctx->Tensor4ArgNameAndIndex("x", 0);
+    const user_op::Tensor* tensor_y = ctx->Tensor4ArgNameAndIndex("y", 0);
+    const user_op::Tensor* tensor_dz = ctx->Tensor4ArgNameAndIndex("dz", 0);
+    user_op::Tensor* tensor_dy = ctx->Tensor4ArgNameAndIndex("dy", 0);
+
+    const half* x = reinterpret_cast<const half*>(tensor_x->dptr<float16>());
+    const half* y = reinterpret_cast<const half*>(tensor_y->dptr<float16>());
+    const half* dz = reinterpret_cast<const half*>(tensor_dz->dptr<float16>());
+    half* dy = reinterpret_cast<half*>(tensor_dy->mut_dptr<float16>());
+    int64_t n = tensor_x->shape_view().elem_cnt();
+    if (n == 0) { return; }
+    MathBinaryElementwiseBackwardYGradGpu<BinaryFunctor, half>
+        <<<BlocksNum4ThreadsNum(n), kCudaThreadsNumPerBlock, 0,
+           ctx->stream()->As<ep::CudaStream>()->cuda_stream()>>>(n, x, y, dz, dy);
+  }
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
+};
+
+#define REGISTER_MATH_BINARY_ELEMENTWISE_CUDA_HALF_KERNEL_AND_GRAD(math_type_str,              \
+                                                                   math_func_prefix)           \
+  REGISTER_USER_KERNEL(math_type_str)                                                          \
+      .SetCreateFn<MathBinaryElementwiseGpuHalfKernel<OF_PP_CAT(math_func_prefix, Functor)>>() \
+      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                         \
+                       && (user_op::HobDataType("x", 0) == DataType::kFloat16));               \
+                                                                                               \
+  REGISTER_USER_KERNEL((std::string("") + math_type_str + "_x_grad"))                          \
+      .SetCreateFn<                                                                            \
+          MathBinaryElementwiseXGradGpuHalfKernel<OF_PP_CAT(math_func_prefix, Functor)>>()     \
+      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                         \
+                       && (user_op::HobDataType("x", 0) == DataType::kFloat16));               \
+  REGISTER_USER_KERNEL((std::string("") + math_type_str + "_y_grad"))                          \
+      .SetCreateFn<                                                                            \
+          MathBinaryElementwiseYGradGpuHalfKernel<OF_PP_CAT(math_func_prefix, Functor)>>()     \
+      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                         \
+                       && (user_op::HobDataType("x", 0) == DataType::kFloat16));
+
+OF_PP_FOR_EACH_TUPLE(REGISTER_MATH_BINARY_ELEMENTWISE_CUDA_HALF_KERNEL_AND_GRAD,
+                     MATH_BINARY_ELEMENTWISE_FUNC_SEQ)
+
 }  // namespace oneflow
\ No newline at end of file
diff --git a/oneflow/user/kernels/math_unary_elementwise_func.h b/oneflow/user/kernels/math_unary_elementwise_func.h
index c55ecf4..aff50c3 100644
--- a/oneflow/user/kernels/math_unary_elementwise_func.h
+++ b/oneflow/user/kernels/math_unary_elementwise_func.h
@@ -1,983 +1,983 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#ifndef ONEFLOW_USER_KERNELS_MATH_UNARY_ELEMENTWISE_FUNC_H_
-#define ONEFLOW_USER_KERNELS_MATH_UNARY_ELEMENTWISE_FUNC_H_
-
-#include "oneflow/core/common/util.h"
-#include "oneflow/core/common/data_type.h"
-#include "oneflow/user/ops/math_unary_elementwise_seq.h"
-#include "oneflow/core/device/cuda_pseudo_half.h"
-
-#if defined(__CUDACC__)
-
-#include <cuda_fp16.h>
-#define MATH_FUNC_F(name, x) name##f(x)
-#define MATH_FUNC_D(name, x) name(x)
-
-#elif defined(__HIPCC__)
-#include <cmath>
-#include <hip/hip_fp16.h>
-
-#if defined(__HIP_DEVICE_COMPILE__)
-#define MATH_FUNC_F(name, x) name##f(x)
-#define MATH_FUNC_D(name, x) name(x)
-#else
-#define MATH_FUNC_F(name, x) std::name(x)
-#define MATH_FUNC_D(name, x) std::name(x)
-#endif
-
-#else
-
-#include <cmath>
-#define MATH_FUNC_F(name, x) std::name(x)
-#define MATH_FUNC_D(name, x) std::name(x)
-
-#endif
-
-namespace oneflow {
-
-#define DECLARE_UNARY_FUNCTOR(math_unary_elementwise_type, func_prefix) \
-  template<typename T>                                                  \
-  struct func_prefix##Functor;
-
-OF_PP_FOR_EACH_TUPLE(DECLARE_UNARY_FUNCTOR, MATH_UNARY_ELEMENTWISE_FUNC_SEQ)
-
-template<typename T>
-struct AbsFunctor {
-  static OF_DEVICE_FUNC T Forward(const T x) {
-    if (x == T(0))
-      return T(0);
-    else
-      return x < T(0) ? -x : x;
-  }
-
-  static OF_DEVICE_FUNC T Backward(const T x, const T dy) {
-    if (x == T(0))
-      return T(0);
-    else
-      return x < T(0) ? -dy : dy;
-  }
-};
-
-template<typename T>
-struct SignFunctor {
-  static OF_DEVICE_FUNC T Forward(const T x) { return (T(0) < x) - (x < T(0)); }
-
-  static OF_DEVICE_FUNC T Backward(const T x, const T dy) { return T(0); }
-};
-
-template<>
-struct RsqrtFunctor<float> {
-  static OF_DEVICE_FUNC float Forward(const float x) {
-#if defined(__CUDACC__)
-    return rsqrtf(x);
-#elif defined(__HIP_DEVICE_COMPILE__)
-    return rsqrtf(x);
-#else
-    return 1.0f / std::sqrt(x);
-#endif
-  }
-
-  static OF_DEVICE_FUNC float Backward(const float x, const float dy) {
-    return dy * (-1.0f / (2.0f * MATH_FUNC_F(sqrt, x * x * x)));
-  }
-};
-
-template<>
-struct RsqrtFunctor<double> {
-  static OF_DEVICE_FUNC double Forward(const double x) {
-#if defined(__CUDACC__)
-    return rsqrt(x);
-#elif defined(__HIP_DEVICE_COMPILE__)
-    return rsqrt(x);
-#else
-    return 1.0 / std::sqrt(x);
-#endif
-  }
-
-  static OF_DEVICE_FUNC double Backward(const double x, const double dy) {
-    return dy * (-1.0 / (2.0 * MATH_FUNC_D(sqrt, x * x * x)));
-  }
-};
-
-// float version
-
-template<>
-struct AcosFunctor<float> {
-  static OF_DEVICE_FUNC float Forward(const float x) { return MATH_FUNC_F(acos, x); }
-
-  static OF_DEVICE_FUNC float Backward(const float x, const float dy) {
-    return dy * -RsqrtFunctor<float>::Forward(1.0f - x * x);
-  }
-};
-
-template<>
-struct AcoshFunctor<float> {
-  static OF_DEVICE_FUNC float Forward(const float x) { return MATH_FUNC_F(acosh, x); }
-
-  static OF_DEVICE_FUNC float Backward(const float x, const float dy) {
-    return dy * RsqrtFunctor<float>::Forward(x * x - 1.0f);
-  }
-};
-
-template<>
-struct AsinFunctor<float> {
-  static OF_DEVICE_FUNC float Forward(const float x) { return MATH_FUNC_F(asin, x); }
-
-  static OF_DEVICE_FUNC float Backward(const float x, const float dy) {
-    return dy * RsqrtFunctor<float>::Forward(1.0f - x * x);
-  }
-};
-
-template<>
-struct AsinhFunctor<float> {
-  static OF_DEVICE_FUNC float Forward(const float x) { return MATH_FUNC_F(asinh, x); }
-
-  static OF_DEVICE_FUNC float Backward(const float x, const float dy) {
-    return dy * RsqrtFunctor<float>::Forward(1.0f + x * x);
-  }
-};
-
-template<>
-struct AtanFunctor<float> {
-  static OF_DEVICE_FUNC float Forward(const float x) { return MATH_FUNC_F(atan, x); }
-
-  static OF_DEVICE_FUNC float Backward(const float x, const float dy) {
-    return dy * (1.0f / (1.0f + x * x));
-  }
-};
-
-template<>
-struct AtanhFunctor<float> {
-  static OF_DEVICE_FUNC float Forward(const float x) { return MATH_FUNC_F(atanh, x); }
-
-  static OF_DEVICE_FUNC float Backward(const float x, const float dy) {
-    return dy * (1.0f / (1.0f - x * x));
-  }
-};
-
-template<>
-struct NotEqualZeroFunctor<float> {
-  static OF_DEVICE_FUNC float Forward(const float x) { return x != 0; }
-
-  static OF_DEVICE_FUNC float Backward(const float x, const float dy) { return 0.0f; }
-};
-
-template<>
-struct CeilFunctor<float> {
-  static OF_DEVICE_FUNC float Forward(const float x) { return MATH_FUNC_F(ceil, x); }
-
-  static OF_DEVICE_FUNC float Backward(const float x, const float dy) { return 0.0f; }
-};
-
-template<>
-struct CosFunctor<float> {
-  static OF_DEVICE_FUNC float Forward(const float x) { return MATH_FUNC_F(cos, x); }
-
-  static OF_DEVICE_FUNC float Backward(const float x, const float dy) {
-    return dy * (-MATH_FUNC_F(sin, x));
-  }
-};
-
-template<>
-struct CoshFunctor<float> {
-  static OF_DEVICE_FUNC float Forward(const float x) { return MATH_FUNC_F(cosh, x); }
-
-  static OF_DEVICE_FUNC float Backward(const float x, const float dy) {
-    return dy * MATH_FUNC_F(sinh, x);
-  }
-};
-
-template<>
-struct ErfFunctor<float> {
-  static OF_DEVICE_FUNC float Forward(const float x) { return MATH_FUNC_F(erf, x); }
-
-  static OF_DEVICE_FUNC float Backward(const float x, const float dy) {
-    return dy * 2.0f * RsqrtFunctor<float>::Forward(M_PI) * expf(-x * x);
-  }
-};
-
-template<>
-struct ErfcFunctor<float> {
-  static OF_DEVICE_FUNC float Forward(const float x) { return MATH_FUNC_F(erfc, x); }
-
-  static OF_DEVICE_FUNC float Backward(const float x, const float dy) {
-    return dy * -2.0f * RsqrtFunctor<float>::Forward(M_PI) * expf(-x * x);
-  }
-};
-
-template<>
-struct ExpFunctor<float> {
-  static OF_DEVICE_FUNC float Forward(const float x) { return MATH_FUNC_F(exp, x); }
-
-  static OF_DEVICE_FUNC float Backward(const float x, const float dy) {
-    return dy * MATH_FUNC_F(exp, x);
-  }
-};
-
-template<>
-struct Expm1Functor<float> {
-  static OF_DEVICE_FUNC float Forward(const float x) { return MATH_FUNC_F(expm1, x); }
-
-  static OF_DEVICE_FUNC float Backward(const float x, const float dy) {
-    return dy * MATH_FUNC_F(exp, x);
-  }
-};
-
-template<>
-struct FloorFunctor<float> {
-  static OF_DEVICE_FUNC float Forward(const float x) { return MATH_FUNC_F(floor, x); }
-
-  static OF_DEVICE_FUNC float Backward(const float x, const float dy) { return 0.0f; }
-};
-
-template<>
-struct LgammaFunctor<float> {
-  static OF_DEVICE_FUNC float Forward(const float x) { return MATH_FUNC_F(lgamma, x); }
-
-  static OF_DEVICE_FUNC float Backward(const float x, const float dy) {
-    // TODO(chengcheng): return: dy * digamma(x)
-    // assert(false);
-    return 0.0f;
-  }
-};
-
-template<>
-struct LogFunctor<float> {
-  static OF_DEVICE_FUNC float Forward(const float x) { return MATH_FUNC_F(log, x); }
-
-  static OF_DEVICE_FUNC float Backward(const float x, const float dy) { return dy * (1.0f / x); }
-};
-
-template<>
-struct Log2Functor<float> {
-  static OF_DEVICE_FUNC float Forward(const float x) { return MATH_FUNC_F(log2, x); }
-
-  static OF_DEVICE_FUNC float Backward(const float x, const float dy) {
-    return dy * (1.0f / (x * MATH_FUNC_F(log, 2.0f)));
-  }
-};
-
-template<>
-struct Log1pFunctor<float> {
-  static OF_DEVICE_FUNC float Forward(const float x) { return MATH_FUNC_F(log1p, x); }
-
-  static OF_DEVICE_FUNC float Backward(const float x, const float dy) {
-    return dy * (1.0f / (x + 1.0f));
-  }
-};
-
-template<>
-struct LogSigmoidFunctor<float> {
-  static OF_DEVICE_FUNC float Forward(const float x) {
-    return -MATH_FUNC_F(log, (1.0f + MATH_FUNC_F(exp, -x)));
-  }
-
-  static OF_DEVICE_FUNC float Backward(const float x, const float dy) {
-    return dy * (1.0f / (MATH_FUNC_F(exp, x) + 1.0f));
-  }
-};
-
-template<>
-struct NegativeFunctor<float> {
-  static OF_DEVICE_FUNC float Forward(const float x) { return -x; }
-
-  static OF_DEVICE_FUNC float Backward(const float x, const float dy) { return -dy; }
-};
-
-template<>
-struct ReciprocalFunctor<float> {
-  static OF_DEVICE_FUNC float Forward(const float x) { return 1.0f / x; }
-
-  static OF_DEVICE_FUNC float Backward(const float x, const float dy) {
-    return dy * (-1.0f / (x * x));
-  }
-};
-
-template<>
-struct ReciprocalNoNanFunctor<float> {
-  static OF_DEVICE_FUNC float Forward(const float x) {
-    if (fabsf(x) <= 0.0f) { return 0.0f; }
-    return 1.0f / x;
-  }
-
-  static OF_DEVICE_FUNC float Backward(const float x, const float dy) {
-    if (fabsf(x) <= 0.0f) { return 0.0f; }
-    return dy * (-1.0f / (x * x));
-  }
-};
-
-template<>
-struct RintFunctor<float> {
-  static OF_DEVICE_FUNC float Forward(const float x) { return MATH_FUNC_F(rint, x); }
-
-  static OF_DEVICE_FUNC float Backward(const float x, const float dy) { return 0.0f; }
-};
-
-template<>
-struct RoundFunctor<float> {
-  static OF_DEVICE_FUNC float Forward(const float x) { return MATH_FUNC_F(nearbyint, x); }
-
-  static OF_DEVICE_FUNC float Backward(const float x, const float dy) { return 0.0f; }
-};
-
-template<>
-struct SigmoidFunctor<float> {
-  static OF_DEVICE_FUNC float Forward(const float x) {
-    return 1.0f / (1.0f + MATH_FUNC_F(exp, -x));
-  }
-
-  static OF_DEVICE_FUNC float Backward(const float x, const float dy) {
-    float y = 1.0f / (1.0f + MATH_FUNC_F(exp, -x));
-    return dy * (y * (1.0f - y));
-  }
-};
-
-template<>
-struct SinFunctor<float> {
-  static OF_DEVICE_FUNC float Forward(const float x) { return MATH_FUNC_F(sin, x); }
-
-  static OF_DEVICE_FUNC float Backward(const float x, const float dy) {
-    return dy * MATH_FUNC_F(cos, x);
-  }
-};
-
-template<>
-struct SinhFunctor<float> {
-  static OF_DEVICE_FUNC float Forward(const float x) { return MATH_FUNC_F(sinh, x); }
-
-  static OF_DEVICE_FUNC float Backward(const float x, const float dy) {
-    return dy * MATH_FUNC_F(cosh, x);
-  }
-};
-
-template<>
-struct SqrtFunctor<float> {
-  static OF_DEVICE_FUNC float Forward(const float x) { return MATH_FUNC_F(sqrt, x); }
-
-  static OF_DEVICE_FUNC float Backward(const float x, const float dy) {
-    return dy * 0.5f / MATH_FUNC_F(sqrt, x);
-  }
-};
-
-template<>
-struct SquareFunctor<float> {
-  static OF_DEVICE_FUNC float Forward(const float x) { return x * x; }
-
-  static OF_DEVICE_FUNC float Backward(const float x, const float dy) { return dy * 2.0f * x; }
-};
-
-template<>
-struct TanFunctor<float> {
-  static OF_DEVICE_FUNC float Forward(const float x) { return MATH_FUNC_F(tan, x); }
-
-  static OF_DEVICE_FUNC float Backward(const float x, const float dy) {
-    return dy * (1.0f / (MATH_FUNC_F(cos, x) * MATH_FUNC_F(cos, x)));
-  }
-};
-
-// double version
-
-template<>
-struct AcosFunctor<double> {
-  static OF_DEVICE_FUNC double Forward(const double x) { return MATH_FUNC_D(acos, x); }
-
-  static OF_DEVICE_FUNC double Backward(const double x, const double dy) {
-    return dy * -RsqrtFunctor<double>::Forward(1.0 - x * x);
-  }
-};
-
-template<>
-struct AcoshFunctor<double> {
-  static OF_DEVICE_FUNC double Forward(const double x) { return MATH_FUNC_D(acosh, x); }
-
-  static OF_DEVICE_FUNC double Backward(const double x, const double dy) {
-    return dy * -RsqrtFunctor<double>::Forward(x * x - 1.0);
-  }
-};
-
-template<>
-struct AsinFunctor<double> {
-  static OF_DEVICE_FUNC double Forward(const double x) { return MATH_FUNC_D(asin, x); }
-
-  static OF_DEVICE_FUNC double Backward(const double x, const double dy) {
-    return dy * RsqrtFunctor<double>::Forward(1.0 - x * x);
-  }
-};
-
-template<>
-struct AsinhFunctor<double> {
-  static OF_DEVICE_FUNC double Forward(const double x) { return MATH_FUNC_D(asinh, x); }
-
-  static OF_DEVICE_FUNC double Backward(const double x, const double dy) {
-    return dy * RsqrtFunctor<double>::Forward(1.0 + x * x);
-  }
-};
-
-template<>
-struct AtanFunctor<double> {
-  static OF_DEVICE_FUNC double Forward(const double x) { return MATH_FUNC_D(atan, x); }
-
-  static OF_DEVICE_FUNC double Backward(const double x, const double dy) {
-    return dy * (1.0 / (1.0 + x * x));
-  }
-};
-
-template<>
-struct AtanhFunctor<double> {
-  static OF_DEVICE_FUNC double Forward(const double x) { return MATH_FUNC_D(atanh, x); }
-
-  static OF_DEVICE_FUNC double Backward(const double x, const double dy) {
-    return dy * (1.0 / (1.0 - x * x));
-  }
-};
-
-template<>
-struct NotEqualZeroFunctor<double> {
-  static OF_DEVICE_FUNC double Forward(const double x) { return x != 0; }
-
-  static OF_DEVICE_FUNC double Backward(const double x, const double dy) { return 0.0f; }
-};
-
-template<>
-struct CeilFunctor<double> {
-  static OF_DEVICE_FUNC double Forward(const double x) { return MATH_FUNC_D(ceil, x); }
-
-  static OF_DEVICE_FUNC double Backward(const double x, const double dy) { return 0.0; }
-};
-
-template<>
-struct CosFunctor<double> {
-  static OF_DEVICE_FUNC double Forward(const double x) { return MATH_FUNC_D(cos, x); }
-
-  static OF_DEVICE_FUNC double Backward(const double x, const double dy) {
-    return dy * (-MATH_FUNC_D(sin, x));
-  }
-};
-
-template<>
-struct CoshFunctor<double> {
-  static OF_DEVICE_FUNC double Forward(const double x) { return MATH_FUNC_D(cosh, x); }
-
-  static OF_DEVICE_FUNC double Backward(const double x, const double dy) {
-    return dy * MATH_FUNC_D(sinh, x);
-  }
-};
-
-template<>
-struct ErfFunctor<double> {
-  static OF_DEVICE_FUNC double Forward(const double x) { return MATH_FUNC_D(erf, x); }
-
-  static OF_DEVICE_FUNC double Backward(const double x, const double dy) {
-    return dy * 2.0 * RsqrtFunctor<double>::Forward(M_PI) * expf(-x * x);
-  }
-};
-
-template<>
-struct ErfcFunctor<double> {
-  static OF_DEVICE_FUNC double Forward(const double x) { return MATH_FUNC_D(erfc, x); }
-
-  static OF_DEVICE_FUNC double Backward(const double x, const double dy) {
-    return dy * -2.0 * RsqrtFunctor<double>::Forward(M_PI) * expf(-x * x);
-  }
-};
-
-template<>
-struct ExpFunctor<double> {
-  static OF_DEVICE_FUNC double Forward(const double x) { return MATH_FUNC_D(exp, x); }
-
-  static OF_DEVICE_FUNC double Backward(const double x, const double dy) {
-    return dy * MATH_FUNC_D(exp, x);
-  }
-};
-
-template<>
-struct Expm1Functor<double> {
-  static OF_DEVICE_FUNC double Forward(const double x) { return MATH_FUNC_D(expm1, x); }
-
-  static OF_DEVICE_FUNC double Backward(const double x, const double dy) {
-    return dy * MATH_FUNC_D(exp, x);
-  }
-};
-
-template<>
-struct FloorFunctor<double> {
-  static OF_DEVICE_FUNC double Forward(const double x) { return MATH_FUNC_D(floor, x); }
-
-  static OF_DEVICE_FUNC double Backward(const double x, const double dy) { return 0.0; }
-};
-
-template<>
-struct LgammaFunctor<double> {
-  static OF_DEVICE_FUNC double Forward(const double x) { return MATH_FUNC_D(lgamma, x); }
-
-  static OF_DEVICE_FUNC double Backward(const double x, const double dy) {
-    // TODO(chengcheng): return: dy * digamma(x)
-    // assert(false);
-    return 0.0;
-  }
-};
-
-template<>
-struct LogFunctor<double> {
-  static OF_DEVICE_FUNC double Forward(const double x) { return MATH_FUNC_D(log, x); }
-
-  static OF_DEVICE_FUNC double Backward(const double x, const double dy) { return dy * (1.0 / x); }
-};
-
-template<>
-struct Log2Functor<double> {
-  static OF_DEVICE_FUNC double Forward(const double x) { return MATH_FUNC_D(log2, x); }
-
-  static OF_DEVICE_FUNC double Backward(const double x, const double dy) {
-    return dy * (1.0 / (x * MATH_FUNC_D(log, 2.0)));
-  }
-};
-
-template<>
-struct Log1pFunctor<double> {
-  static OF_DEVICE_FUNC double Forward(const double x) { return MATH_FUNC_D(log1p, x); }
-
-  static OF_DEVICE_FUNC double Backward(const double x, const double dy) {
-    return dy * (1.0 / (x + 1.0));
-  }
-};
-
-template<>
-struct LogSigmoidFunctor<double> {
-  static OF_DEVICE_FUNC double Forward(const double x) {
-    return -MATH_FUNC_D(log, (1.0 + MATH_FUNC_D(exp, -x)));
-  }
-
-  static OF_DEVICE_FUNC double Backward(const double x, const double dy) {
-    return dy * (1.0 / (MATH_FUNC_D(exp, x) + 1.0));
-  }
-};
-
-template<>
-struct NegativeFunctor<double> {
-  static OF_DEVICE_FUNC double Forward(const double x) { return -x; }
-
-  static OF_DEVICE_FUNC double Backward(const double x, const double dy) { return -dy; }
-};
-
-template<>
-struct ReciprocalFunctor<double> {
-  static OF_DEVICE_FUNC double Forward(const double x) { return 1.0 / x; }
-
-  static OF_DEVICE_FUNC double Backward(const double x, const double dy) {
-    return dy * (-1.0 / (x * x));
-  }
-};
-
-template<>
-struct ReciprocalNoNanFunctor<double> {
-  static OF_DEVICE_FUNC double Forward(const double x) {
-    if (fabs(x) <= 0.0) { return 0.0; }
-    return 1.0 / x;
-  }
-
-  static OF_DEVICE_FUNC double Backward(const double x, const double dy) {
-    if (fabs(x) <= 0.0) { return 0.0; }
-    return dy * (-1.0 / (x * x));
-  }
-};
-
-template<>
-struct RintFunctor<double> {
-  static OF_DEVICE_FUNC double Forward(const double x) { return MATH_FUNC_D(rint, x); }
-
-  static OF_DEVICE_FUNC double Backward(const double x, const double dy) { return 0.0; }
-};
-
-template<>
-struct RoundFunctor<double> {
-  static OF_DEVICE_FUNC double Forward(const double x) { return MATH_FUNC_D(nearbyint, x); }
-
-  static OF_DEVICE_FUNC double Backward(const double x, const double dy) { return 0.0; }
-};
-
-template<>
-struct SigmoidFunctor<double> {
-  static OF_DEVICE_FUNC double Forward(const double x) {
-    return 1.0 / (1.0 + MATH_FUNC_D(exp, -x));
-  }
-
-  static OF_DEVICE_FUNC double Backward(const double x, const double dy) {
-    double y = 1.0 / (1.0 + MATH_FUNC_D(exp, -x));
-    return dy * (y * (1.0 - y));
-  }
-};
-
-template<>
-struct SinFunctor<double> {
-  static OF_DEVICE_FUNC double Forward(const double x) { return MATH_FUNC_D(sin, x); }
-
-  static OF_DEVICE_FUNC double Backward(const double x, const double dy) {
-    return dy * MATH_FUNC_D(cos, x);
-  }
-};
-
-template<>
-struct SinhFunctor<double> {
-  static OF_DEVICE_FUNC double Forward(const double x) { return MATH_FUNC_D(sinh, x); }
-
-  static OF_DEVICE_FUNC double Backward(const double x, const double dy) {
-    return dy * MATH_FUNC_D(cosh, x);
-  }
-};
-
-template<>
-struct SqrtFunctor<double> {
-  static OF_DEVICE_FUNC double Forward(const double x) { return MATH_FUNC_D(sqrt, x); }
-
-  static OF_DEVICE_FUNC double Backward(const double x, const double dy) {
-    return dy * (double)0.5 / MATH_FUNC_D(sqrt, x);
-  }
-};
-
-template<>
-struct SquareFunctor<double> {
-  static OF_DEVICE_FUNC double Forward(const double x) { return x * x; }
-
-  static OF_DEVICE_FUNC double Backward(const double x, const double dy) { return dy * 2.0 * x; }
-};
-
-template<>
-struct TanFunctor<double> {
-  static OF_DEVICE_FUNC double Forward(const double x) { return MATH_FUNC_D(tan, x); }
-
-  static OF_DEVICE_FUNC double Backward(const double x, const double dy) {
-    return dy * (1.0 / (MATH_FUNC_D(cos, x) * MATH_FUNC_D(cos, x)));
-  }
-};
-
-#if defined(__CUDACC__) || defined(__HIPCC__)
-// half version
-
-#define OF_HALF_FUNC __device__ __forceinline__
-
-#define MATH_FUNC_H(name, x) __float2half(name##f(__half2float(x)))
-#define HALF_VAL_HALF __float2half(0.5f)
-#define HALF_VAL_TWO __float2half(2.0f)
-#define HALF_VAL_2RSQRT_PI __float2half(1.1283791671f)
-
-template<>
-struct AbsFunctor<half> {
-  static OF_HALF_FUNC half Forward(const half x) {
-    return __hlt(x, GetZeroVal<half>()) ? __hneg(x) : x;
-  }
-
-  static OF_HALF_FUNC half Backward(const half x, const half dy) {
-    return __hlt(x, GetZeroVal<half>()) ? __hneg(dy) : dy;
-  }
-};
-
-template<>
-struct AcosFunctor<half> {
-  static OF_HALF_FUNC half Forward(const half x) { return MATH_FUNC_H(acos, x); }
-
-  static OF_HALF_FUNC half Backward(const half x, const half dy) {
-    return __hmul(dy, __hneg(hrsqrt(__hsub(GetOneVal<half>(), __hmul(x, x)))));
-  }
-};
-
-template<>
-struct AcoshFunctor<half> {
-  static OF_HALF_FUNC half Forward(const half x) { return MATH_FUNC_H(acosh, x); }
-
-  static OF_HALF_FUNC half Backward(const half x, const half dy) {
-    return __hmul(dy, hrsqrt(__hsub(__hmul(x, x), GetOneVal<half>())));
-  }
-};
-
-template<>
-struct AsinFunctor<half> {
-  static OF_HALF_FUNC half Forward(const half x) { return MATH_FUNC_H(asin, x); }
-
-  static OF_HALF_FUNC half Backward(const half x, const half dy) {
-    return __hmul(dy, hrsqrt(__hsub(GetOneVal<half>(), __hmul(x, x))));
-  }
-};
-
-template<>
-struct AsinhFunctor<half> {
-  static OF_HALF_FUNC half Forward(const half x) { return MATH_FUNC_H(asinh, x); }
-
-  static OF_HALF_FUNC half Backward(const half x, const half dy) {
-    return __hmul(dy, hrsqrt(__hadd(GetOneVal<half>(), __hmul(x, x))));
-  }
-};
-
-template<>
-struct AtanFunctor<half> {
-  static OF_HALF_FUNC half Forward(const half x) { return MATH_FUNC_H(atan, x); }
-
-  static OF_HALF_FUNC half Backward(const half x, const half dy) {
-    return __hmul(dy, __hdiv(GetOneVal<half>(), __hadd(GetOneVal<half>(), __hmul(x, x))));
-  }
-};
-
-template<>
-struct AtanhFunctor<half> {
-  static OF_HALF_FUNC half Forward(const half x) { return MATH_FUNC_H(atanh, x); }
-
-  static OF_HALF_FUNC half Backward(const half x, const half dy) {
-    return __hmul(dy, __hdiv(GetOneVal<half>(), __hsub(GetOneVal<half>(), __hmul(x, x))));
-  }
-};
-
-template<>
-struct CeilFunctor<half> {
-  static OF_HALF_FUNC half Forward(const half x) { return hceil(x); }
-
-  static OF_HALF_FUNC half Backward(const half x, const half dy) { return GetZeroVal<half>(); }
-};
-
-template<>
-struct NotEqualZeroFunctor<half> {
-  static OF_HALF_FUNC half Forward(const half x) { return x != static_cast<half>(0.0); }
-
-  static OF_HALF_FUNC half Backward(const half x, const half dy) { return GetZeroVal<half>(); }
-};
-
-template<>
-struct CosFunctor<half> {
-  static OF_HALF_FUNC half Forward(const half x) { return hcos(x); }
-
-  static OF_HALF_FUNC half Backward(const half x, const half dy) {
-    return __hmul(dy, __hneg(hsin(x)));
-  }
-};
-
-template<>
-struct CoshFunctor<half> {
-  static OF_HALF_FUNC half Forward(const half x) { return MATH_FUNC_H(cosh, x); }
-
-  static OF_HALF_FUNC half Backward(const half x, const half dy) {
-    return __hmul(dy, MATH_FUNC_H(sinh, x));
-  }
-};
-
-template<>
-struct ErfFunctor<half> {
-  static OF_HALF_FUNC half Forward(const half x) { return MATH_FUNC_H(erf, x); }
-
-  static OF_HALF_FUNC half Backward(const half x, const half dy) {
-    return __hmul(dy, __hmul(HALF_VAL_2RSQRT_PI, hexp(__hmul(__hneg(x), x))));
-  }
-};
-
-template<>
-struct ErfcFunctor<half> {
-  static OF_HALF_FUNC half Forward(const half x) { return MATH_FUNC_H(erfc, x); }
-
-  static OF_HALF_FUNC half Backward(const half x, const half dy) {
-    return __hmul(dy, __hneg(__hmul(HALF_VAL_2RSQRT_PI, hexp(__hmul(__hneg(x), x)))));
-  }
-};
-
-template<>
-struct ExpFunctor<half> {
-  static OF_HALF_FUNC half Forward(const half x) { return hexp(x); }
-
-  static OF_HALF_FUNC half Backward(const half x, const half dy) { return __hmul(dy, hexp(x)); }
-};
-
-template<>
-struct Expm1Functor<half> {
-  static OF_HALF_FUNC half Forward(const half x) { return MATH_FUNC_H(expm1, x); }
-
-  static OF_HALF_FUNC half Backward(const half x, const half dy) { return __hmul(dy, hexp(x)); }
-};
-
-template<>
-struct FloorFunctor<half> {
-  static OF_HALF_FUNC half Forward(const half x) { return hfloor(x); }
-
-  static OF_HALF_FUNC half Backward(const half x, const half dy) { return GetZeroVal<half>(); }
-};
-
-template<>
-struct LgammaFunctor<half> {
-  static OF_HALF_FUNC half Forward(const half x) { return MATH_FUNC_H(lgamma, x); }
-
-  static OF_HALF_FUNC half Backward(const half x, const half dy) {
-    // TODO(chengcheng): return: dy * digamma(x)
-    // assert(false);
-    return GetZeroVal<half>();
-  }
-};
-
-template<>
-struct LogFunctor<half> {
-  static OF_HALF_FUNC half Forward(const half x) { return hlog(x); }
-
-  static OF_HALF_FUNC half Backward(const half x, const half dy) { return __hmul(dy, hrcp(x)); }
-};
-
-template<>
-struct Log2Functor<half> {
-  static OF_HALF_FUNC half Forward(const half x) { return hlog2(x); }
-
-  static OF_HALF_FUNC half Backward(const half x, const half dy) {
-    return __hmul(dy, hrcp(__hmul(x, hlog(HALF_VAL_TWO))));
-  }
-};
-
-template<>
-struct Log1pFunctor<half> {
-  static OF_HALF_FUNC half Forward(const half x) { return MATH_FUNC_H(log1p, x); }
-
-  static OF_HALF_FUNC half Backward(const half x, const half dy) {
-    return __hmul(dy, hrcp(__hadd(x, GetOneVal<half>())));
-  }
-};
-
-template<>
-struct LogSigmoidFunctor<half> {
-  static OF_HALF_FUNC half Forward(const half x) {
-    return __hneg(hlog(__hadd(GetOneVal<half>(), hexp(__hneg(x)))));
-  }
-
-  static OF_HALF_FUNC half Backward(const half x, const half dy) {
-    return __hmul(dy, hrcp(__hadd(hexp(x), GetOneVal<half>())));
-  }
-};
-
-template<>
-struct NegativeFunctor<half> {
-  static OF_HALF_FUNC half Forward(const half x) { return __hneg(x); }
-
-  static OF_HALF_FUNC half Backward(const half x, const half dy) { return __hneg(dy); }
-};
-
-template<>
-struct ReciprocalFunctor<half> {
-  static OF_HALF_FUNC half Forward(const half x) { return hrcp(x); }
-
-  static OF_HALF_FUNC half Backward(const half x, const half dy) {
-    return __hmul(dy, __hneg(hrcp(__hmul(x, x))));
-  }
-};
-
-template<>
-struct ReciprocalNoNanFunctor<half> {
-  static OF_HALF_FUNC half Forward(const half x) {
-    if (__heq(GetZeroVal<half>(), x)) { return GetZeroVal<half>(); }
-    return hrcp(x);
-  }
-
-  static OF_HALF_FUNC half Backward(const half x, const half dy) {
-    if (__heq(GetZeroVal<half>(), x)) { return GetZeroVal<half>(); }
-    return __hmul(dy, __hneg(hrcp(__hmul(x, x))));
-  }
-};
-
-template<>
-struct RintFunctor<half> {
-  static OF_HALF_FUNC half Forward(const half x) { return hrint(x); }
-
-  static OF_HALF_FUNC half Backward(const half x, const half dy) { return GetZeroVal<half>(); }
-};
-
-template<>
-struct RoundFunctor<half> {
-  static OF_HALF_FUNC half Forward(const half x) { return MATH_FUNC_H(nearbyint, x); }
-
-  static OF_HALF_FUNC half Backward(const half x, const half dy) { return GetZeroVal<half>(); }
-};
-
-template<>
-struct RsqrtFunctor<half> {
-  static OF_HALF_FUNC half Forward(const half x) { return hrsqrt(x); }
-
-  static OF_HALF_FUNC half Backward(const half x, const half dy) {
-    return __hmul(dy, __hneg(hrcp(__hmul(HALF_VAL_TWO, hsqrt(__hmul(x, __hmul(x, x)))))));
-  }
-};
-
-template<>
-struct SigmoidFunctor<half> {
-  static OF_HALF_FUNC half Forward(const half x) {
-    return hrcp(__hadd(GetOneVal<half>(), hexp(__hneg(x))));
-  }
-
-  static OF_HALF_FUNC half Backward(const half x, const half dy) {
-    half y = hrcp(__hadd(GetOneVal<half>(), hexp(__hneg(x))));
-    return __hmul(dy, __hmul(y, __hsub(GetOneVal<half>(), y)));
-  }
-};
-
-template<>
-struct SignFunctor<half> {
-  static OF_HALF_FUNC half Forward(const half x) {
-    if (__hgt(x, GetZeroVal<half>())) { return GetOneVal<half>(); }
-    if (__hlt(x, GetZeroVal<half>())) { return __hneg(GetOneVal<half>()); }
-    return GetZeroVal<half>();
-  }
-
-  static OF_HALF_FUNC half Backward(const half x, const half dy) { return GetZeroVal<half>(); }
-};
-
-template<>
-struct SinFunctor<half> {
-  static OF_HALF_FUNC half Forward(const half x) { return hsin(x); }
-
-  static OF_HALF_FUNC half Backward(const half x, const half dy) { return __hmul(dy, hcos(x)); }
-};
-
-template<>
-struct SinhFunctor<half> {
-  static OF_HALF_FUNC half Forward(const half x) { return MATH_FUNC_H(sinh, x); }
-
-  static OF_HALF_FUNC half Backward(const half x, const half dy) {
-    return __hmul(dy, MATH_FUNC_H(cosh, x));
-  }
-};
-
-template<>
-struct SqrtFunctor<half> {
-  static OF_HALF_FUNC half Forward(const half x) { return hsqrt(x); }
-
-  static OF_HALF_FUNC half Backward(const half x, const half dy) {
-    return __hmul(dy, __hdiv(HALF_VAL_HALF, hsqrt(x)));
-  }
-};
-
-template<>
-struct SquareFunctor<half> {
-  static OF_HALF_FUNC half Forward(const half x) { return __hmul(x, x); }
-
-  static OF_HALF_FUNC half Backward(const half x, const half dy) {
-    return __hmul(dy, __hmul(HALF_VAL_TWO, x));
-  }
-};
-
-template<>
-struct TanFunctor<half> {
-  static OF_HALF_FUNC half Forward(const half x) { return __hdiv(hsin(x), hcos(x)); }
-
-  static OF_HALF_FUNC half Backward(const half x, const half dy) {
-    return __hmul(dy, hrcp(__hmul(hcos(x), hcos(x))));
-  }
-};
-
-#endif
-
-}  // namespace oneflow
-
-#endif  // ONEFLOW_USER_KERNELS_MATH_UNARY_ELEMENTWISE_FUNC_H_
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#ifndef ONEFLOW_USER_KERNELS_MATH_UNARY_ELEMENTWISE_FUNC_H_
+#define ONEFLOW_USER_KERNELS_MATH_UNARY_ELEMENTWISE_FUNC_H_
+
+#include "oneflow/core/common/util.h"
+#include "oneflow/core/common/data_type.h"
+#include "oneflow/user/ops/math_unary_elementwise_seq.h"
+#include "oneflow/core/device/cuda_pseudo_half.h"
+
+#if defined(__CUDACC__)
+
+#include <cuda_fp16.h>
+#define MATH_FUNC_F(name, x) name##f(x)
+#define MATH_FUNC_D(name, x) name(x)
+
+#elif defined(__HIPCC__)
+#include <cmath>
+#include <hip/hip_fp16.h>
+
+#if defined(__HIP_DEVICE_COMPILE__)
+#define MATH_FUNC_F(name, x) name##f(x)
+#define MATH_FUNC_D(name, x) name(x)
+#else
+#define MATH_FUNC_F(name, x) std::name(x)
+#define MATH_FUNC_D(name, x) std::name(x)
+#endif
+
+#else
+
+#include <cmath>
+#define MATH_FUNC_F(name, x) std::name(x)
+#define MATH_FUNC_D(name, x) std::name(x)
+
+#endif
+
+namespace oneflow {
+
+#define DECLARE_UNARY_FUNCTOR(math_unary_elementwise_type, func_prefix) \
+  template<typename T>                                                  \
+  struct func_prefix##Functor;
+
+OF_PP_FOR_EACH_TUPLE(DECLARE_UNARY_FUNCTOR, MATH_UNARY_ELEMENTWISE_FUNC_SEQ)
+
+template<typename T>
+struct AbsFunctor {
+  static OF_DEVICE_FUNC T Forward(const T x) {
+    if (x == T(0))
+      return T(0);
+    else
+      return x < T(0) ? -x : x;
+  }
+
+  static OF_DEVICE_FUNC T Backward(const T x, const T dy) {
+    if (x == T(0))
+      return T(0);
+    else
+      return x < T(0) ? -dy : dy;
+  }
+};
+
+template<typename T>
+struct SignFunctor {
+  static OF_DEVICE_FUNC T Forward(const T x) { return (T(0) < x) - (x < T(0)); }
+
+  static OF_DEVICE_FUNC T Backward(const T x, const T dy) { return T(0); }
+};
+
+template<>
+struct RsqrtFunctor<float> {
+  static OF_DEVICE_FUNC float Forward(const float x) {
+#if defined(__CUDACC__)
+    return rsqrtf(x);
+#elif defined(__HIP_DEVICE_COMPILE__)
+    return rsqrtf(x);
+#else
+    return 1.0f / std::sqrt(x);
+#endif
+  }
+
+  static OF_DEVICE_FUNC float Backward(const float x, const float dy) {
+    return dy * (-1.0f / (2.0f * MATH_FUNC_F(sqrt, x * x * x)));
+  }
+};
+
+template<>
+struct RsqrtFunctor<double> {
+  static OF_DEVICE_FUNC double Forward(const double x) {
+#if defined(__CUDACC__)
+    return rsqrt(x);
+#elif defined(__HIP_DEVICE_COMPILE__)
+    return rsqrt(x);
+#else
+    return 1.0 / std::sqrt(x);
+#endif
+  }
+
+  static OF_DEVICE_FUNC double Backward(const double x, const double dy) {
+    return dy * (-1.0 / (2.0 * MATH_FUNC_D(sqrt, x * x * x)));
+  }
+};
+
+// float version
+
+template<>
+struct AcosFunctor<float> {
+  static OF_DEVICE_FUNC float Forward(const float x) { return MATH_FUNC_F(acos, x); }
+
+  static OF_DEVICE_FUNC float Backward(const float x, const float dy) {
+    return dy * -RsqrtFunctor<float>::Forward(1.0f - x * x);
+  }
+};
+
+template<>
+struct AcoshFunctor<float> {
+  static OF_DEVICE_FUNC float Forward(const float x) { return MATH_FUNC_F(acosh, x); }
+
+  static OF_DEVICE_FUNC float Backward(const float x, const float dy) {
+    return dy * RsqrtFunctor<float>::Forward(x * x - 1.0f);
+  }
+};
+
+template<>
+struct AsinFunctor<float> {
+  static OF_DEVICE_FUNC float Forward(const float x) { return MATH_FUNC_F(asin, x); }
+
+  static OF_DEVICE_FUNC float Backward(const float x, const float dy) {
+    return dy * RsqrtFunctor<float>::Forward(1.0f - x * x);
+  }
+};
+
+template<>
+struct AsinhFunctor<float> {
+  static OF_DEVICE_FUNC float Forward(const float x) { return MATH_FUNC_F(asinh, x); }
+
+  static OF_DEVICE_FUNC float Backward(const float x, const float dy) {
+    return dy * RsqrtFunctor<float>::Forward(1.0f + x * x);
+  }
+};
+
+template<>
+struct AtanFunctor<float> {
+  static OF_DEVICE_FUNC float Forward(const float x) { return MATH_FUNC_F(atan, x); }
+
+  static OF_DEVICE_FUNC float Backward(const float x, const float dy) {
+    return dy * (1.0f / (1.0f + x * x));
+  }
+};
+
+template<>
+struct AtanhFunctor<float> {
+  static OF_DEVICE_FUNC float Forward(const float x) { return MATH_FUNC_F(atanh, x); }
+
+  static OF_DEVICE_FUNC float Backward(const float x, const float dy) {
+    return dy * (1.0f / (1.0f - x * x));
+  }
+};
+
+template<>
+struct NotEqualZeroFunctor<float> {
+  static OF_DEVICE_FUNC float Forward(const float x) { return x != 0; }
+
+  static OF_DEVICE_FUNC float Backward(const float x, const float dy) { return 0.0f; }
+};
+
+template<>
+struct CeilFunctor<float> {
+  static OF_DEVICE_FUNC float Forward(const float x) { return MATH_FUNC_F(ceil, x); }
+
+  static OF_DEVICE_FUNC float Backward(const float x, const float dy) { return 0.0f; }
+};
+
+template<>
+struct CosFunctor<float> {
+  static OF_DEVICE_FUNC float Forward(const float x) { return MATH_FUNC_F(cos, x); }
+
+  static OF_DEVICE_FUNC float Backward(const float x, const float dy) {
+    return dy * (-MATH_FUNC_F(sin, x));
+  }
+};
+
+template<>
+struct CoshFunctor<float> {
+  static OF_DEVICE_FUNC float Forward(const float x) { return MATH_FUNC_F(cosh, x); }
+
+  static OF_DEVICE_FUNC float Backward(const float x, const float dy) {
+    return dy * MATH_FUNC_F(sinh, x);
+  }
+};
+
+template<>
+struct ErfFunctor<float> {
+  static OF_DEVICE_FUNC float Forward(const float x) { return MATH_FUNC_F(erf, x); }
+
+  static OF_DEVICE_FUNC float Backward(const float x, const float dy) {
+    return dy * 2.0f * RsqrtFunctor<float>::Forward(M_PI) * expf(-x * x);
+  }
+};
+
+template<>
+struct ErfcFunctor<float> {
+  static OF_DEVICE_FUNC float Forward(const float x) { return MATH_FUNC_F(erfc, x); }
+
+  static OF_DEVICE_FUNC float Backward(const float x, const float dy) {
+    return dy * -2.0f * RsqrtFunctor<float>::Forward(M_PI) * expf(-x * x);
+  }
+};
+
+template<>
+struct ExpFunctor<float> {
+  static OF_DEVICE_FUNC float Forward(const float x) { return MATH_FUNC_F(exp, x); }
+
+  static OF_DEVICE_FUNC float Backward(const float x, const float dy) {
+    return dy * MATH_FUNC_F(exp, x);
+  }
+};
+
+template<>
+struct Expm1Functor<float> {
+  static OF_DEVICE_FUNC float Forward(const float x) { return MATH_FUNC_F(expm1, x); }
+
+  static OF_DEVICE_FUNC float Backward(const float x, const float dy) {
+    return dy * MATH_FUNC_F(exp, x);
+  }
+};
+
+template<>
+struct FloorFunctor<float> {
+  static OF_DEVICE_FUNC float Forward(const float x) { return MATH_FUNC_F(floor, x); }
+
+  static OF_DEVICE_FUNC float Backward(const float x, const float dy) { return 0.0f; }
+};
+
+template<>
+struct LgammaFunctor<float> {
+  static OF_DEVICE_FUNC float Forward(const float x) { return MATH_FUNC_F(lgamma, x); }
+
+  static OF_DEVICE_FUNC float Backward(const float x, const float dy) {
+    // TODO(chengcheng): return: dy * digamma(x)
+    // assert(false);
+    return 0.0f;
+  }
+};
+
+template<>
+struct LogFunctor<float> {
+  static OF_DEVICE_FUNC float Forward(const float x) { return MATH_FUNC_F(log, x); }
+
+  static OF_DEVICE_FUNC float Backward(const float x, const float dy) { return dy * (1.0f / x); }
+};
+
+template<>
+struct Log2Functor<float> {
+  static OF_DEVICE_FUNC float Forward(const float x) { return MATH_FUNC_F(log2, x); }
+
+  static OF_DEVICE_FUNC float Backward(const float x, const float dy) {
+    return dy * (1.0f / (x * MATH_FUNC_F(log, 2.0f)));
+  }
+};
+
+template<>
+struct Log1pFunctor<float> {
+  static OF_DEVICE_FUNC float Forward(const float x) { return MATH_FUNC_F(log1p, x); }
+
+  static OF_DEVICE_FUNC float Backward(const float x, const float dy) {
+    return dy * (1.0f / (x + 1.0f));
+  }
+};
+
+template<>
+struct LogSigmoidFunctor<float> {
+  static OF_DEVICE_FUNC float Forward(const float x) {
+    return -MATH_FUNC_F(log, (1.0f + MATH_FUNC_F(exp, -x)));
+  }
+
+  static OF_DEVICE_FUNC float Backward(const float x, const float dy) {
+    return dy * (1.0f / (MATH_FUNC_F(exp, x) + 1.0f));
+  }
+};
+
+template<>
+struct NegativeFunctor<float> {
+  static OF_DEVICE_FUNC float Forward(const float x) { return -x; }
+
+  static OF_DEVICE_FUNC float Backward(const float x, const float dy) { return -dy; }
+};
+
+template<>
+struct ReciprocalFunctor<float> {
+  static OF_DEVICE_FUNC float Forward(const float x) { return 1.0f / x; }
+
+  static OF_DEVICE_FUNC float Backward(const float x, const float dy) {
+    return dy * (-1.0f / (x * x));
+  }
+};
+
+template<>
+struct ReciprocalNoNanFunctor<float> {
+  static OF_DEVICE_FUNC float Forward(const float x) {
+    if (fabsf(x) <= 0.0f) { return 0.0f; }
+    return 1.0f / x;
+  }
+
+  static OF_DEVICE_FUNC float Backward(const float x, const float dy) {
+    if (fabsf(x) <= 0.0f) { return 0.0f; }
+    return dy * (-1.0f / (x * x));
+  }
+};
+
+template<>
+struct RintFunctor<float> {
+  static OF_DEVICE_FUNC float Forward(const float x) { return MATH_FUNC_F(rint, x); }
+
+  static OF_DEVICE_FUNC float Backward(const float x, const float dy) { return 0.0f; }
+};
+
+template<>
+struct RoundFunctor<float> {
+  static OF_DEVICE_FUNC float Forward(const float x) { return MATH_FUNC_F(nearbyint, x); }
+
+  static OF_DEVICE_FUNC float Backward(const float x, const float dy) { return 0.0f; }
+};
+
+template<>
+struct SigmoidFunctor<float> {
+  static OF_DEVICE_FUNC float Forward(const float x) {
+    return 1.0f / (1.0f + MATH_FUNC_F(exp, -x));
+  }
+
+  static OF_DEVICE_FUNC float Backward(const float x, const float dy) {
+    float y = 1.0f / (1.0f + MATH_FUNC_F(exp, -x));
+    return dy * (y * (1.0f - y));
+  }
+};
+
+template<>
+struct SinFunctor<float> {
+  static OF_DEVICE_FUNC float Forward(const float x) { return MATH_FUNC_F(sin, x); }
+
+  static OF_DEVICE_FUNC float Backward(const float x, const float dy) {
+    return dy * MATH_FUNC_F(cos, x);
+  }
+};
+
+template<>
+struct SinhFunctor<float> {
+  static OF_DEVICE_FUNC float Forward(const float x) { return MATH_FUNC_F(sinh, x); }
+
+  static OF_DEVICE_FUNC float Backward(const float x, const float dy) {
+    return dy * MATH_FUNC_F(cosh, x);
+  }
+};
+
+template<>
+struct SqrtFunctor<float> {
+  static OF_DEVICE_FUNC float Forward(const float x) { return MATH_FUNC_F(sqrt, x); }
+
+  static OF_DEVICE_FUNC float Backward(const float x, const float dy) {
+    return dy * 0.5f / MATH_FUNC_F(sqrt, x);
+  }
+};
+
+template<>
+struct SquareFunctor<float> {
+  static OF_DEVICE_FUNC float Forward(const float x) { return x * x; }
+
+  static OF_DEVICE_FUNC float Backward(const float x, const float dy) { return dy * 2.0f * x; }
+};
+
+template<>
+struct TanFunctor<float> {
+  static OF_DEVICE_FUNC float Forward(const float x) { return MATH_FUNC_F(tan, x); }
+
+  static OF_DEVICE_FUNC float Backward(const float x, const float dy) {
+    return dy * (1.0f / (MATH_FUNC_F(cos, x) * MATH_FUNC_F(cos, x)));
+  }
+};
+
+// double version
+
+template<>
+struct AcosFunctor<double> {
+  static OF_DEVICE_FUNC double Forward(const double x) { return MATH_FUNC_D(acos, x); }
+
+  static OF_DEVICE_FUNC double Backward(const double x, const double dy) {
+    return dy * -RsqrtFunctor<double>::Forward(1.0 - x * x);
+  }
+};
+
+template<>
+struct AcoshFunctor<double> {
+  static OF_DEVICE_FUNC double Forward(const double x) { return MATH_FUNC_D(acosh, x); }
+
+  static OF_DEVICE_FUNC double Backward(const double x, const double dy) {
+    return dy * -RsqrtFunctor<double>::Forward(x * x - 1.0);
+  }
+};
+
+template<>
+struct AsinFunctor<double> {
+  static OF_DEVICE_FUNC double Forward(const double x) { return MATH_FUNC_D(asin, x); }
+
+  static OF_DEVICE_FUNC double Backward(const double x, const double dy) {
+    return dy * RsqrtFunctor<double>::Forward(1.0 - x * x);
+  }
+};
+
+template<>
+struct AsinhFunctor<double> {
+  static OF_DEVICE_FUNC double Forward(const double x) { return MATH_FUNC_D(asinh, x); }
+
+  static OF_DEVICE_FUNC double Backward(const double x, const double dy) {
+    return dy * RsqrtFunctor<double>::Forward(1.0 + x * x);
+  }
+};
+
+template<>
+struct AtanFunctor<double> {
+  static OF_DEVICE_FUNC double Forward(const double x) { return MATH_FUNC_D(atan, x); }
+
+  static OF_DEVICE_FUNC double Backward(const double x, const double dy) {
+    return dy * (1.0 / (1.0 + x * x));
+  }
+};
+
+template<>
+struct AtanhFunctor<double> {
+  static OF_DEVICE_FUNC double Forward(const double x) { return MATH_FUNC_D(atanh, x); }
+
+  static OF_DEVICE_FUNC double Backward(const double x, const double dy) {
+    return dy * (1.0 / (1.0 - x * x));
+  }
+};
+
+template<>
+struct NotEqualZeroFunctor<double> {
+  static OF_DEVICE_FUNC double Forward(const double x) { return x != 0; }
+
+  static OF_DEVICE_FUNC double Backward(const double x, const double dy) { return 0.0f; }
+};
+
+template<>
+struct CeilFunctor<double> {
+  static OF_DEVICE_FUNC double Forward(const double x) { return MATH_FUNC_D(ceil, x); }
+
+  static OF_DEVICE_FUNC double Backward(const double x, const double dy) { return 0.0; }
+};
+
+template<>
+struct CosFunctor<double> {
+  static OF_DEVICE_FUNC double Forward(const double x) { return MATH_FUNC_D(cos, x); }
+
+  static OF_DEVICE_FUNC double Backward(const double x, const double dy) {
+    return dy * (-MATH_FUNC_D(sin, x));
+  }
+};
+
+template<>
+struct CoshFunctor<double> {
+  static OF_DEVICE_FUNC double Forward(const double x) { return MATH_FUNC_D(cosh, x); }
+
+  static OF_DEVICE_FUNC double Backward(const double x, const double dy) {
+    return dy * MATH_FUNC_D(sinh, x);
+  }
+};
+
+template<>
+struct ErfFunctor<double> {
+  static OF_DEVICE_FUNC double Forward(const double x) { return MATH_FUNC_D(erf, x); }
+
+  static OF_DEVICE_FUNC double Backward(const double x, const double dy) {
+    return dy * 2.0 * RsqrtFunctor<double>::Forward(M_PI) * expf(-x * x);
+  }
+};
+
+template<>
+struct ErfcFunctor<double> {
+  static OF_DEVICE_FUNC double Forward(const double x) { return MATH_FUNC_D(erfc, x); }
+
+  static OF_DEVICE_FUNC double Backward(const double x, const double dy) {
+    return dy * -2.0 * RsqrtFunctor<double>::Forward(M_PI) * expf(-x * x);
+  }
+};
+
+template<>
+struct ExpFunctor<double> {
+  static OF_DEVICE_FUNC double Forward(const double x) { return MATH_FUNC_D(exp, x); }
+
+  static OF_DEVICE_FUNC double Backward(const double x, const double dy) {
+    return dy * MATH_FUNC_D(exp, x);
+  }
+};
+
+template<>
+struct Expm1Functor<double> {
+  static OF_DEVICE_FUNC double Forward(const double x) { return MATH_FUNC_D(expm1, x); }
+
+  static OF_DEVICE_FUNC double Backward(const double x, const double dy) {
+    return dy * MATH_FUNC_D(exp, x);
+  }
+};
+
+template<>
+struct FloorFunctor<double> {
+  static OF_DEVICE_FUNC double Forward(const double x) { return MATH_FUNC_D(floor, x); }
+
+  static OF_DEVICE_FUNC double Backward(const double x, const double dy) { return 0.0; }
+};
+
+template<>
+struct LgammaFunctor<double> {
+  static OF_DEVICE_FUNC double Forward(const double x) { return MATH_FUNC_D(lgamma, x); }
+
+  static OF_DEVICE_FUNC double Backward(const double x, const double dy) {
+    // TODO(chengcheng): return: dy * digamma(x)
+    // assert(false);
+    return 0.0;
+  }
+};
+
+template<>
+struct LogFunctor<double> {
+  static OF_DEVICE_FUNC double Forward(const double x) { return MATH_FUNC_D(log, x); }
+
+  static OF_DEVICE_FUNC double Backward(const double x, const double dy) { return dy * (1.0 / x); }
+};
+
+template<>
+struct Log2Functor<double> {
+  static OF_DEVICE_FUNC double Forward(const double x) { return MATH_FUNC_D(log2, x); }
+
+  static OF_DEVICE_FUNC double Backward(const double x, const double dy) {
+    return dy * (1.0 / (x * MATH_FUNC_D(log, 2.0)));
+  }
+};
+
+template<>
+struct Log1pFunctor<double> {
+  static OF_DEVICE_FUNC double Forward(const double x) { return MATH_FUNC_D(log1p, x); }
+
+  static OF_DEVICE_FUNC double Backward(const double x, const double dy) {
+    return dy * (1.0 / (x + 1.0));
+  }
+};
+
+template<>
+struct LogSigmoidFunctor<double> {
+  static OF_DEVICE_FUNC double Forward(const double x) {
+    return -MATH_FUNC_D(log, (1.0 + MATH_FUNC_D(exp, -x)));
+  }
+
+  static OF_DEVICE_FUNC double Backward(const double x, const double dy) {
+    return dy * (1.0 / (MATH_FUNC_D(exp, x) + 1.0));
+  }
+};
+
+template<>
+struct NegativeFunctor<double> {
+  static OF_DEVICE_FUNC double Forward(const double x) { return -x; }
+
+  static OF_DEVICE_FUNC double Backward(const double x, const double dy) { return -dy; }
+};
+
+template<>
+struct ReciprocalFunctor<double> {
+  static OF_DEVICE_FUNC double Forward(const double x) { return 1.0 / x; }
+
+  static OF_DEVICE_FUNC double Backward(const double x, const double dy) {
+    return dy * (-1.0 / (x * x));
+  }
+};
+
+template<>
+struct ReciprocalNoNanFunctor<double> {
+  static OF_DEVICE_FUNC double Forward(const double x) {
+    if (fabs(x) <= 0.0) { return 0.0; }
+    return 1.0 / x;
+  }
+
+  static OF_DEVICE_FUNC double Backward(const double x, const double dy) {
+    if (fabs(x) <= 0.0) { return 0.0; }
+    return dy * (-1.0 / (x * x));
+  }
+};
+
+template<>
+struct RintFunctor<double> {
+  static OF_DEVICE_FUNC double Forward(const double x) { return MATH_FUNC_D(rint, x); }
+
+  static OF_DEVICE_FUNC double Backward(const double x, const double dy) { return 0.0; }
+};
+
+template<>
+struct RoundFunctor<double> {
+  static OF_DEVICE_FUNC double Forward(const double x) { return MATH_FUNC_D(nearbyint, x); }
+
+  static OF_DEVICE_FUNC double Backward(const double x, const double dy) { return 0.0; }
+};
+
+template<>
+struct SigmoidFunctor<double> {
+  static OF_DEVICE_FUNC double Forward(const double x) {
+    return 1.0 / (1.0 + MATH_FUNC_D(exp, -x));
+  }
+
+  static OF_DEVICE_FUNC double Backward(const double x, const double dy) {
+    double y = 1.0 / (1.0 + MATH_FUNC_D(exp, -x));
+    return dy * (y * (1.0 - y));
+  }
+};
+
+template<>
+struct SinFunctor<double> {
+  static OF_DEVICE_FUNC double Forward(const double x) { return MATH_FUNC_D(sin, x); }
+
+  static OF_DEVICE_FUNC double Backward(const double x, const double dy) {
+    return dy * MATH_FUNC_D(cos, x);
+  }
+};
+
+template<>
+struct SinhFunctor<double> {
+  static OF_DEVICE_FUNC double Forward(const double x) { return MATH_FUNC_D(sinh, x); }
+
+  static OF_DEVICE_FUNC double Backward(const double x, const double dy) {
+    return dy * MATH_FUNC_D(cosh, x);
+  }
+};
+
+template<>
+struct SqrtFunctor<double> {
+  static OF_DEVICE_FUNC double Forward(const double x) { return MATH_FUNC_D(sqrt, x); }
+
+  static OF_DEVICE_FUNC double Backward(const double x, const double dy) {
+    return dy * (double)0.5 / MATH_FUNC_D(sqrt, x);
+  }
+};
+
+template<>
+struct SquareFunctor<double> {
+  static OF_DEVICE_FUNC double Forward(const double x) { return x * x; }
+
+  static OF_DEVICE_FUNC double Backward(const double x, const double dy) { return dy * 2.0 * x; }
+};
+
+template<>
+struct TanFunctor<double> {
+  static OF_DEVICE_FUNC double Forward(const double x) { return MATH_FUNC_D(tan, x); }
+
+  static OF_DEVICE_FUNC double Backward(const double x, const double dy) {
+    return dy * (1.0 / (MATH_FUNC_D(cos, x) * MATH_FUNC_D(cos, x)));
+  }
+};
+
+#if defined(__CUDACC__) || defined(__HIPCC__)
+// half version
+
+#define OF_HALF_FUNC __device__ __forceinline__
+
+#define MATH_FUNC_H(name, x) __float2half(name##f(__half2float(x)))
+#define HALF_VAL_HALF __float2half(0.5f)
+#define HALF_VAL_TWO __float2half(2.0f)
+#define HALF_VAL_2RSQRT_PI __float2half(1.1283791671f)
+
+template<>
+struct AbsFunctor<half> {
+  static OF_HALF_FUNC half Forward(const half x) {
+    return __hlt(x, GetZeroVal<half>()) ? __hneg(x) : x;
+  }
+
+  static OF_HALF_FUNC half Backward(const half x, const half dy) {
+    return __hlt(x, GetZeroVal<half>()) ? __hneg(dy) : dy;
+  }
+};
+
+template<>
+struct AcosFunctor<half> {
+  static OF_HALF_FUNC half Forward(const half x) { return MATH_FUNC_H(acos, x); }
+
+  static OF_HALF_FUNC half Backward(const half x, const half dy) {
+    return __hmul(dy, __hneg(hrsqrt(__hsub(GetOneVal<half>(), __hmul(x, x)))));
+  }
+};
+
+template<>
+struct AcoshFunctor<half> {
+  static OF_HALF_FUNC half Forward(const half x) { return MATH_FUNC_H(acosh, x); }
+
+  static OF_HALF_FUNC half Backward(const half x, const half dy) {
+    return __hmul(dy, hrsqrt(__hsub(__hmul(x, x), GetOneVal<half>())));
+  }
+};
+
+template<>
+struct AsinFunctor<half> {
+  static OF_HALF_FUNC half Forward(const half x) { return MATH_FUNC_H(asin, x); }
+
+  static OF_HALF_FUNC half Backward(const half x, const half dy) {
+    return __hmul(dy, hrsqrt(__hsub(GetOneVal<half>(), __hmul(x, x))));
+  }
+};
+
+template<>
+struct AsinhFunctor<half> {
+  static OF_HALF_FUNC half Forward(const half x) { return MATH_FUNC_H(asinh, x); }
+
+  static OF_HALF_FUNC half Backward(const half x, const half dy) {
+    return __hmul(dy, hrsqrt(__hadd(GetOneVal<half>(), __hmul(x, x))));
+  }
+};
+
+template<>
+struct AtanFunctor<half> {
+  static OF_HALF_FUNC half Forward(const half x) { return MATH_FUNC_H(atan, x); }
+
+  static OF_HALF_FUNC half Backward(const half x, const half dy) {
+    return __hmul(dy, __hdiv(GetOneVal<half>(), __hadd(GetOneVal<half>(), __hmul(x, x))));
+  }
+};
+
+template<>
+struct AtanhFunctor<half> {
+  static OF_HALF_FUNC half Forward(const half x) { return MATH_FUNC_H(atanh, x); }
+
+  static OF_HALF_FUNC half Backward(const half x, const half dy) {
+    return __hmul(dy, __hdiv(GetOneVal<half>(), __hsub(GetOneVal<half>(), __hmul(x, x))));
+  }
+};
+
+template<>
+struct CeilFunctor<half> {
+  static OF_HALF_FUNC half Forward(const half x) { return hceil(x); }
+
+  static OF_HALF_FUNC half Backward(const half x, const half dy) { return GetZeroVal<half>(); }
+};
+
+template<>
+struct NotEqualZeroFunctor<half> {
+  static OF_HALF_FUNC half Forward(const half x) { return x != static_cast<half>(0.0); }
+
+  static OF_HALF_FUNC half Backward(const half x, const half dy) { return GetZeroVal<half>(); }
+};
+
+template<>
+struct CosFunctor<half> {
+  static OF_HALF_FUNC half Forward(const half x) { return hcos(x); }
+
+  static OF_HALF_FUNC half Backward(const half x, const half dy) {
+    return __hmul(dy, __hneg(hsin(x)));
+  }
+};
+
+template<>
+struct CoshFunctor<half> {
+  static OF_HALF_FUNC half Forward(const half x) { return MATH_FUNC_H(cosh, x); }
+
+  static OF_HALF_FUNC half Backward(const half x, const half dy) {
+    return __hmul(dy, MATH_FUNC_H(sinh, x));
+  }
+};
+
+template<>
+struct ErfFunctor<half> {
+  static OF_HALF_FUNC half Forward(const half x) { return MATH_FUNC_H(erf, x); }
+
+  static OF_HALF_FUNC half Backward(const half x, const half dy) {
+    return __hmul(dy, __hmul(HALF_VAL_2RSQRT_PI, hexp(__hmul(__hneg(x), x))));
+  }
+};
+
+template<>
+struct ErfcFunctor<half> {
+  static OF_HALF_FUNC half Forward(const half x) { return MATH_FUNC_H(erfc, x); }
+
+  static OF_HALF_FUNC half Backward(const half x, const half dy) {
+    return __hmul(dy, __hneg(__hmul(HALF_VAL_2RSQRT_PI, hexp(__hmul(__hneg(x), x)))));
+  }
+};
+
+template<>
+struct ExpFunctor<half> {
+  static OF_HALF_FUNC half Forward(const half x) { return hexp(x); }
+
+  static OF_HALF_FUNC half Backward(const half x, const half dy) { return __hmul(dy, hexp(x)); }
+};
+
+template<>
+struct Expm1Functor<half> {
+  static OF_HALF_FUNC half Forward(const half x) { return MATH_FUNC_H(expm1, x); }
+
+  static OF_HALF_FUNC half Backward(const half x, const half dy) { return __hmul(dy, hexp(x)); }
+};
+
+template<>
+struct FloorFunctor<half> {
+  static OF_HALF_FUNC half Forward(const half x) { return hfloor(x); }
+
+  static OF_HALF_FUNC half Backward(const half x, const half dy) { return GetZeroVal<half>(); }
+};
+
+template<>
+struct LgammaFunctor<half> {
+  static OF_HALF_FUNC half Forward(const half x) { return MATH_FUNC_H(lgamma, x); }
+
+  static OF_HALF_FUNC half Backward(const half x, const half dy) {
+    // TODO(chengcheng): return: dy * digamma(x)
+    // assert(false);
+    return GetZeroVal<half>();
+  }
+};
+
+template<>
+struct LogFunctor<half> {
+  static OF_HALF_FUNC half Forward(const half x) { return hlog(x); }
+
+  static OF_HALF_FUNC half Backward(const half x, const half dy) { return __hmul(dy, hrcp(x)); }
+};
+
+template<>
+struct Log2Functor<half> {
+  static OF_HALF_FUNC half Forward(const half x) { return hlog2(x); }
+
+  static OF_HALF_FUNC half Backward(const half x, const half dy) {
+    return __hmul(dy, hrcp(__hmul(x, hlog(HALF_VAL_TWO))));
+  }
+};
+
+template<>
+struct Log1pFunctor<half> {
+  static OF_HALF_FUNC half Forward(const half x) { return MATH_FUNC_H(log1p, x); }
+
+  static OF_HALF_FUNC half Backward(const half x, const half dy) {
+    return __hmul(dy, hrcp(__hadd(x, GetOneVal<half>())));
+  }
+};
+
+template<>
+struct LogSigmoidFunctor<half> {
+  static OF_HALF_FUNC half Forward(const half x) {
+    return __hneg(hlog(__hadd(GetOneVal<half>(), hexp(__hneg(x)))));
+  }
+
+  static OF_HALF_FUNC half Backward(const half x, const half dy) {
+    return __hmul(dy, hrcp(__hadd(hexp(x), GetOneVal<half>())));
+  }
+};
+
+template<>
+struct NegativeFunctor<half> {
+  static OF_HALF_FUNC half Forward(const half x) { return __hneg(x); }
+
+  static OF_HALF_FUNC half Backward(const half x, const half dy) { return __hneg(dy); }
+};
+
+template<>
+struct ReciprocalFunctor<half> {
+  static OF_HALF_FUNC half Forward(const half x) { return hrcp(x); }
+
+  static OF_HALF_FUNC half Backward(const half x, const half dy) {
+    return __hmul(dy, __hneg(hrcp(__hmul(x, x))));
+  }
+};
+
+template<>
+struct ReciprocalNoNanFunctor<half> {
+  static OF_HALF_FUNC half Forward(const half x) {
+    if (__heq(GetZeroVal<half>(), x)) { return GetZeroVal<half>(); }
+    return hrcp(x);
+  }
+
+  static OF_HALF_FUNC half Backward(const half x, const half dy) {
+    if (__heq(GetZeroVal<half>(), x)) { return GetZeroVal<half>(); }
+    return __hmul(dy, __hneg(hrcp(__hmul(x, x))));
+  }
+};
+
+template<>
+struct RintFunctor<half> {
+  static OF_HALF_FUNC half Forward(const half x) { return hrint(x); }
+
+  static OF_HALF_FUNC half Backward(const half x, const half dy) { return GetZeroVal<half>(); }
+};
+
+template<>
+struct RoundFunctor<half> {
+  static OF_HALF_FUNC half Forward(const half x) { return MATH_FUNC_H(nearbyint, x); }
+
+  static OF_HALF_FUNC half Backward(const half x, const half dy) { return GetZeroVal<half>(); }
+};
+
+template<>
+struct RsqrtFunctor<half> {
+  static OF_HALF_FUNC half Forward(const half x) { return hrsqrt(x); }
+
+  static OF_HALF_FUNC half Backward(const half x, const half dy) {
+    return __hmul(dy, __hneg(hrcp(__hmul(HALF_VAL_TWO, hsqrt(__hmul(x, __hmul(x, x)))))));
+  }
+};
+
+template<>
+struct SigmoidFunctor<half> {
+  static OF_HALF_FUNC half Forward(const half x) {
+    return hrcp(__hadd(GetOneVal<half>(), hexp(__hneg(x))));
+  }
+
+  static OF_HALF_FUNC half Backward(const half x, const half dy) {
+    half y = hrcp(__hadd(GetOneVal<half>(), hexp(__hneg(x))));
+    return __hmul(dy, __hmul(y, __hsub(GetOneVal<half>(), y)));
+  }
+};
+
+template<>
+struct SignFunctor<half> {
+  static OF_HALF_FUNC half Forward(const half x) {
+    if (__hgt(x, GetZeroVal<half>())) { return GetOneVal<half>(); }
+    if (__hlt(x, GetZeroVal<half>())) { return __hneg(GetOneVal<half>()); }
+    return GetZeroVal<half>();
+  }
+
+  static OF_HALF_FUNC half Backward(const half x, const half dy) { return GetZeroVal<half>(); }
+};
+
+template<>
+struct SinFunctor<half> {
+  static OF_HALF_FUNC half Forward(const half x) { return hsin(x); }
+
+  static OF_HALF_FUNC half Backward(const half x, const half dy) { return __hmul(dy, hcos(x)); }
+};
+
+template<>
+struct SinhFunctor<half> {
+  static OF_HALF_FUNC half Forward(const half x) { return MATH_FUNC_H(sinh, x); }
+
+  static OF_HALF_FUNC half Backward(const half x, const half dy) {
+    return __hmul(dy, MATH_FUNC_H(cosh, x));
+  }
+};
+
+template<>
+struct SqrtFunctor<half> {
+  static OF_HALF_FUNC half Forward(const half x) { return hsqrt(x); }
+
+  static OF_HALF_FUNC half Backward(const half x, const half dy) {
+    return __hmul(dy, __hdiv(HALF_VAL_HALF, hsqrt(x)));
+  }
+};
+
+template<>
+struct SquareFunctor<half> {
+  static OF_HALF_FUNC half Forward(const half x) { return __hmul(x, x); }
+
+  static OF_HALF_FUNC half Backward(const half x, const half dy) {
+    return __hmul(dy, __hmul(HALF_VAL_TWO, x));
+  }
+};
+
+template<>
+struct TanFunctor<half> {
+  static OF_HALF_FUNC half Forward(const half x) { return __hdiv(hsin(x), hcos(x)); }
+
+  static OF_HALF_FUNC half Backward(const half x, const half dy) {
+    return __hmul(dy, hrcp(__hmul(hcos(x), hcos(x))));
+  }
+};
+
+#endif
+
+}  // namespace oneflow
+
+#endif  // ONEFLOW_USER_KERNELS_MATH_UNARY_ELEMENTWISE_FUNC_H_
diff --git a/oneflow/user/kernels/math_unary_elementwise_kernel.hip.cpp b/oneflow/user/kernels/math_unary_elementwise_kernel.hip.cpp
index 160dd50..1c276ab 100644
--- a/oneflow/user/kernels/math_unary_elementwise_kernel.hip.cpp
+++ b/oneflow/user/kernels/math_unary_elementwise_kernel.hip.cpp
@@ -1,177 +1,177 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "hip/hip_runtime.h"
-#include "oneflow/core/framework/framework.h"
-#include "oneflow/user/kernels/math_unary_elementwise_func.h"
-#include "oneflow/core/kernel/cuda_graph_support.h"
-#include "oneflow/core/ep/rocm/cuda_stream.h"
-#include "oneflow/core/device/cuda_util.h"
-
-namespace oneflow {
-
-namespace {
-
-template<template<typename> class UnaryFunctor, typename T>
-__global__ void MathUnaryElementwiseForwardGpu(const int64_t n, const T* x, T* y) {
-  CUDA_1D_KERNEL_LOOP_T(int64_t, i, n) { y[i] = UnaryFunctor<T>::Forward(x[i]); }
-}
-
-template<template<typename> class UnaryFunctor, typename T>
-__global__ void MathUnaryElementwiseBackwardGpu(const int64_t n, const T* x, const T* dy, T* dx) {
-  CUDA_1D_KERNEL_LOOP_T(int64_t, i, n) { dx[i] = UnaryFunctor<T>::Backward(x[i], dy[i]); }
-}
-
-}  // namespace
-
-template<template<typename> class UnaryFunctor, typename T>
-class MathUnaryElementwiseGpuKernel final : public user_op::OpKernel,
-                                            public user_op::CudaGraphSupport {
- public:
-  MathUnaryElementwiseGpuKernel() = default;
-  ~MathUnaryElementwiseGpuKernel() = default;
-
- private:
-  using user_op::OpKernel::Compute;
-  void Compute(user_op::KernelComputeContext* ctx) const override {
-    const user_op::Tensor* tensor_x = ctx->Tensor4ArgNameAndIndex("x", 0);
-    user_op::Tensor* tensor_y = ctx->Tensor4ArgNameAndIndex("y", 0);
-    const T* x = tensor_x->dptr<T>();
-    T* y = tensor_y->mut_dptr<T>();
-    int64_t n = tensor_x->shape_view().elem_cnt();
-    if (n == 0) { return; }
-    MathUnaryElementwiseForwardGpu<UnaryFunctor, T>
-        <<<BlocksNum4ThreadsNum(n), kCudaThreadsNumPerBlock, 0,
-           ctx->stream()->As<ep::CudaStream>()->cuda_stream()>>>(n, x, y);
-  }
-  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
-};
-
-template<template<typename> class UnaryFunctor, typename T>
-class MathUnaryElementwiseGradGpuKernel final : public user_op::OpKernel,
-                                                public user_op::CudaGraphSupport {
- public:
-  MathUnaryElementwiseGradGpuKernel() = default;
-  ~MathUnaryElementwiseGradGpuKernel() = default;
-
- private:
-  using user_op::OpKernel::Compute;
-  void Compute(user_op::KernelComputeContext* ctx) const override {
-    const user_op::Tensor* tensor_x = ctx->Tensor4ArgNameAndIndex("x", 0);
-    const user_op::Tensor* tensor_dy = ctx->Tensor4ArgNameAndIndex("dy", 0);
-    user_op::Tensor* tensor_dx = ctx->Tensor4ArgNameAndIndex("dx", 0);
-
-    const T* x = tensor_x->dptr<T>();
-    const T* dy = tensor_dy->dptr<T>();
-    T* dx = tensor_dx->mut_dptr<T>();
-    int64_t n = tensor_x->shape_view().elem_cnt();
-    if (n == 0) { return; }
-    MathUnaryElementwiseBackwardGpu<UnaryFunctor, T>
-        <<<BlocksNum4ThreadsNum(n), kCudaThreadsNumPerBlock, 0,
-           ctx->stream()->As<ep::CudaStream>()->cuda_stream()>>>(n, x, dy, dx);
-  }
-  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
-};
-
-#define REGISTER_MATH_UNARY_ELEMENTWISE_CUDA_KERNEL_AND_GRAD(math_type_pair, data_type_pair)       \
-  REGISTER_USER_KERNEL(OF_PP_PAIR_FIRST(math_type_pair))                                           \
-      .SetCreateFn<                                                                                \
-          MathUnaryElementwiseGpuKernel<OF_PP_CAT(OF_PP_PAIR_SECOND(math_type_pair), Functor),     \
-                                        OF_PP_PAIR_FIRST(data_type_pair)>>()                       \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                             \
-                       && (user_op::HobDataType("x", 0) == OF_PP_PAIR_SECOND(data_type_pair))      \
-                       && (user_op::HobDataType("y", 0) == OF_PP_PAIR_SECOND(data_type_pair)));    \
-                                                                                                   \
-  REGISTER_USER_KERNEL((std::string("") + OF_PP_PAIR_FIRST(math_type_pair) + "_grad"))             \
-      .SetCreateFn<                                                                                \
-          MathUnaryElementwiseGradGpuKernel<OF_PP_CAT(OF_PP_PAIR_SECOND(math_type_pair), Functor), \
-                                            OF_PP_PAIR_FIRST(data_type_pair)>>()                   \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                             \
-                       && (user_op::HobDataType("x", 0) == OF_PP_PAIR_SECOND(data_type_pair)));
-
-OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_MATH_UNARY_ELEMENTWISE_CUDA_KERNEL_AND_GRAD,
-                                 MATH_UNARY_ELEMENTWISE_FUNC_SEQ, FLOATING_DATA_TYPE_SEQ)
-
-// For some special dtype kernel register.
-OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_MATH_UNARY_ELEMENTWISE_CUDA_KERNEL_AND_GRAD,
-                                 OF_PP_MAKE_TUPLE_SEQ("abs", Abs), UNSIGNED_INT_DATA_TYPE_SEQ)
-OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_MATH_UNARY_ELEMENTWISE_CUDA_KERNEL_AND_GRAD,
-                                 OF_PP_MAKE_TUPLE_SEQ("abs", Abs), INT_DATA_TYPE_SEQ)
-
-template<template<typename> class UnaryFunctor>
-class MathUnaryElementwiseGpuHalfKernel final : public user_op::OpKernel,
-                                                public user_op::CudaGraphSupport {
- public:
-  MathUnaryElementwiseGpuHalfKernel() = default;
-  ~MathUnaryElementwiseGpuHalfKernel() = default;
-
- private:
-  using user_op::OpKernel::Compute;
-  void Compute(user_op::KernelComputeContext* ctx) const override {
-    const user_op::Tensor* tensor_x = ctx->Tensor4ArgNameAndIndex("x", 0);
-    user_op::Tensor* tensor_y = ctx->Tensor4ArgNameAndIndex("y", 0);
-    const half* x = reinterpret_cast<const half*>(tensor_x->dptr<float16>());
-    half* y = reinterpret_cast<half*>(tensor_y->mut_dptr<float16>());
-    int64_t n = tensor_x->shape_view().elem_cnt();
-    if (n == 0) { return; }
-    MathUnaryElementwiseForwardGpu<UnaryFunctor, half>
-        <<<BlocksNum4ThreadsNum(n), kCudaThreadsNumPerBlock, 0,
-           ctx->stream()->As<ep::CudaStream>()->cuda_stream()>>>(n, x, y);
-  }
-  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
-};
-
-template<template<typename> class UnaryFunctor>
-class MathUnaryElementwiseGradGpuHalfKernel final : public user_op::OpKernel,
-                                                    public user_op::CudaGraphSupport {
- public:
-  MathUnaryElementwiseGradGpuHalfKernel() = default;
-  ~MathUnaryElementwiseGradGpuHalfKernel() = default;
-
- private:
-  using user_op::OpKernel::Compute;
-  void Compute(user_op::KernelComputeContext* ctx) const override {
-    const user_op::Tensor* tensor_x = ctx->Tensor4ArgNameAndIndex("x", 0);
-    const user_op::Tensor* tensor_dy = ctx->Tensor4ArgNameAndIndex("dy", 0);
-    user_op::Tensor* tensor_dx = ctx->Tensor4ArgNameAndIndex("dx", 0);
-
-    const half* x = reinterpret_cast<const half*>(tensor_x->dptr<float16>());
-    const half* dy = reinterpret_cast<const half*>(tensor_dy->dptr<float16>());
-    half* dx = reinterpret_cast<half*>(tensor_dx->mut_dptr<float16>());
-    int64_t n = tensor_x->shape_view().elem_cnt();
-    if (n == 0) { return; }
-    MathUnaryElementwiseBackwardGpu<UnaryFunctor, half>
-        <<<BlocksNum4ThreadsNum(n), kCudaThreadsNumPerBlock, 0,
-           ctx->stream()->As<ep::CudaStream>()->cuda_stream()>>>(n, x, dy, dx);
-  }
-  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
-};
-
-#define REGISTER_MATH_UNARY_ELEMENTWISE_GUDA_HALF_KERNEL_AND_GRAD(math_type_str, math_func_prefix) \
-  REGISTER_USER_KERNEL(math_type_str)                                                              \
-      .SetCreateFn<MathUnaryElementwiseGpuHalfKernel<OF_PP_CAT(math_func_prefix, Functor)>>()      \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                             \
-                       && (user_op::HobDataType("x", 0) == DataType::kFloat16)                     \
-                       && (user_op::HobDataType("y", 0) == DataType::kFloat16));                   \
-                                                                                                   \
-  REGISTER_USER_KERNEL((std::string("") + math_type_str + "_grad"))                                \
-      .SetCreateFn<MathUnaryElementwiseGradGpuHalfKernel<OF_PP_CAT(math_func_prefix, Functor)>>()  \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                             \
-                       && (user_op::HobDataType("x", 0) == DataType::kFloat16));
-
-// OF_PP_FOR_EACH_TUPLE(REGISTER_MATH_UNARY_ELEMENTWISE_GUDA_HALF_KERNEL_AND_GRAD,
-//                      MATH_UNARY_ELEMENTWISE_FUNC_SEQ)
-
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "hip/hip_runtime.h"
+#include "oneflow/core/framework/framework.h"
+#include "oneflow/user/kernels/math_unary_elementwise_func.h"
+#include "oneflow/core/kernel/cuda_graph_support.h"
+#include "oneflow/core/ep/rocm/cuda_stream.h"
+#include "oneflow/core/device/cuda_util.h"
+
+namespace oneflow {
+
+namespace {
+
+template<template<typename> class UnaryFunctor, typename T>
+__global__ void MathUnaryElementwiseForwardGpu(const int64_t n, const T* x, T* y) {
+  CUDA_1D_KERNEL_LOOP_T(int64_t, i, n) { y[i] = UnaryFunctor<T>::Forward(x[i]); }
+}
+
+template<template<typename> class UnaryFunctor, typename T>
+__global__ void MathUnaryElementwiseBackwardGpu(const int64_t n, const T* x, const T* dy, T* dx) {
+  CUDA_1D_KERNEL_LOOP_T(int64_t, i, n) { dx[i] = UnaryFunctor<T>::Backward(x[i], dy[i]); }
+}
+
+}  // namespace
+
+template<template<typename> class UnaryFunctor, typename T>
+class MathUnaryElementwiseGpuKernel final : public user_op::OpKernel,
+                                            public user_op::CudaGraphSupport {
+ public:
+  MathUnaryElementwiseGpuKernel() = default;
+  ~MathUnaryElementwiseGpuKernel() = default;
+
+ private:
+  using user_op::OpKernel::Compute;
+  void Compute(user_op::KernelComputeContext* ctx) const override {
+    const user_op::Tensor* tensor_x = ctx->Tensor4ArgNameAndIndex("x", 0);
+    user_op::Tensor* tensor_y = ctx->Tensor4ArgNameAndIndex("y", 0);
+    const T* x = tensor_x->dptr<T>();
+    T* y = tensor_y->mut_dptr<T>();
+    int64_t n = tensor_x->shape_view().elem_cnt();
+    if (n == 0) { return; }
+    MathUnaryElementwiseForwardGpu<UnaryFunctor, T>
+        <<<BlocksNum4ThreadsNum(n), kCudaThreadsNumPerBlock, 0,
+           ctx->stream()->As<ep::CudaStream>()->cuda_stream()>>>(n, x, y);
+  }
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
+};
+
+template<template<typename> class UnaryFunctor, typename T>
+class MathUnaryElementwiseGradGpuKernel final : public user_op::OpKernel,
+                                                public user_op::CudaGraphSupport {
+ public:
+  MathUnaryElementwiseGradGpuKernel() = default;
+  ~MathUnaryElementwiseGradGpuKernel() = default;
+
+ private:
+  using user_op::OpKernel::Compute;
+  void Compute(user_op::KernelComputeContext* ctx) const override {
+    const user_op::Tensor* tensor_x = ctx->Tensor4ArgNameAndIndex("x", 0);
+    const user_op::Tensor* tensor_dy = ctx->Tensor4ArgNameAndIndex("dy", 0);
+    user_op::Tensor* tensor_dx = ctx->Tensor4ArgNameAndIndex("dx", 0);
+
+    const T* x = tensor_x->dptr<T>();
+    const T* dy = tensor_dy->dptr<T>();
+    T* dx = tensor_dx->mut_dptr<T>();
+    int64_t n = tensor_x->shape_view().elem_cnt();
+    if (n == 0) { return; }
+    MathUnaryElementwiseBackwardGpu<UnaryFunctor, T>
+        <<<BlocksNum4ThreadsNum(n), kCudaThreadsNumPerBlock, 0,
+           ctx->stream()->As<ep::CudaStream>()->cuda_stream()>>>(n, x, dy, dx);
+  }
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
+};
+
+#define REGISTER_MATH_UNARY_ELEMENTWISE_CUDA_KERNEL_AND_GRAD(math_type_pair, data_type_pair)       \
+  REGISTER_USER_KERNEL(OF_PP_PAIR_FIRST(math_type_pair))                                           \
+      .SetCreateFn<                                                                                \
+          MathUnaryElementwiseGpuKernel<OF_PP_CAT(OF_PP_PAIR_SECOND(math_type_pair), Functor),     \
+                                        OF_PP_PAIR_FIRST(data_type_pair)>>()                       \
+      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                             \
+                       && (user_op::HobDataType("x", 0) == OF_PP_PAIR_SECOND(data_type_pair))      \
+                       && (user_op::HobDataType("y", 0) == OF_PP_PAIR_SECOND(data_type_pair)));    \
+                                                                                                   \
+  REGISTER_USER_KERNEL((std::string("") + OF_PP_PAIR_FIRST(math_type_pair) + "_grad"))             \
+      .SetCreateFn<                                                                                \
+          MathUnaryElementwiseGradGpuKernel<OF_PP_CAT(OF_PP_PAIR_SECOND(math_type_pair), Functor), \
+                                            OF_PP_PAIR_FIRST(data_type_pair)>>()                   \
+      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                             \
+                       && (user_op::HobDataType("x", 0) == OF_PP_PAIR_SECOND(data_type_pair)));
+
+OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_MATH_UNARY_ELEMENTWISE_CUDA_KERNEL_AND_GRAD,
+                                 MATH_UNARY_ELEMENTWISE_FUNC_SEQ, FLOATING_DATA_TYPE_SEQ)
+
+// For some special dtype kernel register.
+OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_MATH_UNARY_ELEMENTWISE_CUDA_KERNEL_AND_GRAD,
+                                 OF_PP_MAKE_TUPLE_SEQ("abs", Abs), UNSIGNED_INT_DATA_TYPE_SEQ)
+OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_MATH_UNARY_ELEMENTWISE_CUDA_KERNEL_AND_GRAD,
+                                 OF_PP_MAKE_TUPLE_SEQ("abs", Abs), INT_DATA_TYPE_SEQ)
+
+template<template<typename> class UnaryFunctor>
+class MathUnaryElementwiseGpuHalfKernel final : public user_op::OpKernel,
+                                                public user_op::CudaGraphSupport {
+ public:
+  MathUnaryElementwiseGpuHalfKernel() = default;
+  ~MathUnaryElementwiseGpuHalfKernel() = default;
+
+ private:
+  using user_op::OpKernel::Compute;
+  void Compute(user_op::KernelComputeContext* ctx) const override {
+    const user_op::Tensor* tensor_x = ctx->Tensor4ArgNameAndIndex("x", 0);
+    user_op::Tensor* tensor_y = ctx->Tensor4ArgNameAndIndex("y", 0);
+    const half* x = reinterpret_cast<const half*>(tensor_x->dptr<float16>());
+    half* y = reinterpret_cast<half*>(tensor_y->mut_dptr<float16>());
+    int64_t n = tensor_x->shape_view().elem_cnt();
+    if (n == 0) { return; }
+    MathUnaryElementwiseForwardGpu<UnaryFunctor, half>
+        <<<BlocksNum4ThreadsNum(n), kCudaThreadsNumPerBlock, 0,
+           ctx->stream()->As<ep::CudaStream>()->cuda_stream()>>>(n, x, y);
+  }
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
+};
+
+template<template<typename> class UnaryFunctor>
+class MathUnaryElementwiseGradGpuHalfKernel final : public user_op::OpKernel,
+                                                    public user_op::CudaGraphSupport {
+ public:
+  MathUnaryElementwiseGradGpuHalfKernel() = default;
+  ~MathUnaryElementwiseGradGpuHalfKernel() = default;
+
+ private:
+  using user_op::OpKernel::Compute;
+  void Compute(user_op::KernelComputeContext* ctx) const override {
+    const user_op::Tensor* tensor_x = ctx->Tensor4ArgNameAndIndex("x", 0);
+    const user_op::Tensor* tensor_dy = ctx->Tensor4ArgNameAndIndex("dy", 0);
+    user_op::Tensor* tensor_dx = ctx->Tensor4ArgNameAndIndex("dx", 0);
+
+    const half* x = reinterpret_cast<const half*>(tensor_x->dptr<float16>());
+    const half* dy = reinterpret_cast<const half*>(tensor_dy->dptr<float16>());
+    half* dx = reinterpret_cast<half*>(tensor_dx->mut_dptr<float16>());
+    int64_t n = tensor_x->shape_view().elem_cnt();
+    if (n == 0) { return; }
+    MathUnaryElementwiseBackwardGpu<UnaryFunctor, half>
+        <<<BlocksNum4ThreadsNum(n), kCudaThreadsNumPerBlock, 0,
+           ctx->stream()->As<ep::CudaStream>()->cuda_stream()>>>(n, x, dy, dx);
+  }
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
+};
+
+#define REGISTER_MATH_UNARY_ELEMENTWISE_GUDA_HALF_KERNEL_AND_GRAD(math_type_str, math_func_prefix) \
+  REGISTER_USER_KERNEL(math_type_str)                                                              \
+      .SetCreateFn<MathUnaryElementwiseGpuHalfKernel<OF_PP_CAT(math_func_prefix, Functor)>>()      \
+      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                             \
+                       && (user_op::HobDataType("x", 0) == DataType::kFloat16)                     \
+                       && (user_op::HobDataType("y", 0) == DataType::kFloat16));                   \
+                                                                                                   \
+  REGISTER_USER_KERNEL((std::string("") + math_type_str + "_grad"))                                \
+      .SetCreateFn<MathUnaryElementwiseGradGpuHalfKernel<OF_PP_CAT(math_func_prefix, Functor)>>()  \
+      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                             \
+                       && (user_op::HobDataType("x", 0) == DataType::kFloat16));
+
+// OF_PP_FOR_EACH_TUPLE(REGISTER_MATH_UNARY_ELEMENTWISE_GUDA_HALF_KERNEL_AND_GRAD,
+//                      MATH_UNARY_ELEMENTWISE_FUNC_SEQ)
+
 }  // namespace oneflow
\ No newline at end of file
diff --git a/oneflow/user/kernels/max_pool_kernel.hip.cpp b/oneflow/user/kernels/max_pool_kernel.hip.cpp
index cd3e86f..9ac75d0 100644
--- a/oneflow/user/kernels/max_pool_kernel.hip.cpp
+++ b/oneflow/user/kernels/max_pool_kernel.hip.cpp
@@ -1,289 +1,289 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include <cstdint>
-#ifdef WITH_ROCM
-#include "hip/hip_runtime.h"
-#include "oneflow/core/hip/elementwise.hip.h"
-#include "oneflow/user/kernels/max_pool_kernel_util.h"
-#include "oneflow/core/ep/rocm/cuda_stream.h"
-#include <hip/hip_fp16.h>
-
-namespace oneflow {
-namespace {
-
-constexpr int kBlockSize = cuda::elementwise::kBlockSize;
-
-int GetMinThreadNum(int64_t elem_num) { return std::min<int64_t>(elem_num, kBlockSize); }
-
-int GetNumBlocks(int64_t elem_cnt) {
-  int num_blocks = 0;
-  OF_CUDA_CHECK(cuda::elementwise::GetNumBlocks(elem_cnt, &num_blocks));
-  return num_blocks;
-}
-
-template<typename T, typename IDX>
-__device__ __inline__ void Maxpool2dForwardComputeCLast(
-    const NdIndexOffsetHelper<IDX, 4>& index_helper, IDX elem_num, const T* src, T* dest,
-    int64_t* indice_ptr, const int32_t padding_h, const int32_t padding_w, const int64_t n_batch,
-    const int64_t n_channel, const int64_t x_height, const int64_t x_width, const int64_t y_height,
-    const int64_t y_width, const int32_t kernel_size_h, const int32_t kernel_size_w,
-    const int32_t stride_h, const int32_t stride_w, const int32_t dilation_h,
-    const int32_t dilation_w) {
-  IDX n, h, w, c;
-  CUDA_1D_KERNEL_LOOP(num, elem_num) {
-    index_helper.OffsetToNdIndex(num, n, h, w, c);
-
-    const IDX x_start_idx = n * n_channel * x_width * x_height;
-    const IDX y_start_idx = n * n_channel * y_height * y_width;
-    IDX hstart = h * stride_h - padding_h;
-    IDX wstart = w * stride_w - padding_w;
-    const IDX hend = (hstart + (kernel_size_h - 1) * dilation_h + 1) <= x_height
-                         ? (hstart + (kernel_size_h - 1) * dilation_h + 1)
-                         : x_height;
-    const IDX wend = (wstart + (kernel_size_w - 1) * dilation_w + 1) <= x_width
-                         ? (wstart + (kernel_size_w - 1) * dilation_w + 1)
-                         : x_width;
-
-    while (hstart < 0) { hstart += dilation_h; }
-    while (wstart < 0) { wstart += dilation_w; }
-    /* compute max value(src[src_idx]) in kernel box region, and save the value to dest[num] */
-    IDX max_index = hstart * x_width + wstart;
-    IDX src_idx = 0;
-    /* equal to -std::numeric_limits<T>::infinity(); */
-    T max_value = detail::numeric_limits<T>::lower_bound();
-
-    for (IDX i = hstart; i < hend; i++) {
-      for (IDX j = wstart; j < wend; j++) {
-        const IDX window_idx = i * x_width * n_channel + j * n_channel + c;
-        const IDX search_idx = x_start_idx + window_idx;
-        T val = src[search_idx];
-        if (val > max_value || detail::numerics<T>::isnan(val)) {
-          max_value = val;
-          max_index = window_idx;
-          src_idx = search_idx;
-        }
-      }
-    }
-    const IDX out_idx = y_start_idx + h * y_width * n_channel + w * n_channel + c;
-    dest[out_idx] = src[src_idx];
-    indice_ptr[out_idx] = max_index;
-  }
-}
-
-}  // namespace
-
-template<typename T, typename IDX>
-__launch_bounds__(kBlockSize) __global__
-    void DoCUDAMaxPool1dForward(const NdIndexOffsetHelper<IDX, 2> index_helper, IDX elem_num,
-                                const T* src, T* dest, int64_t* indice_ptr, int32_t padding_l,
-                                int32_t n_batch, int32_t n_channel, int32_t x_length,
-                                int32_t kernel_size_l, int32_t stride_l, int32_t dilation_l) {
-  Maxpool1dForwardCompute<T, IDX>(index_helper, elem_num, src, dest, indice_ptr, padding_l, n_batch,
-                                  n_channel, x_length, kernel_size_l, stride_l, dilation_l);
-};
-
-template<typename T, typename IDX>
-__launch_bounds__(kBlockSize) __global__
-    void DoCUDAMaxPool2dForwardCFirst(const NdIndexOffsetHelper<IDX, 3> index_helper, IDX elem_num,
-                                      const T* src, T* dest, int64_t* indice_ptr, int32_t padding_h,
-                                      int32_t padding_w, int32_t n_batch, int32_t n_channel,
-                                      int32_t x_height, int32_t x_width, int32_t kernel_size_h,
-                                      int32_t kernel_size_w, int32_t stride_h, int32_t stride_w,
-                                      int32_t dilation_h, int32_t dilation_w) {
-  Maxpool2dForwardComputeCFirst<T, IDX>(
-      index_helper, elem_num, src, dest, indice_ptr, padding_h, padding_w, n_batch, n_channel,
-      x_height, x_width, kernel_size_h, kernel_size_w, stride_h, stride_w, dilation_h, dilation_w);
-};
-
-template<typename T, typename IDX>
-__launch_bounds__(kBlockSize) __global__
-    void DoCUDAMaxPool2dForwardCLast(const NdIndexOffsetHelper<IDX, 4> index_helper, IDX elem_num,
-                                     const T* src, T* dest, int64_t* indice_ptr, int32_t padding_h,
-                                     int32_t padding_w, int32_t n_batch, int32_t n_channel,
-                                     int32_t x_height, int32_t x_width, int32_t y_height,
-                                     int32_t y_width, int32_t kernel_size_h, int32_t kernel_size_w,
-                                     int32_t stride_h, int32_t stride_w, int32_t dilation_h,
-                                     int32_t dilation_w) {
-  Maxpool2dForwardComputeCLast<T, IDX>(index_helper, elem_num, src, dest, indice_ptr, padding_h,
-                                       padding_w, n_batch, n_channel, x_height, x_width, y_height,
-                                       y_width, kernel_size_h, kernel_size_w, stride_h, stride_w,
-                                       dilation_h, dilation_w);
-};
-
-template<typename T, typename IDX>
-__launch_bounds__(kBlockSize) __global__
-    void DoCUDAMaxPool3dForward(const NdIndexOffsetHelper<IDX, 4> index_helper, IDX elem_num,
-                                const T* src, T* dest, int64_t* indice_ptr, int32_t padding_t,
-                                int32_t padding_h, int32_t padding_w, int32_t n_batch,
-                                int32_t n_channel, int32_t x_time, int32_t x_height,
-                                int32_t x_width, int32_t kernel_size_t, int32_t kernel_size_h,
-                                int32_t kernel_size_w, int32_t stride_t, int32_t stride_h,
-                                int32_t stride_w, int32_t dilation_t, int32_t dilation_h,
-                                int32_t dilation_w) {
-  Maxpool3dForwardCompute<T, IDX>(index_helper, elem_num, src, dest, indice_ptr, padding_t,
-                                  padding_h, padding_w, n_batch, n_channel, x_time, x_height,
-                                  x_width, kernel_size_t, kernel_size_h, kernel_size_w, stride_t,
-                                  stride_h, stride_w, dilation_t, dilation_h, dilation_w);
-};
-
-template<typename T, typename IDX>
-__launch_bounds__(kBlockSize) __global__
-    void DoCUDAMaxPool1dBackward(const NdIndexOffsetHelper<IDX, 2> index_helper, const IDX elem_num,
-                                 const T* src, T* dest, const int64_t* indice_ptr,
-                                 const int32_t n_batch, const int32_t n_channel,
-                                 const int32_t src_length, const int32_t dst_length) {
-  Maxpool1dBackwardCompute<T, IDX>(index_helper, elem_num, src, dest, indice_ptr, n_batch,
-                                   n_channel, src_length, dst_length);
-};
-
-template<typename T, typename IDX>
-__launch_bounds__(kBlockSize) __global__
-    void DoCUDAMaxPool2dBackwardCFirst(const NdIndexOffsetHelper<IDX, 3> index_helper,
-                                       const IDX elem_num, const T* src, T* dest,
-                                       const int64_t* indice_ptr, const int32_t n_batch,
-                                       const int32_t n_channel, const int32_t src_height,
-                                       const int32_t src_width, const int32_t dst_height,
-                                       const int32_t dst_width) {
-  Maxpool2dBackwardComputeCFirst<T, IDX>(index_helper, elem_num, src, dest, indice_ptr, n_batch,
-                                         n_channel, src_height, src_width, dst_height, dst_width);
-};
-
-template<typename T, typename IDX>
-__launch_bounds__(kBlockSize) __global__
-    void DoCUDAMaxPool2dBackwardCLast(const NdIndexOffsetHelper<IDX, 4> index_helper,
-                                      const IDX elem_num, const T* src, T* dest,
-                                      const int64_t* indice_ptr, const int32_t n_batch,
-                                      const int32_t n_channel, const int32_t src_height,
-                                      const int32_t src_width, const int32_t dst_height,
-                                      const int32_t dst_width) {
-  Maxpool2dBackwardComputeCLast<T, IDX>(index_helper, elem_num, src, dest, indice_ptr, n_batch,
-                                        n_channel, src_height, src_width, dst_height, dst_width);
-};
-
-template<typename T, typename IDX>
-__launch_bounds__(kBlockSize) __global__
-    void DoCUDAMaxPool3dBackward(const NdIndexOffsetHelper<IDX, 4> index_helper, const IDX elem_num,
-                                 const T* src, T* dest, const int64_t* indice_ptr,
-                                 const int32_t n_batch, const int32_t n_channel,
-                                 const int32_t src_time, const int32_t src_height,
-                                 const int32_t src_width, const int32_t dst_time,
-                                 const int32_t dst_height, const int32_t dst_width) {
-  Maxpool3dBackwardCompute<T, IDX>(index_helper, elem_num, src, dest, indice_ptr, n_batch,
-                                   n_channel, src_time, src_height, src_width, dst_time, dst_height,
-                                   dst_width);
-};
-
-template<typename T, typename IDX>
-struct PoolKernelUtil<DeviceType::kCUDA, T, IDX> {
-  static void Maxpool1dForward(ep::Stream* stream, const NdIndexOffsetHelper<IDX, 2>& index_helper,
-                               const IDX elem_num, const T* src, T* dest, int64_t* indice_ptr,
-                               const MaxPoolParams3D& params_3d) {
-    DoCUDAMaxPool1dForward<T, IDX><<<GetNumBlocks(elem_num), GetMinThreadNum(elem_num), 0,
-                                     stream->As<ep::CudaStream>()->cuda_stream()>>>(
-        index_helper, elem_num, src, dest, indice_ptr, params_3d.padding()[2],
-        params_3d.num_batch(), params_3d.num_channel(), params_3d.GetXShape5D().At(4),
-        params_3d.pool_size_3d()[2], params_3d.stride_3d()[2], params_3d.dilation_3d()[2]);
-  }
-
-  static void Maxpool1dBackward(ep::Stream* stream, const NdIndexOffsetHelper<IDX, 2>& index_helper,
-                                const IDX elem_num, const T* src, T* dest,
-                                const int64_t* indice_ptr, const MaxPoolParams3D& params_3d) {
-    DoCUDAMaxPool1dBackward<T, IDX><<<GetNumBlocks(elem_num), GetMinThreadNum(elem_num), 0,
-                                      stream->As<ep::CudaStream>()->cuda_stream()>>>(
-        index_helper, elem_num, src, dest, indice_ptr, params_3d.num_batch(),
-        params_3d.num_channel(), params_3d.GetYShape5D().At(4), params_3d.GetXShape5D().At(4));
-  }
-
-  static void Maxpool2dForwardCFirst(ep::Stream* stream,
-                                     const NdIndexOffsetHelper<IDX, 3>& index_helper,
-                                     const IDX elem_num, const T* src, T* dest, int64_t* indice_ptr,
-                                     const MaxPoolParams3D& params_3d) {
-    DoCUDAMaxPool2dForwardCFirst<T, IDX><<<GetNumBlocks(elem_num), GetMinThreadNum(elem_num), 0,
-                                           stream->As<ep::CudaStream>()->cuda_stream()>>>(
-        index_helper, elem_num, src, dest, indice_ptr, params_3d.padding()[1],
-        params_3d.padding()[2], params_3d.num_batch(), params_3d.num_channel(),
-        params_3d.GetXShape5D().At(3), params_3d.GetXShape5D().At(4), params_3d.pool_size_3d()[1],
-        params_3d.pool_size_3d()[2], params_3d.stride_3d()[1], params_3d.stride_3d()[2],
-        params_3d.dilation_3d()[1], params_3d.dilation_3d()[2]);
-  }
-
-  static void Maxpool2dBackwardCFirst(ep::Stream* stream,
-                                      const NdIndexOffsetHelper<IDX, 3>& index_helper,
-                                      const IDX elem_num, const T* src, T* dest,
-                                      const int64_t* indice_ptr, const MaxPoolParams3D& params_3d) {
-    DoCUDAMaxPool2dBackwardCFirst<T, IDX><<<GetNumBlocks(elem_num), GetMinThreadNum(elem_num), 0,
-                                            stream->As<ep::CudaStream>()->cuda_stream()>>>(
-        index_helper, elem_num, src, dest, indice_ptr, params_3d.num_batch(),
-        params_3d.num_channel(), params_3d.GetYShape5D().At(3), params_3d.GetYShape5D().At(4),
-        params_3d.GetXShape5D().At(3), params_3d.GetXShape5D().At(4));
-  }
-
-  static void Maxpool2dForwardCLast(ep::Stream* stream,
-                                    const NdIndexOffsetHelper<IDX, 4>& index_helper,
-                                    const IDX elem_num, const T* src, T* dest, int64_t* indice_ptr,
-                                    const MaxPoolParams3D& params_3d) {
-    DoCUDAMaxPool2dForwardCLast<T, IDX><<<GetNumBlocks(elem_num), GetMinThreadNum(elem_num), 0,
-                                          stream->As<ep::CudaStream>()->cuda_stream()>>>(
-        index_helper, elem_num, src, dest, indice_ptr, params_3d.padding()[1],
-        params_3d.padding()[2], params_3d.num_batch(), params_3d.num_channel(),
-        params_3d.GetXShape5D().At(3), params_3d.GetXShape5D().At(4), params_3d.GetYShape5D().At(3),
-        params_3d.GetYShape5D().At(4), params_3d.pool_size_3d()[1], params_3d.pool_size_3d()[2],
-        params_3d.stride_3d()[1], params_3d.stride_3d()[2], params_3d.dilation_3d()[1],
-        params_3d.dilation_3d()[2]);
-  }
-
-  static void Maxpool2dBackwardCLast(ep::Stream* stream,
-                                     const NdIndexOffsetHelper<IDX, 4>& index_helper,
-                                     const IDX elem_num, const T* src, T* dest,
-                                     const int64_t* indice_ptr, const MaxPoolParams3D& params_3d) {
-    DoCUDAMaxPool2dBackwardCLast<T, IDX><<<GetNumBlocks(elem_num), GetMinThreadNum(elem_num), 0,
-                                           stream->As<ep::CudaStream>()->cuda_stream()>>>(
-        index_helper, elem_num, src, dest, indice_ptr, params_3d.num_batch(),
-        params_3d.num_channel(), params_3d.GetYShape5D().At(3), params_3d.GetYShape5D().At(4),
-        params_3d.GetXShape5D().At(3), params_3d.GetXShape5D().At(4));
-  }
-
-  static void Maxpool3dForward(ep::Stream* stream, const NdIndexOffsetHelper<IDX, 4>& index_helper,
-                               const IDX elem_num, const T* src, T* dest, int64_t* indice_ptr,
-                               const MaxPoolParams3D& params_3d) {
-    DoCUDAMaxPool3dForward<T, IDX><<<GetNumBlocks(elem_num), GetMinThreadNum(elem_num), 0,
-                                     stream->As<ep::CudaStream>()->cuda_stream()>>>(
-        index_helper, elem_num, src, dest, indice_ptr, params_3d.padding()[0],
-        params_3d.padding()[1], params_3d.padding()[2], params_3d.num_batch(),
-        params_3d.num_channel(), params_3d.GetXShape5D().At(2), params_3d.GetXShape5D().At(3),
-        params_3d.GetXShape5D().At(4), params_3d.pool_size_3d()[0], params_3d.pool_size_3d()[1],
-        params_3d.pool_size_3d()[2], params_3d.stride_3d()[0], params_3d.stride_3d()[1],
-        params_3d.stride_3d()[2], params_3d.dilation_3d()[0], params_3d.dilation_3d()[1],
-        params_3d.dilation_3d()[2]);
-  }
-
-  static void Maxpool3dBackward(ep::Stream* stream, const NdIndexOffsetHelper<IDX, 4>& index_helper,
-                                const IDX elem_num, const T* src, T* dest,
-                                const int64_t* indice_ptr, const MaxPoolParams3D& params_3d) {
-    DoCUDAMaxPool3dBackward<T, IDX><<<GetNumBlocks(elem_num), GetMinThreadNum(elem_num), 0,
-                                      stream->As<ep::CudaStream>()->cuda_stream()>>>(
-        index_helper, elem_num, src, dest, indice_ptr, params_3d.num_batch(),
-        params_3d.num_channel(), params_3d.GetYShape5D().At(2), params_3d.GetYShape5D().At(3),
-        params_3d.GetYShape5D().At(4), params_3d.GetXShape5D().At(2), params_3d.GetXShape5D().At(3),
-        params_3d.GetXShape5D().At(4));
-  }
-};
-
-OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_POOL_KERNEL_UTIL, (DeviceType::kCUDA),
-                                 POOL_DATA_TYPE_CUDA_SEQ, POOL_IDX_DATA_TYPE_SEQ);
-
-}  // namespace oneflow
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include <cstdint>
+#ifdef WITH_ROCM
+#include "hip/hip_runtime.h"
+#include "oneflow/core/hip/elementwise.hip.h"
+#include "oneflow/user/kernels/max_pool_kernel_util.h"
+#include "oneflow/core/ep/rocm/cuda_stream.h"
+#include <hip/hip_fp16.h>
+
+namespace oneflow {
+namespace {
+
+constexpr int kBlockSize = cuda::elementwise::kBlockSize;
+
+int GetMinThreadNum(int64_t elem_num) { return std::min<int64_t>(elem_num, kBlockSize); }
+
+int GetNumBlocks(int64_t elem_cnt) {
+  int num_blocks = 0;
+  OF_CUDA_CHECK(cuda::elementwise::GetNumBlocks(elem_cnt, &num_blocks));
+  return num_blocks;
+}
+
+template<typename T, typename IDX>
+__device__ __inline__ void Maxpool2dForwardComputeCLast(
+    const NdIndexOffsetHelper<IDX, 4>& index_helper, IDX elem_num, const T* src, T* dest,
+    int64_t* indice_ptr, const int32_t padding_h, const int32_t padding_w, const int64_t n_batch,
+    const int64_t n_channel, const int64_t x_height, const int64_t x_width, const int64_t y_height,
+    const int64_t y_width, const int32_t kernel_size_h, const int32_t kernel_size_w,
+    const int32_t stride_h, const int32_t stride_w, const int32_t dilation_h,
+    const int32_t dilation_w) {
+  IDX n, h, w, c;
+  CUDA_1D_KERNEL_LOOP(num, elem_num) {
+    index_helper.OffsetToNdIndex(num, n, h, w, c);
+
+    const IDX x_start_idx = n * n_channel * x_width * x_height;
+    const IDX y_start_idx = n * n_channel * y_height * y_width;
+    IDX hstart = h * stride_h - padding_h;
+    IDX wstart = w * stride_w - padding_w;
+    const IDX hend = (hstart + (kernel_size_h - 1) * dilation_h + 1) <= x_height
+                         ? (hstart + (kernel_size_h - 1) * dilation_h + 1)
+                         : x_height;
+    const IDX wend = (wstart + (kernel_size_w - 1) * dilation_w + 1) <= x_width
+                         ? (wstart + (kernel_size_w - 1) * dilation_w + 1)
+                         : x_width;
+
+    while (hstart < 0) { hstart += dilation_h; }
+    while (wstart < 0) { wstart += dilation_w; }
+    /* compute max value(src[src_idx]) in kernel box region, and save the value to dest[num] */
+    IDX max_index = hstart * x_width + wstart;
+    IDX src_idx = 0;
+    /* equal to -std::numeric_limits<T>::infinity(); */
+    T max_value = detail::numeric_limits<T>::lower_bound();
+
+    for (IDX i = hstart; i < hend; i++) {
+      for (IDX j = wstart; j < wend; j++) {
+        const IDX window_idx = i * x_width * n_channel + j * n_channel + c;
+        const IDX search_idx = x_start_idx + window_idx;
+        T val = src[search_idx];
+        if (val > max_value || detail::numerics<T>::isnan(val)) {
+          max_value = val;
+          max_index = window_idx;
+          src_idx = search_idx;
+        }
+      }
+    }
+    const IDX out_idx = y_start_idx + h * y_width * n_channel + w * n_channel + c;
+    dest[out_idx] = src[src_idx];
+    indice_ptr[out_idx] = max_index;
+  }
+}
+
+}  // namespace
+
+template<typename T, typename IDX>
+__launch_bounds__(kBlockSize) __global__
+    void DoCUDAMaxPool1dForward(const NdIndexOffsetHelper<IDX, 2> index_helper, IDX elem_num,
+                                const T* src, T* dest, int64_t* indice_ptr, int32_t padding_l,
+                                int32_t n_batch, int32_t n_channel, int32_t x_length,
+                                int32_t kernel_size_l, int32_t stride_l, int32_t dilation_l) {
+  Maxpool1dForwardCompute<T, IDX>(index_helper, elem_num, src, dest, indice_ptr, padding_l, n_batch,
+                                  n_channel, x_length, kernel_size_l, stride_l, dilation_l);
+};
+
+template<typename T, typename IDX>
+__launch_bounds__(kBlockSize) __global__
+    void DoCUDAMaxPool2dForwardCFirst(const NdIndexOffsetHelper<IDX, 3> index_helper, IDX elem_num,
+                                      const T* src, T* dest, int64_t* indice_ptr, int32_t padding_h,
+                                      int32_t padding_w, int32_t n_batch, int32_t n_channel,
+                                      int32_t x_height, int32_t x_width, int32_t kernel_size_h,
+                                      int32_t kernel_size_w, int32_t stride_h, int32_t stride_w,
+                                      int32_t dilation_h, int32_t dilation_w) {
+  Maxpool2dForwardComputeCFirst<T, IDX>(
+      index_helper, elem_num, src, dest, indice_ptr, padding_h, padding_w, n_batch, n_channel,
+      x_height, x_width, kernel_size_h, kernel_size_w, stride_h, stride_w, dilation_h, dilation_w);
+};
+
+template<typename T, typename IDX>
+__launch_bounds__(kBlockSize) __global__
+    void DoCUDAMaxPool2dForwardCLast(const NdIndexOffsetHelper<IDX, 4> index_helper, IDX elem_num,
+                                     const T* src, T* dest, int64_t* indice_ptr, int32_t padding_h,
+                                     int32_t padding_w, int32_t n_batch, int32_t n_channel,
+                                     int32_t x_height, int32_t x_width, int32_t y_height,
+                                     int32_t y_width, int32_t kernel_size_h, int32_t kernel_size_w,
+                                     int32_t stride_h, int32_t stride_w, int32_t dilation_h,
+                                     int32_t dilation_w) {
+  Maxpool2dForwardComputeCLast<T, IDX>(index_helper, elem_num, src, dest, indice_ptr, padding_h,
+                                       padding_w, n_batch, n_channel, x_height, x_width, y_height,
+                                       y_width, kernel_size_h, kernel_size_w, stride_h, stride_w,
+                                       dilation_h, dilation_w);
+};
+
+template<typename T, typename IDX>
+__launch_bounds__(kBlockSize) __global__
+    void DoCUDAMaxPool3dForward(const NdIndexOffsetHelper<IDX, 4> index_helper, IDX elem_num,
+                                const T* src, T* dest, int64_t* indice_ptr, int32_t padding_t,
+                                int32_t padding_h, int32_t padding_w, int32_t n_batch,
+                                int32_t n_channel, int32_t x_time, int32_t x_height,
+                                int32_t x_width, int32_t kernel_size_t, int32_t kernel_size_h,
+                                int32_t kernel_size_w, int32_t stride_t, int32_t stride_h,
+                                int32_t stride_w, int32_t dilation_t, int32_t dilation_h,
+                                int32_t dilation_w) {
+  Maxpool3dForwardCompute<T, IDX>(index_helper, elem_num, src, dest, indice_ptr, padding_t,
+                                  padding_h, padding_w, n_batch, n_channel, x_time, x_height,
+                                  x_width, kernel_size_t, kernel_size_h, kernel_size_w, stride_t,
+                                  stride_h, stride_w, dilation_t, dilation_h, dilation_w);
+};
+
+template<typename T, typename IDX>
+__launch_bounds__(kBlockSize) __global__
+    void DoCUDAMaxPool1dBackward(const NdIndexOffsetHelper<IDX, 2> index_helper, const IDX elem_num,
+                                 const T* src, T* dest, const int64_t* indice_ptr,
+                                 const int32_t n_batch, const int32_t n_channel,
+                                 const int32_t src_length, const int32_t dst_length) {
+  Maxpool1dBackwardCompute<T, IDX>(index_helper, elem_num, src, dest, indice_ptr, n_batch,
+                                   n_channel, src_length, dst_length);
+};
+
+template<typename T, typename IDX>
+__launch_bounds__(kBlockSize) __global__
+    void DoCUDAMaxPool2dBackwardCFirst(const NdIndexOffsetHelper<IDX, 3> index_helper,
+                                       const IDX elem_num, const T* src, T* dest,
+                                       const int64_t* indice_ptr, const int32_t n_batch,
+                                       const int32_t n_channel, const int32_t src_height,
+                                       const int32_t src_width, const int32_t dst_height,
+                                       const int32_t dst_width) {
+  Maxpool2dBackwardComputeCFirst<T, IDX>(index_helper, elem_num, src, dest, indice_ptr, n_batch,
+                                         n_channel, src_height, src_width, dst_height, dst_width);
+};
+
+template<typename T, typename IDX>
+__launch_bounds__(kBlockSize) __global__
+    void DoCUDAMaxPool2dBackwardCLast(const NdIndexOffsetHelper<IDX, 4> index_helper,
+                                      const IDX elem_num, const T* src, T* dest,
+                                      const int64_t* indice_ptr, const int32_t n_batch,
+                                      const int32_t n_channel, const int32_t src_height,
+                                      const int32_t src_width, const int32_t dst_height,
+                                      const int32_t dst_width) {
+  Maxpool2dBackwardComputeCLast<T, IDX>(index_helper, elem_num, src, dest, indice_ptr, n_batch,
+                                        n_channel, src_height, src_width, dst_height, dst_width);
+};
+
+template<typename T, typename IDX>
+__launch_bounds__(kBlockSize) __global__
+    void DoCUDAMaxPool3dBackward(const NdIndexOffsetHelper<IDX, 4> index_helper, const IDX elem_num,
+                                 const T* src, T* dest, const int64_t* indice_ptr,
+                                 const int32_t n_batch, const int32_t n_channel,
+                                 const int32_t src_time, const int32_t src_height,
+                                 const int32_t src_width, const int32_t dst_time,
+                                 const int32_t dst_height, const int32_t dst_width) {
+  Maxpool3dBackwardCompute<T, IDX>(index_helper, elem_num, src, dest, indice_ptr, n_batch,
+                                   n_channel, src_time, src_height, src_width, dst_time, dst_height,
+                                   dst_width);
+};
+
+template<typename T, typename IDX>
+struct PoolKernelUtil<DeviceType::kCUDA, T, IDX> {
+  static void Maxpool1dForward(ep::Stream* stream, const NdIndexOffsetHelper<IDX, 2>& index_helper,
+                               const IDX elem_num, const T* src, T* dest, int64_t* indice_ptr,
+                               const MaxPoolParams3D& params_3d) {
+    DoCUDAMaxPool1dForward<T, IDX><<<GetNumBlocks(elem_num), GetMinThreadNum(elem_num), 0,
+                                     stream->As<ep::CudaStream>()->cuda_stream()>>>(
+        index_helper, elem_num, src, dest, indice_ptr, params_3d.padding()[2],
+        params_3d.num_batch(), params_3d.num_channel(), params_3d.GetXShape5D().At(4),
+        params_3d.pool_size_3d()[2], params_3d.stride_3d()[2], params_3d.dilation_3d()[2]);
+  }
+
+  static void Maxpool1dBackward(ep::Stream* stream, const NdIndexOffsetHelper<IDX, 2>& index_helper,
+                                const IDX elem_num, const T* src, T* dest,
+                                const int64_t* indice_ptr, const MaxPoolParams3D& params_3d) {
+    DoCUDAMaxPool1dBackward<T, IDX><<<GetNumBlocks(elem_num), GetMinThreadNum(elem_num), 0,
+                                      stream->As<ep::CudaStream>()->cuda_stream()>>>(
+        index_helper, elem_num, src, dest, indice_ptr, params_3d.num_batch(),
+        params_3d.num_channel(), params_3d.GetYShape5D().At(4), params_3d.GetXShape5D().At(4));
+  }
+
+  static void Maxpool2dForwardCFirst(ep::Stream* stream,
+                                     const NdIndexOffsetHelper<IDX, 3>& index_helper,
+                                     const IDX elem_num, const T* src, T* dest, int64_t* indice_ptr,
+                                     const MaxPoolParams3D& params_3d) {
+    DoCUDAMaxPool2dForwardCFirst<T, IDX><<<GetNumBlocks(elem_num), GetMinThreadNum(elem_num), 0,
+                                           stream->As<ep::CudaStream>()->cuda_stream()>>>(
+        index_helper, elem_num, src, dest, indice_ptr, params_3d.padding()[1],
+        params_3d.padding()[2], params_3d.num_batch(), params_3d.num_channel(),
+        params_3d.GetXShape5D().At(3), params_3d.GetXShape5D().At(4), params_3d.pool_size_3d()[1],
+        params_3d.pool_size_3d()[2], params_3d.stride_3d()[1], params_3d.stride_3d()[2],
+        params_3d.dilation_3d()[1], params_3d.dilation_3d()[2]);
+  }
+
+  static void Maxpool2dBackwardCFirst(ep::Stream* stream,
+                                      const NdIndexOffsetHelper<IDX, 3>& index_helper,
+                                      const IDX elem_num, const T* src, T* dest,
+                                      const int64_t* indice_ptr, const MaxPoolParams3D& params_3d) {
+    DoCUDAMaxPool2dBackwardCFirst<T, IDX><<<GetNumBlocks(elem_num), GetMinThreadNum(elem_num), 0,
+                                            stream->As<ep::CudaStream>()->cuda_stream()>>>(
+        index_helper, elem_num, src, dest, indice_ptr, params_3d.num_batch(),
+        params_3d.num_channel(), params_3d.GetYShape5D().At(3), params_3d.GetYShape5D().At(4),
+        params_3d.GetXShape5D().At(3), params_3d.GetXShape5D().At(4));
+  }
+
+  static void Maxpool2dForwardCLast(ep::Stream* stream,
+                                    const NdIndexOffsetHelper<IDX, 4>& index_helper,
+                                    const IDX elem_num, const T* src, T* dest, int64_t* indice_ptr,
+                                    const MaxPoolParams3D& params_3d) {
+    DoCUDAMaxPool2dForwardCLast<T, IDX><<<GetNumBlocks(elem_num), GetMinThreadNum(elem_num), 0,
+                                          stream->As<ep::CudaStream>()->cuda_stream()>>>(
+        index_helper, elem_num, src, dest, indice_ptr, params_3d.padding()[1],
+        params_3d.padding()[2], params_3d.num_batch(), params_3d.num_channel(),
+        params_3d.GetXShape5D().At(3), params_3d.GetXShape5D().At(4), params_3d.GetYShape5D().At(3),
+        params_3d.GetYShape5D().At(4), params_3d.pool_size_3d()[1], params_3d.pool_size_3d()[2],
+        params_3d.stride_3d()[1], params_3d.stride_3d()[2], params_3d.dilation_3d()[1],
+        params_3d.dilation_3d()[2]);
+  }
+
+  static void Maxpool2dBackwardCLast(ep::Stream* stream,
+                                     const NdIndexOffsetHelper<IDX, 4>& index_helper,
+                                     const IDX elem_num, const T* src, T* dest,
+                                     const int64_t* indice_ptr, const MaxPoolParams3D& params_3d) {
+    DoCUDAMaxPool2dBackwardCLast<T, IDX><<<GetNumBlocks(elem_num), GetMinThreadNum(elem_num), 0,
+                                           stream->As<ep::CudaStream>()->cuda_stream()>>>(
+        index_helper, elem_num, src, dest, indice_ptr, params_3d.num_batch(),
+        params_3d.num_channel(), params_3d.GetYShape5D().At(3), params_3d.GetYShape5D().At(4),
+        params_3d.GetXShape5D().At(3), params_3d.GetXShape5D().At(4));
+  }
+
+  static void Maxpool3dForward(ep::Stream* stream, const NdIndexOffsetHelper<IDX, 4>& index_helper,
+                               const IDX elem_num, const T* src, T* dest, int64_t* indice_ptr,
+                               const MaxPoolParams3D& params_3d) {
+    DoCUDAMaxPool3dForward<T, IDX><<<GetNumBlocks(elem_num), GetMinThreadNum(elem_num), 0,
+                                     stream->As<ep::CudaStream>()->cuda_stream()>>>(
+        index_helper, elem_num, src, dest, indice_ptr, params_3d.padding()[0],
+        params_3d.padding()[1], params_3d.padding()[2], params_3d.num_batch(),
+        params_3d.num_channel(), params_3d.GetXShape5D().At(2), params_3d.GetXShape5D().At(3),
+        params_3d.GetXShape5D().At(4), params_3d.pool_size_3d()[0], params_3d.pool_size_3d()[1],
+        params_3d.pool_size_3d()[2], params_3d.stride_3d()[0], params_3d.stride_3d()[1],
+        params_3d.stride_3d()[2], params_3d.dilation_3d()[0], params_3d.dilation_3d()[1],
+        params_3d.dilation_3d()[2]);
+  }
+
+  static void Maxpool3dBackward(ep::Stream* stream, const NdIndexOffsetHelper<IDX, 4>& index_helper,
+                                const IDX elem_num, const T* src, T* dest,
+                                const int64_t* indice_ptr, const MaxPoolParams3D& params_3d) {
+    DoCUDAMaxPool3dBackward<T, IDX><<<GetNumBlocks(elem_num), GetMinThreadNum(elem_num), 0,
+                                      stream->As<ep::CudaStream>()->cuda_stream()>>>(
+        index_helper, elem_num, src, dest, indice_ptr, params_3d.num_batch(),
+        params_3d.num_channel(), params_3d.GetYShape5D().At(2), params_3d.GetYShape5D().At(3),
+        params_3d.GetYShape5D().At(4), params_3d.GetXShape5D().At(2), params_3d.GetXShape5D().At(3),
+        params_3d.GetXShape5D().At(4));
+  }
+};
+
+OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_POOL_KERNEL_UTIL, (DeviceType::kCUDA),
+                                 POOL_DATA_TYPE_CUDA_SEQ, POOL_IDX_DATA_TYPE_SEQ);
+
+}  // namespace oneflow
 #endif  // WITH_ROCM
\ No newline at end of file
diff --git a/oneflow/user/kernels/median_kernel.hip.cpp b/oneflow/user/kernels/median_kernel.hip.cpp
index 34bdae2..7f71b02 100644
--- a/oneflow/user/kernels/median_kernel.hip.cpp
+++ b/oneflow/user/kernels/median_kernel.hip.cpp
@@ -1,69 +1,69 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "oneflow/core/framework/framework.h"
-#include "oneflow/core/kernel/new_kernel_util.h"
-#include "oneflow/user/kernels/radix_sort.hip.h"
-
-namespace oneflow {
-
-template<typename T>
-class CudaMedianKernel final : public user_op::OpKernel {
- public:
-  CudaMedianKernel() = default;
-  ~CudaMedianKernel() = default;
-
- private:
-  using user_op::OpKernel::Compute;
-  void Compute(user_op::KernelComputeContext* ctx) const override {
-    const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("input", 0);
-    user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("output", 0);
-    user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0);
-
-    const int32_t instance_size = in->shape_view().elem_cnt();
-    const size_t sort_tensor_buffer_bytes = GetCudaAlignedSize(instance_size * sizeof(T));
-    SortKeysAscending(
-        in->dptr<T>(), 1, instance_size,
-        reinterpret_cast<void*>(tmp_buffer->mut_dptr<char>() + sort_tensor_buffer_bytes),
-        tmp_buffer->shape_view().elem_cnt() - sort_tensor_buffer_bytes, tmp_buffer->mut_dptr<T>(),
-        ctx->stream()->As<ep::CudaStream>()->cuda_stream());
-    Memcpy<DeviceType::kCUDA>(ctx->stream(), out->mut_dptr<T>(),
-                              tmp_buffer->mut_dptr<T>() + (instance_size - 1) / 2, sizeof(T));
-  }
-  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
-};
-
-#define REGISTER_CUDA_MEDIAN_KERNEL(dtype)                                                   \
-  REGISTER_USER_KERNEL("median")                                                             \
-      .SetCreateFn<CudaMedianKernel<dtype>>()                                                \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                       \
-                       && (user_op::HobDataType("input", 0) == GetDataType<dtype>::value))   \
-      .SetInferTmpSizeFn([](user_op::InferContext* ctx) -> size_t {                          \
-        const Shape& in_shape = ctx->InputShape("input", 0);                                 \
-        const int32_t instance_size = in_shape.elem_cnt();                                   \
-        size_t sort_tmp_buffer_bytes =                                                       \
-            InferTempStorageForSortKeysAscending<dtype>(1, instance_size);                   \
-        size_t sort_tensor_buffer_bytes = GetCudaAlignedSize(instance_size * sizeof(dtype)); \
-        return sort_tmp_buffer_bytes + sort_tensor_buffer_bytes;                             \
-      });
-
-REGISTER_CUDA_MEDIAN_KERNEL(float)
-REGISTER_CUDA_MEDIAN_KERNEL(double)
-REGISTER_CUDA_MEDIAN_KERNEL(int8_t)
-REGISTER_CUDA_MEDIAN_KERNEL(uint8_t)
-REGISTER_CUDA_MEDIAN_KERNEL(int32_t)
-REGISTER_CUDA_MEDIAN_KERNEL(int64_t)
-
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "oneflow/core/framework/framework.h"
+#include "oneflow/core/kernel/new_kernel_util.h"
+#include "oneflow/user/kernels/radix_sort.hip.h"
+
+namespace oneflow {
+
+template<typename T>
+class CudaMedianKernel final : public user_op::OpKernel {
+ public:
+  CudaMedianKernel() = default;
+  ~CudaMedianKernel() = default;
+
+ private:
+  using user_op::OpKernel::Compute;
+  void Compute(user_op::KernelComputeContext* ctx) const override {
+    const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("input", 0);
+    user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("output", 0);
+    user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0);
+
+    const int32_t instance_size = in->shape_view().elem_cnt();
+    const size_t sort_tensor_buffer_bytes = GetCudaAlignedSize(instance_size * sizeof(T));
+    SortKeysAscending(
+        in->dptr<T>(), 1, instance_size,
+        reinterpret_cast<void*>(tmp_buffer->mut_dptr<char>() + sort_tensor_buffer_bytes),
+        tmp_buffer->shape_view().elem_cnt() - sort_tensor_buffer_bytes, tmp_buffer->mut_dptr<T>(),
+        ctx->stream()->As<ep::CudaStream>()->cuda_stream());
+    Memcpy<DeviceType::kCUDA>(ctx->stream(), out->mut_dptr<T>(),
+                              tmp_buffer->mut_dptr<T>() + (instance_size - 1) / 2, sizeof(T));
+  }
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
+};
+
+#define REGISTER_CUDA_MEDIAN_KERNEL(dtype)                                                   \
+  REGISTER_USER_KERNEL("median")                                                             \
+      .SetCreateFn<CudaMedianKernel<dtype>>()                                                \
+      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                       \
+                       && (user_op::HobDataType("input", 0) == GetDataType<dtype>::value))   \
+      .SetInferTmpSizeFn([](user_op::InferContext* ctx) -> size_t {                          \
+        const Shape& in_shape = ctx->InputShape("input", 0);                                 \
+        const int32_t instance_size = in_shape.elem_cnt();                                   \
+        size_t sort_tmp_buffer_bytes =                                                       \
+            InferTempStorageForSortKeysAscending<dtype>(1, instance_size);                   \
+        size_t sort_tensor_buffer_bytes = GetCudaAlignedSize(instance_size * sizeof(dtype)); \
+        return sort_tmp_buffer_bytes + sort_tensor_buffer_bytes;                             \
+      });
+
+REGISTER_CUDA_MEDIAN_KERNEL(float)
+REGISTER_CUDA_MEDIAN_KERNEL(double)
+REGISTER_CUDA_MEDIAN_KERNEL(int8_t)
+REGISTER_CUDA_MEDIAN_KERNEL(uint8_t)
+REGISTER_CUDA_MEDIAN_KERNEL(int32_t)
+REGISTER_CUDA_MEDIAN_KERNEL(int64_t)
+
 }  // namespace oneflow
\ No newline at end of file
diff --git a/oneflow/user/kernels/median_with_indices_kernel.hip.cpp b/oneflow/user/kernels/median_with_indices_kernel.hip.cpp
index e5a8187..9485de1 100644
--- a/oneflow/user/kernels/median_with_indices_kernel.hip.cpp
+++ b/oneflow/user/kernels/median_with_indices_kernel.hip.cpp
@@ -1,156 +1,156 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "hip/hip_runtime.h"
-#include "oneflow/core/framework/framework.h"
-#include "oneflow/core/kernel/new_kernel_util.h"
-#include "oneflow/user/kernels/radix_sort.hip.h"
-
-namespace oneflow {
-
-namespace {
-
-template<typename T, typename IDX>
-__global__ void MedianSelectCuda(const IDX reduce_elem_cnt, const IDX stride, const T* in,
-                                 const int64_t* sort_indices, T* values, int64_t* indices) {
-  IDX nth = (stride - 1) / 2;
-  CUDA_1D_KERNEL_LOOP_T(IDX, i, reduce_elem_cnt) {
-    values[i] = in[i * stride + nth];
-    indices[i] = sort_indices[i * stride + nth];
-  }
-}
-
-bool IsSafeUseIndex32(int64_t elem_cnt) { return elem_cnt < GetMaxVal<int32_t>() / 2; }
-
-template<typename T>
-void DispatchIndexSize(ep::Stream* stream, const int64_t elem_cnt, const int64_t stride,
-                       const T* in, const int64_t* sort_indices, T* out, int64_t* out_indices) {
-  const int64_t reduce_elem_cnt = elem_cnt / stride;
-  if (IsSafeUseIndex32(elem_cnt)) {
-    RUN_CUDA_KERNEL((MedianSelectCuda<T, int32_t>), stream, reduce_elem_cnt, reduce_elem_cnt,
-                    stride, in, sort_indices, out, out_indices);
-  } else {
-    RUN_CUDA_KERNEL((MedianSelectCuda<T, int64_t>), stream, reduce_elem_cnt, reduce_elem_cnt,
-                    stride, in, sort_indices, out, out_indices);
-  }
-}
-
-template<typename T>
-class TmpBufferManager final {
- public:
-  OF_DISALLOW_COPY_AND_MOVE(TmpBufferManager);
-  TmpBufferManager(size_t capacity, void* ptr, const ShapeView& in_shape)
-      : capacity_{capacity},
-        sorted_in_elem_cnt_{in_shape.elem_cnt()},
-        indices_elem_cnt_{sorted_in_elem_cnt_} {
-    const size_t sort_tensor_buffer_bytes = GetCudaAlignedSize(sorted_in_elem_cnt_ * sizeof(T));
-    const size_t sort_indices_buffer_bytes =
-        GetCudaAlignedSize(indices_elem_cnt_ * sizeof(int64_t));
-    sorted_in_ptr_ = reinterpret_cast<T*>(ptr);
-    in_indices_ptr_ = reinterpret_cast<int64_t*>(reinterpret_cast<char*>(sorted_in_ptr_)
-                                                 + sort_tensor_buffer_bytes);
-    out_indices_ptr_ = reinterpret_cast<int64_t*>(reinterpret_cast<char*>(in_indices_ptr_)
-                                                  + sort_indices_buffer_bytes);
-    temp_storage_ptr_ = reinterpret_cast<void*>(reinterpret_cast<char*>(out_indices_ptr_)
-                                                + sort_indices_buffer_bytes);
-    temp_storage_bytes_ = capacity_ - sort_tensor_buffer_bytes - sort_indices_buffer_bytes * 2;
-    CHECK_GE(temp_storage_bytes_, 0);
-  }
-  ~TmpBufferManager() = default;
-
-  T* SortedInPtr() const { return sorted_in_ptr_; }
-  int64_t* InIndicesPtr() const { return in_indices_ptr_; }
-  int64_t* OutIndicesPtr() const { return out_indices_ptr_; }
-  void* TempStoragePtr() const { return temp_storage_ptr_; }
-
-  size_t TempStorageBytes() const { return temp_storage_bytes_; }
-
- private:
-  size_t capacity_;
-
-  T* sorted_in_ptr_;
-  int64_t* in_indices_ptr_;
-  int64_t* out_indices_ptr_;
-  void* temp_storage_ptr_;
-
-  int64_t sorted_in_elem_cnt_;
-  int64_t indices_elem_cnt_;
-  size_t temp_storage_bytes_;
-};
-
-__global__ void InitializeIndices(int64_t elem_cnt, int64_t* indices_ptr, int64_t instance_size) {
-  CUDA_1D_KERNEL_LOOP(i, elem_cnt) { indices_ptr[i] = i % instance_size; };
-}
-
-}  // namespace
-
-template<typename T>
-class CudaMedianWithIndicesKernel final : public user_op::OpKernel {
- public:
-  CudaMedianWithIndicesKernel() = default;
-  ~CudaMedianWithIndicesKernel() = default;
-
- private:
-  using user_op::OpKernel::Compute;
-  void Compute(user_op::KernelComputeContext* ctx) const override {
-    const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("input", 0);
-    if (in->shape_view().elem_cnt() == 0) return;
-    user_op::Tensor* values = ctx->Tensor4ArgNameAndIndex("values", 0);
-    user_op::Tensor* indices = ctx->Tensor4ArgNameAndIndex("indices", 0);
-    user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0);
-    TmpBufferManager<T> buf_manager(tmp_buffer->shape_view().elem_cnt(),
-                                    tmp_buffer->mut_dptr<void>(), in->shape_view());
-
-    const int64_t elem_cnt = in->shape_view().elem_cnt();
-    const int64_t instance_size = in->shape_view().At(in->shape_view().NumAxes() - 1);
-    const int64_t instance_num = elem_cnt / instance_size;
-    RUN_CUDA_KERNEL(InitializeIndices, ctx->stream(), elem_cnt, elem_cnt,
-                    buf_manager.InIndicesPtr(), instance_size);
-    SortPairsAscending(in->dptr<T>(), buf_manager.InIndicesPtr(), instance_num, instance_size,
-                       buf_manager.TempStoragePtr(), buf_manager.TempStorageBytes(),
-                       buf_manager.SortedInPtr(), buf_manager.OutIndicesPtr(),
-                       ctx->stream()->As<ep::CudaStream>()->cuda_stream());
-    DispatchIndexSize(ctx->stream(), elem_cnt, instance_size, buf_manager.SortedInPtr(),
-                      buf_manager.OutIndicesPtr(), values->mut_dptr<T>(),
-                      indices->mut_dptr<int64_t>());
-  }
-  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
-};
-
-#define REGISTER_CUDA_MEDIAN_WITH_INDICES_KERNEL(dtype)                                            \
-  REGISTER_USER_KERNEL("median_with_indices")                                                      \
-      .SetCreateFn<CudaMedianWithIndicesKernel<dtype>>()                                           \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                             \
-                       && (user_op::HobDataType("input", 0) == GetDataType<dtype>::value))         \
-      .SetInferTmpSizeFn([](user_op::InferContext* ctx) -> size_t {                                \
-        const Shape& in_shape = ctx->InputShape("input", 0);                                       \
-        const int64_t instance_size = in_shape.dim_vec().back();                                   \
-        const int64_t instance_num = in_shape.elem_cnt() / instance_size;                          \
-        size_t sort_tmp_buffer_bytes =                                                             \
-            InferTempStorageForSortPairsAscending<dtype, int64_t>(instance_num, instance_size);    \
-        size_t sort_tensor_buffer_bytes = GetCudaAlignedSize(in_shape.elem_cnt() * sizeof(dtype)); \
-        size_t sort_indices_buffer_bytes =                                                         \
-            GetCudaAlignedSize(in_shape.elem_cnt() * sizeof(int64_t));                             \
-        return sort_tmp_buffer_bytes + sort_tensor_buffer_bytes + sort_indices_buffer_bytes * 2;   \
-      });
-
-REGISTER_CUDA_MEDIAN_WITH_INDICES_KERNEL(float)
-REGISTER_CUDA_MEDIAN_WITH_INDICES_KERNEL(double)
-REGISTER_CUDA_MEDIAN_WITH_INDICES_KERNEL(int8_t)
-REGISTER_CUDA_MEDIAN_WITH_INDICES_KERNEL(uint8_t)
-REGISTER_CUDA_MEDIAN_WITH_INDICES_KERNEL(int32_t)
-REGISTER_CUDA_MEDIAN_WITH_INDICES_KERNEL(int64_t)
-
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "hip/hip_runtime.h"
+#include "oneflow/core/framework/framework.h"
+#include "oneflow/core/kernel/new_kernel_util.h"
+#include "oneflow/user/kernels/radix_sort.hip.h"
+
+namespace oneflow {
+
+namespace {
+
+template<typename T, typename IDX>
+__global__ void MedianSelectCuda(const IDX reduce_elem_cnt, const IDX stride, const T* in,
+                                 const int64_t* sort_indices, T* values, int64_t* indices) {
+  IDX nth = (stride - 1) / 2;
+  CUDA_1D_KERNEL_LOOP_T(IDX, i, reduce_elem_cnt) {
+    values[i] = in[i * stride + nth];
+    indices[i] = sort_indices[i * stride + nth];
+  }
+}
+
+bool IsSafeUseIndex32(int64_t elem_cnt) { return elem_cnt < GetMaxVal<int32_t>() / 2; }
+
+template<typename T>
+void DispatchIndexSize(ep::Stream* stream, const int64_t elem_cnt, const int64_t stride,
+                       const T* in, const int64_t* sort_indices, T* out, int64_t* out_indices) {
+  const int64_t reduce_elem_cnt = elem_cnt / stride;
+  if (IsSafeUseIndex32(elem_cnt)) {
+    RUN_CUDA_KERNEL((MedianSelectCuda<T, int32_t>), stream, reduce_elem_cnt, reduce_elem_cnt,
+                    stride, in, sort_indices, out, out_indices);
+  } else {
+    RUN_CUDA_KERNEL((MedianSelectCuda<T, int64_t>), stream, reduce_elem_cnt, reduce_elem_cnt,
+                    stride, in, sort_indices, out, out_indices);
+  }
+}
+
+template<typename T>
+class TmpBufferManager final {
+ public:
+  OF_DISALLOW_COPY_AND_MOVE(TmpBufferManager);
+  TmpBufferManager(size_t capacity, void* ptr, const ShapeView& in_shape)
+      : capacity_{capacity},
+        sorted_in_elem_cnt_{in_shape.elem_cnt()},
+        indices_elem_cnt_{sorted_in_elem_cnt_} {
+    const size_t sort_tensor_buffer_bytes = GetCudaAlignedSize(sorted_in_elem_cnt_ * sizeof(T));
+    const size_t sort_indices_buffer_bytes =
+        GetCudaAlignedSize(indices_elem_cnt_ * sizeof(int64_t));
+    sorted_in_ptr_ = reinterpret_cast<T*>(ptr);
+    in_indices_ptr_ = reinterpret_cast<int64_t*>(reinterpret_cast<char*>(sorted_in_ptr_)
+                                                 + sort_tensor_buffer_bytes);
+    out_indices_ptr_ = reinterpret_cast<int64_t*>(reinterpret_cast<char*>(in_indices_ptr_)
+                                                  + sort_indices_buffer_bytes);
+    temp_storage_ptr_ = reinterpret_cast<void*>(reinterpret_cast<char*>(out_indices_ptr_)
+                                                + sort_indices_buffer_bytes);
+    temp_storage_bytes_ = capacity_ - sort_tensor_buffer_bytes - sort_indices_buffer_bytes * 2;
+    CHECK_GE(temp_storage_bytes_, 0);
+  }
+  ~TmpBufferManager() = default;
+
+  T* SortedInPtr() const { return sorted_in_ptr_; }
+  int64_t* InIndicesPtr() const { return in_indices_ptr_; }
+  int64_t* OutIndicesPtr() const { return out_indices_ptr_; }
+  void* TempStoragePtr() const { return temp_storage_ptr_; }
+
+  size_t TempStorageBytes() const { return temp_storage_bytes_; }
+
+ private:
+  size_t capacity_;
+
+  T* sorted_in_ptr_;
+  int64_t* in_indices_ptr_;
+  int64_t* out_indices_ptr_;
+  void* temp_storage_ptr_;
+
+  int64_t sorted_in_elem_cnt_;
+  int64_t indices_elem_cnt_;
+  size_t temp_storage_bytes_;
+};
+
+__global__ void InitializeIndices(int64_t elem_cnt, int64_t* indices_ptr, int64_t instance_size) {
+  CUDA_1D_KERNEL_LOOP(i, elem_cnt) { indices_ptr[i] = i % instance_size; };
+}
+
+}  // namespace
+
+template<typename T>
+class CudaMedianWithIndicesKernel final : public user_op::OpKernel {
+ public:
+  CudaMedianWithIndicesKernel() = default;
+  ~CudaMedianWithIndicesKernel() = default;
+
+ private:
+  using user_op::OpKernel::Compute;
+  void Compute(user_op::KernelComputeContext* ctx) const override {
+    const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("input", 0);
+    if (in->shape_view().elem_cnt() == 0) return;
+    user_op::Tensor* values = ctx->Tensor4ArgNameAndIndex("values", 0);
+    user_op::Tensor* indices = ctx->Tensor4ArgNameAndIndex("indices", 0);
+    user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0);
+    TmpBufferManager<T> buf_manager(tmp_buffer->shape_view().elem_cnt(),
+                                    tmp_buffer->mut_dptr<void>(), in->shape_view());
+
+    const int64_t elem_cnt = in->shape_view().elem_cnt();
+    const int64_t instance_size = in->shape_view().At(in->shape_view().NumAxes() - 1);
+    const int64_t instance_num = elem_cnt / instance_size;
+    RUN_CUDA_KERNEL(InitializeIndices, ctx->stream(), elem_cnt, elem_cnt,
+                    buf_manager.InIndicesPtr(), instance_size);
+    SortPairsAscending(in->dptr<T>(), buf_manager.InIndicesPtr(), instance_num, instance_size,
+                       buf_manager.TempStoragePtr(), buf_manager.TempStorageBytes(),
+                       buf_manager.SortedInPtr(), buf_manager.OutIndicesPtr(),
+                       ctx->stream()->As<ep::CudaStream>()->cuda_stream());
+    DispatchIndexSize(ctx->stream(), elem_cnt, instance_size, buf_manager.SortedInPtr(),
+                      buf_manager.OutIndicesPtr(), values->mut_dptr<T>(),
+                      indices->mut_dptr<int64_t>());
+  }
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
+};
+
+#define REGISTER_CUDA_MEDIAN_WITH_INDICES_KERNEL(dtype)                                            \
+  REGISTER_USER_KERNEL("median_with_indices")                                                      \
+      .SetCreateFn<CudaMedianWithIndicesKernel<dtype>>()                                           \
+      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                             \
+                       && (user_op::HobDataType("input", 0) == GetDataType<dtype>::value))         \
+      .SetInferTmpSizeFn([](user_op::InferContext* ctx) -> size_t {                                \
+        const Shape& in_shape = ctx->InputShape("input", 0);                                       \
+        const int64_t instance_size = in_shape.dim_vec().back();                                   \
+        const int64_t instance_num = in_shape.elem_cnt() / instance_size;                          \
+        size_t sort_tmp_buffer_bytes =                                                             \
+            InferTempStorageForSortPairsAscending<dtype, int64_t>(instance_num, instance_size);    \
+        size_t sort_tensor_buffer_bytes = GetCudaAlignedSize(in_shape.elem_cnt() * sizeof(dtype)); \
+        size_t sort_indices_buffer_bytes =                                                         \
+            GetCudaAlignedSize(in_shape.elem_cnt() * sizeof(int64_t));                             \
+        return sort_tmp_buffer_bytes + sort_tensor_buffer_bytes + sort_indices_buffer_bytes * 2;   \
+      });
+
+REGISTER_CUDA_MEDIAN_WITH_INDICES_KERNEL(float)
+REGISTER_CUDA_MEDIAN_WITH_INDICES_KERNEL(double)
+REGISTER_CUDA_MEDIAN_WITH_INDICES_KERNEL(int8_t)
+REGISTER_CUDA_MEDIAN_WITH_INDICES_KERNEL(uint8_t)
+REGISTER_CUDA_MEDIAN_WITH_INDICES_KERNEL(int32_t)
+REGISTER_CUDA_MEDIAN_WITH_INDICES_KERNEL(int64_t)
+
 }  // namespace oneflow
\ No newline at end of file
diff --git a/oneflow/user/kernels/min_max_observer_kernel.hip.cpp b/oneflow/user/kernels/min_max_observer_kernel.hip.cpp
index 82c4cb4..df27571 100644
--- a/oneflow/user/kernels/min_max_observer_kernel.hip.cpp
+++ b/oneflow/user/kernels/min_max_observer_kernel.hip.cpp
@@ -1,260 +1,260 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "hip/hip_runtime.h"
-#include "oneflow/core/device/cuda_util.h"
-#include "oneflow/core/framework/framework.h"
-#include "oneflow/core/hip/atomic.hip.h"
-#include "oneflow/core/ep/rocm/cuda_stream.h"
-
-#include <float.h>
-
-namespace oneflow {
-
-namespace {
-
-// NOTE(Liang Depeng): refer to
-// https://stackoverflow.com/questions/17371275/implementing-max-reduce-in-cuda
-template<typename T>
-__global__ void ReduceMaxMinPerLayer(const T* input_ptr, const int64_t elements, T* max_ptr,
-                                     T* min_ptr) {
-  extern __shared__ unsigned char shared_max_min_memory[];
-  T* shared_max = reinterpret_cast<T*>(shared_max_min_memory);
-  T* shared_min = shared_max + blockDim.x;
-
-  int64_t tid = threadIdx.x;
-  int64_t gid = (blockDim.x * blockIdx.x) + tid;
-  shared_max[tid] = -FLT_MAX;
-  shared_min[tid] = -FLT_MAX;
-
-  while (gid < elements) {
-    shared_max[tid] = max(shared_max[tid], input_ptr[gid]);
-    shared_min[tid] = max(shared_min[tid], -input_ptr[gid]);
-    gid += gridDim.x * blockDim.x;
-  }
-  __syncthreads();
-  gid = (blockDim.x * blockIdx.x) + tid;
-  for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1) {
-    if (tid < s && gid < elements) {
-      shared_max[tid] = max(shared_max[tid], shared_max[tid + s]);
-      shared_min[tid] = max(shared_min[tid], shared_min[tid + s]);
-    }
-    __syncthreads();
-  }
-
-  if (tid == 0) {
-    cuda::atomic::Max(max_ptr, shared_max[0]);
-    cuda::atomic::Max(min_ptr, shared_min[0]);
-  }
-}
-
-template<typename T>
-__global__ void ReduceMaxMinPerChannel(const T* input_ptr, const int64_t elements,
-                                       const int64_t num_channels, const int64_t panel_size,
-                                       T* max_ptr, T* min_ptr) {
-  extern __shared__ unsigned char shared_max_min_memory[];
-  T* shared_max = reinterpret_cast<T*>(shared_max_min_memory);
-  T* shared_min = shared_max + blockDim.x;
-
-  int64_t cur_channel = blockIdx.x;
-  int64_t tid = threadIdx.x;
-
-  while (cur_channel < num_channels) {
-    shared_max[tid] = -FLT_MAX;
-    shared_min[tid] = -FLT_MAX;
-
-    int64_t index = (panel_size * cur_channel) + tid;
-    int64_t end = panel_size * (cur_channel + 1);
-
-    while (index < end && index < elements) {
-      shared_max[tid] = max(shared_max[tid], input_ptr[index]);
-      shared_min[tid] = max(shared_min[tid], -input_ptr[index]);
-      index += blockDim.x;
-    }
-    __syncthreads();
-
-    for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1) {
-      if (tid < s) {
-        shared_max[tid] = max(shared_max[tid], shared_max[tid + s]);
-        shared_min[tid] = max(shared_min[tid], shared_min[tid + s]);
-      }
-      __syncthreads();
-    }
-
-    if (tid == 0) {
-      cuda::atomic::Max(&max_ptr[cur_channel], shared_max[0]);
-      cuda::atomic::Max(&min_ptr[cur_channel], shared_min[0]);
-    }
-
-    // __syncthreads();
-    cur_channel += gridDim.x;
-  }
-}
-
-template<typename T>
-__global__ void InitMaxMin(const int64_t elements, T* max_ptr, T* min_ptr) {
-  int64_t tid = threadIdx.x;
-  int64_t gid = (blockDim.x * blockIdx.x) + tid;
-
-  while (gid < elements) {
-    max_ptr[gid] = -FLT_MAX;
-    min_ptr[gid] = -FLT_MAX;
-    gid += gridDim.x * blockDim.x;
-  }
-}
-
-template<typename T>
-__global__ void CalScaleZeroPointSymmetric(const T* max_ptr, const T* min_ptr,
-                                           const int64_t elements, const double quantization_bit,
-                                           T* scale, T* zero_point) {
-  int64_t tid = threadIdx.x;
-  int64_t gid = (blockDim.x * blockIdx.x) + tid;
-
-  while (gid < elements) {
-    T weight_max = max(fabs(max_ptr[gid]), fabs(min_ptr[gid]));
-    T denominator = static_cast<T>(pow(2.0, quantization_bit - 1)) - 1;
-    scale[gid] = weight_max / denominator;
-    zero_point[gid] = 0;
-    gid += gridDim.x * blockDim.x;
-  }
-}
-
-template<typename T>
-__global__ void CalScaleZeroPointAffine(const T* max_ptr, const T* min_ptr, const int64_t elements,
-                                        const double quantization_bit, T* scale, T* zero_point) {
-  int64_t tid = threadIdx.x;
-  int64_t gid = (blockDim.x * blockIdx.x) + tid;
-
-  while (gid < elements) {
-    T denominator = static_cast<T>(pow(2.0, quantization_bit)) - 1;
-    T min = -min_ptr[gid];
-    T s = (max_ptr[gid] - min) / denominator;
-    scale[gid] = s;
-    zero_point[gid] = -nearbyint(min / s);
-    gid += gridDim.x * blockDim.x;
-  }
-}
-
-template<typename T>
-__global__ void CalScaleZeroPointCambricon(const T* max_ptr, const T* min_ptr,
-                                           const int64_t elements, const double quantization_bit,
-                                           T* scale, T* zero_point) {
-  int64_t tid = threadIdx.x;
-  int64_t gid = (blockDim.x * blockIdx.x) + tid;
-
-  while (gid < elements) {
-    T weight_max = max(fabs(max_ptr[gid]), fabs(min_ptr[gid]));
-    // T denominator = static_cast<T>(pow(2.0, quantization_bit - 1)) - 1;
-    scale[gid] = floor(log2(weight_max)) - (quantization_bit - 2);
-    zero_point[gid] = 0;
-    gid += gridDim.x * blockDim.x;
-  }
-}
-
-ep::CudaLaunchConfig GetLaunchConfig(ep::CudaStream* stream, size_t thread_num,
-                                     size_t shared_mem_size) {
-  ep::CudaLaunchConfig config;
-  stream->InitLaunchConfigWithWaves(&config, thread_num, kCudaThreadsNumPerBlock, 1);
-  config.shared_mem_size = shared_mem_size;
-  return config;
-}
-
-}  // namespace
-
-#define LAUNCH_CUDA_KERNEL(func, stream, thread_num, shared_mem_size, ...) \
-  (stream)->LaunchKernel(func, GetLaunchConfig((stream), thread_num, shared_mem_size), __VA_ARGS__);
-
-template<typename T>
-class GpuMinMaxObserverKernel final : public user_op::OpKernel {
- public:
-  GpuMinMaxObserverKernel() = default;
-  ~GpuMinMaxObserverKernel() = default;
-
- private:
-  using user_op::OpKernel::Compute;
-  void Compute(user_op::KernelComputeContext* ctx) const override {
-    const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("in", 0);
-    user_op::Tensor* scale = ctx->Tensor4ArgNameAndIndex("scale", 0);
-    user_op::Tensor* zero_point = ctx->Tensor4ArgNameAndIndex("zero_point", 0);
-    user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0);
-
-    const std::string quantization_scheme = ctx->Attr<std::string>("quantization_scheme");
-    const int32_t quantization_bit = ctx->Attr<int32_t>("quantization_bit");
-    const bool per_layer_quantization = ctx->Attr<bool>("per_layer_quantization");
-    const std::string quantization_formula = ctx->Attr<std::string>("quantization_formula");
-
-    const int64_t elements = in->shape_view().elem_cnt();
-    const int64_t channel = scale->shape_view().At(0);
-    const int64_t panel_size = elements / channel;
-    T* max_ptr = tmp_buffer->mut_dptr<T>();
-    T* min_ptr = max_ptr + channel;
-    auto* cuda_stream = ctx->stream()->As<ep::CudaStream>();
-    LAUNCH_CUDA_KERNEL((InitMaxMin<T>), cuda_stream, channel, 0, channel, max_ptr, min_ptr);
-
-    if (per_layer_quantization) {
-      LAUNCH_CUDA_KERNEL((ReduceMaxMinPerLayer<T>), cuda_stream, elements,
-                         kCudaThreadsNumPerBlock * 2 * sizeof(T), in->dptr<T>(), elements, max_ptr,
-                         min_ptr);
-    } else {  // per-channel quantization
-      // NOTE(Liang Depeng): each block of threads will be responsible for
-      //                     computing the max and min values of the whole channel.
-      LAUNCH_CUDA_KERNEL((ReduceMaxMinPerChannel<T>), cuda_stream,
-                         channel * kCudaThreadsNumPerBlock, kCudaThreadsNumPerBlock * 2 * sizeof(T),
-                         in->dptr<T>(), elements, channel, panel_size, max_ptr, min_ptr);
-    }
-
-    if (quantization_formula == "google") {
-      if (quantization_scheme == "symmetric") {
-        LAUNCH_CUDA_KERNEL((CalScaleZeroPointSymmetric<T>), cuda_stream, channel, 0, max_ptr,
-                           min_ptr, channel, static_cast<double>(quantization_bit),
-                           scale->mut_dptr<T>(), zero_point->mut_dptr<T>());
-      } else {  // quantization_scheme == "affine"
-        LAUNCH_CUDA_KERNEL((CalScaleZeroPointAffine<T>), cuda_stream, channel, 0, max_ptr, min_ptr,
-                           channel, static_cast<double>(quantization_bit), scale->mut_dptr<T>(),
-                           zero_point->mut_dptr<T>());
-      }
-    } else if (quantization_formula == "cambricon") {
-      if (!per_layer_quantization) {
-        UNIMPLEMENTED() << " per-channel mode is not supported in cambricon scheme";
-      }
-      LAUNCH_CUDA_KERNEL((CalScaleZeroPointCambricon<T>), cuda_stream, channel, 0, max_ptr, min_ptr,
-                         channel, static_cast<double>(quantization_bit), scale->mut_dptr<T>(),
-                         zero_point->mut_dptr<T>());
-    } else {
-      UNIMPLEMENTED();
-    }
-  }
-
-  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
-};
-
-#define REGISTER_MIN_MAX_OBSERVER_KERNEL(dtype)                                         \
-  REGISTER_USER_KERNEL("min_max_observer")                                              \
-      .SetCreateFn<GpuMinMaxObserverKernel<dtype>>()                                    \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                  \
-                       && (user_op::HobDataType("in", 0) == GetDataType<dtype>::value)) \
-      .SetInferTmpSizeFn([](user_op::InferContext* ctx) -> size_t {                     \
-        size_t tmp_buffer_size = 1;                                                     \
-        if (ctx->Attr<bool>("per_layer_quantization") == false) {                       \
-          const Shape& in_shape = ctx->InputShape("in", 0);                             \
-          tmp_buffer_size = in_shape.At(0);                                             \
-        }                                                                               \
-        return 2 * tmp_buffer_size * sizeof(dtype);                                     \
-      })
-
-REGISTER_MIN_MAX_OBSERVER_KERNEL(float);
-REGISTER_MIN_MAX_OBSERVER_KERNEL(double);
-
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "hip/hip_runtime.h"
+#include "oneflow/core/device/cuda_util.h"
+#include "oneflow/core/framework/framework.h"
+#include "oneflow/core/hip/atomic.hip.h"
+#include "oneflow/core/ep/rocm/cuda_stream.h"
+
+#include <float.h>
+
+namespace oneflow {
+
+namespace {
+
+// NOTE(Liang Depeng): refer to
+// https://stackoverflow.com/questions/17371275/implementing-max-reduce-in-cuda
+template<typename T>
+__global__ void ReduceMaxMinPerLayer(const T* input_ptr, const int64_t elements, T* max_ptr,
+                                     T* min_ptr) {
+  extern __shared__ unsigned char shared_max_min_memory[];
+  T* shared_max = reinterpret_cast<T*>(shared_max_min_memory);
+  T* shared_min = shared_max + blockDim.x;
+
+  int64_t tid = threadIdx.x;
+  int64_t gid = (blockDim.x * blockIdx.x) + tid;
+  shared_max[tid] = -FLT_MAX;
+  shared_min[tid] = -FLT_MAX;
+
+  while (gid < elements) {
+    shared_max[tid] = max(shared_max[tid], input_ptr[gid]);
+    shared_min[tid] = max(shared_min[tid], -input_ptr[gid]);
+    gid += gridDim.x * blockDim.x;
+  }
+  __syncthreads();
+  gid = (blockDim.x * blockIdx.x) + tid;
+  for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1) {
+    if (tid < s && gid < elements) {
+      shared_max[tid] = max(shared_max[tid], shared_max[tid + s]);
+      shared_min[tid] = max(shared_min[tid], shared_min[tid + s]);
+    }
+    __syncthreads();
+  }
+
+  if (tid == 0) {
+    cuda::atomic::Max(max_ptr, shared_max[0]);
+    cuda::atomic::Max(min_ptr, shared_min[0]);
+  }
+}
+
+template<typename T>
+__global__ void ReduceMaxMinPerChannel(const T* input_ptr, const int64_t elements,
+                                       const int64_t num_channels, const int64_t panel_size,
+                                       T* max_ptr, T* min_ptr) {
+  extern __shared__ unsigned char shared_max_min_memory[];
+  T* shared_max = reinterpret_cast<T*>(shared_max_min_memory);
+  T* shared_min = shared_max + blockDim.x;
+
+  int64_t cur_channel = blockIdx.x;
+  int64_t tid = threadIdx.x;
+
+  while (cur_channel < num_channels) {
+    shared_max[tid] = -FLT_MAX;
+    shared_min[tid] = -FLT_MAX;
+
+    int64_t index = (panel_size * cur_channel) + tid;
+    int64_t end = panel_size * (cur_channel + 1);
+
+    while (index < end && index < elements) {
+      shared_max[tid] = max(shared_max[tid], input_ptr[index]);
+      shared_min[tid] = max(shared_min[tid], -input_ptr[index]);
+      index += blockDim.x;
+    }
+    __syncthreads();
+
+    for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1) {
+      if (tid < s) {
+        shared_max[tid] = max(shared_max[tid], shared_max[tid + s]);
+        shared_min[tid] = max(shared_min[tid], shared_min[tid + s]);
+      }
+      __syncthreads();
+    }
+
+    if (tid == 0) {
+      cuda::atomic::Max(&max_ptr[cur_channel], shared_max[0]);
+      cuda::atomic::Max(&min_ptr[cur_channel], shared_min[0]);
+    }
+
+    // __syncthreads();
+    cur_channel += gridDim.x;
+  }
+}
+
+template<typename T>
+__global__ void InitMaxMin(const int64_t elements, T* max_ptr, T* min_ptr) {
+  int64_t tid = threadIdx.x;
+  int64_t gid = (blockDim.x * blockIdx.x) + tid;
+
+  while (gid < elements) {
+    max_ptr[gid] = -FLT_MAX;
+    min_ptr[gid] = -FLT_MAX;
+    gid += gridDim.x * blockDim.x;
+  }
+}
+
+template<typename T>
+__global__ void CalScaleZeroPointSymmetric(const T* max_ptr, const T* min_ptr,
+                                           const int64_t elements, const double quantization_bit,
+                                           T* scale, T* zero_point) {
+  int64_t tid = threadIdx.x;
+  int64_t gid = (blockDim.x * blockIdx.x) + tid;
+
+  while (gid < elements) {
+    T weight_max = max(fabs(max_ptr[gid]), fabs(min_ptr[gid]));
+    T denominator = static_cast<T>(pow(2.0, quantization_bit - 1)) - 1;
+    scale[gid] = weight_max / denominator;
+    zero_point[gid] = 0;
+    gid += gridDim.x * blockDim.x;
+  }
+}
+
+template<typename T>
+__global__ void CalScaleZeroPointAffine(const T* max_ptr, const T* min_ptr, const int64_t elements,
+                                        const double quantization_bit, T* scale, T* zero_point) {
+  int64_t tid = threadIdx.x;
+  int64_t gid = (blockDim.x * blockIdx.x) + tid;
+
+  while (gid < elements) {
+    T denominator = static_cast<T>(pow(2.0, quantization_bit)) - 1;
+    T min = -min_ptr[gid];
+    T s = (max_ptr[gid] - min) / denominator;
+    scale[gid] = s;
+    zero_point[gid] = -nearbyint(min / s);
+    gid += gridDim.x * blockDim.x;
+  }
+}
+
+template<typename T>
+__global__ void CalScaleZeroPointCambricon(const T* max_ptr, const T* min_ptr,
+                                           const int64_t elements, const double quantization_bit,
+                                           T* scale, T* zero_point) {
+  int64_t tid = threadIdx.x;
+  int64_t gid = (blockDim.x * blockIdx.x) + tid;
+
+  while (gid < elements) {
+    T weight_max = max(fabs(max_ptr[gid]), fabs(min_ptr[gid]));
+    // T denominator = static_cast<T>(pow(2.0, quantization_bit - 1)) - 1;
+    scale[gid] = floor(log2(weight_max)) - (quantization_bit - 2);
+    zero_point[gid] = 0;
+    gid += gridDim.x * blockDim.x;
+  }
+}
+
+ep::CudaLaunchConfig GetLaunchConfig(ep::CudaStream* stream, size_t thread_num,
+                                     size_t shared_mem_size) {
+  ep::CudaLaunchConfig config;
+  stream->InitLaunchConfigWithWaves(&config, thread_num, kCudaThreadsNumPerBlock, 1);
+  config.shared_mem_size = shared_mem_size;
+  return config;
+}
+
+}  // namespace
+
+#define LAUNCH_CUDA_KERNEL(func, stream, thread_num, shared_mem_size, ...) \
+  (stream)->LaunchKernel(func, GetLaunchConfig((stream), thread_num, shared_mem_size), __VA_ARGS__);
+
+template<typename T>
+class GpuMinMaxObserverKernel final : public user_op::OpKernel {
+ public:
+  GpuMinMaxObserverKernel() = default;
+  ~GpuMinMaxObserverKernel() = default;
+
+ private:
+  using user_op::OpKernel::Compute;
+  void Compute(user_op::KernelComputeContext* ctx) const override {
+    const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("in", 0);
+    user_op::Tensor* scale = ctx->Tensor4ArgNameAndIndex("scale", 0);
+    user_op::Tensor* zero_point = ctx->Tensor4ArgNameAndIndex("zero_point", 0);
+    user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0);
+
+    const std::string quantization_scheme = ctx->Attr<std::string>("quantization_scheme");
+    const int32_t quantization_bit = ctx->Attr<int32_t>("quantization_bit");
+    const bool per_layer_quantization = ctx->Attr<bool>("per_layer_quantization");
+    const std::string quantization_formula = ctx->Attr<std::string>("quantization_formula");
+
+    const int64_t elements = in->shape_view().elem_cnt();
+    const int64_t channel = scale->shape_view().At(0);
+    const int64_t panel_size = elements / channel;
+    T* max_ptr = tmp_buffer->mut_dptr<T>();
+    T* min_ptr = max_ptr + channel;
+    auto* cuda_stream = ctx->stream()->As<ep::CudaStream>();
+    LAUNCH_CUDA_KERNEL((InitMaxMin<T>), cuda_stream, channel, 0, channel, max_ptr, min_ptr);
+
+    if (per_layer_quantization) {
+      LAUNCH_CUDA_KERNEL((ReduceMaxMinPerLayer<T>), cuda_stream, elements,
+                         kCudaThreadsNumPerBlock * 2 * sizeof(T), in->dptr<T>(), elements, max_ptr,
+                         min_ptr);
+    } else {  // per-channel quantization
+      // NOTE(Liang Depeng): each block of threads will be responsible for
+      //                     computing the max and min values of the whole channel.
+      LAUNCH_CUDA_KERNEL((ReduceMaxMinPerChannel<T>), cuda_stream,
+                         channel * kCudaThreadsNumPerBlock, kCudaThreadsNumPerBlock * 2 * sizeof(T),
+                         in->dptr<T>(), elements, channel, panel_size, max_ptr, min_ptr);
+    }
+
+    if (quantization_formula == "google") {
+      if (quantization_scheme == "symmetric") {
+        LAUNCH_CUDA_KERNEL((CalScaleZeroPointSymmetric<T>), cuda_stream, channel, 0, max_ptr,
+                           min_ptr, channel, static_cast<double>(quantization_bit),
+                           scale->mut_dptr<T>(), zero_point->mut_dptr<T>());
+      } else {  // quantization_scheme == "affine"
+        LAUNCH_CUDA_KERNEL((CalScaleZeroPointAffine<T>), cuda_stream, channel, 0, max_ptr, min_ptr,
+                           channel, static_cast<double>(quantization_bit), scale->mut_dptr<T>(),
+                           zero_point->mut_dptr<T>());
+      }
+    } else if (quantization_formula == "cambricon") {
+      if (!per_layer_quantization) {
+        UNIMPLEMENTED() << " per-channel mode is not supported in cambricon scheme";
+      }
+      LAUNCH_CUDA_KERNEL((CalScaleZeroPointCambricon<T>), cuda_stream, channel, 0, max_ptr, min_ptr,
+                         channel, static_cast<double>(quantization_bit), scale->mut_dptr<T>(),
+                         zero_point->mut_dptr<T>());
+    } else {
+      UNIMPLEMENTED();
+    }
+  }
+
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
+};
+
+#define REGISTER_MIN_MAX_OBSERVER_KERNEL(dtype)                                         \
+  REGISTER_USER_KERNEL("min_max_observer")                                              \
+      .SetCreateFn<GpuMinMaxObserverKernel<dtype>>()                                    \
+      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                  \
+                       && (user_op::HobDataType("in", 0) == GetDataType<dtype>::value)) \
+      .SetInferTmpSizeFn([](user_op::InferContext* ctx) -> size_t {                     \
+        size_t tmp_buffer_size = 1;                                                     \
+        if (ctx->Attr<bool>("per_layer_quantization") == false) {                       \
+          const Shape& in_shape = ctx->InputShape("in", 0);                             \
+          tmp_buffer_size = in_shape.At(0);                                             \
+        }                                                                               \
+        return 2 * tmp_buffer_size * sizeof(dtype);                                     \
+      })
+
+REGISTER_MIN_MAX_OBSERVER_KERNEL(float);
+REGISTER_MIN_MAX_OBSERVER_KERNEL(double);
+
 }  // namespace oneflow
\ No newline at end of file
diff --git a/oneflow/user/kernels/model_update_kernel_util.hip.cpp b/oneflow/user/kernels/model_update_kernel_util.hip.cpp
index 22e1510..ddb698d 100644
--- a/oneflow/user/kernels/model_update_kernel_util.hip.cpp
+++ b/oneflow/user/kernels/model_update_kernel_util.hip.cpp
@@ -1,799 +1,799 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "oneflow/core/framework/framework.h"
-#include "oneflow/core/hip/atomic.hip.h"
-#include "oneflow/user/kernels/model_update_kernel_util.h"
-#include <hipcub/hipcub.hpp>
-#include "oneflow/core/ep/rocm/cuda_stream.h"
-
-namespace oneflow {
-
-namespace {
-
-template<typename T, typename G>
-__global__ void SGDUpdateGpu(int64_t n, T scale, float l1, float l2, float weight_decay,
-                             float learning_rate_val, const float* learning_rate,
-                             const T* scale_by_ptr, const int64_t* skip_if, const G* model_diff,
-                             T* model) {
-  if (skip_if != nullptr && *skip_if != 0) { return; }
-  if (learning_rate != nullptr) { learning_rate_val = *learning_rate; }
-  if (scale_by_ptr != nullptr) { scale *= *scale_by_ptr; }
-  CUDA_1D_KERNEL_LOOP(i, n) {
-    SGDUpdateFunctor<T, G>()(model_diff + i, model + i, scale, l1, l2, weight_decay,
-                             learning_rate_val);
-  }
-}
-
-template<typename T, typename K, typename IDX>
-__global__ void IndexedSlicesSGDUpdateGpu(float weight_decay, const IDX feature_size,
-                                          const int64_t lower_bound, const int64_t upper_bound,
-                                          const IDX* num_unique_instance,
-                                          const float* learning_rate, const K* indices,
-                                          const T* values, T* model) {
-  const int64_t n = *num_unique_instance * feature_size;
-  const T lr = *learning_rate;
-  CUDA_1D_KERNEL_LOOP_T(IDX, i, n) {
-    const IDX indices_idx = i / feature_size;
-    const IDX inner_idx = i - indices_idx * feature_size;
-    const IDX instance_id = indices[indices_idx];
-    if (instance_id >= lower_bound && instance_id < upper_bound) {
-      const IDX model_idx = (instance_id - lower_bound) * feature_size + inner_idx;
-      SGDUpdateFunctor<T, T>()(values + i, model + model_idx, static_cast<T>(1), 0.0, 0.0,
-                               weight_decay, lr);
-    }
-  }
-}
-
-template<typename T>
-__global__ void SumSquares2(int64_t n, const T* src0, T* dst0, const T* src1, T* dst1) {
-  T t_sum0 = 0;
-  T t_sum1 = 0;
-  CUDA_1D_KERNEL_LOOP(i, n) {
-    t_sum0 += src0[i] * src0[i];
-    t_sum1 += src1[i] * src1[i];
-  }
-  typedef hipcub::BlockReduce<T, kCudaThreadsNumPerBlock> BlockReduce;
-  __shared__ typename BlockReduce::TempStorage temp_storage0;
-  __shared__ typename BlockReduce::TempStorage temp_storage1;
-  T b_sum0 = BlockReduce(temp_storage0).Sum(t_sum0);
-  T b_sum1 = BlockReduce(temp_storage1).Sum(t_sum1);
-  if (threadIdx.x == 0) {
-    cuda::atomic::Add(dst0, b_sum0);
-    cuda::atomic::Add(dst1, b_sum1);
-  }
-}
-
-}  // namespace
-
-template<typename T, typename G>
-struct SGDUpdateKernelUtil<DeviceType::kCUDA, T, G> {
-  static void Update(ep::Stream* stream, int64_t n, T scale, float l1, float l2, float weight_decay,
-                     float learning_rate_val, const float* learning_rate, const T* scale_by_ptr,
-                     const int64_t* skip_if, const G* model_diff, T* model);
-};
-
-template<typename T, typename G>
-void SGDUpdateKernelUtil<DeviceType::kCUDA, T, G>::Update(
-    ep::Stream* stream, int64_t n, T scale, float l1, float l2, float weight_decay,
-    float learning_rate_val, const float* learning_rate, const T* scale_by_ptr,
-    const int64_t* skip_if, const G* model_diff, T* model) {
-  SGDUpdateGpu<T, G><<<BlocksNum4ThreadsNum(n), kCudaThreadsNumPerBlock, 0,
-                       stream->As<ep::CudaStream>()->cuda_stream()>>>(
-      n, scale, l1, l2, weight_decay, learning_rate_val, learning_rate, scale_by_ptr, skip_if,
-      model_diff, model);
-}
-
-template<typename T>
-struct SGDUpdateKernelUtil<DeviceType::kCUDA, T, float16> {
-  static void Update(ep::Stream* stream, int64_t n, T scale, float l1, float l2, float weight_decay,
-                     float learning_rate_val, const float* learning_rate, const T* scale_by_ptr,
-                     const int64_t* skip_if, const float16* model_diff, T* model);
-};
-
-template<typename T>
-void SGDUpdateKernelUtil<DeviceType::kCUDA, T, float16>::Update(
-    ep::Stream* stream, int64_t n, T scale, float l1, float l2, float weight_decay,
-    float learning_rate_val, const float* learning_rate, const T* scale_by_ptr,
-    const int64_t* skip_if, const float16* model_diff, T* model) {
-  SGDUpdateKernelUtil<DeviceType::kCUDA, T, half>::Update(
-      stream, n, scale, l1, l2, weight_decay, learning_rate_val, learning_rate, scale_by_ptr,
-      skip_if, reinterpret_cast<const half*>(model_diff), model);
-}
-
-template struct SGDUpdateKernelUtil<DeviceType::kCUDA, double, double>;
-template struct SGDUpdateKernelUtil<DeviceType::kCUDA, float, float>;
-template struct SGDUpdateKernelUtil<DeviceType::kCUDA, float, float16>;
-
-template<typename T, typename K, typename IDX>
-struct IndexedSlicesSGDUpdateKernelUtil<DeviceType::kCUDA, T, K, IDX> {
-  static void Update(ep::Stream* stream, float weight_decay, int64_t num_indices,
-                     int64_t feature_size, int64_t lower_bound, int64_t upper_bound,
-                     const IDX* num_unique_instance, const float* learning_rate, const K* indices,
-                     const T* values, T* model);
-};
-
-template<typename T, typename K, typename IDX>
-void IndexedSlicesSGDUpdateKernelUtil<DeviceType::kCUDA, T, K, IDX>::Update(
-    ep::Stream* stream, float weight_decay, int64_t num_indices, int64_t feature_size,
-    int64_t lower_bound, int64_t upper_bound, const IDX* num_unique_instance,
-    const float* learning_rate, const K* indices, const T* values, T* model) {
-  IndexedSlicesSGDUpdateGpu<T, K, IDX>
-      <<<BlocksNum4ThreadsNum(num_indices * feature_size), kCudaThreadsNumPerBlock, 0,
-         stream->As<ep::CudaStream>()->cuda_stream()>>>(weight_decay, feature_size, lower_bound,
-                                                        upper_bound, num_unique_instance,
-                                                        learning_rate, indices, values, model);
-}
-
-#define INITIATE_INDEXED_SLICES_SGD_UPDATE_KERNEL_UTIL_CUDA(val_type_pair, key_type_pair,  \
-                                                            idx_type_pair)                 \
-  template struct IndexedSlicesSGDUpdateKernelUtil<                                        \
-      DeviceType::kCUDA, OF_PP_PAIR_FIRST(val_type_pair), OF_PP_PAIR_FIRST(key_type_pair), \
-      OF_PP_PAIR_FIRST(idx_type_pair)>;
-OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INITIATE_INDEXED_SLICES_SGD_UPDATE_KERNEL_UTIL_CUDA,
-                                 FLOATING_DATA_TYPE_SEQ, INDEX_DATA_TYPE_SEQ, INT_DATA_TYPE_SEQ);
-#undef INITIATE_INDEXED_SLICES_SGD_UPDATE_KERNEL_UTIL_CUDA
-
-namespace {
-
-template<typename T, typename G>
-__global__ void MomentumUpdateGpu(int64_t n, T scale, float l1, float l2, float beta,
-                                  float weight_decay, float learning_rate_val,
-                                  const float* learning_rate, const T* scale_by_ptr,
-                                  const int64_t* skip_if, const G* model_diff, T* model,
-                                  T* momentum) {
-  if (skip_if != nullptr && *skip_if != 0) { return; }
-  if (learning_rate != nullptr) { learning_rate_val = *learning_rate; }
-  if (scale_by_ptr != nullptr) { scale *= *scale_by_ptr; }
-  CUDA_1D_KERNEL_LOOP(i, n) {
-    MomentumUpdateFunctor<T, G>()(model_diff + i, model + i, momentum + i, scale, l1, l2, beta,
-                                  weight_decay, learning_rate_val);
-  }
-}
-
-template<typename T, typename K, typename IDX>
-__global__ void IndexedSlicesMomentumUpdateGpu(T beta, float weight_decay, int64_t feature_size,
-                                               int64_t lower_bound, int64_t upper_bound,
-                                               const IDX* num_unique_instance,
-                                               const float* learning_rate, const K* indices,
-                                               const T* values, T* model, T* momentum) {
-  const int64_t n = *num_unique_instance * feature_size;
-  const T lr = *learning_rate;
-  CUDA_1D_KERNEL_LOOP(i, n) {
-    const IDX indices_idx = i / feature_size;
-    const IDX inner_idx = i - indices_idx * feature_size;
-    const IDX instance_id = indices[indices_idx];
-    if (instance_id >= lower_bound && instance_id < upper_bound) {
-      const IDX model_idx = (instance_id - lower_bound) * feature_size + inner_idx;
-      MomentumUpdateFunctor<T, T>()(values + i, model + model_idx, momentum + model_idx,
-                                    static_cast<T>(1), 0.0, 0.0, beta, weight_decay, lr);
-    }
-  }
-}
-
-}  // namespace
-
-template<typename T, typename G>
-struct MomentumUpdateKernelUtil<DeviceType::kCUDA, T, G> {
-  static void Update(ep::Stream* stream, int64_t n, T scale, float l1, float l2, float beta,
-                     float weight_decay, float learning_rate_val, const float* learning_rate,
-                     const T* scale_by_ptr, const int64_t* skip_if, const G* model_diff, T* model,
-                     T* momentum);
-};
-
-template<typename T, typename G>
-void MomentumUpdateKernelUtil<DeviceType::kCUDA, T, G>::Update(
-    ep::Stream* stream, int64_t n, T scale, float l1, float l2, float beta, float weight_decay,
-    float learning_rate_val, const float* learning_rate, const T* scale_by_ptr,
-    const int64_t* skip_if, const G* model_diff, T* model, T* momentum) {
-  MomentumUpdateGpu<T, G><<<BlocksNum4ThreadsNum(n), kCudaThreadsNumPerBlock, 0,
-                            stream->As<ep::CudaStream>()->cuda_stream()>>>(
-      n, scale, l1, l2, beta, weight_decay, learning_rate_val, learning_rate, scale_by_ptr, skip_if,
-      model_diff, model, momentum);
-}
-
-template<typename T>
-struct MomentumUpdateKernelUtil<DeviceType::kCUDA, T, float16> {
-  static void Update(ep::Stream* stream, int64_t n, T scale, float l1, float l2, float beta,
-                     float weight_decay, float learning_rate_val, const float* learning_rate,
-                     const T* scale_by_ptr, const int64_t* skip_if, const float16* model_diff,
-                     T* model, T* momentum);
-};
-
-template<typename T>
-void MomentumUpdateKernelUtil<DeviceType::kCUDA, T, float16>::Update(
-    ep::Stream* stream, int64_t n, T scale, float l1, float l2, float beta, float weight_decay,
-    float learning_rate_val, const float* learning_rate, const T* scale_by_ptr,
-    const int64_t* skip_if, const float16* model_diff, T* model, T* momentum) {
-  MomentumUpdateKernelUtil<DeviceType::kCUDA, T, half>::Update(
-      stream, n, scale, l1, l2, beta, weight_decay, learning_rate_val, learning_rate, scale_by_ptr,
-      skip_if, reinterpret_cast<const half*>(model_diff), model, momentum);
-}
-
-template struct MomentumUpdateKernelUtil<DeviceType::kCUDA, double, double>;
-template struct MomentumUpdateKernelUtil<DeviceType::kCUDA, float, float>;
-template struct MomentumUpdateKernelUtil<DeviceType::kCUDA, float, float16>;
-
-template<typename T, typename K, typename IDX>
-struct IndexedSlicesMomentumMdUpdateKernelUtil<DeviceType::kCUDA, T, K, IDX> {
-  static void Update(ep::Stream* stream, T beta, float weight_decay, int64_t num_instance,
-                     int64_t feature_size, int64_t lower_bound, int64_t upper_bound,
-                     const IDX* num_unique_instance, const float* learning_rate, const K* indices,
-                     const T* values, T* model, T* momentum);
-};
-
-template<typename T, typename K, typename IDX>
-void IndexedSlicesMomentumMdUpdateKernelUtil<DeviceType::kCUDA, T, K, IDX>::Update(
-    ep::Stream* stream, T beta, float weight_decay, int64_t num_instance, int64_t feature_size,
-    int64_t lower_bound, int64_t upper_bound, const IDX* num_unique_instance,
-    const float* learning_rate, const K* indices, const T* values, T* model, T* momentum) {
-  IndexedSlicesMomentumUpdateGpu<T, K, IDX>
-      <<<BlocksNum4ThreadsNum(num_instance * feature_size), kCudaThreadsNumPerBlock, 0,
-         stream->As<ep::CudaStream>()->cuda_stream()>>>(
-          beta, weight_decay, feature_size, lower_bound, upper_bound, num_unique_instance,
-          learning_rate, indices, values, model, momentum);
-}
-
-#define INSTANTIATE_INDEXED_SLICES_MOMENTUM_MODEL_UPDATE_KERNEL_UTIL_CUDA(                 \
-    val_type_pair, key_type_pair, idx_type_pair)                                           \
-  template struct IndexedSlicesMomentumMdUpdateKernelUtil<                                 \
-      DeviceType::kCUDA, OF_PP_PAIR_FIRST(val_type_pair), OF_PP_PAIR_FIRST(key_type_pair), \
-      OF_PP_PAIR_FIRST(idx_type_pair)>;
-OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_INDEXED_SLICES_MOMENTUM_MODEL_UPDATE_KERNEL_UTIL_CUDA,
-                                 FLOATING_DATA_TYPE_SEQ, INDEX_DATA_TYPE_SEQ, INT_DATA_TYPE_SEQ);
-#undef INSTANTIATE_INDEXED_SLICES_MOMENTUM_MODEL_UPDATE_KERNEL_UTIL_CUDA
-
-namespace {
-
-__global__ void BiasCorrectionFactorKernelGpu(float beta, const int64_t* train_step, float* out) {
-  const auto exponent = static_cast<double>(*train_step + 1);
-  const float bias_correction_factor = 1.0 - static_cast<float>(pow(beta, exponent));
-  *out = bias_correction_factor;
-}
-
-template<typename T, typename G>
-__global__ void AdamUpdateGpu(int64_t n, T scale, float l1, float l2, float beta1, float beta2,
-                              float epsilon, float weight_decay, bool amsgrad,
-                              bool do_bias_correction, float learning_rate_val,
-                              float bias_correction1_val, float bias_correction2_val,
-                              const float* learning_rate, const T* scale_by_ptr,
-                              const int64_t* skip_if, const float* bias_correction1_ptr,
-                              const float* bias_correction2_ptr, const G* model_diff, T* model,
-                              T* m, T* v, T* max_v) {
-  if (skip_if != nullptr && *skip_if != 0) { return; }
-  if (learning_rate != nullptr) { learning_rate_val = *learning_rate; }
-  if (scale_by_ptr != nullptr) { scale *= *scale_by_ptr; }
-  if (bias_correction1_ptr != nullptr) { bias_correction1_val = *bias_correction1_ptr; }
-  if (bias_correction2_ptr != nullptr) { bias_correction2_val = *bias_correction2_ptr; }
-
-  CUDA_1D_KERNEL_LOOP(i, n) {
-    AdamUpdateFunctor<T, G>()(model_diff + i, model + i, m + i, v + i, max_v + i, scale, l1, l2,
-                              beta1, beta2, epsilon, weight_decay, amsgrad, bias_correction1_val,
-                              bias_correction2_val, learning_rate_val);
-  }
-}
-
-template<typename T>
-__global__ void AdamUpdateBetaTGpu(const T beta1, const T beta2, const int64_t* skip_if, T* beta1_t,
-                                   T* beta2_t) {
-  if (skip_if != nullptr && *skip_if != 0) { return; }
-  *beta1_t *= beta1;
-  *beta2_t *= beta2;
-}
-
-template<typename T, typename K, typename IDX>
-__global__ void IndexedSlicesAdamUpdateGpu(
-    float beta1, float beta2, float epsilon, float weight_decay, bool amsgrad,
-    bool do_bias_correction, float lr, int64_t feature_size, int64_t lower_bound,
-    int64_t upper_bound, const IDX* num_unique_instance, const float* learning_rate,
-    const float* bias_correction1_ptr, const float* bias_correction2_ptr, const K* indices,
-    const T* values, T* model, T* m, T* v, T* max_v) {
-  if (learning_rate != nullptr) { lr = *learning_rate; }
-  float bias_correction1 = 1.0;
-  float bias_correction2 = 1.0;
-  if (bias_correction1_ptr != nullptr) { bias_correction1 = *bias_correction1_ptr; }
-  if (bias_correction2_ptr != nullptr) { bias_correction2 = *bias_correction2_ptr; }
-
-  const int64_t n = *num_unique_instance * feature_size;
-  CUDA_1D_KERNEL_LOOP(i, n) {
-    const IDX indices_idx = i / feature_size;
-    const IDX inner_idx = i - indices_idx * feature_size;
-    const IDX instance_id = indices[indices_idx];
-    if (instance_id >= lower_bound && instance_id < upper_bound) {
-      const IDX model_idx = (instance_id - lower_bound) * feature_size + inner_idx;
-      AdamUpdateFunctor<T, T>()(values + i, model + model_idx, m + model_idx, v + model_idx,
-                                max_v + i, static_cast<T>(1), 0, 0, beta1, beta2, epsilon,
-                                weight_decay, amsgrad, bias_correction1, bias_correction2, lr);
-    }
-  }
-}
-
-template<typename T, typename G>
-__global__ void LambGradGpu(int64_t n, T scale, float l1, float l2, float beta1, float beta2,
-                            float epsilon, const T* scale_by_ptr, const int64_t* skip_if,
-                            const G* model_diff, T* adam_diff, T* model, T* m, T* v,
-                            bool do_bias_correction, float bias_correction1_val,
-                            float bias_correction2_val, const float* bias_correction1_ptr,
-                            const float* bias_correction2_ptr) {
-  if (skip_if != nullptr && *skip_if != 0) { return; }
-  if (scale_by_ptr != nullptr) { scale *= *scale_by_ptr; }
-  if (bias_correction1_ptr != nullptr) { bias_correction1_val = *bias_correction1_ptr; }
-  if (bias_correction2_ptr != nullptr) { bias_correction2_val = *bias_correction2_ptr; }
-  CUDA_1D_KERNEL_LOOP(i, n) {
-    LambGradFunctor<T, G>()(model_diff + i, adam_diff + i, model + i, m + i, v + i, scale, l1, l2,
-                            beta1, beta2, epsilon, do_bias_correction, bias_correction1_val,
-                            bias_correction2_val);
-  }
-}
-
-template<typename T>
-__global__ void LambUpdateGpu(int64_t n, float weight_decay, float learning_rate_val,
-                              const float* learning_rate_ptr, const int64_t* skip_if,
-                              const T* w_norm_2, const T* g_norm_2, const T* adam_diff, T* model) {
-  if (skip_if != nullptr && *skip_if != 0) { return; }
-  if (learning_rate_ptr != nullptr) { learning_rate_val = *learning_rate_ptr; }
-  const float lr = LambLRFunctor<T>()(learning_rate_val, w_norm_2, g_norm_2);
-  CUDA_1D_KERNEL_LOOP(i, n) { LambUpdateFunctor<T>()(lr, weight_decay, adam_diff + i, model + i); }
-}
-
-}  // namespace
-
-template<typename T, typename G>
-struct AdamUpdateKernelUtil<DeviceType::kCUDA, T, G> {
-  static void Update(ep::Stream* stream, int64_t n, T scale, float l1, float l2, float beta1,
-                     float beta2, float epsilon, float weight_decay, bool amsgrad,
-                     bool do_bias_correction, float learning_rate_val, float bias_correction1_val,
-                     float bias_correction2_val, const float* learning_rate, const T* scale_by_ptr,
-                     const int64_t* skip_if, const float* bias_correction1_ptr,
-                     const float* bias_correction2_ptr, const G* model_diff, T* model, T* m, T* v,
-                     T* max_v);
-};
-
-template<typename T, typename G>
-void AdamUpdateKernelUtil<DeviceType::kCUDA, T, G>::Update(
-    ep::Stream* stream, int64_t n, T scale, float l1, float l2, float beta1, float beta2,
-    float epsilon, float weight_decay, bool amsgrad, bool do_bias_correction,
-    float learning_rate_val, float bias_correction1_val, float bias_correction2_val,
-    const float* learning_rate, const T* scale_by_ptr, const int64_t* skip_if,
-    const float* bias_correction1_ptr, const float* bias_correction2_ptr, const G* model_diff,
-    T* model, T* m, T* v, T* max_v) {
-  AdamUpdateGpu<T, G><<<BlocksNum4ThreadsNum(n), kCudaThreadsNumPerBlock, 0,
-                        stream->As<ep::CudaStream>()->cuda_stream()>>>(
-      n, scale, l1, l2, beta1, beta2, epsilon, weight_decay, amsgrad, do_bias_correction,
-      learning_rate_val, bias_correction1_val, bias_correction2_val, learning_rate, scale_by_ptr,
-      skip_if, bias_correction1_ptr, bias_correction2_ptr, model_diff, model, m, v, max_v);
-}
-
-template<typename T>
-struct AdamUpdateKernelUtil<DeviceType::kCUDA, T, float16> {
-  static void Update(ep::Stream* stream, int64_t n, T scale, float l1, float l2, float beta1,
-                     float beta2, float epsilon, float weight_decay, bool amsgrad,
-                     bool do_bias_correction, float learning_rate_val, float bias_correction1_val,
-                     float bias_correction2_val, const float* learning_rate, const T* scale_by_ptr,
-                     const int64_t* skip_if, const float* bias_correction1_ptr,
-                     const float* bias_correction2_ptr, const float16* model_diff, T* model, T* m,
-                     T* v, T* max_v);
-};
-
-template<typename T>
-void AdamUpdateKernelUtil<DeviceType::kCUDA, T, float16>::Update(
-    ep::Stream* stream, int64_t n, T scale, float l1, float l2, float beta1, float beta2,
-    float epsilon, float weight_decay, bool amsgrad, bool do_bias_correction,
-    float learning_rate_val, float bias_correction1_val, float bias_correction2_val,
-    const float* learning_rate, const T* scale_by_ptr, const int64_t* skip_if,
-    const float* bias_correction1_ptr, const float* bias_correction2_ptr, const float16* model_diff,
-    T* model, T* m, T* v, T* max_v) {
-  AdamUpdateKernelUtil<DeviceType::kCUDA, T, half>::Update(
-      stream, n, scale, l1, l2, beta1, beta2, epsilon, weight_decay, amsgrad, do_bias_correction,
-      learning_rate_val, bias_correction1_val, bias_correction2_val, learning_rate, scale_by_ptr,
-      skip_if, bias_correction1_ptr, bias_correction2_ptr,
-      reinterpret_cast<const half*>(model_diff), model, m, v, max_v);
-}
-
-template struct AdamUpdateKernelUtil<DeviceType::kCUDA, float, float>;
-template struct AdamUpdateKernelUtil<DeviceType::kCUDA, double, double>;
-template struct AdamUpdateKernelUtil<DeviceType::kCUDA, float, float16>;
-
-template<typename T, typename G>
-__global__ void AdagradUpdateGpu(int64_t n, T scale, float l1, float l2, float lr_decay,
-                                 float epsilon, float weight_decay, float learning_rate_val,
-                                 int64_t train_step, const float* learning_rate,
-                                 const int64_t* train_step_ptr, const T* scale_by_ptr,
-                                 const int64_t* skip_if, const G* model_diff, T* model, T* sum) {
-  if (skip_if != nullptr && *skip_if != 0) { return; }
-  if (learning_rate != nullptr) { learning_rate_val = *learning_rate; }
-  if (train_step_ptr != nullptr) {
-    train_step = *train_step_ptr + 1;
-  }  // train_step_ptr start from zero.
-  if (scale_by_ptr != nullptr) { scale *= *scale_by_ptr; }
-  learning_rate_val = learning_rate_val / (1 + (train_step - 1) * lr_decay);
-
-  CUDA_1D_KERNEL_LOOP(i, n) {
-    AdagradUpdateFunctor<T, G>()(model_diff + i, model + i, sum + i, scale, l1, l2, epsilon,
-                                 weight_decay, learning_rate_val);
-  }
-}
-
-template<typename T, typename G>
-struct AdagradUpdateKernelUtil<DeviceType::kCUDA, T, G> {
-  static void Update(ep::Stream* stream, int64_t n, T scale, float l1, float l2, float lr_decay,
-                     float epsilon, float weight_decay, float learning_rate_val, int64_t train_step,
-                     const float* learning_rate, const int64_t* train_step_ptr,
-                     const T* scale_by_ptr, const int64_t* skip_if, const G* model_diff, T* model,
-                     T* sum);
-};
-
-template<typename T, typename G>
-void AdagradUpdateKernelUtil<DeviceType::kCUDA, T, G>::Update(
-    ep::Stream* stream, int64_t n, T scale, float l1, float l2, float lr_decay, float epsilon,
-    float weight_decay, float learning_rate_val, int64_t train_step, const float* learning_rate,
-    const int64_t* train_step_ptr, const T* scale_by_ptr, const int64_t* skip_if,
-    const G* model_diff, T* model, T* sum) {
-  AdagradUpdateGpu<T, G><<<BlocksNum4ThreadsNum(n), kCudaThreadsNumPerBlock, 0,
-                           stream->As<ep::CudaStream>()->cuda_stream()>>>(
-      n, scale, l1, l2, lr_decay, epsilon, weight_decay, learning_rate_val, train_step,
-      learning_rate, train_step_ptr, scale_by_ptr, skip_if, model_diff, model, sum);
-}
-
-template struct AdagradUpdateKernelUtil<DeviceType::kCUDA, float, float>;
-template struct AdagradUpdateKernelUtil<DeviceType::kCUDA, double, double>;
-
-template<typename T, typename G>
-struct LambUpdateKernelUtil<DeviceType::kCUDA, T, G> {
-  static void Update(ep::Stream* stream, int64_t n, float scale, float l1, float l2, float beta1,
-                     float beta2, float epsilon, float weight_decay, float learning_rate_val,
-                     bool do_bias_correction, float bias_correction1_val,
-                     float bias_correction2_val, const float* learning_rate_ptr,
-                     const float* bias_correction1_ptr, const float* bias_correction2_ptr,
-                     const T* scale_by_ptr, const int64_t* skip_if, const G* model_diff,
-                     T* adam_diff, T* model, T* m, T* v, T* norm_buffer);
-};
-
-template<typename T, typename G>
-void LambUpdateKernelUtil<DeviceType::kCUDA, T, G>::Update(
-    ep::Stream* stream, int64_t n, float scale, float l1, float l2, float beta1, float beta2,
-    float epsilon, float weight_decay, float learning_rate_val, bool do_bias_correction,
-    float bias_correction1_val, float bias_correction2_val, const float* learning_rate_ptr,
-    const float* bias_correction1_ptr, const float* bias_correction2_ptr, const T* scale_by_ptr,
-    const int64_t* skip_if, const G* model_diff, T* adam_diff, T* model, T* m, T* v,
-    T* norm_buffer) {
-  LambGradGpu<T, G><<<BlocksNum4ThreadsNum(n), kCudaThreadsNumPerBlock, 0,
-                      stream->As<ep::CudaStream>()->cuda_stream()>>>(
-      n, scale, l1, l2, beta1, beta2, epsilon, scale_by_ptr, skip_if, model_diff, adam_diff, model,
-      m, v, do_bias_correction, bias_correction1_val, bias_correction2_val, bias_correction1_ptr,
-      bias_correction2_ptr);
-  T* w_norm_2 = norm_buffer;
-  T* g_norm_2 = norm_buffer + 1;
-  Memset<DeviceType::kCUDA>(stream, norm_buffer, 0, 2 * sizeof(T));
-  SumSquares2<T>
-      <<<BlocksNum4ThreadsNum(n), kCudaThreadsNumPerBlock, 0,
-         stream->As<ep::CudaStream>()->cuda_stream()>>>(n, model, w_norm_2, adam_diff, g_norm_2);
-  LambUpdateGpu<T><<<BlocksNum4ThreadsNum(n), kCudaThreadsNumPerBlock, 0,
-                     stream->As<ep::CudaStream>()->cuda_stream()>>>(
-      n, weight_decay, learning_rate_val, learning_rate_ptr, skip_if, w_norm_2, g_norm_2, adam_diff,
-      model);
-}
-
-template<typename T>
-struct LambUpdateKernelUtil<DeviceType::kCUDA, T, float16> {
-  static void Update(ep::Stream* stream, int64_t n, float scale, float l1, float l2, float beta1,
-                     float beta2, float epsilon, float weight_decay, float learning_rate_val,
-                     bool do_bias_correction, float bias_correction1_val,
-                     float bias_correction2_val, const float* learning_rate_ptr,
-                     const float* bias_correction1_ptr, const float* bias_correction2_ptr,
-                     const T* scale_by_ptr, const int64_t* skip_if, const float16* model_diff,
-                     T* adam_diff, T* model, T* m, T* v, T* norm_buffer);
-};
-
-template<typename T>
-void LambUpdateKernelUtil<DeviceType::kCUDA, T, float16>::Update(
-    ep::Stream* stream, int64_t n, float scale, float l1, float l2, float beta1, float beta2,
-    float epsilon, float weight_decay, float learning_rate_val, bool do_bias_correction,
-    float bias_correction1_val, float bias_correction2_val, const float* learning_rate_ptr,
-    const float* bias_correction1_ptr, const float* bias_correction2_ptr, const T* scale_by_ptr,
-    const int64_t* skip_if, const float16* model_diff, T* adam_diff, T* model, T* m, T* v,
-    T* norm_buffer) {
-  LambUpdateKernelUtil<DeviceType::kCUDA, T, half>::Update(
-      stream, n, scale, l1, l2, beta1, beta2, epsilon, weight_decay, learning_rate_val,
-      do_bias_correction, bias_correction1_val, bias_correction2_val, learning_rate_ptr,
-      bias_correction1_ptr, bias_correction2_ptr, scale_by_ptr, skip_if,
-      reinterpret_cast<const half*>(model_diff), adam_diff, model, m, v, norm_buffer);
-}
-
-template struct LambUpdateKernelUtil<DeviceType::kCUDA, float, float>;
-template struct LambUpdateKernelUtil<DeviceType::kCUDA, double, double>;
-template struct LambUpdateKernelUtil<DeviceType::kCUDA, float, float16>;
-
-template<typename T, typename K, typename IDX>
-struct IndexedSlicesAdamMdUpdateKernelUtil<DeviceType::kCUDA, T, K, IDX> {
-  static void Update(ep::Stream* stream, float beta1, float beta2, float epsilon,
-                     float weight_decay, bool amsgrad, bool do_bias_correction, float lr,
-                     int64_t num_instance, int64_t feature_size, int64_t lower_bound,
-                     int64_t upper_bound, const IDX* num_unique_instance,
-                     const float* learning_rate, const float* bias_correction1_ptr,
-                     const float* bias_correction2_ptr, const K* indices, const T* values, T* model,
-                     T* m, T* v, T* max_v);
-};
-
-template<typename T, typename K, typename IDX>
-void IndexedSlicesAdamMdUpdateKernelUtil<DeviceType::kCUDA, T, K, IDX>::Update(
-    ep::Stream* stream, float beta1, float beta2, float epsilon, float weight_decay, bool amsgrad,
-    bool do_bias_correction, float lr, int64_t num_instance, int64_t feature_size,
-    int64_t lower_bound, int64_t upper_bound, const IDX* num_unique_instance,
-    const float* learning_rate, const float* bias_correction1_ptr,
-    const float* bias_correction2_ptr, const K* indices, const T* values, T* model, T* m, T* v,
-    T* max_v) {
-  IndexedSlicesAdamUpdateGpu<T, K, IDX>
-      <<<BlocksNum4ThreadsNum(num_instance * feature_size), kCudaThreadsNumPerBlock, 0,
-         stream->As<ep::CudaStream>()->cuda_stream()>>>(
-          beta1, beta2, epsilon, weight_decay, amsgrad, do_bias_correction, lr, feature_size,
-          lower_bound, upper_bound, num_unique_instance, learning_rate, bias_correction1_ptr,
-          bias_correction2_ptr, indices, values, model, m, v, max_v);
-}
-
-#define INSTANTIATE_INDEXED_SLICES_ADAM_MODEL_UPDATE_KERNEL_UTIL_CUDA(                     \
-    val_type_pair, key_type_pair, idx_type_pair)                                           \
-  template struct IndexedSlicesAdamMdUpdateKernelUtil<                                     \
-      DeviceType::kCUDA, OF_PP_PAIR_FIRST(val_type_pair), OF_PP_PAIR_FIRST(key_type_pair), \
-      OF_PP_PAIR_FIRST(idx_type_pair)>;
-OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_INDEXED_SLICES_ADAM_MODEL_UPDATE_KERNEL_UTIL_CUDA,
-                                 FLOATING_DATA_TYPE_SEQ, INDEX_DATA_TYPE_SEQ, INT_DATA_TYPE_SEQ);
-#undef INSTANTIATE_INDEXED_SLICES_ADAM_MODEL_UPDATE_KERNEL_UTIL_CUDA
-
-template<>
-struct BiasCorrectionFactorKernelUtil<DeviceType::kCUDA> {
-  static void BiasCorrectionFactorCompute(ep::Stream* stream, float beta, const int64_t* train_step,
-                                          float* out);
-};
-
-void BiasCorrectionFactorKernelUtil<DeviceType::kCUDA>::BiasCorrectionFactorCompute(
-    ep::Stream* stream, float beta, const int64_t* train_step, float* out) {
-  BiasCorrectionFactorKernelGpu<<<1, 1, 0, stream->As<ep::CudaStream>()->cuda_stream()>>>(
-      beta, train_step, out);
-}
-
-namespace {
-
-template<typename T, typename G, bool centered>
-__global__ void RmsPropUpdateGpu(int64_t n, T scale, float l1, float l2, T* mean_square,
-                                 T* mean_gradient, float epsilon, float weight_decay,
-                                 float decay_rate, float learning_rate_val,
-                                 const float* learning_rate, const T* scale_by_ptr,
-                                 const int64_t* skip_if, const G* model_diff, T* model) {
-  if (skip_if != nullptr && *skip_if != 0) { return; }
-  if (learning_rate != nullptr) { learning_rate_val = *learning_rate; }
-  if (scale_by_ptr != nullptr) { scale *= *scale_by_ptr; }
-  CUDA_1D_KERNEL_LOOP(i, n) {
-    RmsPropUpdateFunctor<T, G, centered>()(model_diff + i, model + i, n, scale, l1, l2,
-                                           mean_square + i,
-                                           (centered ? mean_gradient + i : nullptr), epsilon,
-                                           weight_decay, decay_rate, learning_rate_val);
-  }
-}
-
-}  // namespace
-
-template<typename T, typename G>
-struct RmsPropUpdateKernelUtil<DeviceType::kCUDA, T, G> {
-  static void Update(ep::Stream* stream, int64_t n, T scale, float l1, float l2, bool centered,
-                     float epsilon, float weight_decay, float decay_rate, float learning_rate_val,
-                     const float* learning_rate, const T* scale_by_ptr, const int64_t* skip_if,
-                     const G* model_diff, T* model, T* mean_square, T* mean_gradient);
-};
-
-template<typename T, typename G>
-void RmsPropUpdateKernelUtil<DeviceType::kCUDA, T, G>::Update(
-    ep::Stream* stream, int64_t n, T scale, float l1, float l2, bool centered, float epsilon,
-    float weight_decay, float decay_rate, float learning_rate_val, const float* learning_rate,
-    const T* scale_by_ptr, const int64_t* skip_if, const G* model_diff, T* model, T* mean_square,
-    T* mean_gradient) {
-  if (centered) {
-    RmsPropUpdateGpu<T, G, true><<<BlocksNum4ThreadsNum(n), kCudaThreadsNumPerBlock, 0,
-                                   stream->As<ep::CudaStream>()->cuda_stream()>>>(
-        n, scale, l1, l2, mean_square, mean_gradient, epsilon, weight_decay, decay_rate,
-        learning_rate_val, learning_rate, scale_by_ptr, skip_if, model_diff, model);
-  } else {
-    RmsPropUpdateGpu<T, G, false><<<BlocksNum4ThreadsNum(n), kCudaThreadsNumPerBlock, 0,
-                                    stream->As<ep::CudaStream>()->cuda_stream()>>>(
-        n, scale, l1, l2, mean_square, mean_gradient, epsilon, weight_decay, decay_rate,
-        learning_rate_val, learning_rate, scale_by_ptr, skip_if, model_diff, model);
-  }
-}
-
-template<typename T>
-struct RmsPropUpdateKernelUtil<DeviceType::kCUDA, T, float16> {
-  static void Update(ep::Stream* stream, int64_t n, T scale, float l1, float l2, bool centered,
-                     float epsilon, float weight_decay, float decay_rate, float learning_rate_val,
-                     const float* learning_rate, const T* scale_by_ptr, const int64_t* skip_if,
-                     const float16* model_diff, T* model, T* mean_square, T* mean_gradient);
-};
-
-template<typename T>
-void RmsPropUpdateKernelUtil<DeviceType::kCUDA, T, float16>::Update(
-    ep::Stream* stream, int64_t n, T scale, float l1, float l2, bool centered, float epsilon,
-    float weight_decay, float decay_rate, float learning_rate_val, const float* learning_rate,
-    const T* scale_by_ptr, const int64_t* skip_if, const float16* model_diff, T* model,
-    T* mean_square, T* mean_gradient) {
-  RmsPropUpdateKernelUtil<DeviceType::kCUDA, T, half>::Update(
-      stream, n, scale, l1, l2, centered, epsilon, weight_decay, decay_rate, learning_rate_val,
-      learning_rate, scale_by_ptr, skip_if, reinterpret_cast<const half*>(model_diff), model,
-      mean_square, mean_gradient);
-}
-
-template struct RmsPropUpdateKernelUtil<DeviceType::kCUDA, float, float>;
-template struct RmsPropUpdateKernelUtil<DeviceType::kCUDA, double, double>;
-template struct RmsPropUpdateKernelUtil<DeviceType::kCUDA, float, float16>;
-
-namespace {
-
-template<typename T, typename G>
-__global__ void LarsScaleModelDiffGpu(int64_t n, T scale, float l1, float l2, const T* scale_by_ptr,
-                                      const int64_t* skip_if, const G* model_diff, T* model,
-                                      T* model_diff_tmp) {
-  if (skip_if != nullptr && *skip_if != 0) { return; }
-  if (scale_by_ptr != nullptr) { scale *= *scale_by_ptr; }
-  CUDA_1D_KERNEL_LOOP(i, n) {
-    model_diff_tmp[i] =
-        CastScaleRegularizeGradientFunctor<T, G>()(model_diff[i], model[i], scale, l1, l2);
-  }
-}
-
-template<typename T>
-__global__ void LarsGetLocalLearningRateGpu(const float* learning_rate, T weight_decay, T epsilon,
-                                            T lars_coefficient, const int64_t* skip_if,
-                                            T* data_tmp) {
-  if (skip_if != nullptr && *skip_if != 0) { return; }
-  T* model_norm = &data_tmp[0];
-  T* model_diff_norm = &data_tmp[1];
-  T* local_learning_rate = &data_tmp[2];
-  *model_norm = std::sqrt(*model_norm);
-  *model_diff_norm = std::sqrt(*model_diff_norm);
-  T lars = static_cast<T>(1);
-  if (*model_norm > 0 && *model_diff_norm > 0) {
-    lars = lars_coefficient * (*model_norm)
-           / (epsilon + (*model_diff_norm) + weight_decay * (*model_norm));
-  }
-  *local_learning_rate = *learning_rate * lars;
-}
-
-template<typename T>
-__global__ void LarsUpdateGpu(int64_t n, float momentum_beta, T* momentum, float weight_decay,
-                              const int64_t* skip_if, T* local_learning_rate, T* model_diff_tmp,
-                              T* model) {
-  if (skip_if != nullptr && *skip_if != 0) { return; }
-  CUDA_1D_KERNEL_LOOP(i, n) {
-    LarsUpdateFunctor<T>()(model_diff_tmp + i, model + i, momentum_beta, momentum + i, weight_decay,
-                           *local_learning_rate);
-  }
-}
-}  // namespace
-
-template<typename T, typename G>
-struct LarsUpdateKernelUtil<DeviceType::kCUDA, T, G> {
-  static void Update(ep::Stream* stream, int64_t n, T scale, float l1, float l2,
-                     float momentum_beta, float epsilon, float lars_coefficient, float weight_decay,
-                     const float* learning_rate, const T* scale_by_ptr, const int64_t* skip_if,
-                     const G* model_diff, T* model, T* momentum, T* data_tmp, T* model_diff_tmp);
-};
-
-template<typename T, typename G>
-void LarsUpdateKernelUtil<DeviceType::kCUDA, T, G>::Update(
-    ep::Stream* stream, int64_t n, T scale, float l1, float l2, float momentum_beta, float epsilon,
-    float lars_coefficient, float weight_decay, const float* learning_rate, const T* scale_by_ptr,
-    const int64_t* skip_if, const G* model_diff, T* model, T* momentum, T* data_tmp,
-    T* model_diff_tmp) {
-  LarsScaleModelDiffGpu<T, G><<<BlocksNum4ThreadsNum(n), kCudaThreadsNumPerBlock, 0,
-                                stream->As<ep::CudaStream>()->cuda_stream()>>>(
-      n, scale, l1, l2, scale_by_ptr, skip_if, model_diff, model, model_diff_tmp);
-  T* model_norm = data_tmp;
-  T* model_diff_norm = data_tmp + 1;
-  T* local_learning_rate = data_tmp + 2;
-  Memset<DeviceType::kCUDA>(stream, data_tmp, 0, 2 * sizeof(T));
-  SumSquares2<T><<<BlocksNum4ThreadsNum(n), kCudaThreadsNumPerBlock, 0,
-                   stream->As<ep::CudaStream>()->cuda_stream()>>>(n, model, model_norm,
-                                                                  model_diff_tmp, model_diff_norm);
-  LarsGetLocalLearningRateGpu<T><<<1, 1, 0, stream->As<ep::CudaStream>()->cuda_stream()>>>(
-      learning_rate, weight_decay, epsilon, lars_coefficient, skip_if, data_tmp);
-  LarsUpdateGpu<T><<<BlocksNum4ThreadsNum(n), kCudaThreadsNumPerBlock, 0,
-                     stream->As<ep::CudaStream>()->cuda_stream()>>>(
-      n, momentum_beta, momentum, weight_decay, skip_if, local_learning_rate, model_diff_tmp,
-      model);
-}
-
-template<typename T>
-struct LarsUpdateKernelUtil<DeviceType::kCUDA, T, float16> {
-  static void Update(ep::Stream* stream, int64_t n, T scale, float l1, float l2,
-                     float momentum_beta, float epsilon, float lars_coefficient, float weight_decay,
-                     const float* learning_rate, const T* scale_by_ptr, const int64_t* skip_if,
-                     const float16* model_diff, T* model, T* momentum, T* data_tmp,
-                     T* model_diff_tmp);
-};
-
-template<typename T>
-void LarsUpdateKernelUtil<DeviceType::kCUDA, T, float16>::Update(
-    ep::Stream* stream, int64_t n, T scale, float l1, float l2, float momentum_beta, float epsilon,
-    float lars_coefficient, float weight_decay, const float* learning_rate, const T* scale_by_ptr,
-    const int64_t* skip_if, const float16* model_diff, T* model, T* momentum, T* data_tmp,
-    T* model_diff_tmp) {
-  LarsUpdateKernelUtil<DeviceType::kCUDA, T, half>::Update(
-      stream, n, scale, l1, l2, momentum_beta, epsilon, lars_coefficient, weight_decay,
-      learning_rate, scale_by_ptr, skip_if, reinterpret_cast<const half*>(model_diff), model,
-      momentum, data_tmp, model_diff_tmp);
-}
-
-template struct LarsUpdateKernelUtil<DeviceType::kCUDA, float, float>;
-template struct LarsUpdateKernelUtil<DeviceType::kCUDA, double, double>;
-template struct LarsUpdateKernelUtil<DeviceType::kCUDA, float, float16>;
-
-template<typename T, typename G>
-__global__ void FtrlUpdateGpu(int64_t n, T scale, float l1, float l2, float lr_power, float lambda1,
-                              float lambda2, float beta, float weight_decay,
-                              float learning_rate_val, const float* learning_rate,
-                              const T* scale_by_ptr, const int64_t* skip_if, const G* model_diff,
-                              T* model, T* accumulate, T* z) {
-  if (skip_if != nullptr && *skip_if != 0) { return; }
-  if (learning_rate != nullptr) { learning_rate_val = *learning_rate; }
-  if (scale_by_ptr != nullptr) { scale *= *scale_by_ptr; }
-  CUDA_1D_KERNEL_LOOP(i, n) {
-    FtrlUpdateFunctor<T, G>()(model_diff + i, model + i, accumulate + i, z + i, scale, l1, l2,
-                              lr_power, lambda1, lambda2, beta, weight_decay, learning_rate_val);
-  }
-}
-
-template<typename T, typename G>
-struct FtrlUpdateKernelUtil<DeviceType::kCUDA, T, G> {
-  static void Update(ep::Stream* stream, int64_t n, T scale, float l1, float l2, float lr_power,
-                     float lambda1, float lambda2, float beta, float weight_decay,
-                     float learning_rate_val, const float* learning_rate, const T* scale_by_ptr,
-                     const int64_t* skip_if, const G* model_diff, T* model, T* accumulate, T* z);
-};
-
-template<typename T, typename G>
-void FtrlUpdateKernelUtil<DeviceType::kCUDA, T, G>::Update(
-    ep::Stream* stream, int64_t n, T scale, float l1, float l2, float lr_power, float lambda1,
-    float lambda2, float beta, float weight_decay, float learning_rate_val,
-    const float* learning_rate, const T* scale_by_ptr, const int64_t* skip_if, const G* model_diff,
-    T* model, T* accumulate, T* z) {
-  FtrlUpdateGpu<T, G><<<BlocksNum4ThreadsNum(n), kCudaThreadsNumPerBlock, 0,
-                        stream->As<ep::CudaStream>()->cuda_stream()>>>(
-      n, scale, l1, l2, lr_power, lambda1, lambda2, beta, weight_decay, learning_rate_val,
-      learning_rate, scale_by_ptr, skip_if, model_diff, model, accumulate, z);
-}
-
-template<typename T>
-struct FtrlUpdateKernelUtil<DeviceType::kCUDA, T, float16> {
-  static void Update(ep::Stream* stream, int64_t n, T scale, float l1, float l2, float lr_power,
-                     float lambda1, float lambda2, float beta, float weight_decay,
-                     float learning_rate_val, const float* learning_rate, const T* scale_by_ptr,
-                     const int64_t* skip_if, const float16* model_diff, T* model, T* accumulate,
-                     T* z);
-};
-
-template<typename T>
-void FtrlUpdateKernelUtil<DeviceType::kCUDA, T, float16>::Update(
-    ep::Stream* stream, int64_t n, T scale, float l1, float l2, float lr_power, float lambda1,
-    float lambda2, float beta, float weight_decay, float learning_rate_val,
-    const float* learning_rate, const T* scale_by_ptr, const int64_t* skip_if,
-    const float16* model_diff, T* model, T* accumulate, T* z) {
-  FtrlUpdateKernelUtil<DeviceType::kCUDA, T, half>::Update(
-      stream, n, scale, l1, l2, lr_power, lambda1, lambda2, beta, weight_decay, learning_rate_val,
-      learning_rate, scale_by_ptr, skip_if, reinterpret_cast<const half*>(model_diff), model,
-      accumulate, z);
-}
-
-template struct FtrlUpdateKernelUtil<DeviceType::kCUDA, float, float>;
-template struct FtrlUpdateKernelUtil<DeviceType::kCUDA, double, double>;
-template struct FtrlUpdateKernelUtil<DeviceType::kCUDA, float, float16>;
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "oneflow/core/framework/framework.h"
+#include "oneflow/core/hip/atomic.hip.h"
+#include "oneflow/user/kernels/model_update_kernel_util.h"
+#include <hipcub/hipcub.hpp>
+#include "oneflow/core/ep/rocm/cuda_stream.h"
+
+namespace oneflow {
+
+namespace {
+
+template<typename T, typename G>
+__global__ void SGDUpdateGpu(int64_t n, T scale, float l1, float l2, float weight_decay,
+                             float learning_rate_val, const float* learning_rate,
+                             const T* scale_by_ptr, const int64_t* skip_if, const G* model_diff,
+                             T* model) {
+  if (skip_if != nullptr && *skip_if != 0) { return; }
+  if (learning_rate != nullptr) { learning_rate_val = *learning_rate; }
+  if (scale_by_ptr != nullptr) { scale *= *scale_by_ptr; }
+  CUDA_1D_KERNEL_LOOP(i, n) {
+    SGDUpdateFunctor<T, G>()(model_diff + i, model + i, scale, l1, l2, weight_decay,
+                             learning_rate_val);
+  }
+}
+
+template<typename T, typename K, typename IDX>
+__global__ void IndexedSlicesSGDUpdateGpu(float weight_decay, const IDX feature_size,
+                                          const int64_t lower_bound, const int64_t upper_bound,
+                                          const IDX* num_unique_instance,
+                                          const float* learning_rate, const K* indices,
+                                          const T* values, T* model) {
+  const int64_t n = *num_unique_instance * feature_size;
+  const T lr = *learning_rate;
+  CUDA_1D_KERNEL_LOOP_T(IDX, i, n) {
+    const IDX indices_idx = i / feature_size;
+    const IDX inner_idx = i - indices_idx * feature_size;
+    const IDX instance_id = indices[indices_idx];
+    if (instance_id >= lower_bound && instance_id < upper_bound) {
+      const IDX model_idx = (instance_id - lower_bound) * feature_size + inner_idx;
+      SGDUpdateFunctor<T, T>()(values + i, model + model_idx, static_cast<T>(1), 0.0, 0.0,
+                               weight_decay, lr);
+    }
+  }
+}
+
+template<typename T>
+__global__ void SumSquares2(int64_t n, const T* src0, T* dst0, const T* src1, T* dst1) {
+  T t_sum0 = 0;
+  T t_sum1 = 0;
+  CUDA_1D_KERNEL_LOOP(i, n) {
+    t_sum0 += src0[i] * src0[i];
+    t_sum1 += src1[i] * src1[i];
+  }
+  typedef hipcub::BlockReduce<T, kCudaThreadsNumPerBlock> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage temp_storage0;
+  __shared__ typename BlockReduce::TempStorage temp_storage1;
+  T b_sum0 = BlockReduce(temp_storage0).Sum(t_sum0);
+  T b_sum1 = BlockReduce(temp_storage1).Sum(t_sum1);
+  if (threadIdx.x == 0) {
+    cuda::atomic::Add(dst0, b_sum0);
+    cuda::atomic::Add(dst1, b_sum1);
+  }
+}
+
+}  // namespace
+
+template<typename T, typename G>
+struct SGDUpdateKernelUtil<DeviceType::kCUDA, T, G> {
+  static void Update(ep::Stream* stream, int64_t n, T scale, float l1, float l2, float weight_decay,
+                     float learning_rate_val, const float* learning_rate, const T* scale_by_ptr,
+                     const int64_t* skip_if, const G* model_diff, T* model);
+};
+
+template<typename T, typename G>
+void SGDUpdateKernelUtil<DeviceType::kCUDA, T, G>::Update(
+    ep::Stream* stream, int64_t n, T scale, float l1, float l2, float weight_decay,
+    float learning_rate_val, const float* learning_rate, const T* scale_by_ptr,
+    const int64_t* skip_if, const G* model_diff, T* model) {
+  SGDUpdateGpu<T, G><<<BlocksNum4ThreadsNum(n), kCudaThreadsNumPerBlock, 0,
+                       stream->As<ep::CudaStream>()->cuda_stream()>>>(
+      n, scale, l1, l2, weight_decay, learning_rate_val, learning_rate, scale_by_ptr, skip_if,
+      model_diff, model);
+}
+
+template<typename T>
+struct SGDUpdateKernelUtil<DeviceType::kCUDA, T, float16> {
+  static void Update(ep::Stream* stream, int64_t n, T scale, float l1, float l2, float weight_decay,
+                     float learning_rate_val, const float* learning_rate, const T* scale_by_ptr,
+                     const int64_t* skip_if, const float16* model_diff, T* model);
+};
+
+template<typename T>
+void SGDUpdateKernelUtil<DeviceType::kCUDA, T, float16>::Update(
+    ep::Stream* stream, int64_t n, T scale, float l1, float l2, float weight_decay,
+    float learning_rate_val, const float* learning_rate, const T* scale_by_ptr,
+    const int64_t* skip_if, const float16* model_diff, T* model) {
+  SGDUpdateKernelUtil<DeviceType::kCUDA, T, half>::Update(
+      stream, n, scale, l1, l2, weight_decay, learning_rate_val, learning_rate, scale_by_ptr,
+      skip_if, reinterpret_cast<const half*>(model_diff), model);
+}
+
+template struct SGDUpdateKernelUtil<DeviceType::kCUDA, double, double>;
+template struct SGDUpdateKernelUtil<DeviceType::kCUDA, float, float>;
+template struct SGDUpdateKernelUtil<DeviceType::kCUDA, float, float16>;
+
+template<typename T, typename K, typename IDX>
+struct IndexedSlicesSGDUpdateKernelUtil<DeviceType::kCUDA, T, K, IDX> {
+  static void Update(ep::Stream* stream, float weight_decay, int64_t num_indices,
+                     int64_t feature_size, int64_t lower_bound, int64_t upper_bound,
+                     const IDX* num_unique_instance, const float* learning_rate, const K* indices,
+                     const T* values, T* model);
+};
+
+template<typename T, typename K, typename IDX>
+void IndexedSlicesSGDUpdateKernelUtil<DeviceType::kCUDA, T, K, IDX>::Update(
+    ep::Stream* stream, float weight_decay, int64_t num_indices, int64_t feature_size,
+    int64_t lower_bound, int64_t upper_bound, const IDX* num_unique_instance,
+    const float* learning_rate, const K* indices, const T* values, T* model) {
+  IndexedSlicesSGDUpdateGpu<T, K, IDX>
+      <<<BlocksNum4ThreadsNum(num_indices * feature_size), kCudaThreadsNumPerBlock, 0,
+         stream->As<ep::CudaStream>()->cuda_stream()>>>(weight_decay, feature_size, lower_bound,
+                                                        upper_bound, num_unique_instance,
+                                                        learning_rate, indices, values, model);
+}
+
+#define INITIATE_INDEXED_SLICES_SGD_UPDATE_KERNEL_UTIL_CUDA(val_type_pair, key_type_pair,  \
+                                                            idx_type_pair)                 \
+  template struct IndexedSlicesSGDUpdateKernelUtil<                                        \
+      DeviceType::kCUDA, OF_PP_PAIR_FIRST(val_type_pair), OF_PP_PAIR_FIRST(key_type_pair), \
+      OF_PP_PAIR_FIRST(idx_type_pair)>;
+OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INITIATE_INDEXED_SLICES_SGD_UPDATE_KERNEL_UTIL_CUDA,
+                                 FLOATING_DATA_TYPE_SEQ, INDEX_DATA_TYPE_SEQ, INT_DATA_TYPE_SEQ);
+#undef INITIATE_INDEXED_SLICES_SGD_UPDATE_KERNEL_UTIL_CUDA
+
+namespace {
+
+template<typename T, typename G>
+__global__ void MomentumUpdateGpu(int64_t n, T scale, float l1, float l2, float beta,
+                                  float weight_decay, float learning_rate_val,
+                                  const float* learning_rate, const T* scale_by_ptr,
+                                  const int64_t* skip_if, const G* model_diff, T* model,
+                                  T* momentum) {
+  if (skip_if != nullptr && *skip_if != 0) { return; }
+  if (learning_rate != nullptr) { learning_rate_val = *learning_rate; }
+  if (scale_by_ptr != nullptr) { scale *= *scale_by_ptr; }
+  CUDA_1D_KERNEL_LOOP(i, n) {
+    MomentumUpdateFunctor<T, G>()(model_diff + i, model + i, momentum + i, scale, l1, l2, beta,
+                                  weight_decay, learning_rate_val);
+  }
+}
+
+template<typename T, typename K, typename IDX>
+__global__ void IndexedSlicesMomentumUpdateGpu(T beta, float weight_decay, int64_t feature_size,
+                                               int64_t lower_bound, int64_t upper_bound,
+                                               const IDX* num_unique_instance,
+                                               const float* learning_rate, const K* indices,
+                                               const T* values, T* model, T* momentum) {
+  const int64_t n = *num_unique_instance * feature_size;
+  const T lr = *learning_rate;
+  CUDA_1D_KERNEL_LOOP(i, n) {
+    const IDX indices_idx = i / feature_size;
+    const IDX inner_idx = i - indices_idx * feature_size;
+    const IDX instance_id = indices[indices_idx];
+    if (instance_id >= lower_bound && instance_id < upper_bound) {
+      const IDX model_idx = (instance_id - lower_bound) * feature_size + inner_idx;
+      MomentumUpdateFunctor<T, T>()(values + i, model + model_idx, momentum + model_idx,
+                                    static_cast<T>(1), 0.0, 0.0, beta, weight_decay, lr);
+    }
+  }
+}
+
+}  // namespace
+
+template<typename T, typename G>
+struct MomentumUpdateKernelUtil<DeviceType::kCUDA, T, G> {
+  static void Update(ep::Stream* stream, int64_t n, T scale, float l1, float l2, float beta,
+                     float weight_decay, float learning_rate_val, const float* learning_rate,
+                     const T* scale_by_ptr, const int64_t* skip_if, const G* model_diff, T* model,
+                     T* momentum);
+};
+
+template<typename T, typename G>
+void MomentumUpdateKernelUtil<DeviceType::kCUDA, T, G>::Update(
+    ep::Stream* stream, int64_t n, T scale, float l1, float l2, float beta, float weight_decay,
+    float learning_rate_val, const float* learning_rate, const T* scale_by_ptr,
+    const int64_t* skip_if, const G* model_diff, T* model, T* momentum) {
+  MomentumUpdateGpu<T, G><<<BlocksNum4ThreadsNum(n), kCudaThreadsNumPerBlock, 0,
+                            stream->As<ep::CudaStream>()->cuda_stream()>>>(
+      n, scale, l1, l2, beta, weight_decay, learning_rate_val, learning_rate, scale_by_ptr, skip_if,
+      model_diff, model, momentum);
+}
+
+template<typename T>
+struct MomentumUpdateKernelUtil<DeviceType::kCUDA, T, float16> {
+  static void Update(ep::Stream* stream, int64_t n, T scale, float l1, float l2, float beta,
+                     float weight_decay, float learning_rate_val, const float* learning_rate,
+                     const T* scale_by_ptr, const int64_t* skip_if, const float16* model_diff,
+                     T* model, T* momentum);
+};
+
+template<typename T>
+void MomentumUpdateKernelUtil<DeviceType::kCUDA, T, float16>::Update(
+    ep::Stream* stream, int64_t n, T scale, float l1, float l2, float beta, float weight_decay,
+    float learning_rate_val, const float* learning_rate, const T* scale_by_ptr,
+    const int64_t* skip_if, const float16* model_diff, T* model, T* momentum) {
+  MomentumUpdateKernelUtil<DeviceType::kCUDA, T, half>::Update(
+      stream, n, scale, l1, l2, beta, weight_decay, learning_rate_val, learning_rate, scale_by_ptr,
+      skip_if, reinterpret_cast<const half*>(model_diff), model, momentum);
+}
+
+template struct MomentumUpdateKernelUtil<DeviceType::kCUDA, double, double>;
+template struct MomentumUpdateKernelUtil<DeviceType::kCUDA, float, float>;
+template struct MomentumUpdateKernelUtil<DeviceType::kCUDA, float, float16>;
+
+template<typename T, typename K, typename IDX>
+struct IndexedSlicesMomentumMdUpdateKernelUtil<DeviceType::kCUDA, T, K, IDX> {
+  static void Update(ep::Stream* stream, T beta, float weight_decay, int64_t num_instance,
+                     int64_t feature_size, int64_t lower_bound, int64_t upper_bound,
+                     const IDX* num_unique_instance, const float* learning_rate, const K* indices,
+                     const T* values, T* model, T* momentum);
+};
+
+template<typename T, typename K, typename IDX>
+void IndexedSlicesMomentumMdUpdateKernelUtil<DeviceType::kCUDA, T, K, IDX>::Update(
+    ep::Stream* stream, T beta, float weight_decay, int64_t num_instance, int64_t feature_size,
+    int64_t lower_bound, int64_t upper_bound, const IDX* num_unique_instance,
+    const float* learning_rate, const K* indices, const T* values, T* model, T* momentum) {
+  IndexedSlicesMomentumUpdateGpu<T, K, IDX>
+      <<<BlocksNum4ThreadsNum(num_instance * feature_size), kCudaThreadsNumPerBlock, 0,
+         stream->As<ep::CudaStream>()->cuda_stream()>>>(
+          beta, weight_decay, feature_size, lower_bound, upper_bound, num_unique_instance,
+          learning_rate, indices, values, model, momentum);
+}
+
+#define INSTANTIATE_INDEXED_SLICES_MOMENTUM_MODEL_UPDATE_KERNEL_UTIL_CUDA(                 \
+    val_type_pair, key_type_pair, idx_type_pair)                                           \
+  template struct IndexedSlicesMomentumMdUpdateKernelUtil<                                 \
+      DeviceType::kCUDA, OF_PP_PAIR_FIRST(val_type_pair), OF_PP_PAIR_FIRST(key_type_pair), \
+      OF_PP_PAIR_FIRST(idx_type_pair)>;
+OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_INDEXED_SLICES_MOMENTUM_MODEL_UPDATE_KERNEL_UTIL_CUDA,
+                                 FLOATING_DATA_TYPE_SEQ, INDEX_DATA_TYPE_SEQ, INT_DATA_TYPE_SEQ);
+#undef INSTANTIATE_INDEXED_SLICES_MOMENTUM_MODEL_UPDATE_KERNEL_UTIL_CUDA
+
+namespace {
+
+__global__ void BiasCorrectionFactorKernelGpu(float beta, const int64_t* train_step, float* out) {
+  const auto exponent = static_cast<double>(*train_step + 1);
+  const float bias_correction_factor = 1.0 - static_cast<float>(pow(beta, exponent));
+  *out = bias_correction_factor;
+}
+
+template<typename T, typename G>
+__global__ void AdamUpdateGpu(int64_t n, T scale, float l1, float l2, float beta1, float beta2,
+                              float epsilon, float weight_decay, bool amsgrad,
+                              bool do_bias_correction, float learning_rate_val,
+                              float bias_correction1_val, float bias_correction2_val,
+                              const float* learning_rate, const T* scale_by_ptr,
+                              const int64_t* skip_if, const float* bias_correction1_ptr,
+                              const float* bias_correction2_ptr, const G* model_diff, T* model,
+                              T* m, T* v, T* max_v) {
+  if (skip_if != nullptr && *skip_if != 0) { return; }
+  if (learning_rate != nullptr) { learning_rate_val = *learning_rate; }
+  if (scale_by_ptr != nullptr) { scale *= *scale_by_ptr; }
+  if (bias_correction1_ptr != nullptr) { bias_correction1_val = *bias_correction1_ptr; }
+  if (bias_correction2_ptr != nullptr) { bias_correction2_val = *bias_correction2_ptr; }
+
+  CUDA_1D_KERNEL_LOOP(i, n) {
+    AdamUpdateFunctor<T, G>()(model_diff + i, model + i, m + i, v + i, max_v + i, scale, l1, l2,
+                              beta1, beta2, epsilon, weight_decay, amsgrad, bias_correction1_val,
+                              bias_correction2_val, learning_rate_val);
+  }
+}
+
+template<typename T>
+__global__ void AdamUpdateBetaTGpu(const T beta1, const T beta2, const int64_t* skip_if, T* beta1_t,
+                                   T* beta2_t) {
+  if (skip_if != nullptr && *skip_if != 0) { return; }
+  *beta1_t *= beta1;
+  *beta2_t *= beta2;
+}
+
+template<typename T, typename K, typename IDX>
+__global__ void IndexedSlicesAdamUpdateGpu(
+    float beta1, float beta2, float epsilon, float weight_decay, bool amsgrad,
+    bool do_bias_correction, float lr, int64_t feature_size, int64_t lower_bound,
+    int64_t upper_bound, const IDX* num_unique_instance, const float* learning_rate,
+    const float* bias_correction1_ptr, const float* bias_correction2_ptr, const K* indices,
+    const T* values, T* model, T* m, T* v, T* max_v) {
+  if (learning_rate != nullptr) { lr = *learning_rate; }
+  float bias_correction1 = 1.0;
+  float bias_correction2 = 1.0;
+  if (bias_correction1_ptr != nullptr) { bias_correction1 = *bias_correction1_ptr; }
+  if (bias_correction2_ptr != nullptr) { bias_correction2 = *bias_correction2_ptr; }
+
+  const int64_t n = *num_unique_instance * feature_size;
+  CUDA_1D_KERNEL_LOOP(i, n) {
+    const IDX indices_idx = i / feature_size;
+    const IDX inner_idx = i - indices_idx * feature_size;
+    const IDX instance_id = indices[indices_idx];
+    if (instance_id >= lower_bound && instance_id < upper_bound) {
+      const IDX model_idx = (instance_id - lower_bound) * feature_size + inner_idx;
+      AdamUpdateFunctor<T, T>()(values + i, model + model_idx, m + model_idx, v + model_idx,
+                                max_v + i, static_cast<T>(1), 0, 0, beta1, beta2, epsilon,
+                                weight_decay, amsgrad, bias_correction1, bias_correction2, lr);
+    }
+  }
+}
+
+template<typename T, typename G>
+__global__ void LambGradGpu(int64_t n, T scale, float l1, float l2, float beta1, float beta2,
+                            float epsilon, const T* scale_by_ptr, const int64_t* skip_if,
+                            const G* model_diff, T* adam_diff, T* model, T* m, T* v,
+                            bool do_bias_correction, float bias_correction1_val,
+                            float bias_correction2_val, const float* bias_correction1_ptr,
+                            const float* bias_correction2_ptr) {
+  if (skip_if != nullptr && *skip_if != 0) { return; }
+  if (scale_by_ptr != nullptr) { scale *= *scale_by_ptr; }
+  if (bias_correction1_ptr != nullptr) { bias_correction1_val = *bias_correction1_ptr; }
+  if (bias_correction2_ptr != nullptr) { bias_correction2_val = *bias_correction2_ptr; }
+  CUDA_1D_KERNEL_LOOP(i, n) {
+    LambGradFunctor<T, G>()(model_diff + i, adam_diff + i, model + i, m + i, v + i, scale, l1, l2,
+                            beta1, beta2, epsilon, do_bias_correction, bias_correction1_val,
+                            bias_correction2_val);
+  }
+}
+
+template<typename T>
+__global__ void LambUpdateGpu(int64_t n, float weight_decay, float learning_rate_val,
+                              const float* learning_rate_ptr, const int64_t* skip_if,
+                              const T* w_norm_2, const T* g_norm_2, const T* adam_diff, T* model) {
+  if (skip_if != nullptr && *skip_if != 0) { return; }
+  if (learning_rate_ptr != nullptr) { learning_rate_val = *learning_rate_ptr; }
+  const float lr = LambLRFunctor<T>()(learning_rate_val, w_norm_2, g_norm_2);
+  CUDA_1D_KERNEL_LOOP(i, n) { LambUpdateFunctor<T>()(lr, weight_decay, adam_diff + i, model + i); }
+}
+
+}  // namespace
+
+template<typename T, typename G>
+struct AdamUpdateKernelUtil<DeviceType::kCUDA, T, G> {
+  static void Update(ep::Stream* stream, int64_t n, T scale, float l1, float l2, float beta1,
+                     float beta2, float epsilon, float weight_decay, bool amsgrad,
+                     bool do_bias_correction, float learning_rate_val, float bias_correction1_val,
+                     float bias_correction2_val, const float* learning_rate, const T* scale_by_ptr,
+                     const int64_t* skip_if, const float* bias_correction1_ptr,
+                     const float* bias_correction2_ptr, const G* model_diff, T* model, T* m, T* v,
+                     T* max_v);
+};
+
+template<typename T, typename G>
+void AdamUpdateKernelUtil<DeviceType::kCUDA, T, G>::Update(
+    ep::Stream* stream, int64_t n, T scale, float l1, float l2, float beta1, float beta2,
+    float epsilon, float weight_decay, bool amsgrad, bool do_bias_correction,
+    float learning_rate_val, float bias_correction1_val, float bias_correction2_val,
+    const float* learning_rate, const T* scale_by_ptr, const int64_t* skip_if,
+    const float* bias_correction1_ptr, const float* bias_correction2_ptr, const G* model_diff,
+    T* model, T* m, T* v, T* max_v) {
+  AdamUpdateGpu<T, G><<<BlocksNum4ThreadsNum(n), kCudaThreadsNumPerBlock, 0,
+                        stream->As<ep::CudaStream>()->cuda_stream()>>>(
+      n, scale, l1, l2, beta1, beta2, epsilon, weight_decay, amsgrad, do_bias_correction,
+      learning_rate_val, bias_correction1_val, bias_correction2_val, learning_rate, scale_by_ptr,
+      skip_if, bias_correction1_ptr, bias_correction2_ptr, model_diff, model, m, v, max_v);
+}
+
+template<typename T>
+struct AdamUpdateKernelUtil<DeviceType::kCUDA, T, float16> {
+  static void Update(ep::Stream* stream, int64_t n, T scale, float l1, float l2, float beta1,
+                     float beta2, float epsilon, float weight_decay, bool amsgrad,
+                     bool do_bias_correction, float learning_rate_val, float bias_correction1_val,
+                     float bias_correction2_val, const float* learning_rate, const T* scale_by_ptr,
+                     const int64_t* skip_if, const float* bias_correction1_ptr,
+                     const float* bias_correction2_ptr, const float16* model_diff, T* model, T* m,
+                     T* v, T* max_v);
+};
+
+template<typename T>
+void AdamUpdateKernelUtil<DeviceType::kCUDA, T, float16>::Update(
+    ep::Stream* stream, int64_t n, T scale, float l1, float l2, float beta1, float beta2,
+    float epsilon, float weight_decay, bool amsgrad, bool do_bias_correction,
+    float learning_rate_val, float bias_correction1_val, float bias_correction2_val,
+    const float* learning_rate, const T* scale_by_ptr, const int64_t* skip_if,
+    const float* bias_correction1_ptr, const float* bias_correction2_ptr, const float16* model_diff,
+    T* model, T* m, T* v, T* max_v) {
+  AdamUpdateKernelUtil<DeviceType::kCUDA, T, half>::Update(
+      stream, n, scale, l1, l2, beta1, beta2, epsilon, weight_decay, amsgrad, do_bias_correction,
+      learning_rate_val, bias_correction1_val, bias_correction2_val, learning_rate, scale_by_ptr,
+      skip_if, bias_correction1_ptr, bias_correction2_ptr,
+      reinterpret_cast<const half*>(model_diff), model, m, v, max_v);
+}
+
+template struct AdamUpdateKernelUtil<DeviceType::kCUDA, float, float>;
+template struct AdamUpdateKernelUtil<DeviceType::kCUDA, double, double>;
+template struct AdamUpdateKernelUtil<DeviceType::kCUDA, float, float16>;
+
+template<typename T, typename G>
+__global__ void AdagradUpdateGpu(int64_t n, T scale, float l1, float l2, float lr_decay,
+                                 float epsilon, float weight_decay, float learning_rate_val,
+                                 int64_t train_step, const float* learning_rate,
+                                 const int64_t* train_step_ptr, const T* scale_by_ptr,
+                                 const int64_t* skip_if, const G* model_diff, T* model, T* sum) {
+  if (skip_if != nullptr && *skip_if != 0) { return; }
+  if (learning_rate != nullptr) { learning_rate_val = *learning_rate; }
+  if (train_step_ptr != nullptr) {
+    train_step = *train_step_ptr + 1;
+  }  // train_step_ptr start from zero.
+  if (scale_by_ptr != nullptr) { scale *= *scale_by_ptr; }
+  learning_rate_val = learning_rate_val / (1 + (train_step - 1) * lr_decay);
+
+  CUDA_1D_KERNEL_LOOP(i, n) {
+    AdagradUpdateFunctor<T, G>()(model_diff + i, model + i, sum + i, scale, l1, l2, epsilon,
+                                 weight_decay, learning_rate_val);
+  }
+}
+
+template<typename T, typename G>
+struct AdagradUpdateKernelUtil<DeviceType::kCUDA, T, G> {
+  static void Update(ep::Stream* stream, int64_t n, T scale, float l1, float l2, float lr_decay,
+                     float epsilon, float weight_decay, float learning_rate_val, int64_t train_step,
+                     const float* learning_rate, const int64_t* train_step_ptr,
+                     const T* scale_by_ptr, const int64_t* skip_if, const G* model_diff, T* model,
+                     T* sum);
+};
+
+template<typename T, typename G>
+void AdagradUpdateKernelUtil<DeviceType::kCUDA, T, G>::Update(
+    ep::Stream* stream, int64_t n, T scale, float l1, float l2, float lr_decay, float epsilon,
+    float weight_decay, float learning_rate_val, int64_t train_step, const float* learning_rate,
+    const int64_t* train_step_ptr, const T* scale_by_ptr, const int64_t* skip_if,
+    const G* model_diff, T* model, T* sum) {
+  AdagradUpdateGpu<T, G><<<BlocksNum4ThreadsNum(n), kCudaThreadsNumPerBlock, 0,
+                           stream->As<ep::CudaStream>()->cuda_stream()>>>(
+      n, scale, l1, l2, lr_decay, epsilon, weight_decay, learning_rate_val, train_step,
+      learning_rate, train_step_ptr, scale_by_ptr, skip_if, model_diff, model, sum);
+}
+
+template struct AdagradUpdateKernelUtil<DeviceType::kCUDA, float, float>;
+template struct AdagradUpdateKernelUtil<DeviceType::kCUDA, double, double>;
+
+template<typename T, typename G>
+struct LambUpdateKernelUtil<DeviceType::kCUDA, T, G> {
+  static void Update(ep::Stream* stream, int64_t n, float scale, float l1, float l2, float beta1,
+                     float beta2, float epsilon, float weight_decay, float learning_rate_val,
+                     bool do_bias_correction, float bias_correction1_val,
+                     float bias_correction2_val, const float* learning_rate_ptr,
+                     const float* bias_correction1_ptr, const float* bias_correction2_ptr,
+                     const T* scale_by_ptr, const int64_t* skip_if, const G* model_diff,
+                     T* adam_diff, T* model, T* m, T* v, T* norm_buffer);
+};
+
+template<typename T, typename G>
+void LambUpdateKernelUtil<DeviceType::kCUDA, T, G>::Update(
+    ep::Stream* stream, int64_t n, float scale, float l1, float l2, float beta1, float beta2,
+    float epsilon, float weight_decay, float learning_rate_val, bool do_bias_correction,
+    float bias_correction1_val, float bias_correction2_val, const float* learning_rate_ptr,
+    const float* bias_correction1_ptr, const float* bias_correction2_ptr, const T* scale_by_ptr,
+    const int64_t* skip_if, const G* model_diff, T* adam_diff, T* model, T* m, T* v,
+    T* norm_buffer) {
+  LambGradGpu<T, G><<<BlocksNum4ThreadsNum(n), kCudaThreadsNumPerBlock, 0,
+                      stream->As<ep::CudaStream>()->cuda_stream()>>>(
+      n, scale, l1, l2, beta1, beta2, epsilon, scale_by_ptr, skip_if, model_diff, adam_diff, model,
+      m, v, do_bias_correction, bias_correction1_val, bias_correction2_val, bias_correction1_ptr,
+      bias_correction2_ptr);
+  T* w_norm_2 = norm_buffer;
+  T* g_norm_2 = norm_buffer + 1;
+  Memset<DeviceType::kCUDA>(stream, norm_buffer, 0, 2 * sizeof(T));
+  SumSquares2<T>
+      <<<BlocksNum4ThreadsNum(n), kCudaThreadsNumPerBlock, 0,
+         stream->As<ep::CudaStream>()->cuda_stream()>>>(n, model, w_norm_2, adam_diff, g_norm_2);
+  LambUpdateGpu<T><<<BlocksNum4ThreadsNum(n), kCudaThreadsNumPerBlock, 0,
+                     stream->As<ep::CudaStream>()->cuda_stream()>>>(
+      n, weight_decay, learning_rate_val, learning_rate_ptr, skip_if, w_norm_2, g_norm_2, adam_diff,
+      model);
+}
+
+template<typename T>
+struct LambUpdateKernelUtil<DeviceType::kCUDA, T, float16> {
+  static void Update(ep::Stream* stream, int64_t n, float scale, float l1, float l2, float beta1,
+                     float beta2, float epsilon, float weight_decay, float learning_rate_val,
+                     bool do_bias_correction, float bias_correction1_val,
+                     float bias_correction2_val, const float* learning_rate_ptr,
+                     const float* bias_correction1_ptr, const float* bias_correction2_ptr,
+                     const T* scale_by_ptr, const int64_t* skip_if, const float16* model_diff,
+                     T* adam_diff, T* model, T* m, T* v, T* norm_buffer);
+};
+
+template<typename T>
+void LambUpdateKernelUtil<DeviceType::kCUDA, T, float16>::Update(
+    ep::Stream* stream, int64_t n, float scale, float l1, float l2, float beta1, float beta2,
+    float epsilon, float weight_decay, float learning_rate_val, bool do_bias_correction,
+    float bias_correction1_val, float bias_correction2_val, const float* learning_rate_ptr,
+    const float* bias_correction1_ptr, const float* bias_correction2_ptr, const T* scale_by_ptr,
+    const int64_t* skip_if, const float16* model_diff, T* adam_diff, T* model, T* m, T* v,
+    T* norm_buffer) {
+  LambUpdateKernelUtil<DeviceType::kCUDA, T, half>::Update(
+      stream, n, scale, l1, l2, beta1, beta2, epsilon, weight_decay, learning_rate_val,
+      do_bias_correction, bias_correction1_val, bias_correction2_val, learning_rate_ptr,
+      bias_correction1_ptr, bias_correction2_ptr, scale_by_ptr, skip_if,
+      reinterpret_cast<const half*>(model_diff), adam_diff, model, m, v, norm_buffer);
+}
+
+template struct LambUpdateKernelUtil<DeviceType::kCUDA, float, float>;
+template struct LambUpdateKernelUtil<DeviceType::kCUDA, double, double>;
+template struct LambUpdateKernelUtil<DeviceType::kCUDA, float, float16>;
+
+template<typename T, typename K, typename IDX>
+struct IndexedSlicesAdamMdUpdateKernelUtil<DeviceType::kCUDA, T, K, IDX> {
+  static void Update(ep::Stream* stream, float beta1, float beta2, float epsilon,
+                     float weight_decay, bool amsgrad, bool do_bias_correction, float lr,
+                     int64_t num_instance, int64_t feature_size, int64_t lower_bound,
+                     int64_t upper_bound, const IDX* num_unique_instance,
+                     const float* learning_rate, const float* bias_correction1_ptr,
+                     const float* bias_correction2_ptr, const K* indices, const T* values, T* model,
+                     T* m, T* v, T* max_v);
+};
+
+template<typename T, typename K, typename IDX>
+void IndexedSlicesAdamMdUpdateKernelUtil<DeviceType::kCUDA, T, K, IDX>::Update(
+    ep::Stream* stream, float beta1, float beta2, float epsilon, float weight_decay, bool amsgrad,
+    bool do_bias_correction, float lr, int64_t num_instance, int64_t feature_size,
+    int64_t lower_bound, int64_t upper_bound, const IDX* num_unique_instance,
+    const float* learning_rate, const float* bias_correction1_ptr,
+    const float* bias_correction2_ptr, const K* indices, const T* values, T* model, T* m, T* v,
+    T* max_v) {
+  IndexedSlicesAdamUpdateGpu<T, K, IDX>
+      <<<BlocksNum4ThreadsNum(num_instance * feature_size), kCudaThreadsNumPerBlock, 0,
+         stream->As<ep::CudaStream>()->cuda_stream()>>>(
+          beta1, beta2, epsilon, weight_decay, amsgrad, do_bias_correction, lr, feature_size,
+          lower_bound, upper_bound, num_unique_instance, learning_rate, bias_correction1_ptr,
+          bias_correction2_ptr, indices, values, model, m, v, max_v);
+}
+
+#define INSTANTIATE_INDEXED_SLICES_ADAM_MODEL_UPDATE_KERNEL_UTIL_CUDA(                     \
+    val_type_pair, key_type_pair, idx_type_pair)                                           \
+  template struct IndexedSlicesAdamMdUpdateKernelUtil<                                     \
+      DeviceType::kCUDA, OF_PP_PAIR_FIRST(val_type_pair), OF_PP_PAIR_FIRST(key_type_pair), \
+      OF_PP_PAIR_FIRST(idx_type_pair)>;
+OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_INDEXED_SLICES_ADAM_MODEL_UPDATE_KERNEL_UTIL_CUDA,
+                                 FLOATING_DATA_TYPE_SEQ, INDEX_DATA_TYPE_SEQ, INT_DATA_TYPE_SEQ);
+#undef INSTANTIATE_INDEXED_SLICES_ADAM_MODEL_UPDATE_KERNEL_UTIL_CUDA
+
+template<>
+struct BiasCorrectionFactorKernelUtil<DeviceType::kCUDA> {
+  static void BiasCorrectionFactorCompute(ep::Stream* stream, float beta, const int64_t* train_step,
+                                          float* out);
+};
+
+void BiasCorrectionFactorKernelUtil<DeviceType::kCUDA>::BiasCorrectionFactorCompute(
+    ep::Stream* stream, float beta, const int64_t* train_step, float* out) {
+  BiasCorrectionFactorKernelGpu<<<1, 1, 0, stream->As<ep::CudaStream>()->cuda_stream()>>>(
+      beta, train_step, out);
+}
+
+namespace {
+
+template<typename T, typename G, bool centered>
+__global__ void RmsPropUpdateGpu(int64_t n, T scale, float l1, float l2, T* mean_square,
+                                 T* mean_gradient, float epsilon, float weight_decay,
+                                 float decay_rate, float learning_rate_val,
+                                 const float* learning_rate, const T* scale_by_ptr,
+                                 const int64_t* skip_if, const G* model_diff, T* model) {
+  if (skip_if != nullptr && *skip_if != 0) { return; }
+  if (learning_rate != nullptr) { learning_rate_val = *learning_rate; }
+  if (scale_by_ptr != nullptr) { scale *= *scale_by_ptr; }
+  CUDA_1D_KERNEL_LOOP(i, n) {
+    RmsPropUpdateFunctor<T, G, centered>()(model_diff + i, model + i, n, scale, l1, l2,
+                                           mean_square + i,
+                                           (centered ? mean_gradient + i : nullptr), epsilon,
+                                           weight_decay, decay_rate, learning_rate_val);
+  }
+}
+
+}  // namespace
+
+template<typename T, typename G>
+struct RmsPropUpdateKernelUtil<DeviceType::kCUDA, T, G> {
+  static void Update(ep::Stream* stream, int64_t n, T scale, float l1, float l2, bool centered,
+                     float epsilon, float weight_decay, float decay_rate, float learning_rate_val,
+                     const float* learning_rate, const T* scale_by_ptr, const int64_t* skip_if,
+                     const G* model_diff, T* model, T* mean_square, T* mean_gradient);
+};
+
+template<typename T, typename G>
+void RmsPropUpdateKernelUtil<DeviceType::kCUDA, T, G>::Update(
+    ep::Stream* stream, int64_t n, T scale, float l1, float l2, bool centered, float epsilon,
+    float weight_decay, float decay_rate, float learning_rate_val, const float* learning_rate,
+    const T* scale_by_ptr, const int64_t* skip_if, const G* model_diff, T* model, T* mean_square,
+    T* mean_gradient) {
+  if (centered) {
+    RmsPropUpdateGpu<T, G, true><<<BlocksNum4ThreadsNum(n), kCudaThreadsNumPerBlock, 0,
+                                   stream->As<ep::CudaStream>()->cuda_stream()>>>(
+        n, scale, l1, l2, mean_square, mean_gradient, epsilon, weight_decay, decay_rate,
+        learning_rate_val, learning_rate, scale_by_ptr, skip_if, model_diff, model);
+  } else {
+    RmsPropUpdateGpu<T, G, false><<<BlocksNum4ThreadsNum(n), kCudaThreadsNumPerBlock, 0,
+                                    stream->As<ep::CudaStream>()->cuda_stream()>>>(
+        n, scale, l1, l2, mean_square, mean_gradient, epsilon, weight_decay, decay_rate,
+        learning_rate_val, learning_rate, scale_by_ptr, skip_if, model_diff, model);
+  }
+}
+
+template<typename T>
+struct RmsPropUpdateKernelUtil<DeviceType::kCUDA, T, float16> {
+  static void Update(ep::Stream* stream, int64_t n, T scale, float l1, float l2, bool centered,
+                     float epsilon, float weight_decay, float decay_rate, float learning_rate_val,
+                     const float* learning_rate, const T* scale_by_ptr, const int64_t* skip_if,
+                     const float16* model_diff, T* model, T* mean_square, T* mean_gradient);
+};
+
+template<typename T>
+void RmsPropUpdateKernelUtil<DeviceType::kCUDA, T, float16>::Update(
+    ep::Stream* stream, int64_t n, T scale, float l1, float l2, bool centered, float epsilon,
+    float weight_decay, float decay_rate, float learning_rate_val, const float* learning_rate,
+    const T* scale_by_ptr, const int64_t* skip_if, const float16* model_diff, T* model,
+    T* mean_square, T* mean_gradient) {
+  RmsPropUpdateKernelUtil<DeviceType::kCUDA, T, half>::Update(
+      stream, n, scale, l1, l2, centered, epsilon, weight_decay, decay_rate, learning_rate_val,
+      learning_rate, scale_by_ptr, skip_if, reinterpret_cast<const half*>(model_diff), model,
+      mean_square, mean_gradient);
+}
+
+template struct RmsPropUpdateKernelUtil<DeviceType::kCUDA, float, float>;
+template struct RmsPropUpdateKernelUtil<DeviceType::kCUDA, double, double>;
+template struct RmsPropUpdateKernelUtil<DeviceType::kCUDA, float, float16>;
+
+namespace {
+
+template<typename T, typename G>
+__global__ void LarsScaleModelDiffGpu(int64_t n, T scale, float l1, float l2, const T* scale_by_ptr,
+                                      const int64_t* skip_if, const G* model_diff, T* model,
+                                      T* model_diff_tmp) {
+  if (skip_if != nullptr && *skip_if != 0) { return; }
+  if (scale_by_ptr != nullptr) { scale *= *scale_by_ptr; }
+  CUDA_1D_KERNEL_LOOP(i, n) {
+    model_diff_tmp[i] =
+        CastScaleRegularizeGradientFunctor<T, G>()(model_diff[i], model[i], scale, l1, l2);
+  }
+}
+
+template<typename T>
+__global__ void LarsGetLocalLearningRateGpu(const float* learning_rate, T weight_decay, T epsilon,
+                                            T lars_coefficient, const int64_t* skip_if,
+                                            T* data_tmp) {
+  if (skip_if != nullptr && *skip_if != 0) { return; }
+  T* model_norm = &data_tmp[0];
+  T* model_diff_norm = &data_tmp[1];
+  T* local_learning_rate = &data_tmp[2];
+  *model_norm = std::sqrt(*model_norm);
+  *model_diff_norm = std::sqrt(*model_diff_norm);
+  T lars = static_cast<T>(1);
+  if (*model_norm > 0 && *model_diff_norm > 0) {
+    lars = lars_coefficient * (*model_norm)
+           / (epsilon + (*model_diff_norm) + weight_decay * (*model_norm));
+  }
+  *local_learning_rate = *learning_rate * lars;
+}
+
+template<typename T>
+__global__ void LarsUpdateGpu(int64_t n, float momentum_beta, T* momentum, float weight_decay,
+                              const int64_t* skip_if, T* local_learning_rate, T* model_diff_tmp,
+                              T* model) {
+  if (skip_if != nullptr && *skip_if != 0) { return; }
+  CUDA_1D_KERNEL_LOOP(i, n) {
+    LarsUpdateFunctor<T>()(model_diff_tmp + i, model + i, momentum_beta, momentum + i, weight_decay,
+                           *local_learning_rate);
+  }
+}
+}  // namespace
+
+template<typename T, typename G>
+struct LarsUpdateKernelUtil<DeviceType::kCUDA, T, G> {
+  static void Update(ep::Stream* stream, int64_t n, T scale, float l1, float l2,
+                     float momentum_beta, float epsilon, float lars_coefficient, float weight_decay,
+                     const float* learning_rate, const T* scale_by_ptr, const int64_t* skip_if,
+                     const G* model_diff, T* model, T* momentum, T* data_tmp, T* model_diff_tmp);
+};
+
+template<typename T, typename G>
+void LarsUpdateKernelUtil<DeviceType::kCUDA, T, G>::Update(
+    ep::Stream* stream, int64_t n, T scale, float l1, float l2, float momentum_beta, float epsilon,
+    float lars_coefficient, float weight_decay, const float* learning_rate, const T* scale_by_ptr,
+    const int64_t* skip_if, const G* model_diff, T* model, T* momentum, T* data_tmp,
+    T* model_diff_tmp) {
+  LarsScaleModelDiffGpu<T, G><<<BlocksNum4ThreadsNum(n), kCudaThreadsNumPerBlock, 0,
+                                stream->As<ep::CudaStream>()->cuda_stream()>>>(
+      n, scale, l1, l2, scale_by_ptr, skip_if, model_diff, model, model_diff_tmp);
+  T* model_norm = data_tmp;
+  T* model_diff_norm = data_tmp + 1;
+  T* local_learning_rate = data_tmp + 2;
+  Memset<DeviceType::kCUDA>(stream, data_tmp, 0, 2 * sizeof(T));
+  SumSquares2<T><<<BlocksNum4ThreadsNum(n), kCudaThreadsNumPerBlock, 0,
+                   stream->As<ep::CudaStream>()->cuda_stream()>>>(n, model, model_norm,
+                                                                  model_diff_tmp, model_diff_norm);
+  LarsGetLocalLearningRateGpu<T><<<1, 1, 0, stream->As<ep::CudaStream>()->cuda_stream()>>>(
+      learning_rate, weight_decay, epsilon, lars_coefficient, skip_if, data_tmp);
+  LarsUpdateGpu<T><<<BlocksNum4ThreadsNum(n), kCudaThreadsNumPerBlock, 0,
+                     stream->As<ep::CudaStream>()->cuda_stream()>>>(
+      n, momentum_beta, momentum, weight_decay, skip_if, local_learning_rate, model_diff_tmp,
+      model);
+}
+
+template<typename T>
+struct LarsUpdateKernelUtil<DeviceType::kCUDA, T, float16> {
+  static void Update(ep::Stream* stream, int64_t n, T scale, float l1, float l2,
+                     float momentum_beta, float epsilon, float lars_coefficient, float weight_decay,
+                     const float* learning_rate, const T* scale_by_ptr, const int64_t* skip_if,
+                     const float16* model_diff, T* model, T* momentum, T* data_tmp,
+                     T* model_diff_tmp);
+};
+
+template<typename T>
+void LarsUpdateKernelUtil<DeviceType::kCUDA, T, float16>::Update(
+    ep::Stream* stream, int64_t n, T scale, float l1, float l2, float momentum_beta, float epsilon,
+    float lars_coefficient, float weight_decay, const float* learning_rate, const T* scale_by_ptr,
+    const int64_t* skip_if, const float16* model_diff, T* model, T* momentum, T* data_tmp,
+    T* model_diff_tmp) {
+  LarsUpdateKernelUtil<DeviceType::kCUDA, T, half>::Update(
+      stream, n, scale, l1, l2, momentum_beta, epsilon, lars_coefficient, weight_decay,
+      learning_rate, scale_by_ptr, skip_if, reinterpret_cast<const half*>(model_diff), model,
+      momentum, data_tmp, model_diff_tmp);
+}
+
+template struct LarsUpdateKernelUtil<DeviceType::kCUDA, float, float>;
+template struct LarsUpdateKernelUtil<DeviceType::kCUDA, double, double>;
+template struct LarsUpdateKernelUtil<DeviceType::kCUDA, float, float16>;
+
+template<typename T, typename G>
+__global__ void FtrlUpdateGpu(int64_t n, T scale, float l1, float l2, float lr_power, float lambda1,
+                              float lambda2, float beta, float weight_decay,
+                              float learning_rate_val, const float* learning_rate,
+                              const T* scale_by_ptr, const int64_t* skip_if, const G* model_diff,
+                              T* model, T* accumulate, T* z) {
+  if (skip_if != nullptr && *skip_if != 0) { return; }
+  if (learning_rate != nullptr) { learning_rate_val = *learning_rate; }
+  if (scale_by_ptr != nullptr) { scale *= *scale_by_ptr; }
+  CUDA_1D_KERNEL_LOOP(i, n) {
+    FtrlUpdateFunctor<T, G>()(model_diff + i, model + i, accumulate + i, z + i, scale, l1, l2,
+                              lr_power, lambda1, lambda2, beta, weight_decay, learning_rate_val);
+  }
+}
+
+template<typename T, typename G>
+struct FtrlUpdateKernelUtil<DeviceType::kCUDA, T, G> {
+  static void Update(ep::Stream* stream, int64_t n, T scale, float l1, float l2, float lr_power,
+                     float lambda1, float lambda2, float beta, float weight_decay,
+                     float learning_rate_val, const float* learning_rate, const T* scale_by_ptr,
+                     const int64_t* skip_if, const G* model_diff, T* model, T* accumulate, T* z);
+};
+
+template<typename T, typename G>
+void FtrlUpdateKernelUtil<DeviceType::kCUDA, T, G>::Update(
+    ep::Stream* stream, int64_t n, T scale, float l1, float l2, float lr_power, float lambda1,
+    float lambda2, float beta, float weight_decay, float learning_rate_val,
+    const float* learning_rate, const T* scale_by_ptr, const int64_t* skip_if, const G* model_diff,
+    T* model, T* accumulate, T* z) {
+  FtrlUpdateGpu<T, G><<<BlocksNum4ThreadsNum(n), kCudaThreadsNumPerBlock, 0,
+                        stream->As<ep::CudaStream>()->cuda_stream()>>>(
+      n, scale, l1, l2, lr_power, lambda1, lambda2, beta, weight_decay, learning_rate_val,
+      learning_rate, scale_by_ptr, skip_if, model_diff, model, accumulate, z);
+}
+
+template<typename T>
+struct FtrlUpdateKernelUtil<DeviceType::kCUDA, T, float16> {
+  static void Update(ep::Stream* stream, int64_t n, T scale, float l1, float l2, float lr_power,
+                     float lambda1, float lambda2, float beta, float weight_decay,
+                     float learning_rate_val, const float* learning_rate, const T* scale_by_ptr,
+                     const int64_t* skip_if, const float16* model_diff, T* model, T* accumulate,
+                     T* z);
+};
+
+template<typename T>
+void FtrlUpdateKernelUtil<DeviceType::kCUDA, T, float16>::Update(
+    ep::Stream* stream, int64_t n, T scale, float l1, float l2, float lr_power, float lambda1,
+    float lambda2, float beta, float weight_decay, float learning_rate_val,
+    const float* learning_rate, const T* scale_by_ptr, const int64_t* skip_if,
+    const float16* model_diff, T* model, T* accumulate, T* z) {
+  FtrlUpdateKernelUtil<DeviceType::kCUDA, T, half>::Update(
+      stream, n, scale, l1, l2, lr_power, lambda1, lambda2, beta, weight_decay, learning_rate_val,
+      learning_rate, scale_by_ptr, skip_if, reinterpret_cast<const half*>(model_diff), model,
+      accumulate, z);
+}
+
+template struct FtrlUpdateKernelUtil<DeviceType::kCUDA, float, float>;
+template struct FtrlUpdateKernelUtil<DeviceType::kCUDA, double, double>;
+template struct FtrlUpdateKernelUtil<DeviceType::kCUDA, float, float16>;
 }  // namespace oneflow
\ No newline at end of file
diff --git a/oneflow/user/kernels/moving_average_min_max_observer_kernel.hip.cpp b/oneflow/user/kernels/moving_average_min_max_observer_kernel.hip.cpp
index 4feb42b..6d65e1f 100644
--- a/oneflow/user/kernels/moving_average_min_max_observer_kernel.hip.cpp
+++ b/oneflow/user/kernels/moving_average_min_max_observer_kernel.hip.cpp
@@ -1,317 +1,317 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "hip/hip_runtime.h"
-#include "oneflow/core/device/cuda_util.h"
-#include "oneflow/core/framework/framework.h"
-#include "oneflow/core/hip/atomic.hip.h"
-#include "oneflow/core/ep/rocm/cuda_stream.h"
-
-#include <float.h>
-
-namespace oneflow {
-
-namespace {
-
-// NOTE(Liang Depeng): refer to
-// https://stackoverflow.com/questions/17371275/implementing-max-reduce-in-cuda
-template<typename T>
-__global__ void ReduceMaxMinPerLayer(const T* input_ptr, const int64_t elements, T* max_ptr,
-                                     T* min_ptr) {
-  extern __shared__ unsigned char shared_max_min_memory[];
-  T* shared_max = reinterpret_cast<T*>(shared_max_min_memory);
-  T* shared_min = shared_max + blockDim.x;
-
-  int64_t tid = threadIdx.x;
-  int64_t gid = (blockDim.x * blockIdx.x) + tid;
-  shared_max[tid] = -FLT_MAX;
-  shared_min[tid] = -FLT_MAX;
-
-  while (gid < elements) {
-    shared_max[tid] = max(shared_max[tid], input_ptr[gid]);
-    shared_min[tid] = max(shared_min[tid], -input_ptr[gid]);
-    gid += gridDim.x * blockDim.x;
-  }
-  __syncthreads();
-  gid = (blockDim.x * blockIdx.x) + tid;
-  for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1) {
-    if (tid < s && gid < elements) {
-      shared_max[tid] = max(shared_max[tid], shared_max[tid + s]);
-      shared_min[tid] = max(shared_min[tid], shared_min[tid + s]);
-    }
-    __syncthreads();
-  }
-
-  if (tid == 0) {
-    cuda::atomic::Max(max_ptr, shared_max[0]);
-    cuda::atomic::Max(min_ptr, shared_min[0]);
-  }
-}
-
-template<typename T>
-__global__ void InitMaxMin(const int64_t elements, T* max_ptr, T* min_ptr) {
-  int64_t tid = threadIdx.x;
-  int64_t gid = (blockDim.x * blockIdx.x) + tid;
-
-  while (gid < elements) {
-    max_ptr[gid] = -FLT_MAX;
-    min_ptr[gid] = -FLT_MAX;
-    gid += gridDim.x * blockDim.x;
-  }
-}
-
-template<typename T>
-__global__ void CalScaleZeroPointSymmetric(const int64_t elements, const double quantization_bit,
-                                           const float momentum, const T* max_ptr, const T* min_ptr,
-                                           T* moving_max_ptr, T* moving_min_ptr, T* scale,
-                                           T* zero_point) {
-  int64_t tid = threadIdx.x;
-  int64_t gid = (blockDim.x * blockIdx.x) + tid;
-
-  while (gid < elements) {
-    T activation_max = max(fabs(max_ptr[gid]), fabs(min_ptr[gid]));
-    T denominator = static_cast<T>(pow(2.0, quantization_bit - 1)) - 1;
-
-    if (moving_max_ptr[gid] == 0)
-      moving_max_ptr[gid] = activation_max;
-    else
-      moving_max_ptr[gid] = moving_max_ptr[gid] * momentum + activation_max * (1 - momentum);
-
-    // NOTE(Liang Depeng): symmetric quantization only use moving_max to calculate the scale
-    moving_min_ptr[gid] = moving_max_ptr[gid];
-
-    scale[gid] = moving_max_ptr[gid] / denominator;
-    zero_point[gid] = 0;
-    gid += gridDim.x * blockDim.x;
-  }
-}
-
-template<typename T>
-__global__ void CalFreezeScaleZeroPointSymmetric(const int64_t elements,
-                                                 const double quantization_bit,
-                                                 const float momentum, const T* moving_max_ptr,
-                                                 T* scale, T* zero_point) {
-  int64_t tid = threadIdx.x;
-  int64_t gid = (blockDim.x * blockIdx.x) + tid;
-
-  while (gid < elements) {
-    T denominator = static_cast<T>(pow(2.0, quantization_bit - 1)) - 1;
-    scale[gid] = moving_max_ptr[gid] / denominator;
-    zero_point[gid] = 0;
-    gid += gridDim.x * blockDim.x;
-  }
-}
-
-template<typename T>
-__global__ void CalScaleZeroPointAffine(const int64_t elements, const double quantization_bit,
-                                        const float momentum, const T* max_ptr, const T* min_ptr,
-                                        T* moving_max_ptr, T* moving_min_ptr, T* scale,
-                                        T* zero_point) {
-  int64_t tid = threadIdx.x;
-  int64_t gid = (blockDim.x * blockIdx.x) + tid;
-
-  while (gid < elements) {
-    T denominator = static_cast<T>(pow(2.0, quantization_bit)) - 1;
-
-    if (moving_max_ptr[gid] == 0)
-      moving_max_ptr[gid] = max_ptr[gid];
-    else
-      moving_max_ptr[gid] = moving_max_ptr[gid] * momentum + max_ptr[gid] * (1 - momentum);
-
-    if (moving_min_ptr[gid] == 0)
-      moving_min_ptr[gid] = -min_ptr[gid];
-    else
-      moving_min_ptr[gid] = moving_min_ptr[gid] * momentum + -min_ptr[gid] * (1 - momentum);
-
-    T min = moving_min_ptr[gid];
-    T s = (moving_max_ptr[gid] - min) / denominator;
-
-    scale[gid] = s;
-    zero_point[gid] = -round(min / s);
-    gid += gridDim.x * blockDim.x;
-  }
-}
-
-template<typename T>
-__global__ void CalFreezeScaleZeroPointAffine(const int64_t elements, const double quantization_bit,
-                                              const float momentum, const T* moving_max_ptr,
-                                              const T* moving_min_ptr, T* scale, T* zero_point) {
-  int64_t tid = threadIdx.x;
-  int64_t gid = (blockDim.x * blockIdx.x) + tid;
-
-  while (gid < elements) {
-    T denominator = static_cast<T>(pow(2.0, quantization_bit)) - 1;
-
-    T min = moving_min_ptr[gid];
-    T s = (moving_max_ptr[gid] - min) / denominator;
-
-    scale[gid] = s;
-    zero_point[gid] = -round(min / s);
-    gid += gridDim.x * blockDim.x;
-  }
-}
-
-template<typename T>
-__global__ void CalScaleZeroPointCambricon(const int64_t elements, const double quantization_bit,
-                                           const float momentum, const T* max_ptr, const T* min_ptr,
-                                           T* moving_max_ptr, T* moving_min_ptr, T* scale,
-                                           T* zero_point) {
-  int64_t tid = threadIdx.x;
-  int64_t gid = (blockDim.x * blockIdx.x) + tid;
-
-  while (gid < elements) {
-    T activation_max = max(fabs(max_ptr[gid]), fabs(min_ptr[gid]));
-
-    if (moving_max_ptr[gid] == 0)
-      moving_max_ptr[gid] = activation_max;
-    else
-      moving_max_ptr[gid] = moving_max_ptr[gid] * momentum + activation_max * (1 - momentum);
-
-    // NOTE(Liang Depeng): cambricon quantization only use moving_max to calculate the scale
-    moving_min_ptr[gid] = moving_max_ptr[gid];
-
-    scale[gid] = floor(log2(moving_max_ptr[gid])) - (quantization_bit - 2);
-    zero_point[gid] = 0;
-    gid += gridDim.x * blockDim.x;
-  }
-}
-
-template<typename T>
-__global__ void CalFreezeScaleZeroPointCambricon(const int64_t elements,
-                                                 const double quantization_bit,
-                                                 const float momentum, const T* moving_max_ptr,
-                                                 T* scale, T* zero_point) {
-  int64_t tid = threadIdx.x;
-  int64_t gid = (blockDim.x * blockIdx.x) + tid;
-
-  while (gid < elements) {
-    T denominator = static_cast<T>(pow(2.0, quantization_bit - 1)) - 1;
-    scale[gid] = floor(log2(moving_max_ptr[gid])) - (quantization_bit - 2);
-    zero_point[gid] = 0;
-    gid += gridDim.x * blockDim.x;
-  }
-}
-
-ep::CudaLaunchConfig GetLaunchConfig(ep::CudaStream* stream, size_t thread_num,
-                                     size_t shared_mem_size) {
-  ep::CudaLaunchConfig config;
-  stream->InitLaunchConfigWithWaves(&config, thread_num, kCudaThreadsNumPerBlock, 1);
-  config.shared_mem_size = shared_mem_size;
-  return config;
-}
-
-}  // namespace
-
-#define LAUNCH_CUDA_KERNEL(func, stream, thread_num, shared_mem_size, ...) \
-  (stream)->LaunchKernel(func, GetLaunchConfig((stream), thread_num, shared_mem_size), __VA_ARGS__);
-
-template<typename T>
-class GpuMovingAverageMinMaxObserverKernel final : public user_op::OpKernel {
- public:
-  GpuMovingAverageMinMaxObserverKernel() = default;
-  ~GpuMovingAverageMinMaxObserverKernel() = default;
-
- private:
-  using user_op::OpKernel::Compute;
-  void Compute(user_op::KernelComputeContext* ctx) const override {
-    const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("in", 0);
-    const user_op::Tensor* current_train_step =
-        ctx->Tensor4ArgNameAndIndex("current_train_step", 0);
-    user_op::Tensor* moving_max = ctx->Tensor4ArgNameAndIndex("moving_max", 0);
-    user_op::Tensor* moving_min = ctx->Tensor4ArgNameAndIndex("moving_min", 0);
-    user_op::Tensor* scale = ctx->Tensor4ArgNameAndIndex("scale", 0);
-    user_op::Tensor* zero_point = ctx->Tensor4ArgNameAndIndex("zero_point", 0);
-    user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0);
-
-    const bool is_training = ctx->Attr<bool>("training");
-    const int64_t stop_update_after_iters = ctx->Attr<int64_t>("stop_update_after_iters");
-    const std::string quantization_scheme = ctx->Attr<std::string>("quantization_scheme");
-    const int32_t quantization_bit = ctx->Attr<int32_t>("quantization_bit");
-    const float momentum = ctx->Attr<float>("momentum");
-    const std::string quantization_formula = ctx->Attr<std::string>("quantization_formula");
-
-    int64_t elements = in->shape_view().elem_cnt();
-    T* max_ptr = tmp_buffer->mut_dptr<T>();
-    T* min_ptr = max_ptr + 1;
-
-    int64_t* host_current_train_step_ptr = new int64_t[current_train_step->shape_view().elem_cnt()];
-    OF_CUDA_CHECK(hipMemcpy(host_current_train_step_ptr, current_train_step->dptr<int64_t>(),
-                             current_train_step->shape_view().elem_cnt() * sizeof(int64_t),
-                             hipMemcpyDefault));
-    auto* cuda_stream = ctx->stream()->As<ep::CudaStream>();
-    if (*host_current_train_step_ptr <= stop_update_after_iters && is_training) {
-      LAUNCH_CUDA_KERNEL((InitMaxMin<T>), cuda_stream, 1, 0, 1, max_ptr, min_ptr);
-      LAUNCH_CUDA_KERNEL((ReduceMaxMinPerLayer<T>), cuda_stream, elements,
-                         kCudaThreadsNumPerBlock * 2 * sizeof(T), in->dptr<T>(), elements, max_ptr,
-                         min_ptr);
-    }
-    bool moving = (*host_current_train_step_ptr <= stop_update_after_iters) && is_training;
-    if (quantization_formula == "google") {
-      if (quantization_scheme == "symmetric") {
-        if (moving) {
-          LAUNCH_CUDA_KERNEL((CalScaleZeroPointSymmetric<T>), cuda_stream, 1, 0, 1,
-                             static_cast<double>(quantization_bit), momentum, max_ptr, min_ptr,
-                             moving_max->mut_dptr<T>(), moving_min->mut_dptr<T>(),
-                             scale->mut_dptr<T>(), zero_point->mut_dptr<T>());
-        } else {
-          LAUNCH_CUDA_KERNEL((CalFreezeScaleZeroPointSymmetric<T>), cuda_stream, 1, 0, 1,
-                             static_cast<double>(quantization_bit), momentum, moving_max->dptr<T>(),
-                             scale->mut_dptr<T>(), zero_point->mut_dptr<T>());
-        }
-      } else {  // quantization_scheme == "affine"
-        if (moving) {
-          LAUNCH_CUDA_KERNEL((CalScaleZeroPointAffine<T>), cuda_stream, 1, 0, 1,
-                             static_cast<double>(quantization_bit), momentum, max_ptr, min_ptr,
-                             moving_max->mut_dptr<T>(), moving_min->mut_dptr<T>(),
-                             scale->mut_dptr<T>(), zero_point->mut_dptr<T>());
-        } else {
-          LAUNCH_CUDA_KERNEL((CalFreezeScaleZeroPointAffine<T>), cuda_stream, 1, 0, 1,
-                             static_cast<double>(quantization_bit), momentum, moving_max->dptr<T>(),
-                             moving_min->dptr<T>(), scale->mut_dptr<T>(),
-                             zero_point->mut_dptr<T>());
-        }
-      }
-    } else if (quantization_formula == "cambricon") {
-      if (moving) {
-        LAUNCH_CUDA_KERNEL((CalScaleZeroPointCambricon<T>), cuda_stream, 1, 0, 1,
-                           static_cast<double>(quantization_bit), momentum, max_ptr, min_ptr,
-                           moving_max->mut_dptr<T>(), moving_min->mut_dptr<T>(),
-                           scale->mut_dptr<T>(), zero_point->mut_dptr<T>());
-      } else {
-        LAUNCH_CUDA_KERNEL((CalFreezeScaleZeroPointCambricon<T>), cuda_stream, 1, 0, 1,
-                           static_cast<double>(quantization_bit), momentum, moving_max->dptr<T>(),
-                           scale->mut_dptr<T>(), zero_point->mut_dptr<T>());
-      }
-    } else {
-      UNIMPLEMENTED();
-    }
-
-    delete[] host_current_train_step_ptr;
-  }
-
-  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
-};
-
-#define REGISTER_MOVING_AVERAGE_MIN_MAX_OBSERVER_KERNEL(dtype)                          \
-  REGISTER_USER_KERNEL("moving_average_min_max_observer")                               \
-      .SetCreateFn<GpuMovingAverageMinMaxObserverKernel<dtype>>()                       \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                  \
-                       && (user_op::HobDataType("in", 0) == GetDataType<dtype>::value)) \
-      .SetInferTmpSizeFn([](user_op::InferContext* ctx) -> size_t { return 2 * sizeof(dtype); })
-
-REGISTER_MOVING_AVERAGE_MIN_MAX_OBSERVER_KERNEL(float);
-REGISTER_MOVING_AVERAGE_MIN_MAX_OBSERVER_KERNEL(double);
-
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "hip/hip_runtime.h"
+#include "oneflow/core/device/cuda_util.h"
+#include "oneflow/core/framework/framework.h"
+#include "oneflow/core/hip/atomic.hip.h"
+#include "oneflow/core/ep/rocm/cuda_stream.h"
+
+#include <float.h>
+
+namespace oneflow {
+
+namespace {
+
+// NOTE(Liang Depeng): refer to
+// https://stackoverflow.com/questions/17371275/implementing-max-reduce-in-cuda
+template<typename T>
+__global__ void ReduceMaxMinPerLayer(const T* input_ptr, const int64_t elements, T* max_ptr,
+                                     T* min_ptr) {
+  extern __shared__ unsigned char shared_max_min_memory[];
+  T* shared_max = reinterpret_cast<T*>(shared_max_min_memory);
+  T* shared_min = shared_max + blockDim.x;
+
+  int64_t tid = threadIdx.x;
+  int64_t gid = (blockDim.x * blockIdx.x) + tid;
+  shared_max[tid] = -FLT_MAX;
+  shared_min[tid] = -FLT_MAX;
+
+  while (gid < elements) {
+    shared_max[tid] = max(shared_max[tid], input_ptr[gid]);
+    shared_min[tid] = max(shared_min[tid], -input_ptr[gid]);
+    gid += gridDim.x * blockDim.x;
+  }
+  __syncthreads();
+  gid = (blockDim.x * blockIdx.x) + tid;
+  for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1) {
+    if (tid < s && gid < elements) {
+      shared_max[tid] = max(shared_max[tid], shared_max[tid + s]);
+      shared_min[tid] = max(shared_min[tid], shared_min[tid + s]);
+    }
+    __syncthreads();
+  }
+
+  if (tid == 0) {
+    cuda::atomic::Max(max_ptr, shared_max[0]);
+    cuda::atomic::Max(min_ptr, shared_min[0]);
+  }
+}
+
+template<typename T>
+__global__ void InitMaxMin(const int64_t elements, T* max_ptr, T* min_ptr) {
+  int64_t tid = threadIdx.x;
+  int64_t gid = (blockDim.x * blockIdx.x) + tid;
+
+  while (gid < elements) {
+    max_ptr[gid] = -FLT_MAX;
+    min_ptr[gid] = -FLT_MAX;
+    gid += gridDim.x * blockDim.x;
+  }
+}
+
+template<typename T>
+__global__ void CalScaleZeroPointSymmetric(const int64_t elements, const double quantization_bit,
+                                           const float momentum, const T* max_ptr, const T* min_ptr,
+                                           T* moving_max_ptr, T* moving_min_ptr, T* scale,
+                                           T* zero_point) {
+  int64_t tid = threadIdx.x;
+  int64_t gid = (blockDim.x * blockIdx.x) + tid;
+
+  while (gid < elements) {
+    T activation_max = max(fabs(max_ptr[gid]), fabs(min_ptr[gid]));
+    T denominator = static_cast<T>(pow(2.0, quantization_bit - 1)) - 1;
+
+    if (moving_max_ptr[gid] == 0)
+      moving_max_ptr[gid] = activation_max;
+    else
+      moving_max_ptr[gid] = moving_max_ptr[gid] * momentum + activation_max * (1 - momentum);
+
+    // NOTE(Liang Depeng): symmetric quantization only use moving_max to calculate the scale
+    moving_min_ptr[gid] = moving_max_ptr[gid];
+
+    scale[gid] = moving_max_ptr[gid] / denominator;
+    zero_point[gid] = 0;
+    gid += gridDim.x * blockDim.x;
+  }
+}
+
+template<typename T>
+__global__ void CalFreezeScaleZeroPointSymmetric(const int64_t elements,
+                                                 const double quantization_bit,
+                                                 const float momentum, const T* moving_max_ptr,
+                                                 T* scale, T* zero_point) {
+  int64_t tid = threadIdx.x;
+  int64_t gid = (blockDim.x * blockIdx.x) + tid;
+
+  while (gid < elements) {
+    T denominator = static_cast<T>(pow(2.0, quantization_bit - 1)) - 1;
+    scale[gid] = moving_max_ptr[gid] / denominator;
+    zero_point[gid] = 0;
+    gid += gridDim.x * blockDim.x;
+  }
+}
+
+template<typename T>
+__global__ void CalScaleZeroPointAffine(const int64_t elements, const double quantization_bit,
+                                        const float momentum, const T* max_ptr, const T* min_ptr,
+                                        T* moving_max_ptr, T* moving_min_ptr, T* scale,
+                                        T* zero_point) {
+  int64_t tid = threadIdx.x;
+  int64_t gid = (blockDim.x * blockIdx.x) + tid;
+
+  while (gid < elements) {
+    T denominator = static_cast<T>(pow(2.0, quantization_bit)) - 1;
+
+    if (moving_max_ptr[gid] == 0)
+      moving_max_ptr[gid] = max_ptr[gid];
+    else
+      moving_max_ptr[gid] = moving_max_ptr[gid] * momentum + max_ptr[gid] * (1 - momentum);
+
+    if (moving_min_ptr[gid] == 0)
+      moving_min_ptr[gid] = -min_ptr[gid];
+    else
+      moving_min_ptr[gid] = moving_min_ptr[gid] * momentum + -min_ptr[gid] * (1 - momentum);
+
+    T min = moving_min_ptr[gid];
+    T s = (moving_max_ptr[gid] - min) / denominator;
+
+    scale[gid] = s;
+    zero_point[gid] = -round(min / s);
+    gid += gridDim.x * blockDim.x;
+  }
+}
+
+template<typename T>
+__global__ void CalFreezeScaleZeroPointAffine(const int64_t elements, const double quantization_bit,
+                                              const float momentum, const T* moving_max_ptr,
+                                              const T* moving_min_ptr, T* scale, T* zero_point) {
+  int64_t tid = threadIdx.x;
+  int64_t gid = (blockDim.x * blockIdx.x) + tid;
+
+  while (gid < elements) {
+    T denominator = static_cast<T>(pow(2.0, quantization_bit)) - 1;
+
+    T min = moving_min_ptr[gid];
+    T s = (moving_max_ptr[gid] - min) / denominator;
+
+    scale[gid] = s;
+    zero_point[gid] = -round(min / s);
+    gid += gridDim.x * blockDim.x;
+  }
+}
+
+template<typename T>
+__global__ void CalScaleZeroPointCambricon(const int64_t elements, const double quantization_bit,
+                                           const float momentum, const T* max_ptr, const T* min_ptr,
+                                           T* moving_max_ptr, T* moving_min_ptr, T* scale,
+                                           T* zero_point) {
+  int64_t tid = threadIdx.x;
+  int64_t gid = (blockDim.x * blockIdx.x) + tid;
+
+  while (gid < elements) {
+    T activation_max = max(fabs(max_ptr[gid]), fabs(min_ptr[gid]));
+
+    if (moving_max_ptr[gid] == 0)
+      moving_max_ptr[gid] = activation_max;
+    else
+      moving_max_ptr[gid] = moving_max_ptr[gid] * momentum + activation_max * (1 - momentum);
+
+    // NOTE(Liang Depeng): cambricon quantization only use moving_max to calculate the scale
+    moving_min_ptr[gid] = moving_max_ptr[gid];
+
+    scale[gid] = floor(log2(moving_max_ptr[gid])) - (quantization_bit - 2);
+    zero_point[gid] = 0;
+    gid += gridDim.x * blockDim.x;
+  }
+}
+
+template<typename T>
+__global__ void CalFreezeScaleZeroPointCambricon(const int64_t elements,
+                                                 const double quantization_bit,
+                                                 const float momentum, const T* moving_max_ptr,
+                                                 T* scale, T* zero_point) {
+  int64_t tid = threadIdx.x;
+  int64_t gid = (blockDim.x * blockIdx.x) + tid;
+
+  while (gid < elements) {
+    T denominator = static_cast<T>(pow(2.0, quantization_bit - 1)) - 1;
+    scale[gid] = floor(log2(moving_max_ptr[gid])) - (quantization_bit - 2);
+    zero_point[gid] = 0;
+    gid += gridDim.x * blockDim.x;
+  }
+}
+
+ep::CudaLaunchConfig GetLaunchConfig(ep::CudaStream* stream, size_t thread_num,
+                                     size_t shared_mem_size) {
+  ep::CudaLaunchConfig config;
+  stream->InitLaunchConfigWithWaves(&config, thread_num, kCudaThreadsNumPerBlock, 1);
+  config.shared_mem_size = shared_mem_size;
+  return config;
+}
+
+}  // namespace
+
+#define LAUNCH_CUDA_KERNEL(func, stream, thread_num, shared_mem_size, ...) \
+  (stream)->LaunchKernel(func, GetLaunchConfig((stream), thread_num, shared_mem_size), __VA_ARGS__);
+
+template<typename T>
+class GpuMovingAverageMinMaxObserverKernel final : public user_op::OpKernel {
+ public:
+  GpuMovingAverageMinMaxObserverKernel() = default;
+  ~GpuMovingAverageMinMaxObserverKernel() = default;
+
+ private:
+  using user_op::OpKernel::Compute;
+  void Compute(user_op::KernelComputeContext* ctx) const override {
+    const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("in", 0);
+    const user_op::Tensor* current_train_step =
+        ctx->Tensor4ArgNameAndIndex("current_train_step", 0);
+    user_op::Tensor* moving_max = ctx->Tensor4ArgNameAndIndex("moving_max", 0);
+    user_op::Tensor* moving_min = ctx->Tensor4ArgNameAndIndex("moving_min", 0);
+    user_op::Tensor* scale = ctx->Tensor4ArgNameAndIndex("scale", 0);
+    user_op::Tensor* zero_point = ctx->Tensor4ArgNameAndIndex("zero_point", 0);
+    user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0);
+
+    const bool is_training = ctx->Attr<bool>("training");
+    const int64_t stop_update_after_iters = ctx->Attr<int64_t>("stop_update_after_iters");
+    const std::string quantization_scheme = ctx->Attr<std::string>("quantization_scheme");
+    const int32_t quantization_bit = ctx->Attr<int32_t>("quantization_bit");
+    const float momentum = ctx->Attr<float>("momentum");
+    const std::string quantization_formula = ctx->Attr<std::string>("quantization_formula");
+
+    int64_t elements = in->shape_view().elem_cnt();
+    T* max_ptr = tmp_buffer->mut_dptr<T>();
+    T* min_ptr = max_ptr + 1;
+
+    int64_t* host_current_train_step_ptr = new int64_t[current_train_step->shape_view().elem_cnt()];
+    OF_CUDA_CHECK(hipMemcpy(host_current_train_step_ptr, current_train_step->dptr<int64_t>(),
+                             current_train_step->shape_view().elem_cnt() * sizeof(int64_t),
+                             hipMemcpyDefault));
+    auto* cuda_stream = ctx->stream()->As<ep::CudaStream>();
+    if (*host_current_train_step_ptr <= stop_update_after_iters && is_training) {
+      LAUNCH_CUDA_KERNEL((InitMaxMin<T>), cuda_stream, 1, 0, 1, max_ptr, min_ptr);
+      LAUNCH_CUDA_KERNEL((ReduceMaxMinPerLayer<T>), cuda_stream, elements,
+                         kCudaThreadsNumPerBlock * 2 * sizeof(T), in->dptr<T>(), elements, max_ptr,
+                         min_ptr);
+    }
+    bool moving = (*host_current_train_step_ptr <= stop_update_after_iters) && is_training;
+    if (quantization_formula == "google") {
+      if (quantization_scheme == "symmetric") {
+        if (moving) {
+          LAUNCH_CUDA_KERNEL((CalScaleZeroPointSymmetric<T>), cuda_stream, 1, 0, 1,
+                             static_cast<double>(quantization_bit), momentum, max_ptr, min_ptr,
+                             moving_max->mut_dptr<T>(), moving_min->mut_dptr<T>(),
+                             scale->mut_dptr<T>(), zero_point->mut_dptr<T>());
+        } else {
+          LAUNCH_CUDA_KERNEL((CalFreezeScaleZeroPointSymmetric<T>), cuda_stream, 1, 0, 1,
+                             static_cast<double>(quantization_bit), momentum, moving_max->dptr<T>(),
+                             scale->mut_dptr<T>(), zero_point->mut_dptr<T>());
+        }
+      } else {  // quantization_scheme == "affine"
+        if (moving) {
+          LAUNCH_CUDA_KERNEL((CalScaleZeroPointAffine<T>), cuda_stream, 1, 0, 1,
+                             static_cast<double>(quantization_bit), momentum, max_ptr, min_ptr,
+                             moving_max->mut_dptr<T>(), moving_min->mut_dptr<T>(),
+                             scale->mut_dptr<T>(), zero_point->mut_dptr<T>());
+        } else {
+          LAUNCH_CUDA_KERNEL((CalFreezeScaleZeroPointAffine<T>), cuda_stream, 1, 0, 1,
+                             static_cast<double>(quantization_bit), momentum, moving_max->dptr<T>(),
+                             moving_min->dptr<T>(), scale->mut_dptr<T>(),
+                             zero_point->mut_dptr<T>());
+        }
+      }
+    } else if (quantization_formula == "cambricon") {
+      if (moving) {
+        LAUNCH_CUDA_KERNEL((CalScaleZeroPointCambricon<T>), cuda_stream, 1, 0, 1,
+                           static_cast<double>(quantization_bit), momentum, max_ptr, min_ptr,
+                           moving_max->mut_dptr<T>(), moving_min->mut_dptr<T>(),
+                           scale->mut_dptr<T>(), zero_point->mut_dptr<T>());
+      } else {
+        LAUNCH_CUDA_KERNEL((CalFreezeScaleZeroPointCambricon<T>), cuda_stream, 1, 0, 1,
+                           static_cast<double>(quantization_bit), momentum, moving_max->dptr<T>(),
+                           scale->mut_dptr<T>(), zero_point->mut_dptr<T>());
+      }
+    } else {
+      UNIMPLEMENTED();
+    }
+
+    delete[] host_current_train_step_ptr;
+  }
+
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
+};
+
+#define REGISTER_MOVING_AVERAGE_MIN_MAX_OBSERVER_KERNEL(dtype)                          \
+  REGISTER_USER_KERNEL("moving_average_min_max_observer")                               \
+      .SetCreateFn<GpuMovingAverageMinMaxObserverKernel<dtype>>()                       \
+      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                  \
+                       && (user_op::HobDataType("in", 0) == GetDataType<dtype>::value)) \
+      .SetInferTmpSizeFn([](user_op::InferContext* ctx) -> size_t { return 2 * sizeof(dtype); })
+
+REGISTER_MOVING_AVERAGE_MIN_MAX_OBSERVER_KERNEL(float);
+REGISTER_MOVING_AVERAGE_MIN_MAX_OBSERVER_KERNEL(double);
+
 }  // namespace oneflow
\ No newline at end of file
diff --git a/oneflow/user/kernels/multi_reduce_kernels.hip.cpp b/oneflow/user/kernels/multi_reduce_kernels.hip.cpp
index 85440cf..4b33383 100644
--- a/oneflow/user/kernels/multi_reduce_kernels.hip.cpp
+++ b/oneflow/user/kernels/multi_reduce_kernels.hip.cpp
@@ -1,142 +1,142 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "hip/hip_runtime.h"
-#include "oneflow/user/kernels/multi_reduce_kernels.h"
-#include "oneflow/core/ep/include/primitive/fill.h"
-#include "oneflow/core/hip/atomic.hip.h"
-#include "oneflow/core/device/cuda_util.h"
-#include <hipcub/hipcub.hpp>
-#include <limits>
-
-namespace oneflow {
-
-namespace {
-
-constexpr int64_t kMultiReduceMaxPackSize = 64;
-
-template<typename T>
-struct MultiReduceParamsPack {
-  MultiReduceParam<T> params[kMultiReduceMaxPackSize];
-  size_t size;
-};
-
-template<typename T, typename TransformFn, typename ReduceFn>
-__global__ void MultiBlockReduceGpu(TransformFn transform,
-                                    const MultiReduceParamsPack<T> pack_params, const T init,
-                                    T* out) {
-  ReduceFn reduce_fn{};
-  T t_out = init;
-  for (int i = 0; i < pack_params.size; ++i) {
-    const auto& param = pack_params.params[i];
-    CUDA_1D_KERNEL_LOOP(j, param.size) { t_out = reduce_fn(t_out, transform(param.data[j])); }
-  }
-  typedef hipcub::BlockReduce<T, kCudaThreadsNumPerBlock> BlockReduce;
-  __shared__ typename BlockReduce::TempStorage temp_storage;
-  T b_out = BlockReduce(temp_storage).Reduce(t_out, reduce_fn);
-  if (threadIdx.x == 0) { out[blockIdx.x] = b_out; }
-}
-
-size_t InferTempStorageSize(user_op::InferContext* ctx) {
-  auto input_size = ctx->input_size("x");
-  if (input_size == 0) { return 0; }
-  int64_t max_elem_cnt = 0;
-  int64_t pack_size = 0;
-  int32_t num_blocks = 0;
-  for (size_t i = 0; i < input_size; ++i) {
-    int64_t elem_cnt = ctx->InputShape("x", i).elem_cnt();
-    max_elem_cnt = std::max(max_elem_cnt, elem_cnt);
-    pack_size++;
-    if (pack_size == kMultiReduceMaxPackSize || i == input_size - 1) {
-      CHECK_LT(max_elem_cnt, std::numeric_limits<int32_t>::max());
-      num_blocks += BlocksNum4ThreadsNum(static_cast<int32_t>(max_elem_cnt));
-      max_elem_cnt = 0;
-      pack_size = 0;
-    }
-  }
-  CHECK_LT(num_blocks, kCudaThreadsNumPerBlock * kCudaThreadsNumPerBlock * kCudaThreadsNumPerBlock)
-      << "Too much blocks needed for computing " << ctx->op_name() << ", should be less than "
-      << kCudaThreadsNumPerBlock << "*" << kCudaThreadsNumPerBlock << "*" << kCudaThreadsNumPerBlock
-      << ", but got " << num_blocks;
-  size_t elem_size = GetSizeOfDataType(ctx->InputDType("x", 0));
-  return GetCudaAlignedSize(num_blocks * elem_size * 2);
-}
-
-}  // namespace
-
-template<typename T, typename TransformFn, typename ReduceFn>
-struct MultiReduce<DeviceType::kCUDA, T, TransformFn, ReduceFn> {
-  void operator()(ep::Stream* stream, TransformFn transform,
-                  const std::vector<MultiReduceParam<T>>& params, T init, T* ret, T* temp) {
-    CHECK_NOTNULL(temp);
-    int32_t total_num_blocks = 0;
-    for (size_t i = 0; i < params.size(); i += kMultiReduceMaxPackSize) {
-      MultiReduceParamsPack<T> pack_params{};
-      size_t max_elem_cnt = 0;
-      pack_params.size = std::min<size_t>(kMultiReduceMaxPackSize, params.size() - i);
-      for (size_t j = 0; j < pack_params.size; ++j) {
-        pack_params.params[j] = params[i + j];
-        max_elem_cnt = std::max<size_t>(max_elem_cnt, pack_params.params[j].size);
-      }
-      int32_t num_blocks = BlocksNum4ThreadsNum(max_elem_cnt);
-      MultiBlockReduceGpu<T, TransformFn, ReduceFn>
-          <<<num_blocks, kCudaThreadsNumPerBlock, 0, stream->As<ep::CudaStream>()->cuda_stream()>>>(
-              transform, pack_params, init, temp + total_num_blocks);
-      total_num_blocks += num_blocks;
-    }
-    size_t wksp_size = 0;
-    auto DeviceReduce = [&](void* temp_storage) -> void {
-      OF_CUDA_CHECK(hipcub::DeviceReduce::Reduce(temp_storage, wksp_size, temp, ret, total_num_blocks,
-                                              ReduceFn{}, init,
-                                              stream->As<ep::CudaStream>()->cuda_stream()));
-    };
-    DeviceReduce(nullptr);
-    // NOTE(zwx): We have allocated the temp storage with the space
-    //  that can hold all the elements to reduce,
-    //  normally the `temp_storage_bytes` for hipcub::DeviceReduce shouldn't exceed it.
-    CHECK_LE(wksp_size, total_num_blocks * sizeof(T))
-        << wksp_size << " size in bytes of temp storage is needed for doing hipcub::DeviceReduce, "
-        << "but only allocated " << total_num_blocks * sizeof(T);
-    DeviceReduce(temp + total_num_blocks);
-  }
-};
-
-#define REGISTER_MULTI_REDUCE_SUM_POW_ABS_CUDA_KERNEL(dtype)                           \
-  REGISTER_USER_KERNEL("multi_reduce_sum_pow_abs")                                     \
-      .SetCreateFn<MultiReduceSumPowAbsKernel<DeviceType::kCUDA, dtype>>()             \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                 \
-                       && (user_op::HobDataType("y", 0) == GetDataType<dtype>::value)) \
-      .SetInferTmpSizeFn(InferTempStorageSize);
-
-#define REGISTER_MULTI_REDUCE_XIMUM_ABS_CUDA_KERNEL(op_type_name, ximum_enum, dtype)   \
-  REGISTER_USER_KERNEL(op_type_name)                                                   \
-      .SetCreateFn<MultiReduceXimumAbsKernel<DeviceType::kCUDA, dtype, ximum_enum>>()  \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                 \
-                       && (user_op::HobDataType("y", 0) == GetDataType<dtype>::value)) \
-      .SetInferTmpSizeFn(InferTempStorageSize);
-
-#define REGISTER_MULTI_REDUCE_XIMUM_ABS_CUDA_KERNELS(dtype)                                     \
-  REGISTER_MULTI_REDUCE_XIMUM_ABS_CUDA_KERNEL("multi_reduce_max_abs", Ximum::kMax, dtype)       \
-  REGISTER_MULTI_REDUCE_XIMUM_ABS_CUDA_KERNEL("multi_reduce_min_abs", Ximum::kMin, dtype)       \
-  REGISTER_MULTI_REDUCE_XIMUM_ABS_CUDA_KERNEL("local_multi_reduce_max_abs", Ximum::kMax, dtype) \
-  REGISTER_MULTI_REDUCE_XIMUM_ABS_CUDA_KERNEL("local_multi_reduce_min_abs", Ximum::kMin, dtype)
-
-REGISTER_MULTI_REDUCE_SUM_POW_ABS_CUDA_KERNEL(float)
-REGISTER_MULTI_REDUCE_SUM_POW_ABS_CUDA_KERNEL(double)
-
-REGISTER_MULTI_REDUCE_XIMUM_ABS_CUDA_KERNELS(float)
-REGISTER_MULTI_REDUCE_XIMUM_ABS_CUDA_KERNELS(double)
-
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "hip/hip_runtime.h"
+#include "oneflow/user/kernels/multi_reduce_kernels.h"
+#include "oneflow/core/ep/include/primitive/fill.h"
+#include "oneflow/core/hip/atomic.hip.h"
+#include "oneflow/core/device/cuda_util.h"
+#include <hipcub/hipcub.hpp>
+#include <limits>
+
+namespace oneflow {
+
+namespace {
+
+constexpr int64_t kMultiReduceMaxPackSize = 64;
+
+template<typename T>
+struct MultiReduceParamsPack {
+  MultiReduceParam<T> params[kMultiReduceMaxPackSize];
+  size_t size;
+};
+
+template<typename T, typename TransformFn, typename ReduceFn>
+__global__ void MultiBlockReduceGpu(TransformFn transform,
+                                    const MultiReduceParamsPack<T> pack_params, const T init,
+                                    T* out) {
+  ReduceFn reduce_fn{};
+  T t_out = init;
+  for (int i = 0; i < pack_params.size; ++i) {
+    const auto& param = pack_params.params[i];
+    CUDA_1D_KERNEL_LOOP(j, param.size) { t_out = reduce_fn(t_out, transform(param.data[j])); }
+  }
+  typedef hipcub::BlockReduce<T, kCudaThreadsNumPerBlock> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage temp_storage;
+  T b_out = BlockReduce(temp_storage).Reduce(t_out, reduce_fn);
+  if (threadIdx.x == 0) { out[blockIdx.x] = b_out; }
+}
+
+size_t InferTempStorageSize(user_op::InferContext* ctx) {
+  auto input_size = ctx->input_size("x");
+  if (input_size == 0) { return 0; }
+  int64_t max_elem_cnt = 0;
+  int64_t pack_size = 0;
+  int32_t num_blocks = 0;
+  for (size_t i = 0; i < input_size; ++i) {
+    int64_t elem_cnt = ctx->InputShape("x", i).elem_cnt();
+    max_elem_cnt = std::max(max_elem_cnt, elem_cnt);
+    pack_size++;
+    if (pack_size == kMultiReduceMaxPackSize || i == input_size - 1) {
+      CHECK_LT(max_elem_cnt, std::numeric_limits<int32_t>::max());
+      num_blocks += BlocksNum4ThreadsNum(static_cast<int32_t>(max_elem_cnt));
+      max_elem_cnt = 0;
+      pack_size = 0;
+    }
+  }
+  CHECK_LT(num_blocks, kCudaThreadsNumPerBlock * kCudaThreadsNumPerBlock * kCudaThreadsNumPerBlock)
+      << "Too much blocks needed for computing " << ctx->op_name() << ", should be less than "
+      << kCudaThreadsNumPerBlock << "*" << kCudaThreadsNumPerBlock << "*" << kCudaThreadsNumPerBlock
+      << ", but got " << num_blocks;
+  size_t elem_size = GetSizeOfDataType(ctx->InputDType("x", 0));
+  return GetCudaAlignedSize(num_blocks * elem_size * 2);
+}
+
+}  // namespace
+
+template<typename T, typename TransformFn, typename ReduceFn>
+struct MultiReduce<DeviceType::kCUDA, T, TransformFn, ReduceFn> {
+  void operator()(ep::Stream* stream, TransformFn transform,
+                  const std::vector<MultiReduceParam<T>>& params, T init, T* ret, T* temp) {
+    CHECK_NOTNULL(temp);
+    int32_t total_num_blocks = 0;
+    for (size_t i = 0; i < params.size(); i += kMultiReduceMaxPackSize) {
+      MultiReduceParamsPack<T> pack_params{};
+      size_t max_elem_cnt = 0;
+      pack_params.size = std::min<size_t>(kMultiReduceMaxPackSize, params.size() - i);
+      for (size_t j = 0; j < pack_params.size; ++j) {
+        pack_params.params[j] = params[i + j];
+        max_elem_cnt = std::max<size_t>(max_elem_cnt, pack_params.params[j].size);
+      }
+      int32_t num_blocks = BlocksNum4ThreadsNum(max_elem_cnt);
+      MultiBlockReduceGpu<T, TransformFn, ReduceFn>
+          <<<num_blocks, kCudaThreadsNumPerBlock, 0, stream->As<ep::CudaStream>()->cuda_stream()>>>(
+              transform, pack_params, init, temp + total_num_blocks);
+      total_num_blocks += num_blocks;
+    }
+    size_t wksp_size = 0;
+    auto DeviceReduce = [&](void* temp_storage) -> void {
+      OF_CUDA_CHECK(hipcub::DeviceReduce::Reduce(temp_storage, wksp_size, temp, ret, total_num_blocks,
+                                              ReduceFn{}, init,
+                                              stream->As<ep::CudaStream>()->cuda_stream()));
+    };
+    DeviceReduce(nullptr);
+    // NOTE(zwx): We have allocated the temp storage with the space
+    //  that can hold all the elements to reduce,
+    //  normally the `temp_storage_bytes` for hipcub::DeviceReduce shouldn't exceed it.
+    CHECK_LE(wksp_size, total_num_blocks * sizeof(T))
+        << wksp_size << " size in bytes of temp storage is needed for doing hipcub::DeviceReduce, "
+        << "but only allocated " << total_num_blocks * sizeof(T);
+    DeviceReduce(temp + total_num_blocks);
+  }
+};
+
+#define REGISTER_MULTI_REDUCE_SUM_POW_ABS_CUDA_KERNEL(dtype)                           \
+  REGISTER_USER_KERNEL("multi_reduce_sum_pow_abs")                                     \
+      .SetCreateFn<MultiReduceSumPowAbsKernel<DeviceType::kCUDA, dtype>>()             \
+      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                 \
+                       && (user_op::HobDataType("y", 0) == GetDataType<dtype>::value)) \
+      .SetInferTmpSizeFn(InferTempStorageSize);
+
+#define REGISTER_MULTI_REDUCE_XIMUM_ABS_CUDA_KERNEL(op_type_name, ximum_enum, dtype)   \
+  REGISTER_USER_KERNEL(op_type_name)                                                   \
+      .SetCreateFn<MultiReduceXimumAbsKernel<DeviceType::kCUDA, dtype, ximum_enum>>()  \
+      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                 \
+                       && (user_op::HobDataType("y", 0) == GetDataType<dtype>::value)) \
+      .SetInferTmpSizeFn(InferTempStorageSize);
+
+#define REGISTER_MULTI_REDUCE_XIMUM_ABS_CUDA_KERNELS(dtype)                                     \
+  REGISTER_MULTI_REDUCE_XIMUM_ABS_CUDA_KERNEL("multi_reduce_max_abs", Ximum::kMax, dtype)       \
+  REGISTER_MULTI_REDUCE_XIMUM_ABS_CUDA_KERNEL("multi_reduce_min_abs", Ximum::kMin, dtype)       \
+  REGISTER_MULTI_REDUCE_XIMUM_ABS_CUDA_KERNEL("local_multi_reduce_max_abs", Ximum::kMax, dtype) \
+  REGISTER_MULTI_REDUCE_XIMUM_ABS_CUDA_KERNEL("local_multi_reduce_min_abs", Ximum::kMin, dtype)
+
+REGISTER_MULTI_REDUCE_SUM_POW_ABS_CUDA_KERNEL(float)
+REGISTER_MULTI_REDUCE_SUM_POW_ABS_CUDA_KERNEL(double)
+
+REGISTER_MULTI_REDUCE_XIMUM_ABS_CUDA_KERNELS(float)
+REGISTER_MULTI_REDUCE_XIMUM_ABS_CUDA_KERNELS(double)
+
 }  // namespace oneflow
\ No newline at end of file
diff --git a/oneflow/user/kernels/nd_index_slice_kernels.hip.cpp b/oneflow/user/kernels/nd_index_slice_kernels.hip.cpp
index 5f0cea1..2a974e1 100644
--- a/oneflow/user/kernels/nd_index_slice_kernels.hip.cpp
+++ b/oneflow/user/kernels/nd_index_slice_kernels.hip.cpp
@@ -1,166 +1,166 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "hip/hip_runtime.h"
-#include "oneflow/user/kernels/nd_index_slice_kernels.h"
-#include "oneflow/core/hip/atomic.hip.h"
-
-namespace oneflow {
-
-namespace {
-
-template<typename T, typename I>
-__global__ void CudaGatherNd(NdIndexSliceArgs<T, I> args, const I* indices, const T* dense,
-                             T* slices) {
-  DoGatherNd(args.num_slices * args.slice_size, args.slice_size, args.index_ndims, args.dense_shape,
-             indices, dense, slices);
-}
-
-template<typename T, typename I>
-__global__ void CudaScatterNdAdd(NdIndexSliceArgs<T, I> args, const I* indices, const T* slices,
-                                 T* dense) {
-  DoScatterNdAdd<DeviceType::kCUDA>(args.num_slices * args.slice_size, args.slice_size,
-                                    args.index_ndims, args.dense_shape, indices, slices, dense);
-}
-
-template<typename T, typename I>
-__global__ void CudaScatterNdUpdate(NdIndexSliceArgs<T, I> args, const I* indices, const T* slices,
-                                    T* dense) {
-  DoScatterNdUpdate<DeviceType::kCUDA>(args.num_slices * args.slice_size, args.slice_size,
-                                       args.index_ndims, args.dense_shape, indices, slices, dense);
-}
-
-template<typename T, typename I>
-__global__ void CudaFillByNdIndex(NdIndexSliceArgs<T, I> args, const I* indices, T* dense,
-                                  T value) {
-  DoFillByNdIndex(args.num_slices * args.slice_size, args.slice_size, args.index_ndims,
-                  args.dense_shape, indices, dense, value);
-}
-
-}  // namespace
-
-template<typename T, typename I>
-struct GatherNdFunctor<DeviceType::kCUDA, T, I> final {
-  void operator()(ep::Stream* stream, const NdIndexSliceArgs<T, I>& args, const I* indices,
-                  const T* dense, T* slices) const {
-    RUN_CUDA_KERNEL((CudaGatherNd<T, I>), stream, args.num_slices * args.slice_size, args, indices,
-                    dense, slices);
-  }
-};
-
-template<typename T, typename I>
-struct ScatterNdAddFunctor<DeviceType::kCUDA, T, I> final {
-  void operator()(ep::Stream* stream, const NdIndexSliceArgs<T, I>& args, const I* indices,
-                  const T* slices, T* dense) const {
-    RUN_CUDA_KERNEL((CudaScatterNdAdd<T, I>), stream, args.num_slices * args.slice_size, args,
-                    indices, slices, dense);
-  }
-};
-
-template<typename T, typename I>
-struct ScatterNdUpdateFunctor<DeviceType::kCUDA, T, I> final {
-  void operator()(ep::Stream* stream, const NdIndexSliceArgs<T, I>& args, const I* indices,
-                  const T* slices, T* dense) const {
-    RUN_CUDA_KERNEL((CudaScatterNdUpdate<T, I>), stream, args.num_slices * args.slice_size, args,
-                    indices, slices, dense);
-  }
-};
-
-template<typename T, typename I>
-struct FillByNdIndexFunctor<DeviceType::kCUDA, T, I> final {
-  void operator()(ep::Stream* stream, const NdIndexSliceArgs<T, I>& args, const I* indices,
-                  T* dense, T value) const {
-    RUN_CUDA_KERNEL((CudaFillByNdIndex<T, I>), stream, args.num_slices * args.slice_size, args,
-                    indices, dense, value);
-  }
-};
-
-template<typename T>
-struct DeviceAdd<DeviceType::kCUDA, T> {
-  __device__ __forceinline__ static void Invoke(const T* x, T* y) { cuda::atomic::Add(y, *x); }
-};
-
-template<>
-struct DeviceAdd<DeviceType::kCUDA, bool> {
-  __device__ __forceinline__ static void Invoke(const bool* x, bool* y) { *y += *x; }
-};
-
-template<>
-struct DeviceAdd<DeviceType::kCUDA, uint8_t> {
-  __device__ __forceinline__ static void Invoke(const uint8_t* x, uint8_t* y) { *y += *x; }
-};
-
-template<>
-struct DeviceAdd<DeviceType::kCUDA, int8_t> {
-  __device__ __forceinline__ static void Invoke(const int8_t* x, int8_t* y) { *y += *x; }
-};
-
-template<>
-struct DeviceAdd<DeviceType::kCUDA, int64_t> {
-  __device__ __forceinline__ static void Invoke(const int64_t* x, int64_t* y) { *y += *x; }
-};
-
-#define CUDA_ATOMIC_ADD_SUPPORTED_DATA_TYPE_SEQ \
-  FLOATING_DATA_TYPE_SEQ                        \
-  OF_PP_MAKE_TUPLE_SEQ(int32_t, DataType::kInt32)
-
-OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(
-    INSTANTIATE_GATHER_ND_FUNCTOR, (DeviceType::kCUDA),
-    ARITHMETIC_DATA_TYPE_SEQ UNSIGNED_INT_DATA_TYPE_SEQ BOOL_DATA_TYPE_SEQ, INDEX_DATA_TYPE_SEQ)
-
-OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_SCATTER_ND_ADD_FUNCTOR, (DeviceType::kCUDA),
-                                 CUDA_ATOMIC_ADD_SUPPORTED_DATA_TYPE_SEQ BOOL_DATA_TYPE_SEQ,
-                                 INDEX_DATA_TYPE_SEQ)
-
-OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_FILL_BY_ND_INDEX_FUNCTOR, (DeviceType::kCUDA),
-                                 ARITHMETIC_DATA_TYPE_SEQ, INDEX_DATA_TYPE_SEQ)
-
-OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(
-    REGISTER_GATHER_ND_KERNELS, (DeviceType::kCUDA),
-    ARITHMETIC_DATA_TYPE_SEQ UNSIGNED_INT_DATA_TYPE_SEQ BOOL_DATA_TYPE_SEQ, INDEX_DATA_TYPE_SEQ)
-
-OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(
-    REGISTER_SCATTER_ND_KERNELS, (DeviceType::kCUDA),
-    ARITHMETIC_DATA_TYPE_SEQ UNSIGNED_INT_DATA_TYPE_SEQ BOOL_DATA_TYPE_SEQ, INDEX_DATA_TYPE_SEQ)
-
-OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_SCATTER_ND_LIKE_KERNELS, (DeviceType::kCUDA),
-                                 CUDA_ATOMIC_ADD_SUPPORTED_DATA_TYPE_SEQ BOOL_DATA_TYPE_SEQ,
-                                 INDEX_DATA_TYPE_SEQ)
-
-OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(
-    REGISTER_TENSOR_GATHER_ND_UPDATE_KERNELS, (DeviceType::kCUDA),
-    ARITHMETIC_DATA_TYPE_SEQ UNSIGNED_INT_DATA_TYPE_SEQ BOOL_DATA_TYPE_SEQ, INDEX_DATA_TYPE_SEQ)
-
-OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_TENSOR_GATHER_ND_ADD_KERNELS, (DeviceType::kCUDA),
-                                 CUDA_ATOMIC_ADD_SUPPORTED_DATA_TYPE_SEQ, INDEX_DATA_TYPE_SEQ)
-
-#if (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700 && CUDA_VERSION >= 10000) || defined(__HIP_DEVICE_COMPILE__)
-
-template<>
-struct DeviceAdd<DeviceType::kCUDA, float16> {
-  __device__ __forceinline__ static void Invoke(const float16* x, float16* y) {
-    cuda::atomic::Add(reinterpret_cast<half*>(y), *(reinterpret_cast<const half*>(x)));
-  }
-};
-
-OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_ND_INDEX_SLICE_FUNCTORS, (DeviceType::kCUDA),
-                                 FLOAT16_DATA_TYPE_SEQ, INDEX_DATA_TYPE_SEQ)
-
-OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_ND_INDEX_SLICE_KERNELS, (DeviceType::kCUDA),
-                                 FLOAT16_DATA_TYPE_SEQ, INDEX_DATA_TYPE_SEQ)
-
-#endif
-
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "hip/hip_runtime.h"
+#include "oneflow/user/kernels/nd_index_slice_kernels.h"
+#include "oneflow/core/hip/atomic.hip.h"
+
+namespace oneflow {
+
+namespace {
+
+template<typename T, typename I>
+__global__ void CudaGatherNd(NdIndexSliceArgs<T, I> args, const I* indices, const T* dense,
+                             T* slices) {
+  DoGatherNd(args.num_slices * args.slice_size, args.slice_size, args.index_ndims, args.dense_shape,
+             indices, dense, slices);
+}
+
+template<typename T, typename I>
+__global__ void CudaScatterNdAdd(NdIndexSliceArgs<T, I> args, const I* indices, const T* slices,
+                                 T* dense) {
+  DoScatterNdAdd<DeviceType::kCUDA>(args.num_slices * args.slice_size, args.slice_size,
+                                    args.index_ndims, args.dense_shape, indices, slices, dense);
+}
+
+template<typename T, typename I>
+__global__ void CudaScatterNdUpdate(NdIndexSliceArgs<T, I> args, const I* indices, const T* slices,
+                                    T* dense) {
+  DoScatterNdUpdate<DeviceType::kCUDA>(args.num_slices * args.slice_size, args.slice_size,
+                                       args.index_ndims, args.dense_shape, indices, slices, dense);
+}
+
+template<typename T, typename I>
+__global__ void CudaFillByNdIndex(NdIndexSliceArgs<T, I> args, const I* indices, T* dense,
+                                  T value) {
+  DoFillByNdIndex(args.num_slices * args.slice_size, args.slice_size, args.index_ndims,
+                  args.dense_shape, indices, dense, value);
+}
+
+}  // namespace
+
+template<typename T, typename I>
+struct GatherNdFunctor<DeviceType::kCUDA, T, I> final {
+  void operator()(ep::Stream* stream, const NdIndexSliceArgs<T, I>& args, const I* indices,
+                  const T* dense, T* slices) const {
+    RUN_CUDA_KERNEL((CudaGatherNd<T, I>), stream, args.num_slices * args.slice_size, args, indices,
+                    dense, slices);
+  }
+};
+
+template<typename T, typename I>
+struct ScatterNdAddFunctor<DeviceType::kCUDA, T, I> final {
+  void operator()(ep::Stream* stream, const NdIndexSliceArgs<T, I>& args, const I* indices,
+                  const T* slices, T* dense) const {
+    RUN_CUDA_KERNEL((CudaScatterNdAdd<T, I>), stream, args.num_slices * args.slice_size, args,
+                    indices, slices, dense);
+  }
+};
+
+template<typename T, typename I>
+struct ScatterNdUpdateFunctor<DeviceType::kCUDA, T, I> final {
+  void operator()(ep::Stream* stream, const NdIndexSliceArgs<T, I>& args, const I* indices,
+                  const T* slices, T* dense) const {
+    RUN_CUDA_KERNEL((CudaScatterNdUpdate<T, I>), stream, args.num_slices * args.slice_size, args,
+                    indices, slices, dense);
+  }
+};
+
+template<typename T, typename I>
+struct FillByNdIndexFunctor<DeviceType::kCUDA, T, I> final {
+  void operator()(ep::Stream* stream, const NdIndexSliceArgs<T, I>& args, const I* indices,
+                  T* dense, T value) const {
+    RUN_CUDA_KERNEL((CudaFillByNdIndex<T, I>), stream, args.num_slices * args.slice_size, args,
+                    indices, dense, value);
+  }
+};
+
+template<typename T>
+struct DeviceAdd<DeviceType::kCUDA, T> {
+  __device__ __forceinline__ static void Invoke(const T* x, T* y) { cuda::atomic::Add(y, *x); }
+};
+
+template<>
+struct DeviceAdd<DeviceType::kCUDA, bool> {
+  __device__ __forceinline__ static void Invoke(const bool* x, bool* y) { *y += *x; }
+};
+
+template<>
+struct DeviceAdd<DeviceType::kCUDA, uint8_t> {
+  __device__ __forceinline__ static void Invoke(const uint8_t* x, uint8_t* y) { *y += *x; }
+};
+
+template<>
+struct DeviceAdd<DeviceType::kCUDA, int8_t> {
+  __device__ __forceinline__ static void Invoke(const int8_t* x, int8_t* y) { *y += *x; }
+};
+
+template<>
+struct DeviceAdd<DeviceType::kCUDA, int64_t> {
+  __device__ __forceinline__ static void Invoke(const int64_t* x, int64_t* y) { *y += *x; }
+};
+
+#define CUDA_ATOMIC_ADD_SUPPORTED_DATA_TYPE_SEQ \
+  FLOATING_DATA_TYPE_SEQ                        \
+  OF_PP_MAKE_TUPLE_SEQ(int32_t, DataType::kInt32)
+
+OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(
+    INSTANTIATE_GATHER_ND_FUNCTOR, (DeviceType::kCUDA),
+    ARITHMETIC_DATA_TYPE_SEQ UNSIGNED_INT_DATA_TYPE_SEQ BOOL_DATA_TYPE_SEQ, INDEX_DATA_TYPE_SEQ)
+
+OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_SCATTER_ND_ADD_FUNCTOR, (DeviceType::kCUDA),
+                                 CUDA_ATOMIC_ADD_SUPPORTED_DATA_TYPE_SEQ BOOL_DATA_TYPE_SEQ,
+                                 INDEX_DATA_TYPE_SEQ)
+
+OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_FILL_BY_ND_INDEX_FUNCTOR, (DeviceType::kCUDA),
+                                 ARITHMETIC_DATA_TYPE_SEQ, INDEX_DATA_TYPE_SEQ)
+
+OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(
+    REGISTER_GATHER_ND_KERNELS, (DeviceType::kCUDA),
+    ARITHMETIC_DATA_TYPE_SEQ UNSIGNED_INT_DATA_TYPE_SEQ BOOL_DATA_TYPE_SEQ, INDEX_DATA_TYPE_SEQ)
+
+OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(
+    REGISTER_SCATTER_ND_KERNELS, (DeviceType::kCUDA),
+    ARITHMETIC_DATA_TYPE_SEQ UNSIGNED_INT_DATA_TYPE_SEQ BOOL_DATA_TYPE_SEQ, INDEX_DATA_TYPE_SEQ)
+
+OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_SCATTER_ND_LIKE_KERNELS, (DeviceType::kCUDA),
+                                 CUDA_ATOMIC_ADD_SUPPORTED_DATA_TYPE_SEQ BOOL_DATA_TYPE_SEQ,
+                                 INDEX_DATA_TYPE_SEQ)
+
+OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(
+    REGISTER_TENSOR_GATHER_ND_UPDATE_KERNELS, (DeviceType::kCUDA),
+    ARITHMETIC_DATA_TYPE_SEQ UNSIGNED_INT_DATA_TYPE_SEQ BOOL_DATA_TYPE_SEQ, INDEX_DATA_TYPE_SEQ)
+
+OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_TENSOR_GATHER_ND_ADD_KERNELS, (DeviceType::kCUDA),
+                                 CUDA_ATOMIC_ADD_SUPPORTED_DATA_TYPE_SEQ, INDEX_DATA_TYPE_SEQ)
+
+#if (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700 && CUDA_VERSION >= 10000) || defined(__HIP_DEVICE_COMPILE__)
+
+template<>
+struct DeviceAdd<DeviceType::kCUDA, float16> {
+  __device__ __forceinline__ static void Invoke(const float16* x, float16* y) {
+    cuda::atomic::Add(reinterpret_cast<half*>(y), *(reinterpret_cast<const half*>(x)));
+  }
+};
+
+OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_ND_INDEX_SLICE_FUNCTORS, (DeviceType::kCUDA),
+                                 FLOAT16_DATA_TYPE_SEQ, INDEX_DATA_TYPE_SEQ)
+
+OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_ND_INDEX_SLICE_KERNELS, (DeviceType::kCUDA),
+                                 FLOAT16_DATA_TYPE_SEQ, INDEX_DATA_TYPE_SEQ)
+
+#endif
+
 }  // namespace oneflow
\ No newline at end of file
diff --git a/oneflow/user/kernels/nll_kernel_util.hip.cpp b/oneflow/user/kernels/nll_kernel_util.hip.cpp
index 90c82b7..52c68e5 100644
--- a/oneflow/user/kernels/nll_kernel_util.hip.cpp
+++ b/oneflow/user/kernels/nll_kernel_util.hip.cpp
@@ -1,93 +1,93 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "oneflow/user/kernels/nll_kernel_util.h"
-#include "oneflow/core/hip/atomic.hip.h"
-#include "hip/hip_runtime.h"
-
-namespace oneflow {
-
-namespace {
-
-template<typename T, typename K>
-__global__ void NLLForward(const int32_t num_samples, const K num_classes, const K class_start,
-                           const K ignore_index, const T* input, const K* target, const T* weight,
-                           T* out, T* out_weight) {
-  const T zero = GetZeroVal<T>();
-  const T one = GetOneVal<T>();
-  CUDA_1D_KERNEL_LOOP(i, num_samples) {
-    K label = target[i];
-    T w = zero;
-    T y = zero;
-    if (label != ignore_index) {
-      label -= class_start;
-      if (label >= 0 && label < num_classes) {
-        w = weight ? weight[label] : one;
-        y = -(input[i * num_classes + label] * w);
-      }
-    }
-    out[i] = y;
-    out_weight[i] = w;
-  }
-}
-
-template<typename T, typename K>
-__global__ void NLLBackward(const int32_t num_samples, const K num_classes, const K class_start,
-                            const K ignore_index, const T* out_grad, const K* target,
-                            const T* weight, T* in_grad) {
-  const T one = GetOneVal<T>();
-  const T zero = GetZeroVal<T>();
-  CUDA_1D_KERNEL_LOOP_T(K, i, num_samples * num_classes) {
-    const K n = i / num_classes;
-    const K idx = i - n * num_classes;
-    const K label = target[n];
-    if (label != ignore_index && idx == label - class_start) {
-      in_grad[i] = out_grad[n] * (weight ? -weight[idx] : -one);
-    } else {
-      in_grad[i] = zero;
-    }
-  }
-}
-
-}  // namespace
-
-template<typename T, typename K>
-struct NLLKernelUtil<DeviceType::kCUDA, T, K> {
-  static void Forward(ep::Stream* stream, const int32_t num_samples, const K num_classes,
-                      const K class_start, const K ignore_index, const T* input, const K* target,
-                      const T* weight, T* out, T* out_weight) {
-    NLLForward<<<BlocksNum4ThreadsNum(num_samples), kCudaThreadsNumPerBlock, 0,
-                 stream->As<ep::CudaStream>()->cuda_stream()>>>(num_samples, num_classes,
-                                                                class_start, ignore_index, input,
-                                                                target, weight, out, out_weight);
-  }
-
-  static void Backward(ep::Stream* stream, const int32_t num_samples, const K num_classes,
-                       const K class_start, const K ignore_index, const T* out_grad,
-                       const K* target, const T* weight, T* in_grad) {
-    NLLBackward<<<BlocksNum4ThreadsNum(num_samples), kCudaThreadsNumPerBlock, 0,
-                  stream->As<ep::CudaStream>()->cuda_stream()>>>(
-        num_samples, num_classes, class_start, ignore_index, out_grad, target, weight, in_grad);
-  }
-};
-
-template struct NLLKernelUtil<DeviceType::kCUDA, float, int32_t>;
-template struct NLLKernelUtil<DeviceType::kCUDA, float, int64_t>;
-template struct NLLKernelUtil<DeviceType::kCUDA, double, int32_t>;
-template struct NLLKernelUtil<DeviceType::kCUDA, double, int64_t>;
-template struct NLLKernelUtil<DeviceType::kCUDA, half, int32_t>;
-template struct NLLKernelUtil<DeviceType::kCUDA, half, int64_t>;
-
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "oneflow/user/kernels/nll_kernel_util.h"
+#include "oneflow/core/hip/atomic.hip.h"
+#include "hip/hip_runtime.h"
+
+namespace oneflow {
+
+namespace {
+
+template<typename T, typename K>
+__global__ void NLLForward(const int32_t num_samples, const K num_classes, const K class_start,
+                           const K ignore_index, const T* input, const K* target, const T* weight,
+                           T* out, T* out_weight) {
+  const T zero = GetZeroVal<T>();
+  const T one = GetOneVal<T>();
+  CUDA_1D_KERNEL_LOOP(i, num_samples) {
+    K label = target[i];
+    T w = zero;
+    T y = zero;
+    if (label != ignore_index) {
+      label -= class_start;
+      if (label >= 0 && label < num_classes) {
+        w = weight ? weight[label] : one;
+        y = -(input[i * num_classes + label] * w);
+      }
+    }
+    out[i] = y;
+    out_weight[i] = w;
+  }
+}
+
+template<typename T, typename K>
+__global__ void NLLBackward(const int32_t num_samples, const K num_classes, const K class_start,
+                            const K ignore_index, const T* out_grad, const K* target,
+                            const T* weight, T* in_grad) {
+  const T one = GetOneVal<T>();
+  const T zero = GetZeroVal<T>();
+  CUDA_1D_KERNEL_LOOP_T(K, i, num_samples * num_classes) {
+    const K n = i / num_classes;
+    const K idx = i - n * num_classes;
+    const K label = target[n];
+    if (label != ignore_index && idx == label - class_start) {
+      in_grad[i] = out_grad[n] * (weight ? -weight[idx] : -one);
+    } else {
+      in_grad[i] = zero;
+    }
+  }
+}
+
+}  // namespace
+
+template<typename T, typename K>
+struct NLLKernelUtil<DeviceType::kCUDA, T, K> {
+  static void Forward(ep::Stream* stream, const int32_t num_samples, const K num_classes,
+                      const K class_start, const K ignore_index, const T* input, const K* target,
+                      const T* weight, T* out, T* out_weight) {
+    NLLForward<<<BlocksNum4ThreadsNum(num_samples), kCudaThreadsNumPerBlock, 0,
+                 stream->As<ep::CudaStream>()->cuda_stream()>>>(num_samples, num_classes,
+                                                                class_start, ignore_index, input,
+                                                                target, weight, out, out_weight);
+  }
+
+  static void Backward(ep::Stream* stream, const int32_t num_samples, const K num_classes,
+                       const K class_start, const K ignore_index, const T* out_grad,
+                       const K* target, const T* weight, T* in_grad) {
+    NLLBackward<<<BlocksNum4ThreadsNum(num_samples), kCudaThreadsNumPerBlock, 0,
+                  stream->As<ep::CudaStream>()->cuda_stream()>>>(
+        num_samples, num_classes, class_start, ignore_index, out_grad, target, weight, in_grad);
+  }
+};
+
+template struct NLLKernelUtil<DeviceType::kCUDA, float, int32_t>;
+template struct NLLKernelUtil<DeviceType::kCUDA, float, int64_t>;
+template struct NLLKernelUtil<DeviceType::kCUDA, double, int32_t>;
+template struct NLLKernelUtil<DeviceType::kCUDA, double, int64_t>;
+template struct NLLKernelUtil<DeviceType::kCUDA, half, int32_t>;
+template struct NLLKernelUtil<DeviceType::kCUDA, half, int64_t>;
+
 }  // namespace oneflow
\ No newline at end of file
diff --git a/oneflow/user/kernels/nms_kernel.hip.cpp b/oneflow/user/kernels/nms_kernel.hip.cpp
index 0be72a5..bc8a3eb 100644
--- a/oneflow/user/kernels/nms_kernel.hip.cpp
+++ b/oneflow/user/kernels/nms_kernel.hip.cpp
@@ -1,145 +1,145 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "hip/hip_runtime.h"
-#include "oneflow/core/framework/framework.h"
-#include "oneflow/core/kernel/new_kernel_util.h"
-#include "oneflow/core/ep/rocm/cuda_stream.h"
-
-namespace oneflow {
-
-namespace {
-
-constexpr int kBlockSize = sizeof(int64_t) * 8;
-
-template<typename T>
-__host__ __device__ __forceinline__ T CeilDiv(T a, T b) {
-  return (a + b - 1) / b;
-}
-
-template<typename T>
-__host__ __device__ __forceinline__ T IoU(T const* const a, T const* const b) {
-  T interS =
-      max(min(a[2], b[2]) - max(a[0], b[0]), 0.f) * max(min(a[3], b[3]) - max(a[1], b[1]), 0.f);
-  T Sa = (a[2] - a[0]) * (a[3] - a[1]);
-  T Sb = (b[2] - b[0]) * (b[3] - b[1]);
-  return interS / (Sa + Sb - interS);
-}
-
-template<typename T>
-__global__ void CalcSuppressionBitmaskMatrix(int num_boxes, float iou_threshold, const T* boxes,
-                                             int64_t* suppression_bmask_matrix) {
-  const int row = blockIdx.y;
-  const int col = blockIdx.x;
-
-  if (row > col) return;
-
-  const int row_size = min(num_boxes - row * kBlockSize, kBlockSize);
-  const int col_size = min(num_boxes - col * kBlockSize, kBlockSize);
-
-  __shared__ T block_boxes[kBlockSize * 4];
-  if (threadIdx.x < col_size) {
-    block_boxes[threadIdx.x * 4 + 0] = boxes[(kBlockSize * col + threadIdx.x) * 4 + 0];
-    block_boxes[threadIdx.x * 4 + 1] = boxes[(kBlockSize * col + threadIdx.x) * 4 + 1];
-    block_boxes[threadIdx.x * 4 + 2] = boxes[(kBlockSize * col + threadIdx.x) * 4 + 2];
-    block_boxes[threadIdx.x * 4 + 3] = boxes[(kBlockSize * col + threadIdx.x) * 4 + 3];
-  }
-  __syncthreads();
-
-  if (threadIdx.x < row_size) {
-    const int cur_box_idx = kBlockSize * row + threadIdx.x;
-    const T* cur_box_ptr = boxes + cur_box_idx * 4;
-    unsigned long long bits = 0;
-    int start = 0;
-    if (row == col) { start = threadIdx.x + 1; }
-    for (int i = start; i < col_size; i++) {
-      if (IoU(cur_box_ptr, block_boxes + i * 4) > iou_threshold) { bits |= 1Ull << i; }
-    }
-    suppression_bmask_matrix[cur_box_idx * gridDim.y + col] = bits;
-  }
-}
-
-__global__ void ScanSuppression(int num_boxes, int num_blocks, int num_keep,
-                                int64_t* suppression_bmask, int8_t* keep_mask) {
-  extern __shared__ int64_t remv[];
-  remv[threadIdx.x] = 0;
-  for (int i = 0; i < num_boxes; ++i) {
-    int block_n = i / kBlockSize;
-    int block_i = i % kBlockSize;
-    if (!(remv[block_n] & (1Ull << block_i))) {
-      remv[threadIdx.x] |= suppression_bmask[i * num_blocks + threadIdx.x];
-      if (threadIdx.x == block_n && num_keep > 0) {
-        keep_mask[i] = 1;
-        num_keep -= 1;
-      }
-    }
-  }
-}
-
-}  // namespace
-
-template<typename T>
-class NmsGpuKernel final : public user_op::OpKernel {
- public:
-  NmsGpuKernel() = default;
-  ~NmsGpuKernel() = default;
-
- private:
-  using user_op::OpKernel::Compute;
-  void Compute(user_op::KernelComputeContext* ctx) const override {
-    const user_op::Tensor* boxes_blob = ctx->Tensor4ArgNameAndIndex("in", 0);
-    user_op::Tensor* keep_blob = ctx->Tensor4ArgNameAndIndex("out", 0);
-    user_op::Tensor* tmp_blob = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0);
-    const T* boxes = boxes_blob->dptr<T>();
-    int8_t* keep = keep_blob->mut_dptr<int8_t>();
-    int64_t* suppression_mask = tmp_blob->mut_dptr<int64_t>();
-
-    const int num_boxes = boxes_blob->shape_view().At(0);
-    int num_keep = ctx->Attr<int>("keep_n");
-    if (num_keep <= 0 || num_keep > num_boxes) { num_keep = num_boxes; }
-    const int num_blocks = CeilDiv<int>(num_boxes, kBlockSize);
-    Memset<DeviceType::kCUDA>(ctx->stream(), suppression_mask, 0,
-                              num_boxes * num_blocks * sizeof(int64_t));
-    Memset<DeviceType::kCUDA>(ctx->stream(), keep, 0, num_boxes * sizeof(int8_t));
-
-    dim3 blocks(num_blocks, num_blocks);
-    dim3 threads(kBlockSize);
-    CalcSuppressionBitmaskMatrix<<<blocks, threads, 0,
-                                   ctx->stream()->As<ep::CudaStream>()->cuda_stream()>>>(
-        num_boxes, ctx->Attr<float>("iou_threshold"), boxes, suppression_mask);
-    ScanSuppression<<<1, num_blocks, num_blocks * sizeof(int64_t),
-                      ctx->stream()->As<ep::CudaStream>()->cuda_stream()>>>(
-        num_boxes, num_blocks, num_keep, suppression_mask, keep);
-  }
-  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
-};
-
-#define REGISTER_NMS_CUDA_KERNEL(dtype)                                                 \
-  REGISTER_USER_KERNEL("nms")                                                           \
-      .SetCreateFn<NmsGpuKernel<dtype>>()                                               \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                  \
-                       && (user_op::HobDataType("out", 0) == DataType::kInt8)           \
-                       && (user_op::HobDataType("in", 0) == GetDataType<dtype>::value)) \
-      .SetInferTmpSizeFn([](user_op::InferContext* ctx) {                               \
-        Shape* in_shape = ctx->Shape4ArgNameAndIndex("in", 0);                          \
-        int64_t num_boxes = in_shape->At(0);                                            \
-        int64_t blocks = CeilDiv<int64_t>(num_boxes, kBlockSize);                       \
-        return num_boxes * blocks * sizeof(int64_t);                                    \
-      });
-
-REGISTER_NMS_CUDA_KERNEL(float)
-REGISTER_NMS_CUDA_KERNEL(double)
-
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "hip/hip_runtime.h"
+#include "oneflow/core/framework/framework.h"
+#include "oneflow/core/kernel/new_kernel_util.h"
+#include "oneflow/core/ep/rocm/cuda_stream.h"
+
+namespace oneflow {
+
+namespace {
+
+constexpr int kBlockSize = sizeof(int64_t) * 8;
+
+template<typename T>
+__host__ __device__ __forceinline__ T CeilDiv(T a, T b) {
+  return (a + b - 1) / b;
+}
+
+template<typename T>
+__host__ __device__ __forceinline__ T IoU(T const* const a, T const* const b) {
+  T interS =
+      max(min(a[2], b[2]) - max(a[0], b[0]), 0.f) * max(min(a[3], b[3]) - max(a[1], b[1]), 0.f);
+  T Sa = (a[2] - a[0]) * (a[3] - a[1]);
+  T Sb = (b[2] - b[0]) * (b[3] - b[1]);
+  return interS / (Sa + Sb - interS);
+}
+
+template<typename T>
+__global__ void CalcSuppressionBitmaskMatrix(int num_boxes, float iou_threshold, const T* boxes,
+                                             int64_t* suppression_bmask_matrix) {
+  const int row = blockIdx.y;
+  const int col = blockIdx.x;
+
+  if (row > col) return;
+
+  const int row_size = min(num_boxes - row * kBlockSize, kBlockSize);
+  const int col_size = min(num_boxes - col * kBlockSize, kBlockSize);
+
+  __shared__ T block_boxes[kBlockSize * 4];
+  if (threadIdx.x < col_size) {
+    block_boxes[threadIdx.x * 4 + 0] = boxes[(kBlockSize * col + threadIdx.x) * 4 + 0];
+    block_boxes[threadIdx.x * 4 + 1] = boxes[(kBlockSize * col + threadIdx.x) * 4 + 1];
+    block_boxes[threadIdx.x * 4 + 2] = boxes[(kBlockSize * col + threadIdx.x) * 4 + 2];
+    block_boxes[threadIdx.x * 4 + 3] = boxes[(kBlockSize * col + threadIdx.x) * 4 + 3];
+  }
+  __syncthreads();
+
+  if (threadIdx.x < row_size) {
+    const int cur_box_idx = kBlockSize * row + threadIdx.x;
+    const T* cur_box_ptr = boxes + cur_box_idx * 4;
+    unsigned long long bits = 0;
+    int start = 0;
+    if (row == col) { start = threadIdx.x + 1; }
+    for (int i = start; i < col_size; i++) {
+      if (IoU(cur_box_ptr, block_boxes + i * 4) > iou_threshold) { bits |= 1Ull << i; }
+    }
+    suppression_bmask_matrix[cur_box_idx * gridDim.y + col] = bits;
+  }
+}
+
+__global__ void ScanSuppression(int num_boxes, int num_blocks, int num_keep,
+                                int64_t* suppression_bmask, int8_t* keep_mask) {
+  extern __shared__ int64_t remv[];
+  remv[threadIdx.x] = 0;
+  for (int i = 0; i < num_boxes; ++i) {
+    int block_n = i / kBlockSize;
+    int block_i = i % kBlockSize;
+    if (!(remv[block_n] & (1Ull << block_i))) {
+      remv[threadIdx.x] |= suppression_bmask[i * num_blocks + threadIdx.x];
+      if (threadIdx.x == block_n && num_keep > 0) {
+        keep_mask[i] = 1;
+        num_keep -= 1;
+      }
+    }
+  }
+}
+
+}  // namespace
+
+template<typename T>
+class NmsGpuKernel final : public user_op::OpKernel {
+ public:
+  NmsGpuKernel() = default;
+  ~NmsGpuKernel() = default;
+
+ private:
+  using user_op::OpKernel::Compute;
+  void Compute(user_op::KernelComputeContext* ctx) const override {
+    const user_op::Tensor* boxes_blob = ctx->Tensor4ArgNameAndIndex("in", 0);
+    user_op::Tensor* keep_blob = ctx->Tensor4ArgNameAndIndex("out", 0);
+    user_op::Tensor* tmp_blob = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0);
+    const T* boxes = boxes_blob->dptr<T>();
+    int8_t* keep = keep_blob->mut_dptr<int8_t>();
+    int64_t* suppression_mask = tmp_blob->mut_dptr<int64_t>();
+
+    const int num_boxes = boxes_blob->shape_view().At(0);
+    int num_keep = ctx->Attr<int>("keep_n");
+    if (num_keep <= 0 || num_keep > num_boxes) { num_keep = num_boxes; }
+    const int num_blocks = CeilDiv<int>(num_boxes, kBlockSize);
+    Memset<DeviceType::kCUDA>(ctx->stream(), suppression_mask, 0,
+                              num_boxes * num_blocks * sizeof(int64_t));
+    Memset<DeviceType::kCUDA>(ctx->stream(), keep, 0, num_boxes * sizeof(int8_t));
+
+    dim3 blocks(num_blocks, num_blocks);
+    dim3 threads(kBlockSize);
+    CalcSuppressionBitmaskMatrix<<<blocks, threads, 0,
+                                   ctx->stream()->As<ep::CudaStream>()->cuda_stream()>>>(
+        num_boxes, ctx->Attr<float>("iou_threshold"), boxes, suppression_mask);
+    ScanSuppression<<<1, num_blocks, num_blocks * sizeof(int64_t),
+                      ctx->stream()->As<ep::CudaStream>()->cuda_stream()>>>(
+        num_boxes, num_blocks, num_keep, suppression_mask, keep);
+  }
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
+};
+
+#define REGISTER_NMS_CUDA_KERNEL(dtype)                                                 \
+  REGISTER_USER_KERNEL("nms")                                                           \
+      .SetCreateFn<NmsGpuKernel<dtype>>()                                               \
+      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                  \
+                       && (user_op::HobDataType("out", 0) == DataType::kInt8)           \
+                       && (user_op::HobDataType("in", 0) == GetDataType<dtype>::value)) \
+      .SetInferTmpSizeFn([](user_op::InferContext* ctx) {                               \
+        Shape* in_shape = ctx->Shape4ArgNameAndIndex("in", 0);                          \
+        int64_t num_boxes = in_shape->At(0);                                            \
+        int64_t blocks = CeilDiv<int64_t>(num_boxes, kBlockSize);                       \
+        return num_boxes * blocks * sizeof(int64_t);                                    \
+      });
+
+REGISTER_NMS_CUDA_KERNEL(float)
+REGISTER_NMS_CUDA_KERNEL(double)
+
 }  // namespace oneflow
\ No newline at end of file
diff --git a/oneflow/user/kernels/normalization_kernel.hip.cpp b/oneflow/user/kernels/normalization_kernel.hip.cpp
index e5ea5f0..f80809a 100644
--- a/oneflow/user/kernels/normalization_kernel.hip.cpp
+++ b/oneflow/user/kernels/normalization_kernel.hip.cpp
@@ -1,534 +1,534 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#ifdef WITH_ROCM
-
-#include <unordered_map>
-#include "hip/hip_runtime.h"
-#include "oneflow/core/framework/framework.h"
-#include "oneflow/core/device/cudnn_util.h"
-#include "oneflow/core/kernel/new_kernel_util.h"
-#include "oneflow/core/kernel/cuda_graph_support.h"
-#include "oneflow/core/ep/rocm/cuda_stream.h"
-#include "hip/hsa_detail/device_functions.h"
-
-namespace oneflow {
-
-namespace {
-
-void InferDimSizeAndDataFormat(const ShapeView& x_shape, const int32_t axis, int32_t* n, int32_t* c,
-                               int32_t* h, int32_t* w, hipdnnTensorFormat_t* format) {
-  if (x_shape.Count(axis + 1) == 1) {
-    if (axis == 0) {
-      *n = 1;
-      *h = 1;
-    } else {
-      *n = x_shape.At(0);
-      *h = x_shape.Count(1, axis);
-    }
-    *w = 1;
-    *c = x_shape.At(axis);
-    // *format = HIPDNN_TENSOR_NHWC;
-    *format = HIPDNN_TENSOR_NCHW;
-    // std::cout << "don't surpport HIPDNN_TENSOR_NHWC, use HIPDNN_TENSOR_NCHW instead, maybe cause wrong results" << std::endl;
-  } else {
-    *n = x_shape.Count(0, axis);
-    *c = x_shape.At(axis);
-    *h = x_shape.Count(axis + 1);
-    *w = 1;
-    *format = HIPDNN_TENSOR_NCHW;
-  }
-}
-
-void InferXYCudnnTensorDesc(const ShapeView& xy_shape, const DataType& data_type,
-                            const int32_t axis, hipdnnTensorDescriptor_t xy_desc) {
-  int32_t n, c, h, w;
-  hipdnnTensorFormat_t format;
-  InferDimSizeAndDataFormat(xy_shape, axis, &n, &c, &h, &w, &format);
-  OF_CUDNN_CHECK(
-      hipdnnSetTensor4dDescriptor(xy_desc, format, GetCudnnDataType(data_type), n, c, h, w));
-}
-
-void InferParamCudnnTensorDesc(const hipdnnTensorDescriptor_t xy_desc, hipdnnBatchNormMode_t mode,
-                               hipdnnTensorDescriptor_t param_desc) {
-  OF_CUDNN_CHECK(hipdnnDeriveBNTensorDescriptor(param_desc, xy_desc, mode));
-}
-
-class CudnnTensorDescHelper final {
- public:
-  OF_DISALLOW_COPY_AND_MOVE(CudnnTensorDescHelper);
-  CudnnTensorDescHelper(const ShapeView& xy_shape, const DataType& data_type, const int32_t axis,
-                        hipdnnBatchNormMode_t mode) {
-    OF_CUDNN_CHECK(hipdnnCreateTensorDescriptor(&xy_desc_));
-    InferXYCudnnTensorDesc(xy_shape, data_type, axis, xy_desc_);
-    OF_CUDNN_CHECK(hipdnnCreateTensorDescriptor(&param_desc_));
-    InferParamCudnnTensorDesc(xy_desc_, mode, param_desc_);
-    int n, c, h, w, n_stride, c_stride, h_stride, w_stride;
-    OF_CUDNN_CHECK(hipdnnGetTensor4dDescriptor(param_desc_, &param_data_type_, &n, &c, &h, &w,
-                                              &n_stride, &c_stride, &h_stride, &w_stride));
-    param_size_ = c;
-  }
-  ~CudnnTensorDescHelper() {
-    OF_CUDNN_CHECK(hipdnnDestroyTensorDescriptor(param_desc_));
-    OF_CUDNN_CHECK(hipdnnDestroyTensorDescriptor(xy_desc_));
-  }
-
-  hipdnnTensorDescriptor_t xy_desc() const { return xy_desc_; }
-
-  hipdnnTensorDescriptor_t param_desc() const { return param_desc_; }
-
-  void CheckParamTensor(const user_op::Tensor* tensor) const {
-    CHECK_NOTNULL(tensor);
-    CHECK_EQ(tensor->shape_view().NumAxes(), 1);
-    CHECK_EQ(tensor->shape_view().At(0), param_size_);
-    // CHECK_EQ(GetCudnnDataType(tensor->data_type()), param_data_type_);
-  }
-
- private:
-  hipdnnTensorDescriptor_t xy_desc_ = nullptr;
-  hipdnnTensorDescriptor_t param_desc_ = nullptr;
-  hipdnnDataType_t param_data_type_;
-  int32_t param_size_ = 0;
-};
-
-size_t InferTrainWorkspaceSize(const ShapeView& x_shape, const DataType data_type,
-                               const int32_t axis) {
-  return 1;
-}
-
-size_t InferTrainTmpSize(user_op::InferContext* ctx) {
-  const auto& x = ctx->InputTensorDesc("x", 0);
-  const auto axis = ctx->Attr<int32_t>("axis");
-  return InferTrainWorkspaceSize(x.shape(), x.data_type(), axis);
-}
-
-size_t InferGradWorkspaceSize(const ShapeView& x_shape, const DataType data_type,
-                              const int32_t axis) {
-  return 1;
-}
-
-size_t InferGradTmpSize(user_op::InferContext* ctx) {
-  const auto& dy = ctx->InputTensorDesc("dy", 0);
-  const auto axis = ctx->Attr<int32_t>("axis");
-  size_t tmp_size = 0;
-  if (ctx->op_type_name() == "normalization_add_relu_grad" && !ctx->has_output("addend_diff", 0)) {
-    tmp_size += GetCudaAlignedSize(dy.shape().elem_cnt() * GetSizeOfDataType(dy.data_type()));
-  }
-  tmp_size += GetCudaAlignedSize(InferGradWorkspaceSize(dy.shape(), dy.data_type(), axis));
-  return tmp_size;
-}
-
-template<typename T>
-class NormalizationInferenceKernel final : public user_op::OpKernel,
-                                           public user_op::CudaGraphSupport {
- public:
-  NormalizationInferenceKernel() = default;
-  ~NormalizationInferenceKernel() override = default;
-
- private:
-  using user_op::OpKernel::Compute;
-  void Compute(user_op::KernelComputeContext* ctx) const override {
-    const bool training = ctx->Attr<bool>("training");
-    CHECK(!training);
-    const auto* x = ctx->Tensor4ArgNameAndIndex("x", 0);
-    auto* y = ctx->Tensor4ArgNameAndIndex("y", 0);
-    const auto* gamma = ctx->Tensor4ArgNameAndIndex("gamma", 0);
-    const auto* beta = ctx->Tensor4ArgNameAndIndex("beta", 0);
-    auto* moving_mean = ctx->Tensor4ArgNameAndIndex("moving_mean", 0);
-    auto* moving_variance = ctx->Tensor4ArgNameAndIndex("moving_variance", 0);
-    const auto axis = ctx->Attr<int32_t>("axis");
-    const auto epsilon = ctx->Attr<float>("epsilon");
-
-    const DataType data_type = x->data_type();
-    CHECK_EQ(x->shape_view(), y->shape_view());
-    CHECK_EQ(y->data_type(), data_type);
-    CHECK_GE(axis, 0);
-    CHECK_LT(axis, x->shape_view().NumAxes());
-
-    const CudnnTensorDescHelper desc_helper(x->shape_view(), data_type, axis,
-                                            HIPDNN_BATCHNORM_SPATIAL);
-    desc_helper.CheckParamTensor(gamma);
-    desc_helper.CheckParamTensor(beta);
-    desc_helper.CheckParamTensor(moving_mean);
-    desc_helper.CheckParamTensor(moving_variance);
-
-    const void* sp_alpha = CudnnSPOnePtr<T>();
-    const void* sp_beta;
-    if (ctx->has_input("_add_to_output", 0)) {
-      const user_op::Tensor* add_to_output = ctx->Tensor4ArgNameAndIndex("_add_to_output", 0);
-      CHECK_EQ(add_to_output->data_type(), y->data_type());
-      CHECK_EQ(add_to_output->shape_view(), y->shape_view());
-      Memcpy<DeviceType::kCUDA>(
-          ctx->stream(), y->mut_dptr<void>(), add_to_output->dptr<void>(),
-          add_to_output->shape_view().elem_cnt() * GetSizeOfDataType(add_to_output->data_type()));
-      sp_beta = CudnnSPOnePtr<T>();
-    } else {
-      sp_beta = CudnnSPZeroPtr<T>();
-    }
-
-    OF_CUDNN_CHECK(hipdnnBatchNormalizationForwardInference(
-        ctx->stream()->As<ep::CudaStream>()->cudnn_handle(), HIPDNN_BATCHNORM_SPATIAL, sp_alpha,
-        sp_beta, desc_helper.xy_desc(), x->dptr(), desc_helper.xy_desc(), y->mut_dptr(),
-        desc_helper.param_desc(), gamma->dptr(), beta->dptr(), moving_mean->dptr(),
-        moving_variance->dptr(), epsilon));
-  }
-
-  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
-};
-
-#define REGISTER_BN_INFERENCE_KERNEL(dtype)                                                     \
-  REGISTER_USER_KERNEL("normalization")                                                         \
-      .SetCreateFn<NormalizationInferenceKernel<dtype>>()                                       \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                          \
-                       && (user_op::HobDataType("y", 0) == GetDataType<dtype>::value)           \
-                       && (user_op::HobAttr<bool>("training") == false))                        \
-      .SetInplaceProposalFn([](const user_op::InferContext& ctx,                                \
-                               user_op::AddInplaceArgPair AddInplaceArgPairFn) -> Maybe<void> { \
-        if (ctx.has_input("_add_to_output", 0)) {                                               \
-          OF_RETURN_IF_ERROR(AddInplaceArgPairFn("y", 0, "_add_to_output", 0, true));           \
-        }                                                                                       \
-        return Maybe<void>::Ok();                                                               \
-      });
-
-REGISTER_BN_INFERENCE_KERNEL(float16)
-REGISTER_BN_INFERENCE_KERNEL(float)
-REGISTER_BN_INFERENCE_KERNEL(double)
-
-#undef REGISTER_BN_INFERENCE_KERNEL
-
-constexpr int64_t kCudaWarpSize = 64;
-
-template<typename T>
-__global__ void ReluGpu(int64_t n, const T* x, T* y, int64_t* mask) {
-  const int32_t lane_id = threadIdx.x % kCudaWarpSize;
-  CUDA_1D_KERNEL_LOOP(i, n) {
-    const T x_val = x[i];
-    const bool is_positive = (x_val > 0);
-    int64_t warp_mask = __ballot(static_cast<int>(is_positive));
-    if (lane_id == 0) { mask[i / kCudaWarpSize] = warp_mask; }
-    y[i] = is_positive ? x_val : 0;
-  }
-}
-
-template<>
-__global__ void ReluGpu<half>(int64_t n, const half* x, half* y, int64_t* mask) {
-  const int32_t lane_id = threadIdx.x % kCudaWarpSize;
-  const half zero = __float2half(0.0f);
-  CUDA_1D_KERNEL_LOOP(i, n) {
-    const half x_val = x[i];
-    const bool is_positive = __hgt(x_val, zero);
-    int64_t warp_mask = __ballot(static_cast<int>(is_positive));
-    if (lane_id == 0) { mask[i / kCudaWarpSize] = warp_mask; }
-    y[i] = is_positive ? x_val : zero;
-  }
-}
-
-template<typename T>
-__global__ void AddReluGpu(int64_t n, const T* x, const T* addend, T* y, int64_t* mask) {
-  const int32_t lane_id = threadIdx.x % kCudaWarpSize;
-  CUDA_1D_KERNEL_LOOP(i, n) {
-    const T sum = x[i] + addend[i];
-    const bool is_positive = (sum > 0);
-    int64_t warp_mask = __ballot(static_cast<int>(is_positive));
-    if (lane_id == 0) { mask[i / kCudaWarpSize] = warp_mask; }
-    y[i] = is_positive ? sum : 0;
-  }
-}
-
-template<>
-__global__ void AddReluGpu<half>(int64_t n, const half* x, const half* addend, half* y,
-                                 int64_t* mask) {
-  const int32_t lane_id = threadIdx.x % kCudaWarpSize;
-  const half zero = __float2half(0.0f);
-  CUDA_1D_KERNEL_LOOP(i, n) {
-    const half sum = __hadd(x[i], addend[i]);
-    const bool is_positive = __hgt(sum, zero);
-    int64_t warp_mask = __ballot(static_cast<int>(is_positive));
-    if (lane_id == 0) { mask[i / kCudaWarpSize] = warp_mask; }
-    y[i] = is_positive ? sum : zero;
-  }
-}
-
-template<typename T>
-void Relu(ep::Stream* stream, int64_t n, const T* x, T* y, int64_t* mask) {
-  ReluGpu<T><<<BlocksNum4ThreadsNum(n), kCudaThreadsNumPerBlock, 0,
-               stream->As<ep::CudaStream>()->cuda_stream()>>>(n, x, y, mask);
-}
-
-template<>
-void Relu<float16>(ep::Stream* stream, int64_t n, const float16* x, float16* y, int64_t* mask) {
-  Relu<half>(stream, n, reinterpret_cast<const half*>(x), reinterpret_cast<half*>(y), mask);
-}
-
-template<typename T>
-void AddRelu(ep::Stream* stream, int64_t n, const T* x, const T* addend, T* y, int64_t* mask) {
-  AddReluGpu<T><<<BlocksNum4ThreadsNum(n), kCudaThreadsNumPerBlock, 0,
-                  stream->As<ep::CudaStream>()->cuda_stream()>>>(n, x, addend, y, mask);
-}
-
-template<>
-void AddRelu<float16>(ep::Stream* stream, int64_t n, const float16* x, const float16* addend,
-                      float16* y, int64_t* mask) {
-  AddRelu<half>(stream, n, reinterpret_cast<const half*>(x), reinterpret_cast<const half*>(addend),
-                reinterpret_cast<half*>(y), mask);
-}
-
-template<typename T>
-__global__ void ReluBackwardGpu(int64_t n, const int64_t* mask, const T* dy, T* addend_diff) {
-  int32_t lane_id = threadIdx.x % kCudaWarpSize;
-  CUDA_1D_KERNEL_LOOP(i, n) {
-    int64_t mask_val = mask[i / kCudaWarpSize];
-    bool is_positive = mask_val & (1 << lane_id);
-    addend_diff[i] = static_cast<T>(is_positive) * dy[i];
-  }
-}
-
-template<typename T>
-void ReluBackward(ep::Stream* stream, int64_t n, const int64_t* mask, const T* dy, T* addend_diff) {
-  ReluBackwardGpu<T><<<BlocksNum4ThreadsNum(n), kCudaThreadsNumPerBlock, 0,
-                       stream->As<ep::CudaStream>()->cuda_stream()>>>(n, mask, dy, addend_diff);
-}
-
-template<>
-void ReluBackward<float16>(ep::Stream* stream, int64_t n, const int64_t* mask, const float16* dy,
-                           float16* addend_diff) {
-  ReluBackward<half>(stream, n, mask, reinterpret_cast<const half*>(dy),
-                     reinterpret_cast<half*>(addend_diff));
-}
-
-template<typename T>
-class NormalizationTrainKernel final : public user_op::OpKernel, public user_op::CudaGraphSupport {
- public:
-  NormalizationTrainKernel() = default;
-  ~NormalizationTrainKernel() override = default;
-
- private:
-  using user_op::OpKernel::Compute;
-  void Compute(user_op::KernelComputeContext* ctx) const override {
-    if (ctx->op_type_name() == "normalization") { CHECK(ctx->Attr<bool>("training")); }
-    const auto* x = ctx->Tensor4ArgNameAndIndex("x", 0);
-    auto* y = ctx->Tensor4ArgNameAndIndex("y", 0);
-
-    const auto axis = ctx->Attr<int32_t>("axis");
-    const auto epsilon = ctx->Attr<float>("epsilon");
-    const auto momentum = ctx->Attr<float>("momentum");
-
-    const DataType data_type = x->data_type();
-    CHECK_EQ(x->shape_view(), y->shape_view());
-    CHECK_EQ(y->data_type(), data_type);
-    CHECK_GE(axis, 0);
-    CHECK_LT(axis, x->shape_view().NumAxes());
-    const CudnnTensorDescHelper desc_helper(x->shape_view(), data_type, axis,
-                                            HIPDNN_BATCHNORM_SPATIAL_PERSISTENT);
-
-    const auto* gamma = ctx->Tensor4ArgNameAndIndex("gamma", 0);
-    const auto* beta = ctx->Tensor4ArgNameAndIndex("beta", 0);
-    auto* mean = ctx->Tensor4ArgNameAndIndex("mean", 0);
-    auto* inv_variance = ctx->Tensor4ArgNameAndIndex("inv_variance", 0);
-    desc_helper.CheckParamTensor(gamma);
-    desc_helper.CheckParamTensor(beta);
-    desc_helper.CheckParamTensor(mean);
-    desc_helper.CheckParamTensor(inv_variance);
-
-    user_op::Tensor* moving_mean = nullptr;
-    user_op::Tensor* moving_variance = nullptr;
-    if (ctx->has_input("moving_mean", 0)) {
-      CHECK(ctx->has_input("moving_variance", 0));
-      moving_mean = ctx->Tensor4ArgNameAndIndex("moving_mean", 0);
-      moving_variance = ctx->Tensor4ArgNameAndIndex("moving_variance", 0);
-      desc_helper.CheckParamTensor(moving_mean);
-      desc_helper.CheckParamTensor(moving_variance);
-    }
-
-    const void* sp_alpha = CudnnSPOnePtr<T>();
-    const void* sp_beta;
-    if (ctx->has_input("_add_to_output", 0)) {
-      const user_op::Tensor* add_to_output = ctx->Tensor4ArgNameAndIndex("_add_to_output", 0);
-      CHECK_EQ(add_to_output->data_type(), y->data_type());
-      CHECK_EQ(add_to_output->shape_view(), y->shape_view());
-      Memcpy<DeviceType::kCUDA>(
-          ctx->stream(), y->mut_dptr<void>(), add_to_output->dptr<void>(),
-          add_to_output->shape_view().elem_cnt() * GetSizeOfDataType(add_to_output->data_type()));
-      sp_beta = CudnnSPOnePtr<T>();
-    } else {
-      sp_beta = CudnnSPZeroPtr<T>();
-    }
-
-    OF_CUDNN_CHECK(hipdnnBatchNormalizationForwardTraining(
-        ctx->stream()->As<ep::CudaStream>()->cudnn_handle(), HIPDNN_BATCHNORM_SPATIAL_PERSISTENT,
-        const_cast<void *>(sp_alpha), const_cast<void *>(sp_beta), desc_helper.xy_desc(), x->dptr(), desc_helper.xy_desc(), y->mut_dptr(),
-        desc_helper.param_desc(), const_cast<void *>(gamma->dptr()), const_cast<void *>(beta->dptr()), 1.0 - momentum,
-        moving_mean ? moving_mean->mut_dptr() : NULL,
-        moving_variance ? moving_variance->mut_dptr() : NULL, epsilon, mean->mut_dptr(),
-        inv_variance->mut_dptr()));
-
-    if (ctx->op_type_name() == "normalization_add_relu") {
-      CHECK(!ctx->has_input("_add_to_output", 0));
-      const int64_t elem_cnt = x->shape_view().elem_cnt();
-      auto* mask = ctx->Tensor4ArgNameAndIndex("reserve_space", 0);
-      if (ctx->has_input("addend", 0)) {
-        const auto* addend = ctx->Tensor4ArgNameAndIndex("addend", 0);
-        AddRelu(ctx->stream(), elem_cnt, y->dptr<T>(), addend->dptr<T>(), y->mut_dptr<T>(),
-                mask->mut_dptr<int64_t>());
-      } else {
-        Relu(ctx->stream(), elem_cnt, y->dptr<T>(), y->mut_dptr<T>(), mask->mut_dptr<int64_t>());
-      }
-    }
-  }
-
-  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
-};
-
-#define REGISTER_BN_TRAIN_KERNEL(dtype)                                                         \
-  REGISTER_USER_KERNEL("normalization")                                                         \
-      .SetCreateFn<NormalizationTrainKernel<dtype>>()                                           \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                          \
-                       && (user_op::HobDataType("y", 0) == GetDataType<dtype>::value)           \
-                       && (user_op::HobAttr<bool>("training") == true))                         \
-      .SetInferTmpSizeFn(InferTrainTmpSize)                                                     \
-      .SetInplaceProposalFn([](const user_op::InferContext& ctx,                                \
-                               user_op::AddInplaceArgPair AddInplaceArgPairFn) -> Maybe<void> { \
-        if (ctx.has_input("_add_to_output", 0)) {                                               \
-          OF_RETURN_IF_ERROR(AddInplaceArgPairFn("y", 0, "_add_to_output", 0, true));           \
-        }                                                                                       \
-        return Maybe<void>::Ok();                                                               \
-      });
-
-REGISTER_BN_TRAIN_KERNEL(float16)
-REGISTER_BN_TRAIN_KERNEL(float)
-REGISTER_BN_TRAIN_KERNEL(double)
-
-#define REGISTER_BN_ADD_RELU_KERNEL(dtype)                                             \
-  REGISTER_USER_KERNEL("normalization_add_relu")                                       \
-      .SetCreateFn<NormalizationTrainKernel<dtype>>()                                  \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                 \
-                       && (user_op::HobDataType("y", 0) == GetDataType<dtype>::value)) \
-      .SetInferTmpSizeFn(InferTrainTmpSize);
-
-REGISTER_BN_ADD_RELU_KERNEL(float16)
-REGISTER_BN_ADD_RELU_KERNEL(float)
-REGISTER_BN_ADD_RELU_KERNEL(double)
-
-template<typename T>
-class NormalizationGradUserKernel final : public user_op::OpKernel,
-                                          public user_op::CudaGraphSupport {
- public:
-  NormalizationGradUserKernel() = default;
-  ~NormalizationGradUserKernel() override = default;
-
- private:
-  using user_op::OpKernel::Compute;
-  void Compute(user_op::KernelComputeContext* ctx) const override {
-    const auto* x = ctx->Tensor4ArgNameAndIndex("x", 0);
-    auto* dx = ctx->Tensor4ArgNameAndIndex("dx", 0);
-    const auto* dy = ctx->Tensor4ArgNameAndIndex("dy", 0);
-    const auto* gamma = ctx->Tensor4ArgNameAndIndex("gamma", 0);
-    auto* gamma_diff = ctx->Tensor4ArgNameAndIndex("gamma_diff", 0);
-    auto* beta_diff = ctx->Tensor4ArgNameAndIndex("beta_diff", 0);
-    const auto* mean = ctx->Tensor4ArgNameAndIndex("mean", 0);
-    const auto* inv_variance = ctx->Tensor4ArgNameAndIndex("inv_variance", 0);
-    auto* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0);
-    const auto axis = ctx->Attr<int32_t>("axis");
-    const auto epsilon = ctx->Attr<float>("epsilon");
-
-    const DataType data_type = x->data_type();
-    CHECK_EQ(dy->shape_view(), x->shape_view());
-    CHECK_EQ(dy->data_type(), data_type);
-    CHECK_EQ(dx->shape_view(), x->shape_view());
-    CHECK_EQ(dx->data_type(), data_type);
-    CHECK_GE(axis, 0);
-    CHECK_LT(axis, x->shape_view().NumAxes());
-
-    const CudnnTensorDescHelper desc_helper(x->shape_view(), data_type, axis,
-                                            HIPDNN_BATCHNORM_SPATIAL_PERSISTENT);
-    desc_helper.CheckParamTensor(gamma);
-    desc_helper.CheckParamTensor(gamma_diff);
-    desc_helper.CheckParamTensor(beta_diff);
-    desc_helper.CheckParamTensor(mean);
-    desc_helper.CheckParamTensor(inv_variance);
-
-    void* bn_workspace_ptr;
-    size_t bn_workspace_size;
-    const void* bn_dy_ptr;
-
-    if (ctx->op_type_name() == "normalization_grad") {
-      bn_workspace_ptr = tmp_buffer->mut_dptr();
-      bn_workspace_size = tmp_buffer->shape_view().elem_cnt();
-      bn_dy_ptr = dy->dptr();
-    } else if (ctx->op_type_name() == "normalization_add_relu_grad") {
-      const int64_t elem_cnt = dy->shape_view().elem_cnt();
-      const auto* mask = ctx->Tensor4ArgNameAndIndex("reserve_space", 0);
-      user_op::Tensor* y = ctx->Tensor4ArgNameAndIndex("y", 0);
-      if (ctx->has_output("addend_diff", 0)) {
-        user_op::Tensor* addend_diff = ctx->Tensor4ArgNameAndIndex("addend_diff", 0);
-        ReluBackward(ctx->stream(), elem_cnt, mask->dptr<int64_t>(), dy->dptr<T>(),
-                     addend_diff->mut_dptr<T>());
-        bn_workspace_ptr = tmp_buffer->mut_dptr();
-        bn_workspace_size = tmp_buffer->shape_view().elem_cnt();
-        bn_dy_ptr = addend_diff->dptr();
-      } else {
-        const size_t tmp_buffer_size = tmp_buffer->shape_view().elem_cnt();
-        const size_t relu_dx_size =
-            GetCudaAlignedSize(dy->shape_view().elem_cnt() * GetSizeOfDataType(dy->data_type()));
-        CHECK_GE(tmp_buffer_size, relu_dx_size);
-        ReluBackward(ctx->stream(), elem_cnt, mask->dptr<int64_t>(), dy->dptr<T>(),
-                     reinterpret_cast<T*>(tmp_buffer->mut_dptr()));
-        bn_workspace_ptr = tmp_buffer->mut_dptr<char>() + relu_dx_size;
-        bn_workspace_size = tmp_buffer_size - relu_dx_size;
-        bn_dy_ptr = tmp_buffer->dptr();
-      }
-    } else {
-      UNIMPLEMENTED();
-    }
-
-    OF_CUDNN_CHECK(hipdnnBatchNormalizationBackward(
-        ctx->stream()->As<ep::CudaStream>()->cudnn_handle(), HIPDNN_BATCHNORM_SPATIAL_PERSISTENT,
-        CudnnSPOnePtr<T>(), CudnnSPZeroPtr<T>(), CudnnSPOnePtr<T>(), CudnnSPZeroPtr<T>(),
-        desc_helper.xy_desc(), x->dptr(), desc_helper.xy_desc(), bn_dy_ptr, desc_helper.xy_desc(),
-        dx->mut_dptr(), desc_helper.param_desc(), gamma->dptr(), gamma_diff->mut_dptr(),
-        beta_diff->mut_dptr(), epsilon, mean->dptr(), inv_variance->dptr()));
-
-  }
-
-  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
-};
-
-#define REGISTER_BN_GRAD_KERNEL(dtype)                                                  \
-  REGISTER_USER_KERNEL("normalization_grad")                                            \
-      .SetCreateFn<NormalizationGradUserKernel<dtype>>()                                \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                  \
-                       && (user_op::HobDataType("dx", 0) == GetDataType<dtype>::value)) \
-      .SetInferTmpSizeFn(InferGradTmpSize);
-
-REGISTER_BN_GRAD_KERNEL(float16)
-REGISTER_BN_GRAD_KERNEL(float)
-REGISTER_BN_GRAD_KERNEL(double)
-
-#define REGISTER_BN_ADD_RELU_GRAD_KERNEL(dtype)                                         \
-  REGISTER_USER_KERNEL("normalization_add_relu_grad")                                   \
-      .SetCreateFn<NormalizationGradUserKernel<dtype>>()                                \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                  \
-                       && (user_op::HobDataType("dx", 0) == GetDataType<dtype>::value)) \
-      .SetInferTmpSizeFn(InferGradTmpSize);
-
-REGISTER_BN_ADD_RELU_GRAD_KERNEL(float16)
-REGISTER_BN_ADD_RELU_GRAD_KERNEL(float)
-REGISTER_BN_ADD_RELU_GRAD_KERNEL(double)
-
-
-}  // namespace
-}  // namespace oneflow
-
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#ifdef WITH_ROCM
+
+#include <unordered_map>
+#include "hip/hip_runtime.h"
+#include "oneflow/core/framework/framework.h"
+#include "oneflow/core/device/cudnn_util.h"
+#include "oneflow/core/kernel/new_kernel_util.h"
+#include "oneflow/core/kernel/cuda_graph_support.h"
+#include "oneflow/core/ep/rocm/cuda_stream.h"
+#include "hip/hsa_detail/device_functions.h"
+
+namespace oneflow {
+
+namespace {
+
+void InferDimSizeAndDataFormat(const ShapeView& x_shape, const int32_t axis, int32_t* n, int32_t* c,
+                               int32_t* h, int32_t* w, hipdnnTensorFormat_t* format) {
+  if (x_shape.Count(axis + 1) == 1) {
+    if (axis == 0) {
+      *n = 1;
+      *h = 1;
+    } else {
+      *n = x_shape.At(0);
+      *h = x_shape.Count(1, axis);
+    }
+    *w = 1;
+    *c = x_shape.At(axis);
+    // *format = HIPDNN_TENSOR_NHWC;
+    *format = HIPDNN_TENSOR_NCHW;
+    // std::cout << "don't surpport HIPDNN_TENSOR_NHWC, use HIPDNN_TENSOR_NCHW instead, maybe cause wrong results" << std::endl;
+  } else {
+    *n = x_shape.Count(0, axis);
+    *c = x_shape.At(axis);
+    *h = x_shape.Count(axis + 1);
+    *w = 1;
+    *format = HIPDNN_TENSOR_NCHW;
+  }
+}
+
+void InferXYCudnnTensorDesc(const ShapeView& xy_shape, const DataType& data_type,
+                            const int32_t axis, hipdnnTensorDescriptor_t xy_desc) {
+  int32_t n, c, h, w;
+  hipdnnTensorFormat_t format;
+  InferDimSizeAndDataFormat(xy_shape, axis, &n, &c, &h, &w, &format);
+  OF_CUDNN_CHECK(
+      hipdnnSetTensor4dDescriptor(xy_desc, format, GetCudnnDataType(data_type), n, c, h, w));
+}
+
+void InferParamCudnnTensorDesc(const hipdnnTensorDescriptor_t xy_desc, hipdnnBatchNormMode_t mode,
+                               hipdnnTensorDescriptor_t param_desc) {
+  OF_CUDNN_CHECK(hipdnnDeriveBNTensorDescriptor(param_desc, xy_desc, mode));
+}
+
+class CudnnTensorDescHelper final {
+ public:
+  OF_DISALLOW_COPY_AND_MOVE(CudnnTensorDescHelper);
+  CudnnTensorDescHelper(const ShapeView& xy_shape, const DataType& data_type, const int32_t axis,
+                        hipdnnBatchNormMode_t mode) {
+    OF_CUDNN_CHECK(hipdnnCreateTensorDescriptor(&xy_desc_));
+    InferXYCudnnTensorDesc(xy_shape, data_type, axis, xy_desc_);
+    OF_CUDNN_CHECK(hipdnnCreateTensorDescriptor(&param_desc_));
+    InferParamCudnnTensorDesc(xy_desc_, mode, param_desc_);
+    int n, c, h, w, n_stride, c_stride, h_stride, w_stride;
+    OF_CUDNN_CHECK(hipdnnGetTensor4dDescriptor(param_desc_, &param_data_type_, &n, &c, &h, &w,
+                                              &n_stride, &c_stride, &h_stride, &w_stride));
+    param_size_ = c;
+  }
+  ~CudnnTensorDescHelper() {
+    OF_CUDNN_CHECK(hipdnnDestroyTensorDescriptor(param_desc_));
+    OF_CUDNN_CHECK(hipdnnDestroyTensorDescriptor(xy_desc_));
+  }
+
+  hipdnnTensorDescriptor_t xy_desc() const { return xy_desc_; }
+
+  hipdnnTensorDescriptor_t param_desc() const { return param_desc_; }
+
+  void CheckParamTensor(const user_op::Tensor* tensor) const {
+    CHECK_NOTNULL(tensor);
+    CHECK_EQ(tensor->shape_view().NumAxes(), 1);
+    CHECK_EQ(tensor->shape_view().At(0), param_size_);
+    // CHECK_EQ(GetCudnnDataType(tensor->data_type()), param_data_type_);
+  }
+
+ private:
+  hipdnnTensorDescriptor_t xy_desc_ = nullptr;
+  hipdnnTensorDescriptor_t param_desc_ = nullptr;
+  hipdnnDataType_t param_data_type_;
+  int32_t param_size_ = 0;
+};
+
+size_t InferTrainWorkspaceSize(const ShapeView& x_shape, const DataType data_type,
+                               const int32_t axis) {
+  return 1;
+}
+
+size_t InferTrainTmpSize(user_op::InferContext* ctx) {
+  const auto& x = ctx->InputTensorDesc("x", 0);
+  const auto axis = ctx->Attr<int32_t>("axis");
+  return InferTrainWorkspaceSize(x.shape(), x.data_type(), axis);
+}
+
+size_t InferGradWorkspaceSize(const ShapeView& x_shape, const DataType data_type,
+                              const int32_t axis) {
+  return 1;
+}
+
+size_t InferGradTmpSize(user_op::InferContext* ctx) {
+  const auto& dy = ctx->InputTensorDesc("dy", 0);
+  const auto axis = ctx->Attr<int32_t>("axis");
+  size_t tmp_size = 0;
+  if (ctx->op_type_name() == "normalization_add_relu_grad" && !ctx->has_output("addend_diff", 0)) {
+    tmp_size += GetCudaAlignedSize(dy.shape().elem_cnt() * GetSizeOfDataType(dy.data_type()));
+  }
+  tmp_size += GetCudaAlignedSize(InferGradWorkspaceSize(dy.shape(), dy.data_type(), axis));
+  return tmp_size;
+}
+
+template<typename T>
+class NormalizationInferenceKernel final : public user_op::OpKernel,
+                                           public user_op::CudaGraphSupport {
+ public:
+  NormalizationInferenceKernel() = default;
+  ~NormalizationInferenceKernel() override = default;
+
+ private:
+  using user_op::OpKernel::Compute;
+  void Compute(user_op::KernelComputeContext* ctx) const override {
+    const bool training = ctx->Attr<bool>("training");
+    CHECK(!training);
+    const auto* x = ctx->Tensor4ArgNameAndIndex("x", 0);
+    auto* y = ctx->Tensor4ArgNameAndIndex("y", 0);
+    const auto* gamma = ctx->Tensor4ArgNameAndIndex("gamma", 0);
+    const auto* beta = ctx->Tensor4ArgNameAndIndex("beta", 0);
+    auto* moving_mean = ctx->Tensor4ArgNameAndIndex("moving_mean", 0);
+    auto* moving_variance = ctx->Tensor4ArgNameAndIndex("moving_variance", 0);
+    const auto axis = ctx->Attr<int32_t>("axis");
+    const auto epsilon = ctx->Attr<float>("epsilon");
+
+    const DataType data_type = x->data_type();
+    CHECK_EQ(x->shape_view(), y->shape_view());
+    CHECK_EQ(y->data_type(), data_type);
+    CHECK_GE(axis, 0);
+    CHECK_LT(axis, x->shape_view().NumAxes());
+
+    const CudnnTensorDescHelper desc_helper(x->shape_view(), data_type, axis,
+                                            HIPDNN_BATCHNORM_SPATIAL);
+    desc_helper.CheckParamTensor(gamma);
+    desc_helper.CheckParamTensor(beta);
+    desc_helper.CheckParamTensor(moving_mean);
+    desc_helper.CheckParamTensor(moving_variance);
+
+    const void* sp_alpha = CudnnSPOnePtr<T>();
+    const void* sp_beta;
+    if (ctx->has_input("_add_to_output", 0)) {
+      const user_op::Tensor* add_to_output = ctx->Tensor4ArgNameAndIndex("_add_to_output", 0);
+      CHECK_EQ(add_to_output->data_type(), y->data_type());
+      CHECK_EQ(add_to_output->shape_view(), y->shape_view());
+      Memcpy<DeviceType::kCUDA>(
+          ctx->stream(), y->mut_dptr<void>(), add_to_output->dptr<void>(),
+          add_to_output->shape_view().elem_cnt() * GetSizeOfDataType(add_to_output->data_type()));
+      sp_beta = CudnnSPOnePtr<T>();
+    } else {
+      sp_beta = CudnnSPZeroPtr<T>();
+    }
+
+    OF_CUDNN_CHECK(hipdnnBatchNormalizationForwardInference(
+        ctx->stream()->As<ep::CudaStream>()->cudnn_handle(), HIPDNN_BATCHNORM_SPATIAL, sp_alpha,
+        sp_beta, desc_helper.xy_desc(), x->dptr(), desc_helper.xy_desc(), y->mut_dptr(),
+        desc_helper.param_desc(), gamma->dptr(), beta->dptr(), moving_mean->dptr(),
+        moving_variance->dptr(), epsilon));
+  }
+
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
+};
+
+#define REGISTER_BN_INFERENCE_KERNEL(dtype)                                                     \
+  REGISTER_USER_KERNEL("normalization")                                                         \
+      .SetCreateFn<NormalizationInferenceKernel<dtype>>()                                       \
+      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                          \
+                       && (user_op::HobDataType("y", 0) == GetDataType<dtype>::value)           \
+                       && (user_op::HobAttr<bool>("training") == false))                        \
+      .SetInplaceProposalFn([](const user_op::InferContext& ctx,                                \
+                               user_op::AddInplaceArgPair AddInplaceArgPairFn) -> Maybe<void> { \
+        if (ctx.has_input("_add_to_output", 0)) {                                               \
+          OF_RETURN_IF_ERROR(AddInplaceArgPairFn("y", 0, "_add_to_output", 0, true));           \
+        }                                                                                       \
+        return Maybe<void>::Ok();                                                               \
+      });
+
+REGISTER_BN_INFERENCE_KERNEL(float16)
+REGISTER_BN_INFERENCE_KERNEL(float)
+REGISTER_BN_INFERENCE_KERNEL(double)
+
+#undef REGISTER_BN_INFERENCE_KERNEL
+
+constexpr int64_t kCudaWarpSize = 64;
+
+template<typename T>
+__global__ void ReluGpu(int64_t n, const T* x, T* y, int64_t* mask) {
+  const int32_t lane_id = threadIdx.x % kCudaWarpSize;
+  CUDA_1D_KERNEL_LOOP(i, n) {
+    const T x_val = x[i];
+    const bool is_positive = (x_val > 0);
+    int64_t warp_mask = __ballot(static_cast<int>(is_positive));
+    if (lane_id == 0) { mask[i / kCudaWarpSize] = warp_mask; }
+    y[i] = is_positive ? x_val : 0;
+  }
+}
+
+template<>
+__global__ void ReluGpu<half>(int64_t n, const half* x, half* y, int64_t* mask) {
+  const int32_t lane_id = threadIdx.x % kCudaWarpSize;
+  const half zero = __float2half(0.0f);
+  CUDA_1D_KERNEL_LOOP(i, n) {
+    const half x_val = x[i];
+    const bool is_positive = __hgt(x_val, zero);
+    int64_t warp_mask = __ballot(static_cast<int>(is_positive));
+    if (lane_id == 0) { mask[i / kCudaWarpSize] = warp_mask; }
+    y[i] = is_positive ? x_val : zero;
+  }
+}
+
+template<typename T>
+__global__ void AddReluGpu(int64_t n, const T* x, const T* addend, T* y, int64_t* mask) {
+  const int32_t lane_id = threadIdx.x % kCudaWarpSize;
+  CUDA_1D_KERNEL_LOOP(i, n) {
+    const T sum = x[i] + addend[i];
+    const bool is_positive = (sum > 0);
+    int64_t warp_mask = __ballot(static_cast<int>(is_positive));
+    if (lane_id == 0) { mask[i / kCudaWarpSize] = warp_mask; }
+    y[i] = is_positive ? sum : 0;
+  }
+}
+
+template<>
+__global__ void AddReluGpu<half>(int64_t n, const half* x, const half* addend, half* y,
+                                 int64_t* mask) {
+  const int32_t lane_id = threadIdx.x % kCudaWarpSize;
+  const half zero = __float2half(0.0f);
+  CUDA_1D_KERNEL_LOOP(i, n) {
+    const half sum = __hadd(x[i], addend[i]);
+    const bool is_positive = __hgt(sum, zero);
+    int64_t warp_mask = __ballot(static_cast<int>(is_positive));
+    if (lane_id == 0) { mask[i / kCudaWarpSize] = warp_mask; }
+    y[i] = is_positive ? sum : zero;
+  }
+}
+
+template<typename T>
+void Relu(ep::Stream* stream, int64_t n, const T* x, T* y, int64_t* mask) {
+  ReluGpu<T><<<BlocksNum4ThreadsNum(n), kCudaThreadsNumPerBlock, 0,
+               stream->As<ep::CudaStream>()->cuda_stream()>>>(n, x, y, mask);
+}
+
+template<>
+void Relu<float16>(ep::Stream* stream, int64_t n, const float16* x, float16* y, int64_t* mask) {
+  Relu<half>(stream, n, reinterpret_cast<const half*>(x), reinterpret_cast<half*>(y), mask);
+}
+
+template<typename T>
+void AddRelu(ep::Stream* stream, int64_t n, const T* x, const T* addend, T* y, int64_t* mask) {
+  AddReluGpu<T><<<BlocksNum4ThreadsNum(n), kCudaThreadsNumPerBlock, 0,
+                  stream->As<ep::CudaStream>()->cuda_stream()>>>(n, x, addend, y, mask);
+}
+
+template<>
+void AddRelu<float16>(ep::Stream* stream, int64_t n, const float16* x, const float16* addend,
+                      float16* y, int64_t* mask) {
+  AddRelu<half>(stream, n, reinterpret_cast<const half*>(x), reinterpret_cast<const half*>(addend),
+                reinterpret_cast<half*>(y), mask);
+}
+
+template<typename T>
+__global__ void ReluBackwardGpu(int64_t n, const int64_t* mask, const T* dy, T* addend_diff) {
+  int32_t lane_id = threadIdx.x % kCudaWarpSize;
+  CUDA_1D_KERNEL_LOOP(i, n) {
+    int64_t mask_val = mask[i / kCudaWarpSize];
+    bool is_positive = mask_val & (1 << lane_id);
+    addend_diff[i] = static_cast<T>(is_positive) * dy[i];
+  }
+}
+
+template<typename T>
+void ReluBackward(ep::Stream* stream, int64_t n, const int64_t* mask, const T* dy, T* addend_diff) {
+  ReluBackwardGpu<T><<<BlocksNum4ThreadsNum(n), kCudaThreadsNumPerBlock, 0,
+                       stream->As<ep::CudaStream>()->cuda_stream()>>>(n, mask, dy, addend_diff);
+}
+
+template<>
+void ReluBackward<float16>(ep::Stream* stream, int64_t n, const int64_t* mask, const float16* dy,
+                           float16* addend_diff) {
+  ReluBackward<half>(stream, n, mask, reinterpret_cast<const half*>(dy),
+                     reinterpret_cast<half*>(addend_diff));
+}
+
+template<typename T>
+class NormalizationTrainKernel final : public user_op::OpKernel, public user_op::CudaGraphSupport {
+ public:
+  NormalizationTrainKernel() = default;
+  ~NormalizationTrainKernel() override = default;
+
+ private:
+  using user_op::OpKernel::Compute;
+  void Compute(user_op::KernelComputeContext* ctx) const override {
+    if (ctx->op_type_name() == "normalization") { CHECK(ctx->Attr<bool>("training")); }
+    const auto* x = ctx->Tensor4ArgNameAndIndex("x", 0);
+    auto* y = ctx->Tensor4ArgNameAndIndex("y", 0);
+
+    const auto axis = ctx->Attr<int32_t>("axis");
+    const auto epsilon = ctx->Attr<float>("epsilon");
+    const auto momentum = ctx->Attr<float>("momentum");
+
+    const DataType data_type = x->data_type();
+    CHECK_EQ(x->shape_view(), y->shape_view());
+    CHECK_EQ(y->data_type(), data_type);
+    CHECK_GE(axis, 0);
+    CHECK_LT(axis, x->shape_view().NumAxes());
+    const CudnnTensorDescHelper desc_helper(x->shape_view(), data_type, axis,
+                                            HIPDNN_BATCHNORM_SPATIAL_PERSISTENT);
+
+    const auto* gamma = ctx->Tensor4ArgNameAndIndex("gamma", 0);
+    const auto* beta = ctx->Tensor4ArgNameAndIndex("beta", 0);
+    auto* mean = ctx->Tensor4ArgNameAndIndex("mean", 0);
+    auto* inv_variance = ctx->Tensor4ArgNameAndIndex("inv_variance", 0);
+    desc_helper.CheckParamTensor(gamma);
+    desc_helper.CheckParamTensor(beta);
+    desc_helper.CheckParamTensor(mean);
+    desc_helper.CheckParamTensor(inv_variance);
+
+    user_op::Tensor* moving_mean = nullptr;
+    user_op::Tensor* moving_variance = nullptr;
+    if (ctx->has_input("moving_mean", 0)) {
+      CHECK(ctx->has_input("moving_variance", 0));
+      moving_mean = ctx->Tensor4ArgNameAndIndex("moving_mean", 0);
+      moving_variance = ctx->Tensor4ArgNameAndIndex("moving_variance", 0);
+      desc_helper.CheckParamTensor(moving_mean);
+      desc_helper.CheckParamTensor(moving_variance);
+    }
+
+    const void* sp_alpha = CudnnSPOnePtr<T>();
+    const void* sp_beta;
+    if (ctx->has_input("_add_to_output", 0)) {
+      const user_op::Tensor* add_to_output = ctx->Tensor4ArgNameAndIndex("_add_to_output", 0);
+      CHECK_EQ(add_to_output->data_type(), y->data_type());
+      CHECK_EQ(add_to_output->shape_view(), y->shape_view());
+      Memcpy<DeviceType::kCUDA>(
+          ctx->stream(), y->mut_dptr<void>(), add_to_output->dptr<void>(),
+          add_to_output->shape_view().elem_cnt() * GetSizeOfDataType(add_to_output->data_type()));
+      sp_beta = CudnnSPOnePtr<T>();
+    } else {
+      sp_beta = CudnnSPZeroPtr<T>();
+    }
+
+    OF_CUDNN_CHECK(hipdnnBatchNormalizationForwardTraining(
+        ctx->stream()->As<ep::CudaStream>()->cudnn_handle(), HIPDNN_BATCHNORM_SPATIAL_PERSISTENT,
+        const_cast<void *>(sp_alpha), const_cast<void *>(sp_beta), desc_helper.xy_desc(), x->dptr(), desc_helper.xy_desc(), y->mut_dptr(),
+        desc_helper.param_desc(), const_cast<void *>(gamma->dptr()), const_cast<void *>(beta->dptr()), 1.0 - momentum,
+        moving_mean ? moving_mean->mut_dptr() : NULL,
+        moving_variance ? moving_variance->mut_dptr() : NULL, epsilon, mean->mut_dptr(),
+        inv_variance->mut_dptr()));
+
+    if (ctx->op_type_name() == "normalization_add_relu") {
+      CHECK(!ctx->has_input("_add_to_output", 0));
+      const int64_t elem_cnt = x->shape_view().elem_cnt();
+      auto* mask = ctx->Tensor4ArgNameAndIndex("reserve_space", 0);
+      if (ctx->has_input("addend", 0)) {
+        const auto* addend = ctx->Tensor4ArgNameAndIndex("addend", 0);
+        AddRelu(ctx->stream(), elem_cnt, y->dptr<T>(), addend->dptr<T>(), y->mut_dptr<T>(),
+                mask->mut_dptr<int64_t>());
+      } else {
+        Relu(ctx->stream(), elem_cnt, y->dptr<T>(), y->mut_dptr<T>(), mask->mut_dptr<int64_t>());
+      }
+    }
+  }
+
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
+};
+
+#define REGISTER_BN_TRAIN_KERNEL(dtype)                                                         \
+  REGISTER_USER_KERNEL("normalization")                                                         \
+      .SetCreateFn<NormalizationTrainKernel<dtype>>()                                           \
+      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                          \
+                       && (user_op::HobDataType("y", 0) == GetDataType<dtype>::value)           \
+                       && (user_op::HobAttr<bool>("training") == true))                         \
+      .SetInferTmpSizeFn(InferTrainTmpSize)                                                     \
+      .SetInplaceProposalFn([](const user_op::InferContext& ctx,                                \
+                               user_op::AddInplaceArgPair AddInplaceArgPairFn) -> Maybe<void> { \
+        if (ctx.has_input("_add_to_output", 0)) {                                               \
+          OF_RETURN_IF_ERROR(AddInplaceArgPairFn("y", 0, "_add_to_output", 0, true));           \
+        }                                                                                       \
+        return Maybe<void>::Ok();                                                               \
+      });
+
+REGISTER_BN_TRAIN_KERNEL(float16)
+REGISTER_BN_TRAIN_KERNEL(float)
+REGISTER_BN_TRAIN_KERNEL(double)
+
+#define REGISTER_BN_ADD_RELU_KERNEL(dtype)                                             \
+  REGISTER_USER_KERNEL("normalization_add_relu")                                       \
+      .SetCreateFn<NormalizationTrainKernel<dtype>>()                                  \
+      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                 \
+                       && (user_op::HobDataType("y", 0) == GetDataType<dtype>::value)) \
+      .SetInferTmpSizeFn(InferTrainTmpSize);
+
+REGISTER_BN_ADD_RELU_KERNEL(float16)
+REGISTER_BN_ADD_RELU_KERNEL(float)
+REGISTER_BN_ADD_RELU_KERNEL(double)
+
+template<typename T>
+class NormalizationGradUserKernel final : public user_op::OpKernel,
+                                          public user_op::CudaGraphSupport {
+ public:
+  NormalizationGradUserKernel() = default;
+  ~NormalizationGradUserKernel() override = default;
+
+ private:
+  using user_op::OpKernel::Compute;
+  void Compute(user_op::KernelComputeContext* ctx) const override {
+    const auto* x = ctx->Tensor4ArgNameAndIndex("x", 0);
+    auto* dx = ctx->Tensor4ArgNameAndIndex("dx", 0);
+    const auto* dy = ctx->Tensor4ArgNameAndIndex("dy", 0);
+    const auto* gamma = ctx->Tensor4ArgNameAndIndex("gamma", 0);
+    auto* gamma_diff = ctx->Tensor4ArgNameAndIndex("gamma_diff", 0);
+    auto* beta_diff = ctx->Tensor4ArgNameAndIndex("beta_diff", 0);
+    const auto* mean = ctx->Tensor4ArgNameAndIndex("mean", 0);
+    const auto* inv_variance = ctx->Tensor4ArgNameAndIndex("inv_variance", 0);
+    auto* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0);
+    const auto axis = ctx->Attr<int32_t>("axis");
+    const auto epsilon = ctx->Attr<float>("epsilon");
+
+    const DataType data_type = x->data_type();
+    CHECK_EQ(dy->shape_view(), x->shape_view());
+    CHECK_EQ(dy->data_type(), data_type);
+    CHECK_EQ(dx->shape_view(), x->shape_view());
+    CHECK_EQ(dx->data_type(), data_type);
+    CHECK_GE(axis, 0);
+    CHECK_LT(axis, x->shape_view().NumAxes());
+
+    const CudnnTensorDescHelper desc_helper(x->shape_view(), data_type, axis,
+                                            HIPDNN_BATCHNORM_SPATIAL_PERSISTENT);
+    desc_helper.CheckParamTensor(gamma);
+    desc_helper.CheckParamTensor(gamma_diff);
+    desc_helper.CheckParamTensor(beta_diff);
+    desc_helper.CheckParamTensor(mean);
+    desc_helper.CheckParamTensor(inv_variance);
+
+    void* bn_workspace_ptr;
+    size_t bn_workspace_size;
+    const void* bn_dy_ptr;
+
+    if (ctx->op_type_name() == "normalization_grad") {
+      bn_workspace_ptr = tmp_buffer->mut_dptr();
+      bn_workspace_size = tmp_buffer->shape_view().elem_cnt();
+      bn_dy_ptr = dy->dptr();
+    } else if (ctx->op_type_name() == "normalization_add_relu_grad") {
+      const int64_t elem_cnt = dy->shape_view().elem_cnt();
+      const auto* mask = ctx->Tensor4ArgNameAndIndex("reserve_space", 0);
+      user_op::Tensor* y = ctx->Tensor4ArgNameAndIndex("y", 0);
+      if (ctx->has_output("addend_diff", 0)) {
+        user_op::Tensor* addend_diff = ctx->Tensor4ArgNameAndIndex("addend_diff", 0);
+        ReluBackward(ctx->stream(), elem_cnt, mask->dptr<int64_t>(), dy->dptr<T>(),
+                     addend_diff->mut_dptr<T>());
+        bn_workspace_ptr = tmp_buffer->mut_dptr();
+        bn_workspace_size = tmp_buffer->shape_view().elem_cnt();
+        bn_dy_ptr = addend_diff->dptr();
+      } else {
+        const size_t tmp_buffer_size = tmp_buffer->shape_view().elem_cnt();
+        const size_t relu_dx_size =
+            GetCudaAlignedSize(dy->shape_view().elem_cnt() * GetSizeOfDataType(dy->data_type()));
+        CHECK_GE(tmp_buffer_size, relu_dx_size);
+        ReluBackward(ctx->stream(), elem_cnt, mask->dptr<int64_t>(), dy->dptr<T>(),
+                     reinterpret_cast<T*>(tmp_buffer->mut_dptr()));
+        bn_workspace_ptr = tmp_buffer->mut_dptr<char>() + relu_dx_size;
+        bn_workspace_size = tmp_buffer_size - relu_dx_size;
+        bn_dy_ptr = tmp_buffer->dptr();
+      }
+    } else {
+      UNIMPLEMENTED();
+    }
+
+    OF_CUDNN_CHECK(hipdnnBatchNormalizationBackward(
+        ctx->stream()->As<ep::CudaStream>()->cudnn_handle(), HIPDNN_BATCHNORM_SPATIAL_PERSISTENT,
+        CudnnSPOnePtr<T>(), CudnnSPZeroPtr<T>(), CudnnSPOnePtr<T>(), CudnnSPZeroPtr<T>(),
+        desc_helper.xy_desc(), x->dptr(), desc_helper.xy_desc(), bn_dy_ptr, desc_helper.xy_desc(),
+        dx->mut_dptr(), desc_helper.param_desc(), gamma->dptr(), gamma_diff->mut_dptr(),
+        beta_diff->mut_dptr(), epsilon, mean->dptr(), inv_variance->dptr()));
+
+  }
+
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
+};
+
+#define REGISTER_BN_GRAD_KERNEL(dtype)                                                  \
+  REGISTER_USER_KERNEL("normalization_grad")                                            \
+      .SetCreateFn<NormalizationGradUserKernel<dtype>>()                                \
+      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                  \
+                       && (user_op::HobDataType("dx", 0) == GetDataType<dtype>::value)) \
+      .SetInferTmpSizeFn(InferGradTmpSize);
+
+REGISTER_BN_GRAD_KERNEL(float16)
+REGISTER_BN_GRAD_KERNEL(float)
+REGISTER_BN_GRAD_KERNEL(double)
+
+#define REGISTER_BN_ADD_RELU_GRAD_KERNEL(dtype)                                         \
+  REGISTER_USER_KERNEL("normalization_add_relu_grad")                                   \
+      .SetCreateFn<NormalizationGradUserKernel<dtype>>()                                \
+      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                  \
+                       && (user_op::HobDataType("dx", 0) == GetDataType<dtype>::value)) \
+      .SetInferTmpSizeFn(InferGradTmpSize);
+
+REGISTER_BN_ADD_RELU_GRAD_KERNEL(float16)
+REGISTER_BN_ADD_RELU_GRAD_KERNEL(float)
+REGISTER_BN_ADD_RELU_GRAD_KERNEL(double)
+
+
+}  // namespace
+}  // namespace oneflow
+
 #endif
\ No newline at end of file
diff --git a/oneflow/user/kernels/nvtx_range_kernel.hip.cpp b/oneflow/user/kernels/nvtx_range_kernel.hip.cpp
index 8f22f5f..24a1fa5 100644
--- a/oneflow/user/kernels/nvtx_range_kernel.hip.cpp
+++ b/oneflow/user/kernels/nvtx_range_kernel.hip.cpp
@@ -1,138 +1,138 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "oneflow/core/framework/framework.h"
-#include "oneflow/core/kernel/new_kernel_util.h"
-
-#ifdef OF_ENABLE_PROFILER
-#include <roctracer_roctx.h>
-#endif  // OF_ENABLE_PROFILER
-
-namespace oneflow {
-
-namespace {
-
-#ifdef OF_ENABLE_PROFILER
-static thread_local HashMap<std::string, roctx_range_id_t> mark2range_id;
-#endif
-
-}  // namespace
-
-class NvtxOpKernelState final : public user_op::OpKernelState {
- public:
-  NvtxOpKernelState() : counter_(0) {
-#ifndef OF_ENABLE_PROFILER
-    LOG(WARNING) << "To use NVTX, run cmake with -DBUILD_PROFILER=ON";
-#endif
-  }
-  ~NvtxOpKernelState() override = default;
-
-  int64_t counter() const { return counter_; }
-  void IncreaseCount() { counter_ += 1; }
-
- private:
-  int64_t counter_;
-};
-
-class NvtxStartKernel final : public user_op::OpKernel {
- public:
-  NvtxStartKernel() = default;
-  ~NvtxStartKernel() override = default;
-
-  std::shared_ptr<user_op::OpKernelState> CreateOpKernelState(
-      user_op::KernelInitContext* ctx) const override {
-    return std::make_shared<NvtxOpKernelState>();
-  }
-
- private:
-  using user_op::OpKernel::Compute;
-  void Compute(user_op::KernelComputeContext* ctx, user_op::OpKernelState* state,
-               const user_op::OpKernelCache*) const override {
-    const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("in", 0);
-    user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0);
-    const ShapeView& in_shape = in->shape_view();
-    CHECK_EQ(out->shape_view(), in_shape);
-    const DataType in_data_type = in->data_type();
-    CHECK_EQ(out->data_type(), in_data_type);
-    Memcpy<DeviceType::kCUDA>(ctx->stream(), out->mut_dptr<void>(), in->dptr<void>(),
-                              in_shape.elem_cnt() * GetSizeOfDataType(in_data_type));
-#ifdef OF_ENABLE_PROFILER
-    auto* kernel_state = dynamic_cast<NvtxOpKernelState*>(state);
-    const std::string mark_prefix = ctx->Attr<std::string>("mark_prefix");
-    const std::string mark = mark_prefix + "-" + std::to_string(kernel_state->counter());
-    roctx_range_id_t range_id = roctxRangeStartA(mark.c_str());
-    CHECK(mark2range_id.emplace(mark, range_id).second);
-    kernel_state->IncreaseCount();
-#endif  // OF_ENABLE_PROFILER
-  }
-  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
-};
-
-REGISTER_USER_KERNEL("nvtx_start")
-    .SetCreateFn<NvtxStartKernel>()
-    .SetIsMatchedHob(user_op::HobDeviceType() == DeviceType::kCUDA)
-    .SetInplaceProposalFn([](const user_op::InferContext&,
-                             user_op::AddInplaceArgPair AddInplaceArgPairFn) -> Maybe<void> {
-      OF_RETURN_IF_ERROR(AddInplaceArgPairFn("out", 0, "in", 0, false));
-      return Maybe<void>::Ok();
-    });
-
-class NvtxEndKernel final : public user_op::OpKernel {
- public:
-  NvtxEndKernel() = default;
-  ~NvtxEndKernel() override = default;
-
-  std::shared_ptr<user_op::OpKernelState> CreateOpKernelState(
-      user_op::KernelInitContext* ctx) const override {
-    return std::make_shared<NvtxOpKernelState>();
-  }
-
- private:
-  using user_op::OpKernel::Compute;
-  void Compute(user_op::KernelComputeContext* ctx, user_op::OpKernelState* state,
-               const user_op::OpKernelCache*) const override {
-    const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("in", 0);
-    user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0);
-    const ShapeView& in_shape = in->shape_view();
-    CHECK_EQ(out->shape_view(), in_shape);
-    const DataType in_data_type = in->data_type();
-    CHECK_EQ(out->data_type(), in_data_type);
-#ifdef OF_ENABLE_PROFILER
-    auto* kernel_state = dynamic_cast<NvtxOpKernelState*>(state);
-    const std::string mark_prefix = ctx->Attr<std::string>("mark_prefix");
-    const std::string mark = mark_prefix + "-" + std::to_string(kernel_state->counter());
-    auto it = mark2range_id.find(mark.c_str());
-    CHECK(it != mark2range_id.end());
-    roctx_range_id_t range_id = it->second;
-    mark2range_id.erase(it);
-    roctxRangeStop(range_id);
-    Memcpy<DeviceType::kCUDA>(ctx->stream(), out->mut_dptr<void>(), in->dptr<void>(),
-                              in_shape.elem_cnt() * GetSizeOfDataType(in_data_type));
-    kernel_state->IncreaseCount();
-#endif
-  }
-  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
-};
-
-REGISTER_USER_KERNEL("nvtx_end")
-    .SetCreateFn<NvtxEndKernel>()
-    .SetIsMatchedHob(user_op::HobDeviceType() == DeviceType::kCUDA)
-    .SetInplaceProposalFn([](const user_op::InferContext&,
-                             user_op::AddInplaceArgPair AddInplaceArgPairFn) -> Maybe<void> {
-      OF_RETURN_IF_ERROR(AddInplaceArgPairFn("out", 0, "in", 0, false));
-      return Maybe<void>::Ok();
-    });
-
-}  // namespace oneflow
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "oneflow/core/framework/framework.h"
+#include "oneflow/core/kernel/new_kernel_util.h"
+
+#ifdef OF_ENABLE_PROFILER
+#include <roctracer_roctx.h>
+#endif  // OF_ENABLE_PROFILER
+
+namespace oneflow {
+
+namespace {
+
+#ifdef OF_ENABLE_PROFILER
+static thread_local HashMap<std::string, roctx_range_id_t> mark2range_id;
+#endif
+
+}  // namespace
+
+class NvtxOpKernelState final : public user_op::OpKernelState {
+ public:
+  NvtxOpKernelState() : counter_(0) {
+#ifndef OF_ENABLE_PROFILER
+    LOG(WARNING) << "To use NVTX, run cmake with -DBUILD_PROFILER=ON";
+#endif
+  }
+  ~NvtxOpKernelState() override = default;
+
+  int64_t counter() const { return counter_; }
+  void IncreaseCount() { counter_ += 1; }
+
+ private:
+  int64_t counter_;
+};
+
+class NvtxStartKernel final : public user_op::OpKernel {
+ public:
+  NvtxStartKernel() = default;
+  ~NvtxStartKernel() override = default;
+
+  std::shared_ptr<user_op::OpKernelState> CreateOpKernelState(
+      user_op::KernelInitContext* ctx) const override {
+    return std::make_shared<NvtxOpKernelState>();
+  }
+
+ private:
+  using user_op::OpKernel::Compute;
+  void Compute(user_op::KernelComputeContext* ctx, user_op::OpKernelState* state,
+               const user_op::OpKernelCache*) const override {
+    const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("in", 0);
+    user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0);
+    const ShapeView& in_shape = in->shape_view();
+    CHECK_EQ(out->shape_view(), in_shape);
+    const DataType in_data_type = in->data_type();
+    CHECK_EQ(out->data_type(), in_data_type);
+    Memcpy<DeviceType::kCUDA>(ctx->stream(), out->mut_dptr<void>(), in->dptr<void>(),
+                              in_shape.elem_cnt() * GetSizeOfDataType(in_data_type));
+#ifdef OF_ENABLE_PROFILER
+    auto* kernel_state = dynamic_cast<NvtxOpKernelState*>(state);
+    const std::string mark_prefix = ctx->Attr<std::string>("mark_prefix");
+    const std::string mark = mark_prefix + "-" + std::to_string(kernel_state->counter());
+    roctx_range_id_t range_id = roctxRangeStartA(mark.c_str());
+    CHECK(mark2range_id.emplace(mark, range_id).second);
+    kernel_state->IncreaseCount();
+#endif  // OF_ENABLE_PROFILER
+  }
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
+};
+
+REGISTER_USER_KERNEL("nvtx_start")
+    .SetCreateFn<NvtxStartKernel>()
+    .SetIsMatchedHob(user_op::HobDeviceType() == DeviceType::kCUDA)
+    .SetInplaceProposalFn([](const user_op::InferContext&,
+                             user_op::AddInplaceArgPair AddInplaceArgPairFn) -> Maybe<void> {
+      OF_RETURN_IF_ERROR(AddInplaceArgPairFn("out", 0, "in", 0, false));
+      return Maybe<void>::Ok();
+    });
+
+class NvtxEndKernel final : public user_op::OpKernel {
+ public:
+  NvtxEndKernel() = default;
+  ~NvtxEndKernel() override = default;
+
+  std::shared_ptr<user_op::OpKernelState> CreateOpKernelState(
+      user_op::KernelInitContext* ctx) const override {
+    return std::make_shared<NvtxOpKernelState>();
+  }
+
+ private:
+  using user_op::OpKernel::Compute;
+  void Compute(user_op::KernelComputeContext* ctx, user_op::OpKernelState* state,
+               const user_op::OpKernelCache*) const override {
+    const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("in", 0);
+    user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0);
+    const ShapeView& in_shape = in->shape_view();
+    CHECK_EQ(out->shape_view(), in_shape);
+    const DataType in_data_type = in->data_type();
+    CHECK_EQ(out->data_type(), in_data_type);
+#ifdef OF_ENABLE_PROFILER
+    auto* kernel_state = dynamic_cast<NvtxOpKernelState*>(state);
+    const std::string mark_prefix = ctx->Attr<std::string>("mark_prefix");
+    const std::string mark = mark_prefix + "-" + std::to_string(kernel_state->counter());
+    auto it = mark2range_id.find(mark.c_str());
+    CHECK(it != mark2range_id.end());
+    roctx_range_id_t range_id = it->second;
+    mark2range_id.erase(it);
+    roctxRangeStop(range_id);
+    Memcpy<DeviceType::kCUDA>(ctx->stream(), out->mut_dptr<void>(), in->dptr<void>(),
+                              in_shape.elem_cnt() * GetSizeOfDataType(in_data_type));
+    kernel_state->IncreaseCount();
+#endif
+  }
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
+};
+
+REGISTER_USER_KERNEL("nvtx_end")
+    .SetCreateFn<NvtxEndKernel>()
+    .SetIsMatchedHob(user_op::HobDeviceType() == DeviceType::kCUDA)
+    .SetInplaceProposalFn([](const user_op::InferContext&,
+                             user_op::AddInplaceArgPair AddInplaceArgPairFn) -> Maybe<void> {
+      OF_RETURN_IF_ERROR(AddInplaceArgPairFn("out", 0, "in", 0, false));
+      return Maybe<void>::Ok();
+    });
+
+}  // namespace oneflow
diff --git a/oneflow/user/kernels/one_embedding_kernels.hip.cpp b/oneflow/user/kernels/one_embedding_kernels.hip.cpp
index 59d3e50..cf0d24f 100644
--- a/oneflow/user/kernels/one_embedding_kernels.hip.cpp
+++ b/oneflow/user/kernels/one_embedding_kernels.hip.cpp
@@ -1,634 +1,634 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "hip/hip_runtime.h"
-#include "oneflow/core/framework/framework.h"
-#include "oneflow/core/embedding/key_value_store.h"
-#include "oneflow/core/embedding/embedding_manager.h"
-#include "oneflow/core/device/cuda_util.h"
-#include "oneflow/user/kernels/random_mask_generator.h"
-#include "oneflow/core/framework/random_generator_impl.h"
-#include "oneflow/core/hip/atomic.hip.h"
-#include "oneflow/core/ep/include/primitive/copy_nd.h"
-#include "oneflow/core/ep/include/primitive/cast.h"
-#include "oneflow/core/ep/include/device.h"
-
-namespace oneflow {
-
-namespace {
-
-enum class InitializerType { kUniform, kNormal, kConstant };
-
-struct EmbeddingInitializer {
-  InitializerType type;
-  union {
-    struct {
-      float low;
-      float high;
-    } uniform_param;
-    struct {
-      float mean;
-      float std;
-    } normal_param;
-    struct {
-      float value;
-    } constant_param;
-  };
-
-  bool operator==(const EmbeddingInitializer& rhs) const {
-    if (this->type != rhs.type) { return false; }
-    if (rhs.type == InitializerType::kUniform) {
-      return (this->uniform_param.low == rhs.uniform_param.low)
-             && (this->uniform_param.high == rhs.uniform_param.high);
-    } else if (rhs.type == InitializerType::kNormal) {
-      return (this->normal_param.mean == rhs.normal_param.mean)
-             && (this->normal_param.std == rhs.normal_param.std);
-    } else if (rhs.type == InitializerType::kConstant) {
-      return this->constant_param.value == rhs.constant_param.value;
-    } else {
-      UNIMPLEMENTED();
-      return false;
-    }
-  }
-};
-
-void ParseInitializerFromJson(const nlohmann::json& initializer,
-                              EmbeddingInitializer* embedding_initializer) {
-  CHECK(initializer.contains("type"));
-  CHECK(initializer["type"].is_string());
-  std::string type = initializer["type"].get<std::string>();
-  if (type == "uniform") {
-    embedding_initializer->type = InitializerType::kUniform;
-    CHECK(initializer.contains("low"));
-    CHECK(initializer.contains("high"));
-    CHECK(initializer["low"].is_number());
-    CHECK(initializer["high"].is_number());
-    embedding_initializer->uniform_param.low = initializer["low"];
-    embedding_initializer->uniform_param.high = initializer["high"];
-  } else if (type == "normal") {
-    CHECK(initializer.contains("mean"));
-    CHECK(initializer.contains("std"));
-    CHECK(initializer["mean"].is_number());
-    CHECK(initializer["std"].is_number());
-    embedding_initializer->type = InitializerType::kNormal;
-    embedding_initializer->normal_param.mean = initializer["mean"];
-    embedding_initializer->normal_param.std = initializer["std"];
-  } else if (type == "constant") {
-    CHECK(initializer.contains("value"));
-    CHECK(initializer["value"].is_number());
-    embedding_initializer->type = InitializerType::kConstant;
-    embedding_initializer->constant_param.value = initializer["value"];
-  } else {
-    UNIMPLEMENTED() << "Unsupported initializer type";
-  }
-}
-
-int32_t ParseJsonToUniqueInitializerVecAndReturnOffset(
-    const nlohmann::json& initializer, std::vector<EmbeddingInitializer>* initializers) {
-  EmbeddingInitializer embedding_initializer;
-  ParseInitializerFromJson(initializer, &embedding_initializer);
-  for (int32_t i = 0; i < initializers->size(); ++i) {
-    if (initializers->at(i) == embedding_initializer) { return i; }
-  }
-  initializers->push_back(embedding_initializer);
-  return initializers->size() - 1;
-}
-
-void SetInitializerIndex(int32_t row_id, int32_t col_start, int32_t col_end, int64_t line_size,
-                         int8_t index, std::vector<int8_t>* initializer_index) {
-  int64_t row_offset = row_id * line_size;
-  for (int32_t col = col_start; col < col_end; ++col) {
-    initializer_index->at(row_offset + col) = index;
-  }
-}
-
-void ParseAndSetStateInitializerIndex(const std::string& state_initializer,
-                                      const int32_t num_tables, const int64_t line_size,
-                                      const int64_t embedding_size,
-                                      std::vector<EmbeddingInitializer>* initializer_params,
-                                      std::vector<int8_t>* initializer_index) {
-  if (line_size == embedding_size) { return; }
-  CHECK(!state_initializer.empty());
-  auto initializers = nlohmann::json::parse(state_initializer);
-  CHECK(initializers.is_array());
-  const int num_states = line_size / embedding_size - 1;
-  CHECK_EQ(num_states, initializers.size());
-  for (int32_t i = 0; i < num_states; ++i) {
-    int32_t offset =
-        ParseJsonToUniqueInitializerVecAndReturnOffset(initializers.at(i), initializer_params);
-    int32_t col_start = embedding_size + i * embedding_size;
-    int32_t col_end = col_start + embedding_size;
-    CHECK_LE(col_end, line_size);
-    for (int32_t j = 0; j < num_tables; ++j) {
-      SetInitializerIndex(j, col_start, col_end, line_size, offset, initializer_index);
-    }
-  }
-}
-
-void ParseAndSetModelInitializerIndex(const nlohmann::json& tables,
-                                      const std::vector<int64_t>& column_dims,
-                                      const int32_t num_tables, const int32_t num_columns,
-                                      const int64_t line_size, const int64_t embedding_size,
-                                      std::vector<EmbeddingInitializer>* initializer_params,
-                                      std::vector<int8_t>* initializer_index) {
-  for (int32_t i = 0; i < num_tables; ++i) {
-    auto table = tables.at(i);
-    CHECK(table.contains("columns"));
-    auto columns = table["columns"];
-    CHECK(columns.is_array());
-    CHECK_EQ(num_columns, columns.size()) << "columns size must equal to num embedding dims";
-    int32_t col_start = 0;
-    for (int k = 0; k < columns.size(); ++k) {
-      auto column = columns.at(k);
-      CHECK(column.contains("initializer"));
-      int32_t offset =
-          ParseJsonToUniqueInitializerVecAndReturnOffset(column["initializer"], initializer_params);
-      int32_t col_end = col_start + column_dims.at(k);
-      SetInitializerIndex(i, col_start, col_end, line_size, offset, initializer_index);
-      col_start = col_end;
-    }
-    CHECK_EQ(col_start, embedding_size);
-  }
-}
-
-void ParseInitializers(const int64_t line_size, const int64_t embedding_size,
-                       const std::string& state_initializer, const std::string& json_serialized,
-                       std::vector<EmbeddingInitializer>* initializer_params,
-                       std::vector<int8_t>* initializer_index) {
-  auto json_object = nlohmann::json::parse(json_serialized);
-  CHECK(json_object.contains("column_dims"));
-  std::vector<int64_t> column_dims = json_object["column_dims"];
-  const int32_t num_columns = column_dims.size();
-  CHECK(json_object.contains("tables"));
-  auto tables = json_object["tables"];
-  CHECK(tables.is_array());
-  const int32_t num_tables = tables.size();
-  initializer_index->resize(num_tables * line_size);
-  ParseAndSetStateInitializerIndex(state_initializer, num_tables, line_size, embedding_size,
-                                   initializer_params, initializer_index);
-  ParseAndSetModelInitializerIndex(tables, column_dims, num_tables, num_columns, line_size,
-                                   embedding_size, initializer_params, initializer_index);
-}
-
-template<typename IDX>
-class EmbeddingKernelState final : public user_op::OpKernelState {
- public:
-  explicit EmbeddingKernelState(user_op::KernelInitContext* ctx)
-      : device_index_(-1), generator_(CHECK_JUST(one::MakeGenerator(DeviceType::kCUDA))) {
-    OF_CUDA_CHECK(hipGetDevice(&device_index_));
-    OF_CUDA_CHECK(hipMallocHost(&host_num_keys_, sizeof(IDX)));
-    key_value_store_ = Singleton<embedding::EmbeddingManager>::Get()->GetKeyValueStore(
-        ctx->Attr<std::string>("embedding_name"), ctx->parallel_ctx().parallel_id());
-    uint32_t max_query_length =
-        ctx->TensorDesc4ArgNameAndIndex("unique_ids", 0)->shape().elem_cnt();
-    key_value_store_->ReserveQueryLength(max_query_length);
-
-    const int64_t embedding_size = ctx->Attr<int64_t>("embedding_size");
-    const int64_t line_size = ctx->Attr<int64_t>("line_size");
-    const std::string& state_initializer = ctx->Attr<std::string>("state_initializer");
-
-    std::vector<EmbeddingInitializer> initializer_param;
-    std::vector<int8_t> initializer_index;
-    ParseInitializers(line_size, embedding_size, state_initializer,
-                      ctx->Attr<std::string>("embedding_tables"), &initializer_param,
-                      &initializer_index);
-
-    const size_t param_size_bytes = initializer_param.size() * sizeof(EmbeddingInitializer);
-    OF_CUDA_CHECK(hipMallocHost(reinterpret_cast<void **>(&host_initializer_param_), param_size_bytes));
-    std::memcpy(host_initializer_param_, initializer_param.data(), param_size_bytes);
-    OF_CUDA_CHECK(hipMalloc(&device_initializer_param_, param_size_bytes));
-    OF_CUDA_CHECK(hipMemcpyAsync(device_initializer_param_, host_initializer_param_,
-                                  param_size_bytes, hipMemcpyDefault,
-                                  ctx->stream()->As<ep::CudaStream>()->cuda_stream()));
-
-    const size_t index_size_bytes = initializer_index.size() * sizeof(int8_t);
-    OF_CUDA_CHECK(hipMallocHost(reinterpret_cast<void **>(&host_initializer_index_), index_size_bytes));
-    std::memcpy(host_initializer_index_, initializer_index.data(), index_size_bytes);
-    OF_CUDA_CHECK(hipMalloc(&device_initializer_index_, index_size_bytes));
-    OF_CUDA_CHECK(hipMemcpyAsync(device_initializer_index_, host_initializer_index_,
-                                  index_size_bytes, hipMemcpyDefault,
-                                  ctx->stream()->As<ep::CudaStream>()->cuda_stream()));
-  }
-  ~EmbeddingKernelState() override {
-    CudaCurrentDeviceGuard guard(device_index_);
-    OF_CUDA_CHECK(hipHostFree(host_num_keys_));
-    OF_CUDA_CHECK(hipHostFree(host_initializer_param_));
-    OF_CUDA_CHECK(hipFree(device_initializer_param_));
-    OF_CUDA_CHECK(hipHostFree(host_initializer_index_));
-    OF_CUDA_CHECK(hipFree(device_initializer_index_));
-  }
-
-  void* HostNumKeys() { return host_num_keys_; }
-
-  embedding::KeyValueStore* KeyValueStore() { return key_value_store_; }
-
-  one::Generator* generator() { return generator_.get(); }
-
-  const int8_t* InitializerIndex() { return device_initializer_index_; }
-  const EmbeddingInitializer* Initializers() { return device_initializer_param_; }
-
- private:
-  int device_index_;
-  void* host_num_keys_;
-  std::shared_ptr<one::Generator> generator_;
-  embedding::KeyValueStore* key_value_store_;
-
-  EmbeddingInitializer* host_initializer_param_;
-  EmbeddingInitializer* device_initializer_param_;
-  int8_t* host_initializer_index_;
-  int8_t* device_initializer_index_;
-};
-
-template<typename IDX>
-class EmbeddingPutKernelState final : public user_op::OpKernelState {
- public:
-  explicit EmbeddingPutKernelState(user_op::KernelInitContext* ctx) : device_index_(-1) {
-    OF_CUDA_CHECK(hipGetDevice(&device_index_));
-    OF_CUDA_CHECK(hipMallocHost(&host_num_keys_, sizeof(IDX)));
-    key_value_store_ = Singleton<embedding::EmbeddingManager>::Get()->GetKeyValueStore(
-        ctx->Attr<std::string>("embedding_name"), ctx->parallel_ctx().parallel_id());
-    uint32_t max_query_length =
-        ctx->TensorDesc4ArgNameAndIndex("unique_ids", 0)->shape().elem_cnt();
-    key_value_store_->ReserveQueryLength(max_query_length);
-  }
-  ~EmbeddingPutKernelState() override {
-    CudaCurrentDeviceGuard guard(device_index_);
-    OF_CUDA_CHECK(hipHostFree(host_num_keys_));
-  }
-
-  void* HostNumKeys() { return host_num_keys_; }
-  embedding::KeyValueStore* KeyValueStore() { return key_value_store_; }
-
- private:
-  int device_index_;
-  void* host_num_keys_;
-  embedding::KeyValueStore* key_value_store_;
-};
-
-enum class EmbeddingBufferType { kNumMissing = 0, kMissingIndices, kValues, kMaxType };
-
-class EmbeddingTmpBufferManager final {
- public:
-  OF_DISALLOW_COPY_AND_MOVE(EmbeddingTmpBufferManager);
-  EmbeddingTmpBufferManager(void* ptr, const int64_t num_ids, const int64_t value_byte_size,
-                            const bool need_value_buffer)
-      : offset_(0), offsets_(static_cast<size_t>(EmbeddingBufferType::kMaxType), -1), ptr_(ptr) {
-    AllocBuffer(EmbeddingBufferType::kNumMissing, sizeof(uint32_t));
-    AllocBuffer(EmbeddingBufferType::kMissingIndices, num_ids * sizeof(uint32_t));
-    if (need_value_buffer) { AllocBuffer(EmbeddingBufferType::kValues, num_ids * value_byte_size); }
-  }
-
-  template<typename T = void>
-  T* Ptr(EmbeddingBufferType type) {
-    CHECK(ptr_ != nullptr);
-    int64_t offset = offsets_.at(static_cast<size_t>(type));
-    CHECK_NE(offset, -1);
-    return reinterpret_cast<T*>(reinterpret_cast<char*>(ptr_) + offset);
-  }
-
-  size_t TotalBufferSize() const { return offset_; }
-
- private:
-  void AllocBuffer(EmbeddingBufferType type, size_t size) {
-    const size_t type_id = static_cast<size_t>(type);
-    CHECK_EQ(offsets_.at(type_id), -1);
-    offsets_.at(type_id) = offset_;
-    offset_ += GetCudaAlignedSize(size);
-  }
-
-  size_t offset_;
-  std::vector<int64_t> offsets_;
-  void* ptr_;
-};
-
-template<typename T, typename U>
-__global__ void InitValueKernel(uint64_t seed, one::CUDAGeneratorState* cuda_gen_state,
-                                uint64_t inc_offset, const int32_t line_size,
-                                const int32_t embedding_size,
-                                const EmbeddingInitializer* initializer_param,
-                                const int8_t* initializer_index, const U* table_ids,
-                                const uint32_t* num_missing_keys, const uint32_t* missing_indices,
-                                T* values) {
-  int32_t global_thread_id = blockIdx.x * blockDim.x + threadIdx.x;
-  hiprandStatePhilox4_32_10_t state;
-  hiprand_init(seed, global_thread_id, cuda_gen_state->dev_offset, &state);
-  int64_t n = *num_missing_keys * line_size;
-  CUDA_1D_KERNEL_LOOP(i, n) {
-    int row = i / line_size;
-    int col = i - row * line_size;
-    const uint32_t index = missing_indices[row];
-    const int64_t offset = index * line_size + col;
-    const int32_t table_idx = table_ids[index];
-    const int32_t initializer_idx = initializer_index[table_idx * line_size + col];
-    EmbeddingInitializer initializer = initializer_param[initializer_idx];
-    T value;
-    if (initializer.type == InitializerType::kUniform) {
-      const float low = initializer.uniform_param.low;
-      const float high = initializer.uniform_param.high;
-      value = hiprand_uniform(&state) * (high - low) + low;
-    } else if (initializer.type == InitializerType::kNormal) {
-      const float mean = initializer.normal_param.mean;
-      const float std = initializer.normal_param.std;
-      value = hiprand_normal(&state) * std + mean;
-    } else if (initializer.type == InitializerType::kConstant) {
-      value = initializer.constant_param.value;
-    } else {
-      asm volatile("s_trap 0;");
-    }
-    values[offset] = value;
-  }
-  __syncthreads();
-  if (threadIdx.x == 0) {
-    int32_t new_counter = cuda::atomic::Add(&cuda_gen_state->dev_counter, 1) + 1;
-    if (new_counter == gridDim.x) {
-      cuda_gen_state->dev_counter = 0;           // reset counter to zero
-      cuda_gen_state->dev_offset += inc_offset;  // maintain the state of generator's dev_offset
-    }
-  }
-}
-
-template<typename T, typename U, typename IDX>
-void LookupAndInitMissing(ep::Stream* stream, EmbeddingKernelState<IDX>* embedding_state,
-                          const int64_t num_ids, const int64_t embedding_size,
-                          const int64_t line_size, const void* num_unique_ptr,
-                          const void* unique_ids, const void* table_ids, T* values_ptr,
-                          void* tmp_buffer_ptr, uint32_t* return_num_unique,
-                          const bool put_to_kv_store) {
-  const auto& generator = embedding_state->generator();
-  CHECK_NOTNULL(generator);
-  std::shared_ptr<one::CUDAGeneratorImpl> cuda_generator =
-      CHECK_JUST(generator->template Get<one::CUDAGeneratorImpl>(stream->device()->device_index()));
-  uint64_t seed = cuda_generator->current_seed();
-  one::CUDAGeneratorState* cuda_gen_state = cuda_generator->cuda_gen_state();
-  embedding::KeyValueStore* store = embedding_state->KeyValueStore();
-  const EmbeddingInitializer* initializer_param = embedding_state->Initializers();
-  const int8_t* initializer_index = embedding_state->InitializerIndex();
-  bool need_value_buffer = (values_ptr == nullptr);
-  EmbeddingTmpBufferManager buffer_manager(tmp_buffer_ptr, num_ids, line_size * sizeof(T),
-                                           need_value_buffer);
-  void* host_num_keys = embedding_state->HostNumKeys();
-  OF_CUDA_CHECK(hipMemcpyAsync(host_num_keys, num_unique_ptr, sizeof(IDX), hipMemcpyDefault,
-                                stream->As<ep::CudaStream>()->cuda_stream()));
-  CHECK_JUST(stream->Sync());
-  uint32_t num_unique = *reinterpret_cast<IDX*>(host_num_keys);
-  uint32_t* num_missing_ptr =
-      buffer_manager.template Ptr<uint32_t>(EmbeddingBufferType::kNumMissing);
-  uint32_t* missing_indices =
-      buffer_manager.template Ptr<uint32_t>(EmbeddingBufferType::kMissingIndices);
-  T* store_values =
-      need_value_buffer ? buffer_manager.template Ptr<T>(EmbeddingBufferType::kValues) : values_ptr;
-  store->Get(stream, num_unique, unique_ids, store_values, num_missing_ptr, missing_indices);
-  CHECK_GE(sizeof(IDX), sizeof(uint32_t));  // host_num_keys's buffer size is sizeof(IDX)
-  OF_CUDA_CHECK(hipMemcpyAsync(host_num_keys, num_missing_ptr, sizeof(uint32_t), hipMemcpyDefault,
-                                stream->As<ep::CudaStream>()->cuda_stream()));
-  CHECK_JUST(stream->Sync());
-  uint32_t num_missing = *reinterpret_cast<uint32_t*>(host_num_keys);
-  // init missing values
-  if (num_missing > 0) {
-    const int64_t elem_cnt = num_missing * line_size;
-    const int64_t num_blocks = BlocksNum4ThreadsNum(elem_cnt);
-    const uint64_t inc_offset = std::ceil(elem_cnt / num_blocks / kCudaThreadsNumPerBlock);
-    InitValueKernel<T, U>
-        <<<num_blocks, kCudaThreadsNumPerBlock, 0, stream->As<ep::CudaStream>()->cuda_stream()>>>(
-            seed, cuda_gen_state, inc_offset, line_size, embedding_size, initializer_param,
-            initializer_index, reinterpret_cast<const U*>(table_ids), num_missing_ptr,
-            missing_indices, store_values);
-  }
-  if (put_to_kv_store) { store->Put(stream, num_unique, unique_ids, store_values); }
-  *return_num_unique = num_unique;
-}
-
-template<typename T, typename U>
-__global__ void Copy2D(int64_t out_elem_cnt, const int32_t in_cols, const int32_t out_cols,
-                       const T* in, U* out) {
-  CUDA_1D_KERNEL_LOOP(i, out_elem_cnt) {
-    const int32_t row = i / out_cols;
-    const int32_t col = i - row * out_cols;
-    const int64_t in_offset = row * in_cols + col;
-    out[i] = static_cast<U>(in[in_offset]);
-  }
-}
-
-template<typename T>
-void CopyValuesToEmbeddings(ep::Stream* stream, int64_t num_unique, const int32_t embedding_size,
-                            const int32_t value_size, const DataType value_dtype,
-                            const DataType embedding_dtype, const T* values, void* embeddings) {
-  bool need_cast = (value_dtype != embedding_dtype);
-  bool need_copy_nd = (embedding_size != value_size);
-  CHECK(need_cast || need_copy_nd);
-  if (need_cast && !need_copy_nd) {
-    const int64_t cast_elem_count = num_unique * embedding_size;
-    std::unique_ptr<ep::primitive::Cast> cast_primitive =
-        ep::primitive::NewPrimitive<ep::primitive::CastFactory>(DeviceType::kCUDA, value_dtype,
-                                                                embedding_dtype);
-    cast_primitive->Launch(stream, values, embeddings, cast_elem_count);
-  } else if (!need_cast && need_copy_nd) {
-    const int32_t ndims = 2;
-    DimVector src_pos_vec(ndims, 0);
-    DimVector dst_pos_vec(ndims, 0);
-    DimVector src_shape = {num_unique, value_size};
-    DimVector dst_shape = {num_unique, embedding_size};
-    DimVector extent_shape = {num_unique, embedding_size};
-    std::unique_ptr<ep::primitive::CopyNd> copy_nd_primitive =
-        ep::primitive::NewPrimitive<ep::primitive::CopyNdFactory>(DeviceType::kCUDA, ndims);
-    CHECK(copy_nd_primitive);
-    copy_nd_primitive->Launch(stream, value_dtype, ndims, embeddings, dst_shape.data(),
-                              dst_pos_vec.data(), values, src_shape.data(), src_pos_vec.data(),
-                              extent_shape.data());
-  } else {
-    const int64_t embedding_elem_cnt = num_unique * embedding_size;
-    if (embedding_dtype == DataType::kFloat16) {
-      Copy2D<T, half><<<BlocksNum4ThreadsNum(embedding_elem_cnt), kCudaThreadsNumPerBlock, 0,
-                        stream->As<ep::CudaStream>()->cuda_stream()>>>(
-          embedding_elem_cnt, value_size, embedding_size, values,
-          reinterpret_cast<half*>(embeddings));
-    } else {
-      UNIMPLEMENTED();
-    }
-  }
-}
-
-}  // namespace
-
-template<typename T, typename U, typename IDX>
-class EmbeddingPrefetchKernel final : public user_op::OpKernel {
- public:
-  EmbeddingPrefetchKernel() = default;
-  ~EmbeddingPrefetchKernel() override = default;
-
-  std::shared_ptr<user_op::OpKernelState> CreateOpKernelState(
-      user_op::KernelInitContext* ctx) const override {
-    return std::make_shared<EmbeddingKernelState<IDX>>(ctx);
-  }
-
- private:
-  using user_op::OpKernel::Compute;
-  void Compute(user_op::KernelComputeContext* ctx, user_op::OpKernelState* state,
-               const user_op::OpKernelCache*) const override {
-    auto* embedding_state = dynamic_cast<EmbeddingKernelState<IDX>*>(state);
-    CHECK(embedding_state != nullptr);
-
-    const user_op::Tensor* num_unique_ids = ctx->Tensor4ArgNameAndIndex("num_unique_ids", 0);
-    const user_op::Tensor* unique_ids = ctx->Tensor4ArgNameAndIndex("unique_ids", 0);
-    const user_op::Tensor* table_ids = ctx->Tensor4ArgNameAndIndex("table_ids", 0);
-    user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0);
-    const int64_t embedding_size = ctx->Attr<int64_t>("embedding_size");
-    const int64_t line_size = ctx->Attr<int64_t>("line_size");
-    uint32_t num_unique;
-    T* values_ptr = nullptr;
-    LookupAndInitMissing<T, U, IDX>(ctx->stream(), embedding_state,
-                                    unique_ids->shape_view().elem_cnt(), embedding_size, line_size,
-                                    num_unique_ids->dptr(), unique_ids->dptr(), table_ids->dptr(),
-                                    values_ptr, tmp_buffer->mut_dptr(), &num_unique, true);
-  }
-  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
-};
-
-#define EMBEDDING_DATA_TYPE_SEQ OF_PP_MAKE_TUPLE_SEQ(float, DataType::kFloat)
-
-#define TABLE_ID_DATA_TYPE_SEQ                      \
-  OF_PP_MAKE_TUPLE_SEQ(uint8_t, DataType::kUInt8)   \
-  OF_PP_MAKE_TUPLE_SEQ(uint32_t, DataType::kUInt32) \
-  OF_PP_MAKE_TUPLE_SEQ(uint64_t, DataType::kUInt64) \
-  OF_PP_MAKE_TUPLE_SEQ(int8_t, DataType::kInt8)     \
-  OF_PP_MAKE_TUPLE_SEQ(int32_t, DataType::kInt32)   \
-  OF_PP_MAKE_TUPLE_SEQ(int64_t, DataType::kInt64)
-
-#define IDX_DATA_TYPE_SEQ                           \
-  OF_PP_MAKE_TUPLE_SEQ(uint32_t, DataType::kUInt32) \
-  OF_PP_MAKE_TUPLE_SEQ(int32_t, DataType::kInt32)
-
-#define REGISTER_CUDA_EMBEDDING_PREFETCH_KERNEL(t_dtype_pair, table_dtype_pair, idx_dtype_pair) \
-  REGISTER_USER_KERNEL("embedding_prefetch")                                                    \
-      .SetCreateFn<EmbeddingPrefetchKernel<OF_PP_PAIR_FIRST(t_dtype_pair),                      \
-                                           OF_PP_PAIR_FIRST(table_dtype_pair),                  \
-                                           OF_PP_PAIR_FIRST(idx_dtype_pair)>>()                 \
-      .SetIsMatchedHob(                                                                         \
-          (user_op::HobDeviceType() == DeviceType::kCUDA)                                       \
-          && (user_op::HobDataType("table_ids", 0) == OF_PP_PAIR_SECOND(table_dtype_pair))      \
-          && (user_op::HobDataType("num_unique_ids", 0) == OF_PP_PAIR_SECOND(idx_dtype_pair)))  \
-      .SetInferTmpSizeFn([](user_op::InferContext* ctx) {                                       \
-        const user_op::TensorDesc& unique_ids = ctx->InputTensorDesc("unique_ids", 0);          \
-        EmbeddingTmpBufferManager buffer_manager(                                               \
-            nullptr, unique_ids.shape().elem_cnt(),                                             \
-            ctx->Attr<int64_t>("line_size") * sizeof(OF_PP_PAIR_FIRST(t_dtype_pair)), true);    \
-        return buffer_manager.TotalBufferSize();                                                \
-      });
-
-OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_CUDA_EMBEDDING_PREFETCH_KERNEL, EMBEDDING_DATA_TYPE_SEQ,
-                                 TABLE_ID_DATA_TYPE_SEQ, IDX_DATA_TYPE_SEQ)
-
-template<typename T, typename U, typename IDX>
-class EmbeddingLookupKernel final : public user_op::OpKernel {
- public:
-  EmbeddingLookupKernel() = default;
-  ~EmbeddingLookupKernel() override = default;
-
-  std::shared_ptr<user_op::OpKernelState> CreateOpKernelState(
-      user_op::KernelInitContext* ctx) const override {
-    return std::make_shared<EmbeddingKernelState<IDX>>(ctx);
-  }
-
- private:
-  using user_op::OpKernel::Compute;
-  void Compute(user_op::KernelComputeContext* ctx, user_op::OpKernelState* state,
-               const user_op::OpKernelCache*) const override {
-    auto* embedding_state = dynamic_cast<EmbeddingKernelState<IDX>*>(state);
-    CHECK(embedding_state != nullptr);
-    const user_op::Tensor* num_unique_ids = ctx->Tensor4ArgNameAndIndex("num_unique_ids", 0);
-    const user_op::Tensor* unique_ids = ctx->Tensor4ArgNameAndIndex("unique_ids", 0);
-    const user_op::Tensor* table_ids = ctx->Tensor4ArgNameAndIndex("table_ids", 0);
-    user_op::Tensor* unique_values = ctx->Tensor4ArgNameAndIndex("unique_values", 0);
-    user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0);
-    const int64_t embedding_size = ctx->Attr<int64_t>("embedding_size");
-    const int64_t line_size = ctx->Attr<int64_t>("line_size");
-    uint32_t num_unique;
-    LookupAndInitMissing<T, U, IDX>(
-        ctx->stream(), embedding_state, unique_ids->shape_view().elem_cnt(), embedding_size,
-        line_size, num_unique_ids->dptr(), unique_ids->dptr(), table_ids->dptr(),
-        unique_values->mut_dptr<T>(), tmp_buffer->mut_dptr(), &num_unique, false);
-    if (ctx->has_output("embeddings", 0)) {
-      user_op::Tensor* embeddings = ctx->Tensor4ArgNameAndIndex("embeddings", 0);
-      CopyValuesToEmbeddings<T>(ctx->stream(), num_unique, embedding_size, line_size,
-                                unique_values->data_type(), embeddings->data_type(),
-                                unique_values->dptr<T>(), embeddings->mut_dptr());
-    }
-  }
-  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
-};
-
-#define REGISTER_CUDA_EMBEDDING_LOOKUP_KERNEL(t_dtype_pair, table_dtype_pair, idx_dtype_pair)  \
-  REGISTER_USER_KERNEL("embedding_lookup")                                                     \
-      .SetCreateFn<EmbeddingLookupKernel<OF_PP_PAIR_FIRST(t_dtype_pair),                       \
-                                         OF_PP_PAIR_FIRST(table_dtype_pair),                   \
-                                         OF_PP_PAIR_FIRST(idx_dtype_pair)>>()                  \
-      .SetIsMatchedHob(                                                                        \
-          (user_op::HobDeviceType() == DeviceType::kCUDA)                                      \
-          && (user_op::HobDataType("unique_values", 0) == OF_PP_PAIR_SECOND(t_dtype_pair))     \
-          && (user_op::HobDataType("table_ids", 0) == OF_PP_PAIR_SECOND(table_dtype_pair))     \
-          && (user_op::HobDataType("num_unique_ids", 0) == OF_PP_PAIR_SECOND(idx_dtype_pair))) \
-      .SetInferTmpSizeFn([](user_op::InferContext* ctx) {                                      \
-        const user_op::TensorDesc& unique_ids = ctx->InputTensorDesc("unique_ids", 0);         \
-        EmbeddingTmpBufferManager buffer_manager(                                              \
-            nullptr, unique_ids.shape().elem_cnt(),                                            \
-            ctx->Attr<int64_t>("line_size") * sizeof(OF_PP_PAIR_FIRST(t_dtype_pair)), false);  \
-        return buffer_manager.TotalBufferSize();                                               \
-      });
-
-OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_CUDA_EMBEDDING_LOOKUP_KERNEL, EMBEDDING_DATA_TYPE_SEQ,
-                                 TABLE_ID_DATA_TYPE_SEQ, IDX_DATA_TYPE_SEQ)
-
-template<typename IDX>
-class EmbeddingPutKernel final : public user_op::OpKernel {
- public:
-  EmbeddingPutKernel() = default;
-  ~EmbeddingPutKernel() override = default;
-
-  std::shared_ptr<user_op::OpKernelState> CreateOpKernelState(
-      user_op::KernelInitContext* ctx) const override {
-    return std::make_shared<EmbeddingPutKernelState<IDX>>(ctx);
-  }
-
- private:
-  using user_op::OpKernel::Compute;
-  void Compute(user_op::KernelComputeContext* ctx, user_op::OpKernelState* state,
-               const user_op::OpKernelCache*) const override {
-    auto* embedding_state = dynamic_cast<EmbeddingPutKernelState<IDX>*>(state);
-    CHECK(embedding_state != nullptr);
-    embedding::KeyValueStore* store = embedding_state->KeyValueStore();
-    const user_op::Tensor* num_unique_ids = ctx->Tensor4ArgNameAndIndex("num_unique_ids", 0);
-    const user_op::Tensor* unique_ids = ctx->Tensor4ArgNameAndIndex("unique_ids", 0);
-    const user_op::Tensor* unique_embeddings = ctx->Tensor4ArgNameAndIndex("unique_embeddings", 0);
-
-    IDX* host_num_keys = reinterpret_cast<IDX*>(embedding_state->HostNumKeys());
-    OF_CUDA_CHECK(hipMemcpyAsync(host_num_keys, num_unique_ids->dptr(), sizeof(IDX),
-                                  hipMemcpyDefault,
-                                  ctx->stream()->As<ep::CudaStream>()->cuda_stream()));
-    CHECK_JUST(ctx->stream()->Sync());
-
-    store->Put(ctx->stream(), *host_num_keys, unique_ids->dptr(), unique_embeddings->dptr());
-  }
-  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
-};
-
-#define REGISTER_CUDA_EMBEDDING_PUT_KERNEL(dtype, typeproto)           \
-  REGISTER_USER_KERNEL("embedding_put")                                \
-      .SetCreateFn<EmbeddingPutKernel<dtype>>()                        \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \
-                       && (user_op::HobDataType("num_unique_ids", 0) == typeproto));
-
-OF_PP_FOR_EACH_TUPLE(REGISTER_CUDA_EMBEDDING_PUT_KERNEL, IDX_DATA_TYPE_SEQ)
-
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "hip/hip_runtime.h"
+#include "oneflow/core/framework/framework.h"
+#include "oneflow/core/embedding/key_value_store.h"
+#include "oneflow/core/embedding/embedding_manager.h"
+#include "oneflow/core/device/cuda_util.h"
+#include "oneflow/user/kernels/random_mask_generator.h"
+#include "oneflow/core/framework/random_generator_impl.h"
+#include "oneflow/core/hip/atomic.hip.h"
+#include "oneflow/core/ep/include/primitive/copy_nd.h"
+#include "oneflow/core/ep/include/primitive/cast.h"
+#include "oneflow/core/ep/include/device.h"
+
+namespace oneflow {
+
+namespace {
+
+enum class InitializerType { kUniform, kNormal, kConstant };
+
+struct EmbeddingInitializer {
+  InitializerType type;
+  union {
+    struct {
+      float low;
+      float high;
+    } uniform_param;
+    struct {
+      float mean;
+      float std;
+    } normal_param;
+    struct {
+      float value;
+    } constant_param;
+  };
+
+  bool operator==(const EmbeddingInitializer& rhs) const {
+    if (this->type != rhs.type) { return false; }
+    if (rhs.type == InitializerType::kUniform) {
+      return (this->uniform_param.low == rhs.uniform_param.low)
+             && (this->uniform_param.high == rhs.uniform_param.high);
+    } else if (rhs.type == InitializerType::kNormal) {
+      return (this->normal_param.mean == rhs.normal_param.mean)
+             && (this->normal_param.std == rhs.normal_param.std);
+    } else if (rhs.type == InitializerType::kConstant) {
+      return this->constant_param.value == rhs.constant_param.value;
+    } else {
+      UNIMPLEMENTED();
+      return false;
+    }
+  }
+};
+
+void ParseInitializerFromJson(const nlohmann::json& initializer,
+                              EmbeddingInitializer* embedding_initializer) {
+  CHECK(initializer.contains("type"));
+  CHECK(initializer["type"].is_string());
+  std::string type = initializer["type"].get<std::string>();
+  if (type == "uniform") {
+    embedding_initializer->type = InitializerType::kUniform;
+    CHECK(initializer.contains("low"));
+    CHECK(initializer.contains("high"));
+    CHECK(initializer["low"].is_number());
+    CHECK(initializer["high"].is_number());
+    embedding_initializer->uniform_param.low = initializer["low"];
+    embedding_initializer->uniform_param.high = initializer["high"];
+  } else if (type == "normal") {
+    CHECK(initializer.contains("mean"));
+    CHECK(initializer.contains("std"));
+    CHECK(initializer["mean"].is_number());
+    CHECK(initializer["std"].is_number());
+    embedding_initializer->type = InitializerType::kNormal;
+    embedding_initializer->normal_param.mean = initializer["mean"];
+    embedding_initializer->normal_param.std = initializer["std"];
+  } else if (type == "constant") {
+    CHECK(initializer.contains("value"));
+    CHECK(initializer["value"].is_number());
+    embedding_initializer->type = InitializerType::kConstant;
+    embedding_initializer->constant_param.value = initializer["value"];
+  } else {
+    UNIMPLEMENTED() << "Unsupported initializer type";
+  }
+}
+
+int32_t ParseJsonToUniqueInitializerVecAndReturnOffset(
+    const nlohmann::json& initializer, std::vector<EmbeddingInitializer>* initializers) {
+  EmbeddingInitializer embedding_initializer;
+  ParseInitializerFromJson(initializer, &embedding_initializer);
+  for (int32_t i = 0; i < initializers->size(); ++i) {
+    if (initializers->at(i) == embedding_initializer) { return i; }
+  }
+  initializers->push_back(embedding_initializer);
+  return initializers->size() - 1;
+}
+
+void SetInitializerIndex(int32_t row_id, int32_t col_start, int32_t col_end, int64_t line_size,
+                         int8_t index, std::vector<int8_t>* initializer_index) {
+  int64_t row_offset = row_id * line_size;
+  for (int32_t col = col_start; col < col_end; ++col) {
+    initializer_index->at(row_offset + col) = index;
+  }
+}
+
+void ParseAndSetStateInitializerIndex(const std::string& state_initializer,
+                                      const int32_t num_tables, const int64_t line_size,
+                                      const int64_t embedding_size,
+                                      std::vector<EmbeddingInitializer>* initializer_params,
+                                      std::vector<int8_t>* initializer_index) {
+  if (line_size == embedding_size) { return; }
+  CHECK(!state_initializer.empty());
+  auto initializers = nlohmann::json::parse(state_initializer);
+  CHECK(initializers.is_array());
+  const int num_states = line_size / embedding_size - 1;
+  CHECK_EQ(num_states, initializers.size());
+  for (int32_t i = 0; i < num_states; ++i) {
+    int32_t offset =
+        ParseJsonToUniqueInitializerVecAndReturnOffset(initializers.at(i), initializer_params);
+    int32_t col_start = embedding_size + i * embedding_size;
+    int32_t col_end = col_start + embedding_size;
+    CHECK_LE(col_end, line_size);
+    for (int32_t j = 0; j < num_tables; ++j) {
+      SetInitializerIndex(j, col_start, col_end, line_size, offset, initializer_index);
+    }
+  }
+}
+
+void ParseAndSetModelInitializerIndex(const nlohmann::json& tables,
+                                      const std::vector<int64_t>& column_dims,
+                                      const int32_t num_tables, const int32_t num_columns,
+                                      const int64_t line_size, const int64_t embedding_size,
+                                      std::vector<EmbeddingInitializer>* initializer_params,
+                                      std::vector<int8_t>* initializer_index) {
+  for (int32_t i = 0; i < num_tables; ++i) {
+    auto table = tables.at(i);
+    CHECK(table.contains("columns"));
+    auto columns = table["columns"];
+    CHECK(columns.is_array());
+    CHECK_EQ(num_columns, columns.size()) << "columns size must equal to num embedding dims";
+    int32_t col_start = 0;
+    for (int k = 0; k < columns.size(); ++k) {
+      auto column = columns.at(k);
+      CHECK(column.contains("initializer"));
+      int32_t offset =
+          ParseJsonToUniqueInitializerVecAndReturnOffset(column["initializer"], initializer_params);
+      int32_t col_end = col_start + column_dims.at(k);
+      SetInitializerIndex(i, col_start, col_end, line_size, offset, initializer_index);
+      col_start = col_end;
+    }
+    CHECK_EQ(col_start, embedding_size);
+  }
+}
+
+void ParseInitializers(const int64_t line_size, const int64_t embedding_size,
+                       const std::string& state_initializer, const std::string& json_serialized,
+                       std::vector<EmbeddingInitializer>* initializer_params,
+                       std::vector<int8_t>* initializer_index) {
+  auto json_object = nlohmann::json::parse(json_serialized);
+  CHECK(json_object.contains("column_dims"));
+  std::vector<int64_t> column_dims = json_object["column_dims"];
+  const int32_t num_columns = column_dims.size();
+  CHECK(json_object.contains("tables"));
+  auto tables = json_object["tables"];
+  CHECK(tables.is_array());
+  const int32_t num_tables = tables.size();
+  initializer_index->resize(num_tables * line_size);
+  ParseAndSetStateInitializerIndex(state_initializer, num_tables, line_size, embedding_size,
+                                   initializer_params, initializer_index);
+  ParseAndSetModelInitializerIndex(tables, column_dims, num_tables, num_columns, line_size,
+                                   embedding_size, initializer_params, initializer_index);
+}
+
+template<typename IDX>
+class EmbeddingKernelState final : public user_op::OpKernelState {
+ public:
+  explicit EmbeddingKernelState(user_op::KernelInitContext* ctx)
+      : device_index_(-1), generator_(CHECK_JUST(one::MakeGenerator(DeviceType::kCUDA))) {
+    OF_CUDA_CHECK(hipGetDevice(&device_index_));
+    OF_CUDA_CHECK(hipMallocHost(&host_num_keys_, sizeof(IDX)));
+    key_value_store_ = Singleton<embedding::EmbeddingManager>::Get()->GetKeyValueStore(
+        ctx->Attr<std::string>("embedding_name"), ctx->parallel_ctx().parallel_id());
+    uint32_t max_query_length =
+        ctx->TensorDesc4ArgNameAndIndex("unique_ids", 0)->shape().elem_cnt();
+    key_value_store_->ReserveQueryLength(max_query_length);
+
+    const int64_t embedding_size = ctx->Attr<int64_t>("embedding_size");
+    const int64_t line_size = ctx->Attr<int64_t>("line_size");
+    const std::string& state_initializer = ctx->Attr<std::string>("state_initializer");
+
+    std::vector<EmbeddingInitializer> initializer_param;
+    std::vector<int8_t> initializer_index;
+    ParseInitializers(line_size, embedding_size, state_initializer,
+                      ctx->Attr<std::string>("embedding_tables"), &initializer_param,
+                      &initializer_index);
+
+    const size_t param_size_bytes = initializer_param.size() * sizeof(EmbeddingInitializer);
+    OF_CUDA_CHECK(hipMallocHost(reinterpret_cast<void **>(&host_initializer_param_), param_size_bytes));
+    std::memcpy(host_initializer_param_, initializer_param.data(), param_size_bytes);
+    OF_CUDA_CHECK(hipMalloc(&device_initializer_param_, param_size_bytes));
+    OF_CUDA_CHECK(hipMemcpyAsync(device_initializer_param_, host_initializer_param_,
+                                  param_size_bytes, hipMemcpyDefault,
+                                  ctx->stream()->As<ep::CudaStream>()->cuda_stream()));
+
+    const size_t index_size_bytes = initializer_index.size() * sizeof(int8_t);
+    OF_CUDA_CHECK(hipMallocHost(reinterpret_cast<void **>(&host_initializer_index_), index_size_bytes));
+    std::memcpy(host_initializer_index_, initializer_index.data(), index_size_bytes);
+    OF_CUDA_CHECK(hipMalloc(&device_initializer_index_, index_size_bytes));
+    OF_CUDA_CHECK(hipMemcpyAsync(device_initializer_index_, host_initializer_index_,
+                                  index_size_bytes, hipMemcpyDefault,
+                                  ctx->stream()->As<ep::CudaStream>()->cuda_stream()));
+  }
+  ~EmbeddingKernelState() override {
+    CudaCurrentDeviceGuard guard(device_index_);
+    OF_CUDA_CHECK(hipHostFree(host_num_keys_));
+    OF_CUDA_CHECK(hipHostFree(host_initializer_param_));
+    OF_CUDA_CHECK(hipFree(device_initializer_param_));
+    OF_CUDA_CHECK(hipHostFree(host_initializer_index_));
+    OF_CUDA_CHECK(hipFree(device_initializer_index_));
+  }
+
+  void* HostNumKeys() { return host_num_keys_; }
+
+  embedding::KeyValueStore* KeyValueStore() { return key_value_store_; }
+
+  one::Generator* generator() { return generator_.get(); }
+
+  const int8_t* InitializerIndex() { return device_initializer_index_; }
+  const EmbeddingInitializer* Initializers() { return device_initializer_param_; }
+
+ private:
+  int device_index_;
+  void* host_num_keys_;
+  std::shared_ptr<one::Generator> generator_;
+  embedding::KeyValueStore* key_value_store_;
+
+  EmbeddingInitializer* host_initializer_param_;
+  EmbeddingInitializer* device_initializer_param_;
+  int8_t* host_initializer_index_;
+  int8_t* device_initializer_index_;
+};
+
+template<typename IDX>
+class EmbeddingPutKernelState final : public user_op::OpKernelState {
+ public:
+  explicit EmbeddingPutKernelState(user_op::KernelInitContext* ctx) : device_index_(-1) {
+    OF_CUDA_CHECK(hipGetDevice(&device_index_));
+    OF_CUDA_CHECK(hipMallocHost(&host_num_keys_, sizeof(IDX)));
+    key_value_store_ = Singleton<embedding::EmbeddingManager>::Get()->GetKeyValueStore(
+        ctx->Attr<std::string>("embedding_name"), ctx->parallel_ctx().parallel_id());
+    uint32_t max_query_length =
+        ctx->TensorDesc4ArgNameAndIndex("unique_ids", 0)->shape().elem_cnt();
+    key_value_store_->ReserveQueryLength(max_query_length);
+  }
+  ~EmbeddingPutKernelState() override {
+    CudaCurrentDeviceGuard guard(device_index_);
+    OF_CUDA_CHECK(hipHostFree(host_num_keys_));
+  }
+
+  void* HostNumKeys() { return host_num_keys_; }
+  embedding::KeyValueStore* KeyValueStore() { return key_value_store_; }
+
+ private:
+  int device_index_;
+  void* host_num_keys_;
+  embedding::KeyValueStore* key_value_store_;
+};
+
+enum class EmbeddingBufferType { kNumMissing = 0, kMissingIndices, kValues, kMaxType };
+
+class EmbeddingTmpBufferManager final {
+ public:
+  OF_DISALLOW_COPY_AND_MOVE(EmbeddingTmpBufferManager);
+  EmbeddingTmpBufferManager(void* ptr, const int64_t num_ids, const int64_t value_byte_size,
+                            const bool need_value_buffer)
+      : offset_(0), offsets_(static_cast<size_t>(EmbeddingBufferType::kMaxType), -1), ptr_(ptr) {
+    AllocBuffer(EmbeddingBufferType::kNumMissing, sizeof(uint32_t));
+    AllocBuffer(EmbeddingBufferType::kMissingIndices, num_ids * sizeof(uint32_t));
+    if (need_value_buffer) { AllocBuffer(EmbeddingBufferType::kValues, num_ids * value_byte_size); }
+  }
+
+  template<typename T = void>
+  T* Ptr(EmbeddingBufferType type) {
+    CHECK(ptr_ != nullptr);
+    int64_t offset = offsets_.at(static_cast<size_t>(type));
+    CHECK_NE(offset, -1);
+    return reinterpret_cast<T*>(reinterpret_cast<char*>(ptr_) + offset);
+  }
+
+  size_t TotalBufferSize() const { return offset_; }
+
+ private:
+  void AllocBuffer(EmbeddingBufferType type, size_t size) {
+    const size_t type_id = static_cast<size_t>(type);
+    CHECK_EQ(offsets_.at(type_id), -1);
+    offsets_.at(type_id) = offset_;
+    offset_ += GetCudaAlignedSize(size);
+  }
+
+  size_t offset_;
+  std::vector<int64_t> offsets_;
+  void* ptr_;
+};
+
+template<typename T, typename U>
+__global__ void InitValueKernel(uint64_t seed, one::CUDAGeneratorState* cuda_gen_state,
+                                uint64_t inc_offset, const int32_t line_size,
+                                const int32_t embedding_size,
+                                const EmbeddingInitializer* initializer_param,
+                                const int8_t* initializer_index, const U* table_ids,
+                                const uint32_t* num_missing_keys, const uint32_t* missing_indices,
+                                T* values) {
+  int32_t global_thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+  hiprandStatePhilox4_32_10_t state;
+  hiprand_init(seed, global_thread_id, cuda_gen_state->dev_offset, &state);
+  int64_t n = *num_missing_keys * line_size;
+  CUDA_1D_KERNEL_LOOP(i, n) {
+    int row = i / line_size;
+    int col = i - row * line_size;
+    const uint32_t index = missing_indices[row];
+    const int64_t offset = index * line_size + col;
+    const int32_t table_idx = table_ids[index];
+    const int32_t initializer_idx = initializer_index[table_idx * line_size + col];
+    EmbeddingInitializer initializer = initializer_param[initializer_idx];
+    T value;
+    if (initializer.type == InitializerType::kUniform) {
+      const float low = initializer.uniform_param.low;
+      const float high = initializer.uniform_param.high;
+      value = hiprand_uniform(&state) * (high - low) + low;
+    } else if (initializer.type == InitializerType::kNormal) {
+      const float mean = initializer.normal_param.mean;
+      const float std = initializer.normal_param.std;
+      value = hiprand_normal(&state) * std + mean;
+    } else if (initializer.type == InitializerType::kConstant) {
+      value = initializer.constant_param.value;
+    } else {
+      asm volatile("s_trap 0;");
+    }
+    values[offset] = value;
+  }
+  __syncthreads();
+  if (threadIdx.x == 0) {
+    int32_t new_counter = cuda::atomic::Add(&cuda_gen_state->dev_counter, 1) + 1;
+    if (new_counter == gridDim.x) {
+      cuda_gen_state->dev_counter = 0;           // reset counter to zero
+      cuda_gen_state->dev_offset += inc_offset;  // maintain the state of generator's dev_offset
+    }
+  }
+}
+
+template<typename T, typename U, typename IDX>
+void LookupAndInitMissing(ep::Stream* stream, EmbeddingKernelState<IDX>* embedding_state,
+                          const int64_t num_ids, const int64_t embedding_size,
+                          const int64_t line_size, const void* num_unique_ptr,
+                          const void* unique_ids, const void* table_ids, T* values_ptr,
+                          void* tmp_buffer_ptr, uint32_t* return_num_unique,
+                          const bool put_to_kv_store) {
+  const auto& generator = embedding_state->generator();
+  CHECK_NOTNULL(generator);
+  std::shared_ptr<one::CUDAGeneratorImpl> cuda_generator =
+      CHECK_JUST(generator->template Get<one::CUDAGeneratorImpl>(stream->device()->device_index()));
+  uint64_t seed = cuda_generator->current_seed();
+  one::CUDAGeneratorState* cuda_gen_state = cuda_generator->cuda_gen_state();
+  embedding::KeyValueStore* store = embedding_state->KeyValueStore();
+  const EmbeddingInitializer* initializer_param = embedding_state->Initializers();
+  const int8_t* initializer_index = embedding_state->InitializerIndex();
+  bool need_value_buffer = (values_ptr == nullptr);
+  EmbeddingTmpBufferManager buffer_manager(tmp_buffer_ptr, num_ids, line_size * sizeof(T),
+                                           need_value_buffer);
+  void* host_num_keys = embedding_state->HostNumKeys();
+  OF_CUDA_CHECK(hipMemcpyAsync(host_num_keys, num_unique_ptr, sizeof(IDX), hipMemcpyDefault,
+                                stream->As<ep::CudaStream>()->cuda_stream()));
+  CHECK_JUST(stream->Sync());
+  uint32_t num_unique = *reinterpret_cast<IDX*>(host_num_keys);
+  uint32_t* num_missing_ptr =
+      buffer_manager.template Ptr<uint32_t>(EmbeddingBufferType::kNumMissing);
+  uint32_t* missing_indices =
+      buffer_manager.template Ptr<uint32_t>(EmbeddingBufferType::kMissingIndices);
+  T* store_values =
+      need_value_buffer ? buffer_manager.template Ptr<T>(EmbeddingBufferType::kValues) : values_ptr;
+  store->Get(stream, num_unique, unique_ids, store_values, num_missing_ptr, missing_indices);
+  CHECK_GE(sizeof(IDX), sizeof(uint32_t));  // host_num_keys's buffer size is sizeof(IDX)
+  OF_CUDA_CHECK(hipMemcpyAsync(host_num_keys, num_missing_ptr, sizeof(uint32_t), hipMemcpyDefault,
+                                stream->As<ep::CudaStream>()->cuda_stream()));
+  CHECK_JUST(stream->Sync());
+  uint32_t num_missing = *reinterpret_cast<uint32_t*>(host_num_keys);
+  // init missing values
+  if (num_missing > 0) {
+    const int64_t elem_cnt = num_missing * line_size;
+    const int64_t num_blocks = BlocksNum4ThreadsNum(elem_cnt);
+    const uint64_t inc_offset = std::ceil(elem_cnt / num_blocks / kCudaThreadsNumPerBlock);
+    InitValueKernel<T, U>
+        <<<num_blocks, kCudaThreadsNumPerBlock, 0, stream->As<ep::CudaStream>()->cuda_stream()>>>(
+            seed, cuda_gen_state, inc_offset, line_size, embedding_size, initializer_param,
+            initializer_index, reinterpret_cast<const U*>(table_ids), num_missing_ptr,
+            missing_indices, store_values);
+  }
+  if (put_to_kv_store) { store->Put(stream, num_unique, unique_ids, store_values); }
+  *return_num_unique = num_unique;
+}
+
+template<typename T, typename U>
+__global__ void Copy2D(int64_t out_elem_cnt, const int32_t in_cols, const int32_t out_cols,
+                       const T* in, U* out) {
+  CUDA_1D_KERNEL_LOOP(i, out_elem_cnt) {
+    const int32_t row = i / out_cols;
+    const int32_t col = i - row * out_cols;
+    const int64_t in_offset = row * in_cols + col;
+    out[i] = static_cast<U>(in[in_offset]);
+  }
+}
+
+template<typename T>
+void CopyValuesToEmbeddings(ep::Stream* stream, int64_t num_unique, const int32_t embedding_size,
+                            const int32_t value_size, const DataType value_dtype,
+                            const DataType embedding_dtype, const T* values, void* embeddings) {
+  bool need_cast = (value_dtype != embedding_dtype);
+  bool need_copy_nd = (embedding_size != value_size);
+  CHECK(need_cast || need_copy_nd);
+  if (need_cast && !need_copy_nd) {
+    const int64_t cast_elem_count = num_unique * embedding_size;
+    std::unique_ptr<ep::primitive::Cast> cast_primitive =
+        ep::primitive::NewPrimitive<ep::primitive::CastFactory>(DeviceType::kCUDA, value_dtype,
+                                                                embedding_dtype);
+    cast_primitive->Launch(stream, values, embeddings, cast_elem_count);
+  } else if (!need_cast && need_copy_nd) {
+    const int32_t ndims = 2;
+    DimVector src_pos_vec(ndims, 0);
+    DimVector dst_pos_vec(ndims, 0);
+    DimVector src_shape = {num_unique, value_size};
+    DimVector dst_shape = {num_unique, embedding_size};
+    DimVector extent_shape = {num_unique, embedding_size};
+    std::unique_ptr<ep::primitive::CopyNd> copy_nd_primitive =
+        ep::primitive::NewPrimitive<ep::primitive::CopyNdFactory>(DeviceType::kCUDA, ndims);
+    CHECK(copy_nd_primitive);
+    copy_nd_primitive->Launch(stream, value_dtype, ndims, embeddings, dst_shape.data(),
+                              dst_pos_vec.data(), values, src_shape.data(), src_pos_vec.data(),
+                              extent_shape.data());
+  } else {
+    const int64_t embedding_elem_cnt = num_unique * embedding_size;
+    if (embedding_dtype == DataType::kFloat16) {
+      Copy2D<T, half><<<BlocksNum4ThreadsNum(embedding_elem_cnt), kCudaThreadsNumPerBlock, 0,
+                        stream->As<ep::CudaStream>()->cuda_stream()>>>(
+          embedding_elem_cnt, value_size, embedding_size, values,
+          reinterpret_cast<half*>(embeddings));
+    } else {
+      UNIMPLEMENTED();
+    }
+  }
+}
+
+}  // namespace
+
+template<typename T, typename U, typename IDX>
+class EmbeddingPrefetchKernel final : public user_op::OpKernel {
+ public:
+  EmbeddingPrefetchKernel() = default;
+  ~EmbeddingPrefetchKernel() override = default;
+
+  std::shared_ptr<user_op::OpKernelState> CreateOpKernelState(
+      user_op::KernelInitContext* ctx) const override {
+    return std::make_shared<EmbeddingKernelState<IDX>>(ctx);
+  }
+
+ private:
+  using user_op::OpKernel::Compute;
+  void Compute(user_op::KernelComputeContext* ctx, user_op::OpKernelState* state,
+               const user_op::OpKernelCache*) const override {
+    auto* embedding_state = dynamic_cast<EmbeddingKernelState<IDX>*>(state);
+    CHECK(embedding_state != nullptr);
+
+    const user_op::Tensor* num_unique_ids = ctx->Tensor4ArgNameAndIndex("num_unique_ids", 0);
+    const user_op::Tensor* unique_ids = ctx->Tensor4ArgNameAndIndex("unique_ids", 0);
+    const user_op::Tensor* table_ids = ctx->Tensor4ArgNameAndIndex("table_ids", 0);
+    user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0);
+    const int64_t embedding_size = ctx->Attr<int64_t>("embedding_size");
+    const int64_t line_size = ctx->Attr<int64_t>("line_size");
+    uint32_t num_unique;
+    T* values_ptr = nullptr;
+    LookupAndInitMissing<T, U, IDX>(ctx->stream(), embedding_state,
+                                    unique_ids->shape_view().elem_cnt(), embedding_size, line_size,
+                                    num_unique_ids->dptr(), unique_ids->dptr(), table_ids->dptr(),
+                                    values_ptr, tmp_buffer->mut_dptr(), &num_unique, true);
+  }
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
+};
+
+#define EMBEDDING_DATA_TYPE_SEQ OF_PP_MAKE_TUPLE_SEQ(float, DataType::kFloat)
+
+#define TABLE_ID_DATA_TYPE_SEQ                      \
+  OF_PP_MAKE_TUPLE_SEQ(uint8_t, DataType::kUInt8)   \
+  OF_PP_MAKE_TUPLE_SEQ(uint32_t, DataType::kUInt32) \
+  OF_PP_MAKE_TUPLE_SEQ(uint64_t, DataType::kUInt64) \
+  OF_PP_MAKE_TUPLE_SEQ(int8_t, DataType::kInt8)     \
+  OF_PP_MAKE_TUPLE_SEQ(int32_t, DataType::kInt32)   \
+  OF_PP_MAKE_TUPLE_SEQ(int64_t, DataType::kInt64)
+
+#define IDX_DATA_TYPE_SEQ                           \
+  OF_PP_MAKE_TUPLE_SEQ(uint32_t, DataType::kUInt32) \
+  OF_PP_MAKE_TUPLE_SEQ(int32_t, DataType::kInt32)
+
+#define REGISTER_CUDA_EMBEDDING_PREFETCH_KERNEL(t_dtype_pair, table_dtype_pair, idx_dtype_pair) \
+  REGISTER_USER_KERNEL("embedding_prefetch")                                                    \
+      .SetCreateFn<EmbeddingPrefetchKernel<OF_PP_PAIR_FIRST(t_dtype_pair),                      \
+                                           OF_PP_PAIR_FIRST(table_dtype_pair),                  \
+                                           OF_PP_PAIR_FIRST(idx_dtype_pair)>>()                 \
+      .SetIsMatchedHob(                                                                         \
+          (user_op::HobDeviceType() == DeviceType::kCUDA)                                       \
+          && (user_op::HobDataType("table_ids", 0) == OF_PP_PAIR_SECOND(table_dtype_pair))      \
+          && (user_op::HobDataType("num_unique_ids", 0) == OF_PP_PAIR_SECOND(idx_dtype_pair)))  \
+      .SetInferTmpSizeFn([](user_op::InferContext* ctx) {                                       \
+        const user_op::TensorDesc& unique_ids = ctx->InputTensorDesc("unique_ids", 0);          \
+        EmbeddingTmpBufferManager buffer_manager(                                               \
+            nullptr, unique_ids.shape().elem_cnt(),                                             \
+            ctx->Attr<int64_t>("line_size") * sizeof(OF_PP_PAIR_FIRST(t_dtype_pair)), true);    \
+        return buffer_manager.TotalBufferSize();                                                \
+      });
+
+OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_CUDA_EMBEDDING_PREFETCH_KERNEL, EMBEDDING_DATA_TYPE_SEQ,
+                                 TABLE_ID_DATA_TYPE_SEQ, IDX_DATA_TYPE_SEQ)
+
+template<typename T, typename U, typename IDX>
+class EmbeddingLookupKernel final : public user_op::OpKernel {
+ public:
+  EmbeddingLookupKernel() = default;
+  ~EmbeddingLookupKernel() override = default;
+
+  std::shared_ptr<user_op::OpKernelState> CreateOpKernelState(
+      user_op::KernelInitContext* ctx) const override {
+    return std::make_shared<EmbeddingKernelState<IDX>>(ctx);
+  }
+
+ private:
+  using user_op::OpKernel::Compute;
+  void Compute(user_op::KernelComputeContext* ctx, user_op::OpKernelState* state,
+               const user_op::OpKernelCache*) const override {
+    auto* embedding_state = dynamic_cast<EmbeddingKernelState<IDX>*>(state);
+    CHECK(embedding_state != nullptr);
+    const user_op::Tensor* num_unique_ids = ctx->Tensor4ArgNameAndIndex("num_unique_ids", 0);
+    const user_op::Tensor* unique_ids = ctx->Tensor4ArgNameAndIndex("unique_ids", 0);
+    const user_op::Tensor* table_ids = ctx->Tensor4ArgNameAndIndex("table_ids", 0);
+    user_op::Tensor* unique_values = ctx->Tensor4ArgNameAndIndex("unique_values", 0);
+    user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0);
+    const int64_t embedding_size = ctx->Attr<int64_t>("embedding_size");
+    const int64_t line_size = ctx->Attr<int64_t>("line_size");
+    uint32_t num_unique;
+    LookupAndInitMissing<T, U, IDX>(
+        ctx->stream(), embedding_state, unique_ids->shape_view().elem_cnt(), embedding_size,
+        line_size, num_unique_ids->dptr(), unique_ids->dptr(), table_ids->dptr(),
+        unique_values->mut_dptr<T>(), tmp_buffer->mut_dptr(), &num_unique, false);
+    if (ctx->has_output("embeddings", 0)) {
+      user_op::Tensor* embeddings = ctx->Tensor4ArgNameAndIndex("embeddings", 0);
+      CopyValuesToEmbeddings<T>(ctx->stream(), num_unique, embedding_size, line_size,
+                                unique_values->data_type(), embeddings->data_type(),
+                                unique_values->dptr<T>(), embeddings->mut_dptr());
+    }
+  }
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
+};
+
+#define REGISTER_CUDA_EMBEDDING_LOOKUP_KERNEL(t_dtype_pair, table_dtype_pair, idx_dtype_pair)  \
+  REGISTER_USER_KERNEL("embedding_lookup")                                                     \
+      .SetCreateFn<EmbeddingLookupKernel<OF_PP_PAIR_FIRST(t_dtype_pair),                       \
+                                         OF_PP_PAIR_FIRST(table_dtype_pair),                   \
+                                         OF_PP_PAIR_FIRST(idx_dtype_pair)>>()                  \
+      .SetIsMatchedHob(                                                                        \
+          (user_op::HobDeviceType() == DeviceType::kCUDA)                                      \
+          && (user_op::HobDataType("unique_values", 0) == OF_PP_PAIR_SECOND(t_dtype_pair))     \
+          && (user_op::HobDataType("table_ids", 0) == OF_PP_PAIR_SECOND(table_dtype_pair))     \
+          && (user_op::HobDataType("num_unique_ids", 0) == OF_PP_PAIR_SECOND(idx_dtype_pair))) \
+      .SetInferTmpSizeFn([](user_op::InferContext* ctx) {                                      \
+        const user_op::TensorDesc& unique_ids = ctx->InputTensorDesc("unique_ids", 0);         \
+        EmbeddingTmpBufferManager buffer_manager(                                              \
+            nullptr, unique_ids.shape().elem_cnt(),                                            \
+            ctx->Attr<int64_t>("line_size") * sizeof(OF_PP_PAIR_FIRST(t_dtype_pair)), false);  \
+        return buffer_manager.TotalBufferSize();                                               \
+      });
+
+OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_CUDA_EMBEDDING_LOOKUP_KERNEL, EMBEDDING_DATA_TYPE_SEQ,
+                                 TABLE_ID_DATA_TYPE_SEQ, IDX_DATA_TYPE_SEQ)
+
+template<typename IDX>
+class EmbeddingPutKernel final : public user_op::OpKernel {
+ public:
+  EmbeddingPutKernel() = default;
+  ~EmbeddingPutKernel() override = default;
+
+  std::shared_ptr<user_op::OpKernelState> CreateOpKernelState(
+      user_op::KernelInitContext* ctx) const override {
+    return std::make_shared<EmbeddingPutKernelState<IDX>>(ctx);
+  }
+
+ private:
+  using user_op::OpKernel::Compute;
+  void Compute(user_op::KernelComputeContext* ctx, user_op::OpKernelState* state,
+               const user_op::OpKernelCache*) const override {
+    auto* embedding_state = dynamic_cast<EmbeddingPutKernelState<IDX>*>(state);
+    CHECK(embedding_state != nullptr);
+    embedding::KeyValueStore* store = embedding_state->KeyValueStore();
+    const user_op::Tensor* num_unique_ids = ctx->Tensor4ArgNameAndIndex("num_unique_ids", 0);
+    const user_op::Tensor* unique_ids = ctx->Tensor4ArgNameAndIndex("unique_ids", 0);
+    const user_op::Tensor* unique_embeddings = ctx->Tensor4ArgNameAndIndex("unique_embeddings", 0);
+
+    IDX* host_num_keys = reinterpret_cast<IDX*>(embedding_state->HostNumKeys());
+    OF_CUDA_CHECK(hipMemcpyAsync(host_num_keys, num_unique_ids->dptr(), sizeof(IDX),
+                                  hipMemcpyDefault,
+                                  ctx->stream()->As<ep::CudaStream>()->cuda_stream()));
+    CHECK_JUST(ctx->stream()->Sync());
+
+    store->Put(ctx->stream(), *host_num_keys, unique_ids->dptr(), unique_embeddings->dptr());
+  }
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
+};
+
+#define REGISTER_CUDA_EMBEDDING_PUT_KERNEL(dtype, typeproto)           \
+  REGISTER_USER_KERNEL("embedding_put")                                \
+      .SetCreateFn<EmbeddingPutKernel<dtype>>()                        \
+      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \
+                       && (user_op::HobDataType("num_unique_ids", 0) == typeproto));
+
+OF_PP_FOR_EACH_TUPLE(REGISTER_CUDA_EMBEDDING_PUT_KERNEL, IDX_DATA_TYPE_SEQ)
+
 }  // namespace oneflow
\ No newline at end of file
diff --git a/oneflow/user/kernels/one_embedding_update_kernels.hip.cpp b/oneflow/user/kernels/one_embedding_update_kernels.hip.cpp
index a134dda..db1decc 100644
--- a/oneflow/user/kernels/one_embedding_update_kernels.hip.cpp
+++ b/oneflow/user/kernels/one_embedding_update_kernels.hip.cpp
@@ -1,604 +1,604 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "hip/hip_runtime.h"
-#include "oneflow/core/framework/framework.h"
-#include "oneflow/core/device/cuda_util.h"
-#include "oneflow/user/kernels/model_update_kernel_util.h"
-
-namespace oneflow {
-
-namespace {
-
-template<typename T, typename G, typename IDX>
-__global__ void SGDUpdateKernel(const int64_t embedding_size, T scale, float l1, float l2,
-                                float weight_decay, const IDX* num_unique_ids,
-                                const float* learning_rate, const T* scale_by_ptr,
-                                const T* down_scale_by_ptr, const int64_t* skip_if,
-                                const G* model_diff, const T* model, T* updated_model) {
-  if (skip_if != nullptr && *skip_if != 0) {
-    const int64_t n = *num_unique_ids * embedding_size;
-    CUDA_1D_KERNEL_LOOP(i, n) { updated_model[i] = model[i]; }
-  } else {
-    if (scale_by_ptr != nullptr) { scale *= *scale_by_ptr; }
-    if (down_scale_by_ptr != nullptr) { scale /= *down_scale_by_ptr; }
-    float learning_rate_val = *learning_rate;
-    const int64_t n = *num_unique_ids * embedding_size;
-    CUDA_1D_KERNEL_LOOP(i, n) {
-      updated_model[i] = model[i];
-      SGDUpdateFunctor<T, G>()(model_diff + i, updated_model + i, scale, l1, l2, weight_decay,
-                               learning_rate_val);
-    }
-  }
-}
-
-__device__ void GetMomentumOffset(const int32_t line_size, const int32_t embedding_size,
-                                  int64_t model_diff_offset, int64_t* model_offset,
-                                  int64_t* momentum_offset) {
-  const int32_t row = model_diff_offset / embedding_size;
-  const int32_t col = model_diff_offset - row * embedding_size;
-  *model_offset = row * line_size + col;
-  *momentum_offset = *model_offset + embedding_size;
-}
-
-template<typename T, typename G, typename IDX>
-__global__ void MomentumUpdateKernel(const int64_t line_size, const int64_t embedding_size, T scale,
-                                     float l1, float l2, float weight_decay, float beta,
-                                     const IDX* num_unique_ids, const float* learning_rate,
-                                     const T* scale_by_ptr, const T* down_scale_by_ptr,
-                                     const int64_t* skip_if, const G* model_diff,
-                                     const T* unique_values, T* updated_unique_values) {
-  if (skip_if != nullptr && *skip_if != 0) {
-    const int64_t n = *num_unique_ids * line_size;
-    CUDA_1D_KERNEL_LOOP(i, n) { updated_unique_values[i] = unique_values[i]; }
-  } else {
-    if (scale_by_ptr != nullptr) { scale *= *scale_by_ptr; }
-    if (down_scale_by_ptr != nullptr) { scale /= *down_scale_by_ptr; }
-    float learning_rate_val = *learning_rate;
-    const int64_t n = *num_unique_ids * embedding_size;
-    CUDA_1D_KERNEL_LOOP(i, n) {
-      int64_t model_offset;
-      int64_t momentum_offset;
-      GetMomentumOffset(line_size, embedding_size, i, &model_offset, &momentum_offset);
-      updated_unique_values[model_offset] = unique_values[model_offset];
-      updated_unique_values[momentum_offset] = unique_values[momentum_offset];
-      MomentumUpdateFunctor<T, G>()(model_diff + i, updated_unique_values + model_offset,
-                                    updated_unique_values + momentum_offset, scale, l1, l2, beta,
-                                    weight_decay, learning_rate_val);
-    }
-  }
-}
-
-__device__ void GetAdamOffset(const int32_t line_size, const int32_t embedding_size,
-                              int64_t model_diff_offset, int64_t* model_offset, int64_t* m_offset,
-                              int64_t* v_offset) {
-  const int32_t row = model_diff_offset / embedding_size;
-  const int32_t col = model_diff_offset - row * embedding_size;
-  *model_offset = row * line_size + col;
-  *m_offset = *model_offset + embedding_size;
-  *v_offset = *model_offset + 2 * embedding_size;
-}
-
-template<typename T, typename G, typename IDX>
-__global__ void AdamUpdateKernel(const int32_t line_size, const int32_t embedding_size, T scale,
-                                 float l1, float l2, float weight_decay, float beta1, float beta2,
-                                 float epsilon, const float* bias_correction1_ptr,
-                                 const float* bias_correction2_ptr, const IDX* num_unique_ids,
-                                 const float* learning_rate, const T* scale_by_ptr,
-                                 const T* down_scale_by_ptr, const int64_t* skip_if,
-                                 const G* model_diff, const T* unique_values,
-                                 T* updated_unique_values) {
-  if (skip_if != nullptr && *skip_if != 0) {
-    const int64_t n = *num_unique_ids * line_size;
-    CUDA_1D_KERNEL_LOOP(i, n) {
-      // The n is the unique_values elem_cnt, so not need to use GetAdamOffset.
-      updated_unique_values[i] = unique_values[i];
-    }
-  } else {
-    if (scale_by_ptr != nullptr) { scale *= *scale_by_ptr; }
-    if (down_scale_by_ptr != nullptr) { scale /= *down_scale_by_ptr; }
-    float bias_correction1_val = 1.0;
-    float bias_correction2_val = 1.0;
-    if (bias_correction1_ptr != nullptr) { bias_correction1_val = *bias_correction1_ptr; }
-    if (bias_correction2_ptr != nullptr) { bias_correction2_val = *bias_correction2_ptr; }
-    float learning_rate_val = *learning_rate;
-    const int64_t n = *num_unique_ids * embedding_size;
-    // The n is model_diff elem_cnt.
-    CUDA_1D_KERNEL_LOOP(i, n) {
-      int64_t model_offset;
-      int64_t m_offset;
-      int64_t v_offset;
-      GetAdamOffset(line_size, embedding_size, i, &model_offset, &m_offset, &v_offset);
-      updated_unique_values[model_offset] = unique_values[model_offset];
-      updated_unique_values[m_offset] = unique_values[m_offset];
-      updated_unique_values[v_offset] = unique_values[v_offset];
-      AdamUpdateFunctor<T, G>()(model_diff + i, updated_unique_values + model_offset,
-                                updated_unique_values + m_offset, updated_unique_values + v_offset,
-                                nullptr, scale, l1, l2, beta1, beta2, epsilon, weight_decay, false,
-                                bias_correction1_val, bias_correction2_val, learning_rate_val);
-    }
-  }
-}
-
-template<typename T, typename G, typename IDX>
-__global__ void AdagradUpdateKernel(const int64_t line_size, const int64_t embedding_size, T scale,
-                                    float l1, float l2, float weight_decay, float lr_decay,
-                                    float epsilon, const IDX* num_unique_ids,
-                                    const float* learning_rate, const int64_t* train_step_ptr,
-                                    const T* scale_by_ptr, const T* down_scale_by_ptr,
-                                    const int64_t* skip_if, const G* model_diff,
-                                    const T* unique_values, T* updated_unique_values) {
-  if (skip_if != nullptr && *skip_if != 0) {
-    const int64_t n = *num_unique_ids * line_size;
-    CUDA_1D_KERNEL_LOOP(i, n) { updated_unique_values[i] = unique_values[i]; }
-  } else {
-    int64_t train_step = *train_step_ptr + 1;
-    if (scale_by_ptr != nullptr) { scale *= *scale_by_ptr; }
-    if (down_scale_by_ptr != nullptr) { scale /= *down_scale_by_ptr; }
-    float learning_rate_val = *learning_rate;
-    learning_rate_val = learning_rate_val / (1 + (train_step - 1) * lr_decay);
-    const int64_t n = *num_unique_ids * embedding_size;
-    CUDA_1D_KERNEL_LOOP(i, n) {
-      int64_t model_offset;
-      int64_t sum_offset;
-      GetMomentumOffset(line_size, embedding_size, i, &model_offset, &sum_offset);
-      updated_unique_values[model_offset] = unique_values[model_offset];
-      updated_unique_values[sum_offset] = unique_values[sum_offset];
-      AdagradUpdateFunctor<T, G>()(model_diff + i, updated_unique_values + model_offset,
-                                   updated_unique_values + sum_offset, scale, l1, l2, epsilon,
-                                   weight_decay, learning_rate_val);
-    }
-  }
-}
-
-__device__ void GetFtrlOffset(const int32_t line_size, const int32_t embedding_size,
-                              int64_t model_diff_offset, int64_t* model_offset,
-                              int64_t* accumulate_offset, int64_t* z_offset) {
-  const int32_t row = model_diff_offset / embedding_size;
-  const int32_t col = model_diff_offset - row * embedding_size;
-  *model_offset = row * line_size + col;
-  *accumulate_offset = *model_offset + embedding_size;
-  *z_offset = *model_offset + 2 * embedding_size;
-}
-
-template<typename T, typename G, typename IDX>
-__global__ void FtrlUpdateKernel(const int32_t line_size, const int32_t embedding_size, T scale,
-                                 float l1, float l2, float weight_decay, float lr_power,
-                                 float lambda1, float lambda2, float beta,
-                                 const IDX* num_unique_ids, const float* learning_rate,
-                                 const T* down_scale_by_ptr, const int64_t* skip_if,
-                                 const G* model_diff, const T* unique_values,
-                                 T* updated_unique_values) {
-  if (skip_if != nullptr && *skip_if != 0) {
-    const int64_t n = *num_unique_ids * line_size;
-    CUDA_1D_KERNEL_LOOP(i, n) { updated_unique_values[i] = unique_values[i]; }
-  } else {
-    if (down_scale_by_ptr != nullptr) { scale /= *down_scale_by_ptr; }
-    float learning_rate_val = *learning_rate;
-    const int64_t n = *num_unique_ids * embedding_size;
-    CUDA_1D_KERNEL_LOOP(i, n) {
-      int64_t model_offset;
-      int64_t accumulate_offset;
-      int64_t z_offset;
-      GetFtrlOffset(line_size, embedding_size, i, &model_offset, &accumulate_offset, &z_offset);
-      updated_unique_values[model_offset] = unique_values[model_offset];
-      updated_unique_values[accumulate_offset] = unique_values[accumulate_offset];
-      updated_unique_values[z_offset] = unique_values[z_offset];
-      FtrlUpdateFunctor<T, G>()(model_diff + i, updated_unique_values + model_offset,
-                                updated_unique_values + accumulate_offset,
-                                updated_unique_values + z_offset, scale, l1, l2, lr_power, lambda1,
-                                lambda2, beta, weight_decay, learning_rate_val);
-    }
-  }
-}
-
-}  // namespace
-
-template<typename T, typename G, typename IDX>
-class SgdEmbeddingUpdateKernel final : public user_op::OpKernel {
- public:
-  SgdEmbeddingUpdateKernel() = default;
-  ~SgdEmbeddingUpdateKernel() override = default;
-
- private:
-  using user_op::OpKernel::Compute;
-  void Compute(user_op::KernelComputeContext* ctx) const override {
-    const user_op::Tensor* num_unique_ids = ctx->Tensor4ArgNameAndIndex("num_unique_ids", 0);
-    const user_op::Tensor* unique_embeddings = ctx->Tensor4ArgNameAndIndex("unique_embeddings", 0);
-    const user_op::Tensor* embedding_grad = ctx->Tensor4ArgNameAndIndex("embedding_grad", 0);
-    user_op::Tensor* updated_unique_embeddings =
-        ctx->Tensor4ArgNameAndIndex("updated_unique_embeddings", 0);
-    CHECK_EQ(unique_embeddings->shape_view().NumAxes(), 2);
-    CHECK_EQ(embedding_grad->shape_view().NumAxes(), 2);
-    const int64_t line_size = unique_embeddings->shape_view().At(1);
-    const int64_t embedding_size = embedding_grad->shape_view().At(1);
-    CHECK_EQ(line_size, embedding_size);
-    const auto scale = ctx->Attr<double>("scale");
-    const float l1 = ctx->Attr<float>("l1");
-    const float l2 = ctx->Attr<float>("l2");
-    const auto weight_decay = ctx->Attr<float>("weight_decay");
-    const user_op::Tensor* learning_rate = ctx->Tensor4ArgNameAndIndex("learning_rate", 0);
-    const float* learning_rate_ptr = learning_rate->dptr<float>();
-    const T* scale_by_ptr = nullptr;
-    if (ctx->has_input("scale_by_tensor", 0)) {
-      const user_op::Tensor* scale_by_tensor = ctx->Tensor4ArgNameAndIndex("scale_by_tensor", 0);
-      CHECK_EQ(scale_by_tensor->data_type(), unique_embeddings->data_type());
-      CHECK_EQ(scale_by_tensor->shape_view().elem_cnt(), 1);
-      scale_by_ptr = scale_by_tensor->dptr<T>();
-    }
-    const T* down_scale_by_ptr = nullptr;
-    if (ctx->has_input("down_scale_by_tensor", 0)) {
-      const user_op::Tensor* down_scale_by_tensor =
-          ctx->Tensor4ArgNameAndIndex("down_scale_by_tensor", 0);
-      CHECK_EQ(down_scale_by_tensor->data_type(), unique_embeddings->data_type());
-      CHECK_EQ(down_scale_by_tensor->shape_view().elem_cnt(), 1);
-      down_scale_by_ptr = down_scale_by_tensor->dptr<T>();
-    }
-    const int64_t* skip_if_ptr = nullptr;
-    if (ctx->has_input("skip_if", 0)) {
-      const user_op::Tensor* skip_if = ctx->Tensor4ArgNameAndIndex("skip_if", 0);
-      CHECK_EQ(skip_if->shape_view().elem_cnt(), 1);
-      skip_if_ptr = skip_if->dptr<int64_t>();
-    }
-    // update kernel
-    SGDUpdateKernel<T, G, IDX>
-        <<<BlocksNum4ThreadsNum(embedding_grad->shape_view().elem_cnt()), kCudaThreadsNumPerBlock,
-           0, ctx->stream()->As<ep::CudaStream>()->cuda_stream()>>>(
-            embedding_size, scale, l1, l2, weight_decay,
-            reinterpret_cast<const IDX*>(num_unique_ids->dptr()), learning_rate_ptr, scale_by_ptr,
-            down_scale_by_ptr, skip_if_ptr, embedding_grad->dptr<G>(), unique_embeddings->dptr<T>(),
-            updated_unique_embeddings->mut_dptr<T>());
-  }
-  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
-};
-
-#define IDX_DATA_TYPE_SEQ                           \
-  OF_PP_MAKE_TUPLE_SEQ(uint32_t, DataType::kUInt32) \
-  OF_PP_MAKE_TUPLE_SEQ(int32_t, DataType::kInt32)
-
-#define REGISTER_CUDA_SGD_EMBEDDING_UPDATE_KERNEL(t_dtype_pair, g_type_pair, idx_dtype_pair)      \
-  REGISTER_USER_KERNEL("sgd_embedding_update")                                                    \
-      .SetCreateFn<                                                                               \
-          SgdEmbeddingUpdateKernel<OF_PP_PAIR_FIRST(t_dtype_pair), OF_PP_PAIR_FIRST(g_type_pair), \
-                                   OF_PP_PAIR_FIRST(idx_dtype_pair)>>()                           \
-      .SetIsMatchedHob(                                                                           \
-          (user_op::HobDeviceType() == DeviceType::kCUDA)                                         \
-          && (user_op::HobDataType("num_unique_ids", 0) == OF_PP_PAIR_SECOND(idx_dtype_pair))     \
-          && (user_op::HobDataType("embedding_grad", 0) == OF_PP_PAIR_SECOND(g_type_pair))        \
-          && (user_op::HobDataType("unique_embeddings", 0) == OF_PP_PAIR_SECOND(t_dtype_pair)));
-
-OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_CUDA_SGD_EMBEDDING_UPDATE_KERNEL, FLOATING_DATA_TYPE_SEQ,
-                                //  FLOATING_DATA_TYPE_SEQ HALF_DATA_TYPE_SEQ, IDX_DATA_TYPE_SEQ)
-                                FLOATING_DATA_TYPE_SEQ, IDX_DATA_TYPE_SEQ)
-template<typename T, typename G, typename IDX>
-class MomentumEmbeddingUpdateKernel final : public user_op::OpKernel {
- public:
-  MomentumEmbeddingUpdateKernel() = default;
-  ~MomentumEmbeddingUpdateKernel() override = default;
-
- private:
-  using user_op::OpKernel::Compute;
-  void Compute(user_op::KernelComputeContext* ctx) const override {
-    const user_op::Tensor* num_unique_ids = ctx->Tensor4ArgNameAndIndex("num_unique_ids", 0);
-    const user_op::Tensor* unique_embeddings = ctx->Tensor4ArgNameAndIndex("unique_embeddings", 0);
-    const user_op::Tensor* embedding_grad = ctx->Tensor4ArgNameAndIndex("embedding_grad", 0);
-    user_op::Tensor* updated_unique_embeddings =
-        ctx->Tensor4ArgNameAndIndex("updated_unique_embeddings", 0);
-    CHECK_EQ(unique_embeddings->shape_view().NumAxes(), 2);
-    CHECK_EQ(embedding_grad->shape_view().NumAxes(), 2);
-    const int64_t num_keys = unique_embeddings->shape_view().At(0);
-    const int64_t line_size = unique_embeddings->shape_view().At(1);
-    const int64_t embedding_size = embedding_grad->shape_view().At(1);
-    CHECK_EQ(line_size, embedding_size * 2);
-    const float l1 = ctx->Attr<float>("l1");
-    const float l2 = ctx->Attr<float>("l2");
-    const auto weight_decay = ctx->Attr<float>("weight_decay");
-    const auto beta = ctx->Attr<float>("beta");
-    const auto scale = ctx->Attr<double>("scale");
-    const T* scale_by_ptr = nullptr;
-    if (ctx->has_input("scale_by_tensor", 0)) {
-      const user_op::Tensor* scale_by_tensor = ctx->Tensor4ArgNameAndIndex("scale_by_tensor", 0);
-      CHECK_EQ(scale_by_tensor->data_type(), unique_embeddings->data_type());
-      CHECK_EQ(scale_by_tensor->shape_view().elem_cnt(), 1);
-      scale_by_ptr = scale_by_tensor->dptr<T>();
-    }
-    const T* down_scale_by_ptr = nullptr;
-    if (ctx->has_input("down_scale_by_tensor", 0)) {
-      const user_op::Tensor* down_scale_by_tensor =
-          ctx->Tensor4ArgNameAndIndex("down_scale_by_tensor", 0);
-      CHECK_EQ(down_scale_by_tensor->data_type(), unique_embeddings->data_type());
-      CHECK_EQ(down_scale_by_tensor->shape_view().elem_cnt(), 1);
-      down_scale_by_ptr = down_scale_by_tensor->dptr<T>();
-    }
-    const user_op::Tensor* learning_rate = ctx->Tensor4ArgNameAndIndex("learning_rate", 0);
-    const float* learning_rate_ptr = learning_rate->dptr<float>();
-    const int64_t* skip_if_ptr = nullptr;
-    if (ctx->has_input("skip_if", 0)) {
-      const user_op::Tensor* skip_if = ctx->Tensor4ArgNameAndIndex("skip_if", 0);
-      CHECK_EQ(skip_if->shape_view().elem_cnt(), 1);
-      skip_if_ptr = skip_if->dptr<int64_t>();
-    }
-    // update kernel
-    MomentumUpdateKernel<T, G, IDX>
-        <<<BlocksNum4ThreadsNum(embedding_grad->shape_view().elem_cnt()), kCudaThreadsNumPerBlock,
-           0, ctx->stream()->As<ep::CudaStream>()->cuda_stream()>>>(
-            line_size, embedding_size, scale, l1, l2, weight_decay, beta,
-            reinterpret_cast<const IDX*>(num_unique_ids->dptr()), learning_rate_ptr, scale_by_ptr,
-            down_scale_by_ptr, skip_if_ptr, embedding_grad->dptr<G>(), unique_embeddings->dptr<T>(),
-            updated_unique_embeddings->mut_dptr<T>());
-  }
-  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
-};
-
-#define REGISTER_CUDA_MOMENTUM_EMBEDDING_UPDATE_KERNEL(t_dtype_pair, g_type_pair, idx_dtype_pair) \
-  REGISTER_USER_KERNEL("momentum_embedding_update")                                               \
-      .SetCreateFn<MomentumEmbeddingUpdateKernel<OF_PP_PAIR_FIRST(t_dtype_pair),                  \
-                                                 OF_PP_PAIR_FIRST(g_type_pair),                   \
-                                                 OF_PP_PAIR_FIRST(idx_dtype_pair)>>()             \
-      .SetIsMatchedHob(                                                                           \
-          (user_op::HobDeviceType() == DeviceType::kCUDA)                                         \
-          && (user_op::HobDataType("num_unique_ids", 0) == OF_PP_PAIR_SECOND(idx_dtype_pair))     \
-          && (user_op::HobDataType("embedding_grad", 0) == OF_PP_PAIR_SECOND(g_type_pair))        \
-          && (user_op::HobDataType("unique_embeddings", 0) == OF_PP_PAIR_SECOND(t_dtype_pair)));
-
-OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_CUDA_MOMENTUM_EMBEDDING_UPDATE_KERNEL,
-                                //  FLOATING_DATA_TYPE_SEQ, FLOATING_DATA_TYPE_SEQ HALF_DATA_TYPE_SEQ,
-                                FLOATING_DATA_TYPE_SEQ, FLOATING_DATA_TYPE_SEQ,
-                                IDX_DATA_TYPE_SEQ)
-
-template<typename T, typename G, typename IDX>
-class AdamEmbeddingUpdateKernel final : public user_op::OpKernel {
- public:
-  AdamEmbeddingUpdateKernel() = default;
-  ~AdamEmbeddingUpdateKernel() override = default;
-
- private:
-  using user_op::OpKernel::Compute;
-  void Compute(user_op::KernelComputeContext* ctx) const override {
-    const user_op::Tensor* num_unique_ids = ctx->Tensor4ArgNameAndIndex("num_unique_ids", 0);
-    const user_op::Tensor* unique_embeddings = ctx->Tensor4ArgNameAndIndex("unique_embeddings", 0);
-    const user_op::Tensor* embedding_grad = ctx->Tensor4ArgNameAndIndex("embedding_grad", 0);
-    user_op::Tensor* updated_unique_embeddings =
-        ctx->Tensor4ArgNameAndIndex("updated_unique_embeddings", 0);
-    CHECK_EQ(unique_embeddings->shape_view().NumAxes(), 2);
-    CHECK_EQ(embedding_grad->shape_view().NumAxes(), 2);
-    const int64_t num_keys = unique_embeddings->shape_view().At(0);
-    const int64_t line_size = unique_embeddings->shape_view().At(1);
-    const int64_t embedding_size = embedding_grad->shape_view().At(1);
-    CHECK_EQ(line_size, embedding_size * 3);
-
-    const float l1 = ctx->Attr<float>("l1");
-    const float l2 = ctx->Attr<float>("l2");
-    const auto weight_decay = ctx->Attr<float>("weight_decay");
-    const auto beta1 = ctx->Attr<float>("beta1");
-    const auto beta2 = ctx->Attr<float>("beta2");
-    const auto epsilon = ctx->Attr<float>("epsilon");
-    const bool do_bias_correction = ctx->Attr<bool>("do_bias_correction");
-    const auto scale = ctx->Attr<double>("scale");
-    const T* scale_by_ptr = nullptr;
-    if (ctx->has_input("scale_by_tensor", 0)) {
-      const user_op::Tensor* scale_by_tensor = ctx->Tensor4ArgNameAndIndex("scale_by_tensor", 0);
-      CHECK_EQ(scale_by_tensor->data_type(), unique_embeddings->data_type());
-      CHECK_EQ(scale_by_tensor->shape_view().elem_cnt(), 1);
-      scale_by_ptr = scale_by_tensor->dptr<T>();
-    }
-    const T* down_scale_by_ptr = nullptr;
-    if (ctx->has_input("down_scale_by_tensor", 0)) {
-      const user_op::Tensor* down_scale_by_tensor =
-          ctx->Tensor4ArgNameAndIndex("down_scale_by_tensor", 0);
-      CHECK_EQ(down_scale_by_tensor->data_type(), unique_embeddings->data_type());
-      CHECK_EQ(down_scale_by_tensor->shape_view().elem_cnt(), 1);
-      down_scale_by_ptr = down_scale_by_tensor->dptr<T>();
-    }
-    const user_op::Tensor* learning_rate = ctx->Tensor4ArgNameAndIndex("learning_rate", 0);
-    const float* learning_rate_ptr = learning_rate->dptr<float>();
-    const int64_t* skip_if_ptr = nullptr;
-    if (ctx->has_input("skip_if", 0)) {
-      const user_op::Tensor* skip_if = ctx->Tensor4ArgNameAndIndex("skip_if", 0);
-      CHECK_EQ(skip_if->shape_view().elem_cnt(), 1);
-      skip_if_ptr = skip_if->dptr<int64_t>();
-    }
-    const float* bias_correction1_ptr = nullptr;
-    if (ctx->has_input("bias_correction1", 0)) {
-      bias_correction1_ptr = ctx->Tensor4ArgNameAndIndex("bias_correction1", 0)->dptr<float>();
-    }
-    const float* bias_correction2_ptr = nullptr;
-    if (ctx->has_input("bias_correction2", 0)) {
-      bias_correction2_ptr = ctx->Tensor4ArgNameAndIndex("bias_correction2", 0)->dptr<float>();
-    }
-    // update kernel
-    AdamUpdateKernel<T, G, IDX>
-        <<<BlocksNum4ThreadsNum(embedding_grad->shape_view().elem_cnt()), kCudaThreadsNumPerBlock,
-           0, ctx->stream()->As<ep::CudaStream>()->cuda_stream()>>>(
-            line_size, embedding_size, static_cast<T>(scale), l1, l2, weight_decay, beta1, beta2,
-            epsilon, bias_correction1_ptr, bias_correction2_ptr,
-            reinterpret_cast<const IDX*>(num_unique_ids->dptr()), learning_rate_ptr, scale_by_ptr,
-            down_scale_by_ptr, skip_if_ptr, embedding_grad->dptr<G>(), unique_embeddings->dptr<T>(),
-            updated_unique_embeddings->mut_dptr<T>());
-  }
-  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
-};
-
-#define REGISTER_CUDA_ADAM_EMBEDDING_UPDATE_KERNEL(t_dtype_pair, g_type_pair, idx_dtype_pair)      \
-  REGISTER_USER_KERNEL("adam_embedding_update")                                                    \
-      .SetCreateFn<                                                                                \
-          AdamEmbeddingUpdateKernel<OF_PP_PAIR_FIRST(t_dtype_pair), OF_PP_PAIR_FIRST(g_type_pair), \
-                                    OF_PP_PAIR_FIRST(idx_dtype_pair)>>()                           \
-      .SetIsMatchedHob(                                                                            \
-          (user_op::HobDeviceType() == DeviceType::kCUDA)                                          \
-          && (user_op::HobDataType("num_unique_ids", 0) == OF_PP_PAIR_SECOND(idx_dtype_pair))      \
-          && (user_op::HobDataType("embedding_grad", 0) == OF_PP_PAIR_SECOND(g_type_pair))         \
-          && (user_op::HobDataType("unique_embeddings", 0) == OF_PP_PAIR_SECOND(t_dtype_pair)));
-
-OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_CUDA_ADAM_EMBEDDING_UPDATE_KERNEL, FLOATING_DATA_TYPE_SEQ,
-                                //  FLOATING_DATA_TYPE_SEQ HALF_DATA_TYPE_SEQ, IDX_DATA_TYPE_SEQ)
-                                FLOATING_DATA_TYPE_SEQ, IDX_DATA_TYPE_SEQ)
-
-template<typename T, typename G, typename IDX>
-class AdagradEmbeddingUpdateKernel final : public user_op::OpKernel {
- public:
-  AdagradEmbeddingUpdateKernel() = default;
-  ~AdagradEmbeddingUpdateKernel() override = default;
-
- private:
-  using user_op::OpKernel::Compute;
-  void Compute(user_op::KernelComputeContext* ctx) const override {
-    const user_op::Tensor* num_unique_ids = ctx->Tensor4ArgNameAndIndex("num_unique_ids", 0);
-    const user_op::Tensor* unique_embeddings = ctx->Tensor4ArgNameAndIndex("unique_embeddings", 0);
-    const user_op::Tensor* embedding_grad = ctx->Tensor4ArgNameAndIndex("embedding_grad", 0);
-    user_op::Tensor* updated_unique_embeddings =
-        ctx->Tensor4ArgNameAndIndex("updated_unique_embeddings", 0);
-    CHECK_EQ(unique_embeddings->shape_view().NumAxes(), 2);
-    CHECK_EQ(embedding_grad->shape_view().NumAxes(), 2);
-    const int64_t num_keys = unique_embeddings->shape_view().At(0);
-    const int64_t line_size = unique_embeddings->shape_view().At(1);
-    const int64_t embedding_size = embedding_grad->shape_view().At(1);
-    CHECK_EQ(line_size, embedding_size * 2);
-
-    const float l1 = ctx->Attr<float>("l1");
-    const float l2 = ctx->Attr<float>("l2");
-    const auto weight_decay = ctx->Attr<float>("weight_decay");
-    const auto lr_decay = ctx->Attr<float>("lr_decay");
-    const auto epsilon = ctx->Attr<float>("epsilon");
-    const auto scale = ctx->Attr<double>("scale");
-    const T* scale_by_ptr = nullptr;
-    if (ctx->has_input("scale_by_tensor", 0)) {
-      const user_op::Tensor* scale_by_tensor = ctx->Tensor4ArgNameAndIndex("scale_by_tensor", 0);
-      CHECK_EQ(scale_by_tensor->data_type(), unique_embeddings->data_type());
-      CHECK_EQ(scale_by_tensor->shape_view().elem_cnt(), 1);
-      scale_by_ptr = scale_by_tensor->dptr<T>();
-    }
-    const T* down_scale_by_ptr = nullptr;
-    if (ctx->has_input("down_scale_by_tensor", 0)) {
-      const user_op::Tensor* down_scale_by_tensor =
-          ctx->Tensor4ArgNameAndIndex("down_scale_by_tensor", 0);
-      CHECK_EQ(down_scale_by_tensor->data_type(), unique_embeddings->data_type());
-      CHECK_EQ(down_scale_by_tensor->shape_view().elem_cnt(), 1);
-      down_scale_by_ptr = down_scale_by_tensor->dptr<T>();
-    }
-    const user_op::Tensor* learning_rate = ctx->Tensor4ArgNameAndIndex("learning_rate", 0);
-    const float* learning_rate_ptr = learning_rate->dptr<float>();
-    const int64_t* train_step_ptr = ctx->Tensor4ArgNameAndIndex("train_step", 0)->dptr<int64_t>();
-    const int64_t* skip_if_ptr = nullptr;
-    if (ctx->has_input("skip_if", 0)) {
-      const user_op::Tensor* skip_if = ctx->Tensor4ArgNameAndIndex("skip_if", 0);
-      CHECK_EQ(skip_if->shape_view().elem_cnt(), 1);
-      skip_if_ptr = skip_if->dptr<int64_t>();
-    }
-    // update kernel
-    AdagradUpdateKernel<T, G, IDX>
-        <<<BlocksNum4ThreadsNum(embedding_grad->shape_view().elem_cnt()), kCudaThreadsNumPerBlock,
-           0, ctx->stream()->As<ep::CudaStream>()->cuda_stream()>>>(
-            line_size, embedding_size, static_cast<T>(scale), l1, l2, weight_decay, lr_decay,
-            epsilon, reinterpret_cast<const IDX*>(num_unique_ids->dptr()), learning_rate_ptr,
-            train_step_ptr, scale_by_ptr, down_scale_by_ptr, skip_if_ptr, embedding_grad->dptr<G>(),
-            unique_embeddings->dptr<T>(), updated_unique_embeddings->mut_dptr<T>());
-  }
-  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
-};
-
-#define REGISTER_CUDA_ADAGRAD_EMBEDDING_UPDATE_KERNEL(t_dtype_pair, g_type_pair, idx_dtype_pair) \
-  REGISTER_USER_KERNEL("adagrad_embedding_update")                                               \
-      .SetCreateFn<AdagradEmbeddingUpdateKernel<OF_PP_PAIR_FIRST(t_dtype_pair),                  \
-                                                OF_PP_PAIR_FIRST(g_type_pair),                   \
-                                                OF_PP_PAIR_FIRST(idx_dtype_pair)>>()             \
-      .SetIsMatchedHob(                                                                          \
-          (user_op::HobDeviceType() == DeviceType::kCUDA)                                        \
-          && (user_op::HobDataType("num_unique_ids", 0) == OF_PP_PAIR_SECOND(idx_dtype_pair))    \
-          && (user_op::HobDataType("embedding_grad", 0) == OF_PP_PAIR_SECOND(g_type_pair))       \
-          && (user_op::HobDataType("unique_embeddings", 0) == OF_PP_PAIR_SECOND(t_dtype_pair)));
-
-OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_CUDA_ADAGRAD_EMBEDDING_UPDATE_KERNEL,
-                                //  FLOATING_DATA_TYPE_SEQ, FLOATING_DATA_TYPE_SEQ HALF_DATA_TYPE_SEQ,
-                                FLOATING_DATA_TYPE_SEQ, FLOATING_DATA_TYPE_SEQ,
-                                IDX_DATA_TYPE_SEQ)
-
-template<typename T, typename G, typename IDX>
-class FtrlEmbeddingUpdateKernel final : public user_op::OpKernel {
- public:
-  FtrlEmbeddingUpdateKernel() = default;
-  ~FtrlEmbeddingUpdateKernel() override = default;
-
- private:
-  using user_op::OpKernel::Compute;
-  void Compute(user_op::KernelComputeContext* ctx) const override {
-    const user_op::Tensor* num_unique_ids = ctx->Tensor4ArgNameAndIndex("num_unique_ids", 0);
-    const user_op::Tensor* unique_embeddings = ctx->Tensor4ArgNameAndIndex("unique_embeddings", 0);
-    const user_op::Tensor* embedding_grad = ctx->Tensor4ArgNameAndIndex("embedding_grad", 0);
-    user_op::Tensor* updated_unique_embeddings =
-        ctx->Tensor4ArgNameAndIndex("updated_unique_embeddings", 0);
-    CHECK_EQ(unique_embeddings->shape_view().NumAxes(), 2)
-        << "The NumAxes of unique_embedding should be equal to 2. ";
-    CHECK_EQ(embedding_grad->shape_view().NumAxes(), 2)
-        << "The NumAxes of embedding_grad should be equal to 2. ";
-    const int64_t num_keys = unique_embeddings->shape_view().At(0);
-    const int64_t line_size = unique_embeddings->shape_view().At(1);
-    const int64_t embedding_size = embedding_grad->shape_view().At(1);
-    CHECK_EQ(line_size, embedding_size * 3)
-        << "The line_size should be equal to 3 x embedding_size. ";
-    const float l1 = 0.0;
-    const float l2 = 0.0;
-    const float weight_decay = ctx->Attr<float>("weight_decay");
-    // TODO(zhengzekang): Undefined behavior for ftrl optimizer with weight_decay in `abs(new_z_val)
-    // < lambda1` condition.
-    CHECK_EQ(weight_decay, static_cast<float>(0.0))
-        << "Currently not support for setting weight decay. ";
-    const float lr_power = ctx->Attr<float>("lr_power");
-    const float lambda1 = ctx->Attr<float>("lambda1");
-    const float lambda2 = ctx->Attr<float>("lambda2");
-    const float beta = ctx->Attr<float>("beta");
-    const double scale = ctx->Attr<double>("scale");
-    const T* down_scale_by_ptr = nullptr;
-    if (ctx->has_input("down_scale_by_tensor", 0)) {
-      const user_op::Tensor* down_scale_by_tensor =
-          ctx->Tensor4ArgNameAndIndex("down_scale_by_tensor", 0);
-      CHECK_EQ(down_scale_by_tensor->data_type(), unique_embeddings->data_type());
-      CHECK_EQ(down_scale_by_tensor->shape_view().elem_cnt(), 1);
-      down_scale_by_ptr = down_scale_by_tensor->dptr<T>();
-    }
-    const user_op::Tensor* learning_rate = ctx->Tensor4ArgNameAndIndex("learning_rate", 0);
-    const float* learning_rate_ptr = learning_rate->dptr<float>();
-    const int64_t* skip_if_ptr = nullptr;
-    if (ctx->has_input("skip_if", 0)) {
-      const user_op::Tensor* skip_if = ctx->Tensor4ArgNameAndIndex("skip_if", 0);
-      CHECK_EQ(skip_if->shape_view().elem_cnt(), 1);
-      skip_if_ptr = skip_if->dptr<int64_t>();
-    }
-    // update kernel
-    FtrlUpdateKernel<T, G, IDX>
-        <<<BlocksNum4ThreadsNum(embedding_grad->shape_view().elem_cnt()), kCudaThreadsNumPerBlock,
-           0, ctx->stream()->As<ep::CudaStream>()->cuda_stream()>>>(
-            line_size, embedding_size, static_cast<T>(scale), l1, l2, weight_decay, lr_power,
-            lambda1, lambda2, beta, reinterpret_cast<const IDX*>(num_unique_ids->dptr()),
-            learning_rate_ptr, down_scale_by_ptr, skip_if_ptr, embedding_grad->dptr<G>(),
-            unique_embeddings->dptr<T>(), updated_unique_embeddings->mut_dptr<T>());
-  }
-  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
-};
-#define REGISTER_CUDA_FTRL_EMBEDDING_UPDATE_KERNEL(t_dtype_pair, g_type_pair, idx_dtype_pair)      \
-  REGISTER_USER_KERNEL("ftrl_embedding_update")                                                    \
-      .SetCreateFn<                                                                                \
-          FtrlEmbeddingUpdateKernel<OF_PP_PAIR_FIRST(t_dtype_pair), OF_PP_PAIR_FIRST(g_type_pair), \
-                                    OF_PP_PAIR_FIRST(idx_dtype_pair)>>()                           \
-      .SetIsMatchedHob(                                                                            \
-          (user_op::HobDeviceType() == DeviceType::kCUDA)                                          \
-          && (user_op::HobDataType("num_unique_ids", 0) == OF_PP_PAIR_SECOND(idx_dtype_pair))      \
-          && (user_op::HobDataType("embedding_grad", 0) == OF_PP_PAIR_SECOND(g_type_pair))         \
-          && (user_op::HobDataType("unique_embeddings", 0) == OF_PP_PAIR_SECOND(t_dtype_pair)));
-OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_CUDA_FTRL_EMBEDDING_UPDATE_KERNEL, FLOATING_DATA_TYPE_SEQ,
-                                //  FLOATING_DATA_TYPE_SEQ HALF_DATA_TYPE_SEQ, IDX_DATA_TYPE_SEQ)
-                                FLOATING_DATA_TYPE_SEQ, IDX_DATA_TYPE_SEQ)
-
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "hip/hip_runtime.h"
+#include "oneflow/core/framework/framework.h"
+#include "oneflow/core/device/cuda_util.h"
+#include "oneflow/user/kernels/model_update_kernel_util.h"
+
+namespace oneflow {
+
+namespace {
+
+template<typename T, typename G, typename IDX>
+__global__ void SGDUpdateKernel(const int64_t embedding_size, T scale, float l1, float l2,
+                                float weight_decay, const IDX* num_unique_ids,
+                                const float* learning_rate, const T* scale_by_ptr,
+                                const T* down_scale_by_ptr, const int64_t* skip_if,
+                                const G* model_diff, const T* model, T* updated_model) {
+  if (skip_if != nullptr && *skip_if != 0) {
+    const int64_t n = *num_unique_ids * embedding_size;
+    CUDA_1D_KERNEL_LOOP(i, n) { updated_model[i] = model[i]; }
+  } else {
+    if (scale_by_ptr != nullptr) { scale *= *scale_by_ptr; }
+    if (down_scale_by_ptr != nullptr) { scale /= *down_scale_by_ptr; }
+    float learning_rate_val = *learning_rate;
+    const int64_t n = *num_unique_ids * embedding_size;
+    CUDA_1D_KERNEL_LOOP(i, n) {
+      updated_model[i] = model[i];
+      SGDUpdateFunctor<T, G>()(model_diff + i, updated_model + i, scale, l1, l2, weight_decay,
+                               learning_rate_val);
+    }
+  }
+}
+
+__device__ void GetMomentumOffset(const int32_t line_size, const int32_t embedding_size,
+                                  int64_t model_diff_offset, int64_t* model_offset,
+                                  int64_t* momentum_offset) {
+  const int32_t row = model_diff_offset / embedding_size;
+  const int32_t col = model_diff_offset - row * embedding_size;
+  *model_offset = row * line_size + col;
+  *momentum_offset = *model_offset + embedding_size;
+}
+
+template<typename T, typename G, typename IDX>
+__global__ void MomentumUpdateKernel(const int64_t line_size, const int64_t embedding_size, T scale,
+                                     float l1, float l2, float weight_decay, float beta,
+                                     const IDX* num_unique_ids, const float* learning_rate,
+                                     const T* scale_by_ptr, const T* down_scale_by_ptr,
+                                     const int64_t* skip_if, const G* model_diff,
+                                     const T* unique_values, T* updated_unique_values) {
+  if (skip_if != nullptr && *skip_if != 0) {
+    const int64_t n = *num_unique_ids * line_size;
+    CUDA_1D_KERNEL_LOOP(i, n) { updated_unique_values[i] = unique_values[i]; }
+  } else {
+    if (scale_by_ptr != nullptr) { scale *= *scale_by_ptr; }
+    if (down_scale_by_ptr != nullptr) { scale /= *down_scale_by_ptr; }
+    float learning_rate_val = *learning_rate;
+    const int64_t n = *num_unique_ids * embedding_size;
+    CUDA_1D_KERNEL_LOOP(i, n) {
+      int64_t model_offset;
+      int64_t momentum_offset;
+      GetMomentumOffset(line_size, embedding_size, i, &model_offset, &momentum_offset);
+      updated_unique_values[model_offset] = unique_values[model_offset];
+      updated_unique_values[momentum_offset] = unique_values[momentum_offset];
+      MomentumUpdateFunctor<T, G>()(model_diff + i, updated_unique_values + model_offset,
+                                    updated_unique_values + momentum_offset, scale, l1, l2, beta,
+                                    weight_decay, learning_rate_val);
+    }
+  }
+}
+
+__device__ void GetAdamOffset(const int32_t line_size, const int32_t embedding_size,
+                              int64_t model_diff_offset, int64_t* model_offset, int64_t* m_offset,
+                              int64_t* v_offset) {
+  const int32_t row = model_diff_offset / embedding_size;
+  const int32_t col = model_diff_offset - row * embedding_size;
+  *model_offset = row * line_size + col;
+  *m_offset = *model_offset + embedding_size;
+  *v_offset = *model_offset + 2 * embedding_size;
+}
+
+template<typename T, typename G, typename IDX>
+__global__ void AdamUpdateKernel(const int32_t line_size, const int32_t embedding_size, T scale,
+                                 float l1, float l2, float weight_decay, float beta1, float beta2,
+                                 float epsilon, const float* bias_correction1_ptr,
+                                 const float* bias_correction2_ptr, const IDX* num_unique_ids,
+                                 const float* learning_rate, const T* scale_by_ptr,
+                                 const T* down_scale_by_ptr, const int64_t* skip_if,
+                                 const G* model_diff, const T* unique_values,
+                                 T* updated_unique_values) {
+  if (skip_if != nullptr && *skip_if != 0) {
+    const int64_t n = *num_unique_ids * line_size;
+    CUDA_1D_KERNEL_LOOP(i, n) {
+      // The n is the unique_values elem_cnt, so not need to use GetAdamOffset.
+      updated_unique_values[i] = unique_values[i];
+    }
+  } else {
+    if (scale_by_ptr != nullptr) { scale *= *scale_by_ptr; }
+    if (down_scale_by_ptr != nullptr) { scale /= *down_scale_by_ptr; }
+    float bias_correction1_val = 1.0;
+    float bias_correction2_val = 1.0;
+    if (bias_correction1_ptr != nullptr) { bias_correction1_val = *bias_correction1_ptr; }
+    if (bias_correction2_ptr != nullptr) { bias_correction2_val = *bias_correction2_ptr; }
+    float learning_rate_val = *learning_rate;
+    const int64_t n = *num_unique_ids * embedding_size;
+    // The n is model_diff elem_cnt.
+    CUDA_1D_KERNEL_LOOP(i, n) {
+      int64_t model_offset;
+      int64_t m_offset;
+      int64_t v_offset;
+      GetAdamOffset(line_size, embedding_size, i, &model_offset, &m_offset, &v_offset);
+      updated_unique_values[model_offset] = unique_values[model_offset];
+      updated_unique_values[m_offset] = unique_values[m_offset];
+      updated_unique_values[v_offset] = unique_values[v_offset];
+      AdamUpdateFunctor<T, G>()(model_diff + i, updated_unique_values + model_offset,
+                                updated_unique_values + m_offset, updated_unique_values + v_offset,
+                                nullptr, scale, l1, l2, beta1, beta2, epsilon, weight_decay, false,
+                                bias_correction1_val, bias_correction2_val, learning_rate_val);
+    }
+  }
+}
+
+template<typename T, typename G, typename IDX>
+__global__ void AdagradUpdateKernel(const int64_t line_size, const int64_t embedding_size, T scale,
+                                    float l1, float l2, float weight_decay, float lr_decay,
+                                    float epsilon, const IDX* num_unique_ids,
+                                    const float* learning_rate, const int64_t* train_step_ptr,
+                                    const T* scale_by_ptr, const T* down_scale_by_ptr,
+                                    const int64_t* skip_if, const G* model_diff,
+                                    const T* unique_values, T* updated_unique_values) {
+  if (skip_if != nullptr && *skip_if != 0) {
+    const int64_t n = *num_unique_ids * line_size;
+    CUDA_1D_KERNEL_LOOP(i, n) { updated_unique_values[i] = unique_values[i]; }
+  } else {
+    int64_t train_step = *train_step_ptr + 1;
+    if (scale_by_ptr != nullptr) { scale *= *scale_by_ptr; }
+    if (down_scale_by_ptr != nullptr) { scale /= *down_scale_by_ptr; }
+    float learning_rate_val = *learning_rate;
+    learning_rate_val = learning_rate_val / (1 + (train_step - 1) * lr_decay);
+    const int64_t n = *num_unique_ids * embedding_size;
+    CUDA_1D_KERNEL_LOOP(i, n) {
+      int64_t model_offset;
+      int64_t sum_offset;
+      GetMomentumOffset(line_size, embedding_size, i, &model_offset, &sum_offset);
+      updated_unique_values[model_offset] = unique_values[model_offset];
+      updated_unique_values[sum_offset] = unique_values[sum_offset];
+      AdagradUpdateFunctor<T, G>()(model_diff + i, updated_unique_values + model_offset,
+                                   updated_unique_values + sum_offset, scale, l1, l2, epsilon,
+                                   weight_decay, learning_rate_val);
+    }
+  }
+}
+
+__device__ void GetFtrlOffset(const int32_t line_size, const int32_t embedding_size,
+                              int64_t model_diff_offset, int64_t* model_offset,
+                              int64_t* accumulate_offset, int64_t* z_offset) {
+  const int32_t row = model_diff_offset / embedding_size;
+  const int32_t col = model_diff_offset - row * embedding_size;
+  *model_offset = row * line_size + col;
+  *accumulate_offset = *model_offset + embedding_size;
+  *z_offset = *model_offset + 2 * embedding_size;
+}
+
+template<typename T, typename G, typename IDX>
+__global__ void FtrlUpdateKernel(const int32_t line_size, const int32_t embedding_size, T scale,
+                                 float l1, float l2, float weight_decay, float lr_power,
+                                 float lambda1, float lambda2, float beta,
+                                 const IDX* num_unique_ids, const float* learning_rate,
+                                 const T* down_scale_by_ptr, const int64_t* skip_if,
+                                 const G* model_diff, const T* unique_values,
+                                 T* updated_unique_values) {
+  if (skip_if != nullptr && *skip_if != 0) {
+    const int64_t n = *num_unique_ids * line_size;
+    CUDA_1D_KERNEL_LOOP(i, n) { updated_unique_values[i] = unique_values[i]; }
+  } else {
+    if (down_scale_by_ptr != nullptr) { scale /= *down_scale_by_ptr; }
+    float learning_rate_val = *learning_rate;
+    const int64_t n = *num_unique_ids * embedding_size;
+    CUDA_1D_KERNEL_LOOP(i, n) {
+      int64_t model_offset;
+      int64_t accumulate_offset;
+      int64_t z_offset;
+      GetFtrlOffset(line_size, embedding_size, i, &model_offset, &accumulate_offset, &z_offset);
+      updated_unique_values[model_offset] = unique_values[model_offset];
+      updated_unique_values[accumulate_offset] = unique_values[accumulate_offset];
+      updated_unique_values[z_offset] = unique_values[z_offset];
+      FtrlUpdateFunctor<T, G>()(model_diff + i, updated_unique_values + model_offset,
+                                updated_unique_values + accumulate_offset,
+                                updated_unique_values + z_offset, scale, l1, l2, lr_power, lambda1,
+                                lambda2, beta, weight_decay, learning_rate_val);
+    }
+  }
+}
+
+}  // namespace
+
+template<typename T, typename G, typename IDX>
+class SgdEmbeddingUpdateKernel final : public user_op::OpKernel {
+ public:
+  SgdEmbeddingUpdateKernel() = default;
+  ~SgdEmbeddingUpdateKernel() override = default;
+
+ private:
+  using user_op::OpKernel::Compute;
+  void Compute(user_op::KernelComputeContext* ctx) const override {
+    const user_op::Tensor* num_unique_ids = ctx->Tensor4ArgNameAndIndex("num_unique_ids", 0);
+    const user_op::Tensor* unique_embeddings = ctx->Tensor4ArgNameAndIndex("unique_embeddings", 0);
+    const user_op::Tensor* embedding_grad = ctx->Tensor4ArgNameAndIndex("embedding_grad", 0);
+    user_op::Tensor* updated_unique_embeddings =
+        ctx->Tensor4ArgNameAndIndex("updated_unique_embeddings", 0);
+    CHECK_EQ(unique_embeddings->shape_view().NumAxes(), 2);
+    CHECK_EQ(embedding_grad->shape_view().NumAxes(), 2);
+    const int64_t line_size = unique_embeddings->shape_view().At(1);
+    const int64_t embedding_size = embedding_grad->shape_view().At(1);
+    CHECK_EQ(line_size, embedding_size);
+    const auto scale = ctx->Attr<double>("scale");
+    const float l1 = ctx->Attr<float>("l1");
+    const float l2 = ctx->Attr<float>("l2");
+    const auto weight_decay = ctx->Attr<float>("weight_decay");
+    const user_op::Tensor* learning_rate = ctx->Tensor4ArgNameAndIndex("learning_rate", 0);
+    const float* learning_rate_ptr = learning_rate->dptr<float>();
+    const T* scale_by_ptr = nullptr;
+    if (ctx->has_input("scale_by_tensor", 0)) {
+      const user_op::Tensor* scale_by_tensor = ctx->Tensor4ArgNameAndIndex("scale_by_tensor", 0);
+      CHECK_EQ(scale_by_tensor->data_type(), unique_embeddings->data_type());
+      CHECK_EQ(scale_by_tensor->shape_view().elem_cnt(), 1);
+      scale_by_ptr = scale_by_tensor->dptr<T>();
+    }
+    const T* down_scale_by_ptr = nullptr;
+    if (ctx->has_input("down_scale_by_tensor", 0)) {
+      const user_op::Tensor* down_scale_by_tensor =
+          ctx->Tensor4ArgNameAndIndex("down_scale_by_tensor", 0);
+      CHECK_EQ(down_scale_by_tensor->data_type(), unique_embeddings->data_type());
+      CHECK_EQ(down_scale_by_tensor->shape_view().elem_cnt(), 1);
+      down_scale_by_ptr = down_scale_by_tensor->dptr<T>();
+    }
+    const int64_t* skip_if_ptr = nullptr;
+    if (ctx->has_input("skip_if", 0)) {
+      const user_op::Tensor* skip_if = ctx->Tensor4ArgNameAndIndex("skip_if", 0);
+      CHECK_EQ(skip_if->shape_view().elem_cnt(), 1);
+      skip_if_ptr = skip_if->dptr<int64_t>();
+    }
+    // update kernel
+    SGDUpdateKernel<T, G, IDX>
+        <<<BlocksNum4ThreadsNum(embedding_grad->shape_view().elem_cnt()), kCudaThreadsNumPerBlock,
+           0, ctx->stream()->As<ep::CudaStream>()->cuda_stream()>>>(
+            embedding_size, scale, l1, l2, weight_decay,
+            reinterpret_cast<const IDX*>(num_unique_ids->dptr()), learning_rate_ptr, scale_by_ptr,
+            down_scale_by_ptr, skip_if_ptr, embedding_grad->dptr<G>(), unique_embeddings->dptr<T>(),
+            updated_unique_embeddings->mut_dptr<T>());
+  }
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
+};
+
+#define IDX_DATA_TYPE_SEQ                           \
+  OF_PP_MAKE_TUPLE_SEQ(uint32_t, DataType::kUInt32) \
+  OF_PP_MAKE_TUPLE_SEQ(int32_t, DataType::kInt32)
+
+#define REGISTER_CUDA_SGD_EMBEDDING_UPDATE_KERNEL(t_dtype_pair, g_type_pair, idx_dtype_pair)      \
+  REGISTER_USER_KERNEL("sgd_embedding_update")                                                    \
+      .SetCreateFn<                                                                               \
+          SgdEmbeddingUpdateKernel<OF_PP_PAIR_FIRST(t_dtype_pair), OF_PP_PAIR_FIRST(g_type_pair), \
+                                   OF_PP_PAIR_FIRST(idx_dtype_pair)>>()                           \
+      .SetIsMatchedHob(                                                                           \
+          (user_op::HobDeviceType() == DeviceType::kCUDA)                                         \
+          && (user_op::HobDataType("num_unique_ids", 0) == OF_PP_PAIR_SECOND(idx_dtype_pair))     \
+          && (user_op::HobDataType("embedding_grad", 0) == OF_PP_PAIR_SECOND(g_type_pair))        \
+          && (user_op::HobDataType("unique_embeddings", 0) == OF_PP_PAIR_SECOND(t_dtype_pair)));
+
+OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_CUDA_SGD_EMBEDDING_UPDATE_KERNEL, FLOATING_DATA_TYPE_SEQ,
+                                //  FLOATING_DATA_TYPE_SEQ HALF_DATA_TYPE_SEQ, IDX_DATA_TYPE_SEQ)
+                                FLOATING_DATA_TYPE_SEQ, IDX_DATA_TYPE_SEQ)
+template<typename T, typename G, typename IDX>
+class MomentumEmbeddingUpdateKernel final : public user_op::OpKernel {
+ public:
+  MomentumEmbeddingUpdateKernel() = default;
+  ~MomentumEmbeddingUpdateKernel() override = default;
+
+ private:
+  using user_op::OpKernel::Compute;
+  void Compute(user_op::KernelComputeContext* ctx) const override {
+    const user_op::Tensor* num_unique_ids = ctx->Tensor4ArgNameAndIndex("num_unique_ids", 0);
+    const user_op::Tensor* unique_embeddings = ctx->Tensor4ArgNameAndIndex("unique_embeddings", 0);
+    const user_op::Tensor* embedding_grad = ctx->Tensor4ArgNameAndIndex("embedding_grad", 0);
+    user_op::Tensor* updated_unique_embeddings =
+        ctx->Tensor4ArgNameAndIndex("updated_unique_embeddings", 0);
+    CHECK_EQ(unique_embeddings->shape_view().NumAxes(), 2);
+    CHECK_EQ(embedding_grad->shape_view().NumAxes(), 2);
+    const int64_t num_keys = unique_embeddings->shape_view().At(0);
+    const int64_t line_size = unique_embeddings->shape_view().At(1);
+    const int64_t embedding_size = embedding_grad->shape_view().At(1);
+    CHECK_EQ(line_size, embedding_size * 2);
+    const float l1 = ctx->Attr<float>("l1");
+    const float l2 = ctx->Attr<float>("l2");
+    const auto weight_decay = ctx->Attr<float>("weight_decay");
+    const auto beta = ctx->Attr<float>("beta");
+    const auto scale = ctx->Attr<double>("scale");
+    const T* scale_by_ptr = nullptr;
+    if (ctx->has_input("scale_by_tensor", 0)) {
+      const user_op::Tensor* scale_by_tensor = ctx->Tensor4ArgNameAndIndex("scale_by_tensor", 0);
+      CHECK_EQ(scale_by_tensor->data_type(), unique_embeddings->data_type());
+      CHECK_EQ(scale_by_tensor->shape_view().elem_cnt(), 1);
+      scale_by_ptr = scale_by_tensor->dptr<T>();
+    }
+    const T* down_scale_by_ptr = nullptr;
+    if (ctx->has_input("down_scale_by_tensor", 0)) {
+      const user_op::Tensor* down_scale_by_tensor =
+          ctx->Tensor4ArgNameAndIndex("down_scale_by_tensor", 0);
+      CHECK_EQ(down_scale_by_tensor->data_type(), unique_embeddings->data_type());
+      CHECK_EQ(down_scale_by_tensor->shape_view().elem_cnt(), 1);
+      down_scale_by_ptr = down_scale_by_tensor->dptr<T>();
+    }
+    const user_op::Tensor* learning_rate = ctx->Tensor4ArgNameAndIndex("learning_rate", 0);
+    const float* learning_rate_ptr = learning_rate->dptr<float>();
+    const int64_t* skip_if_ptr = nullptr;
+    if (ctx->has_input("skip_if", 0)) {
+      const user_op::Tensor* skip_if = ctx->Tensor4ArgNameAndIndex("skip_if", 0);
+      CHECK_EQ(skip_if->shape_view().elem_cnt(), 1);
+      skip_if_ptr = skip_if->dptr<int64_t>();
+    }
+    // update kernel
+    MomentumUpdateKernel<T, G, IDX>
+        <<<BlocksNum4ThreadsNum(embedding_grad->shape_view().elem_cnt()), kCudaThreadsNumPerBlock,
+           0, ctx->stream()->As<ep::CudaStream>()->cuda_stream()>>>(
+            line_size, embedding_size, scale, l1, l2, weight_decay, beta,
+            reinterpret_cast<const IDX*>(num_unique_ids->dptr()), learning_rate_ptr, scale_by_ptr,
+            down_scale_by_ptr, skip_if_ptr, embedding_grad->dptr<G>(), unique_embeddings->dptr<T>(),
+            updated_unique_embeddings->mut_dptr<T>());
+  }
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
+};
+
+#define REGISTER_CUDA_MOMENTUM_EMBEDDING_UPDATE_KERNEL(t_dtype_pair, g_type_pair, idx_dtype_pair) \
+  REGISTER_USER_KERNEL("momentum_embedding_update")                                               \
+      .SetCreateFn<MomentumEmbeddingUpdateKernel<OF_PP_PAIR_FIRST(t_dtype_pair),                  \
+                                                 OF_PP_PAIR_FIRST(g_type_pair),                   \
+                                                 OF_PP_PAIR_FIRST(idx_dtype_pair)>>()             \
+      .SetIsMatchedHob(                                                                           \
+          (user_op::HobDeviceType() == DeviceType::kCUDA)                                         \
+          && (user_op::HobDataType("num_unique_ids", 0) == OF_PP_PAIR_SECOND(idx_dtype_pair))     \
+          && (user_op::HobDataType("embedding_grad", 0) == OF_PP_PAIR_SECOND(g_type_pair))        \
+          && (user_op::HobDataType("unique_embeddings", 0) == OF_PP_PAIR_SECOND(t_dtype_pair)));
+
+OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_CUDA_MOMENTUM_EMBEDDING_UPDATE_KERNEL,
+                                //  FLOATING_DATA_TYPE_SEQ, FLOATING_DATA_TYPE_SEQ HALF_DATA_TYPE_SEQ,
+                                FLOATING_DATA_TYPE_SEQ, FLOATING_DATA_TYPE_SEQ,
+                                IDX_DATA_TYPE_SEQ)
+
+template<typename T, typename G, typename IDX>
+class AdamEmbeddingUpdateKernel final : public user_op::OpKernel {
+ public:
+  AdamEmbeddingUpdateKernel() = default;
+  ~AdamEmbeddingUpdateKernel() override = default;
+
+ private:
+  using user_op::OpKernel::Compute;
+  void Compute(user_op::KernelComputeContext* ctx) const override {
+    const user_op::Tensor* num_unique_ids = ctx->Tensor4ArgNameAndIndex("num_unique_ids", 0);
+    const user_op::Tensor* unique_embeddings = ctx->Tensor4ArgNameAndIndex("unique_embeddings", 0);
+    const user_op::Tensor* embedding_grad = ctx->Tensor4ArgNameAndIndex("embedding_grad", 0);
+    user_op::Tensor* updated_unique_embeddings =
+        ctx->Tensor4ArgNameAndIndex("updated_unique_embeddings", 0);
+    CHECK_EQ(unique_embeddings->shape_view().NumAxes(), 2);
+    CHECK_EQ(embedding_grad->shape_view().NumAxes(), 2);
+    const int64_t num_keys = unique_embeddings->shape_view().At(0);
+    const int64_t line_size = unique_embeddings->shape_view().At(1);
+    const int64_t embedding_size = embedding_grad->shape_view().At(1);
+    CHECK_EQ(line_size, embedding_size * 3);
+
+    const float l1 = ctx->Attr<float>("l1");
+    const float l2 = ctx->Attr<float>("l2");
+    const auto weight_decay = ctx->Attr<float>("weight_decay");
+    const auto beta1 = ctx->Attr<float>("beta1");
+    const auto beta2 = ctx->Attr<float>("beta2");
+    const auto epsilon = ctx->Attr<float>("epsilon");
+    const bool do_bias_correction = ctx->Attr<bool>("do_bias_correction");
+    const auto scale = ctx->Attr<double>("scale");
+    const T* scale_by_ptr = nullptr;
+    if (ctx->has_input("scale_by_tensor", 0)) {
+      const user_op::Tensor* scale_by_tensor = ctx->Tensor4ArgNameAndIndex("scale_by_tensor", 0);
+      CHECK_EQ(scale_by_tensor->data_type(), unique_embeddings->data_type());
+      CHECK_EQ(scale_by_tensor->shape_view().elem_cnt(), 1);
+      scale_by_ptr = scale_by_tensor->dptr<T>();
+    }
+    const T* down_scale_by_ptr = nullptr;
+    if (ctx->has_input("down_scale_by_tensor", 0)) {
+      const user_op::Tensor* down_scale_by_tensor =
+          ctx->Tensor4ArgNameAndIndex("down_scale_by_tensor", 0);
+      CHECK_EQ(down_scale_by_tensor->data_type(), unique_embeddings->data_type());
+      CHECK_EQ(down_scale_by_tensor->shape_view().elem_cnt(), 1);
+      down_scale_by_ptr = down_scale_by_tensor->dptr<T>();
+    }
+    const user_op::Tensor* learning_rate = ctx->Tensor4ArgNameAndIndex("learning_rate", 0);
+    const float* learning_rate_ptr = learning_rate->dptr<float>();
+    const int64_t* skip_if_ptr = nullptr;
+    if (ctx->has_input("skip_if", 0)) {
+      const user_op::Tensor* skip_if = ctx->Tensor4ArgNameAndIndex("skip_if", 0);
+      CHECK_EQ(skip_if->shape_view().elem_cnt(), 1);
+      skip_if_ptr = skip_if->dptr<int64_t>();
+    }
+    const float* bias_correction1_ptr = nullptr;
+    if (ctx->has_input("bias_correction1", 0)) {
+      bias_correction1_ptr = ctx->Tensor4ArgNameAndIndex("bias_correction1", 0)->dptr<float>();
+    }
+    const float* bias_correction2_ptr = nullptr;
+    if (ctx->has_input("bias_correction2", 0)) {
+      bias_correction2_ptr = ctx->Tensor4ArgNameAndIndex("bias_correction2", 0)->dptr<float>();
+    }
+    // update kernel
+    AdamUpdateKernel<T, G, IDX>
+        <<<BlocksNum4ThreadsNum(embedding_grad->shape_view().elem_cnt()), kCudaThreadsNumPerBlock,
+           0, ctx->stream()->As<ep::CudaStream>()->cuda_stream()>>>(
+            line_size, embedding_size, static_cast<T>(scale), l1, l2, weight_decay, beta1, beta2,
+            epsilon, bias_correction1_ptr, bias_correction2_ptr,
+            reinterpret_cast<const IDX*>(num_unique_ids->dptr()), learning_rate_ptr, scale_by_ptr,
+            down_scale_by_ptr, skip_if_ptr, embedding_grad->dptr<G>(), unique_embeddings->dptr<T>(),
+            updated_unique_embeddings->mut_dptr<T>());
+  }
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
+};
+
+#define REGISTER_CUDA_ADAM_EMBEDDING_UPDATE_KERNEL(t_dtype_pair, g_type_pair, idx_dtype_pair)      \
+  REGISTER_USER_KERNEL("adam_embedding_update")                                                    \
+      .SetCreateFn<                                                                                \
+          AdamEmbeddingUpdateKernel<OF_PP_PAIR_FIRST(t_dtype_pair), OF_PP_PAIR_FIRST(g_type_pair), \
+                                    OF_PP_PAIR_FIRST(idx_dtype_pair)>>()                           \
+      .SetIsMatchedHob(                                                                            \
+          (user_op::HobDeviceType() == DeviceType::kCUDA)                                          \
+          && (user_op::HobDataType("num_unique_ids", 0) == OF_PP_PAIR_SECOND(idx_dtype_pair))      \
+          && (user_op::HobDataType("embedding_grad", 0) == OF_PP_PAIR_SECOND(g_type_pair))         \
+          && (user_op::HobDataType("unique_embeddings", 0) == OF_PP_PAIR_SECOND(t_dtype_pair)));
+
+OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_CUDA_ADAM_EMBEDDING_UPDATE_KERNEL, FLOATING_DATA_TYPE_SEQ,
+                                //  FLOATING_DATA_TYPE_SEQ HALF_DATA_TYPE_SEQ, IDX_DATA_TYPE_SEQ)
+                                FLOATING_DATA_TYPE_SEQ, IDX_DATA_TYPE_SEQ)
+
+template<typename T, typename G, typename IDX>
+class AdagradEmbeddingUpdateKernel final : public user_op::OpKernel {
+ public:
+  AdagradEmbeddingUpdateKernel() = default;
+  ~AdagradEmbeddingUpdateKernel() override = default;
+
+ private:
+  using user_op::OpKernel::Compute;
+  void Compute(user_op::KernelComputeContext* ctx) const override {
+    const user_op::Tensor* num_unique_ids = ctx->Tensor4ArgNameAndIndex("num_unique_ids", 0);
+    const user_op::Tensor* unique_embeddings = ctx->Tensor4ArgNameAndIndex("unique_embeddings", 0);
+    const user_op::Tensor* embedding_grad = ctx->Tensor4ArgNameAndIndex("embedding_grad", 0);
+    user_op::Tensor* updated_unique_embeddings =
+        ctx->Tensor4ArgNameAndIndex("updated_unique_embeddings", 0);
+    CHECK_EQ(unique_embeddings->shape_view().NumAxes(), 2);
+    CHECK_EQ(embedding_grad->shape_view().NumAxes(), 2);
+    const int64_t num_keys = unique_embeddings->shape_view().At(0);
+    const int64_t line_size = unique_embeddings->shape_view().At(1);
+    const int64_t embedding_size = embedding_grad->shape_view().At(1);
+    CHECK_EQ(line_size, embedding_size * 2);
+
+    const float l1 = ctx->Attr<float>("l1");
+    const float l2 = ctx->Attr<float>("l2");
+    const auto weight_decay = ctx->Attr<float>("weight_decay");
+    const auto lr_decay = ctx->Attr<float>("lr_decay");
+    const auto epsilon = ctx->Attr<float>("epsilon");
+    const auto scale = ctx->Attr<double>("scale");
+    const T* scale_by_ptr = nullptr;
+    if (ctx->has_input("scale_by_tensor", 0)) {
+      const user_op::Tensor* scale_by_tensor = ctx->Tensor4ArgNameAndIndex("scale_by_tensor", 0);
+      CHECK_EQ(scale_by_tensor->data_type(), unique_embeddings->data_type());
+      CHECK_EQ(scale_by_tensor->shape_view().elem_cnt(), 1);
+      scale_by_ptr = scale_by_tensor->dptr<T>();
+    }
+    const T* down_scale_by_ptr = nullptr;
+    if (ctx->has_input("down_scale_by_tensor", 0)) {
+      const user_op::Tensor* down_scale_by_tensor =
+          ctx->Tensor4ArgNameAndIndex("down_scale_by_tensor", 0);
+      CHECK_EQ(down_scale_by_tensor->data_type(), unique_embeddings->data_type());
+      CHECK_EQ(down_scale_by_tensor->shape_view().elem_cnt(), 1);
+      down_scale_by_ptr = down_scale_by_tensor->dptr<T>();
+    }
+    const user_op::Tensor* learning_rate = ctx->Tensor4ArgNameAndIndex("learning_rate", 0);
+    const float* learning_rate_ptr = learning_rate->dptr<float>();
+    const int64_t* train_step_ptr = ctx->Tensor4ArgNameAndIndex("train_step", 0)->dptr<int64_t>();
+    const int64_t* skip_if_ptr = nullptr;
+    if (ctx->has_input("skip_if", 0)) {
+      const user_op::Tensor* skip_if = ctx->Tensor4ArgNameAndIndex("skip_if", 0);
+      CHECK_EQ(skip_if->shape_view().elem_cnt(), 1);
+      skip_if_ptr = skip_if->dptr<int64_t>();
+    }
+    // update kernel
+    AdagradUpdateKernel<T, G, IDX>
+        <<<BlocksNum4ThreadsNum(embedding_grad->shape_view().elem_cnt()), kCudaThreadsNumPerBlock,
+           0, ctx->stream()->As<ep::CudaStream>()->cuda_stream()>>>(
+            line_size, embedding_size, static_cast<T>(scale), l1, l2, weight_decay, lr_decay,
+            epsilon, reinterpret_cast<const IDX*>(num_unique_ids->dptr()), learning_rate_ptr,
+            train_step_ptr, scale_by_ptr, down_scale_by_ptr, skip_if_ptr, embedding_grad->dptr<G>(),
+            unique_embeddings->dptr<T>(), updated_unique_embeddings->mut_dptr<T>());
+  }
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
+};
+
+#define REGISTER_CUDA_ADAGRAD_EMBEDDING_UPDATE_KERNEL(t_dtype_pair, g_type_pair, idx_dtype_pair) \
+  REGISTER_USER_KERNEL("adagrad_embedding_update")                                               \
+      .SetCreateFn<AdagradEmbeddingUpdateKernel<OF_PP_PAIR_FIRST(t_dtype_pair),                  \
+                                                OF_PP_PAIR_FIRST(g_type_pair),                   \
+                                                OF_PP_PAIR_FIRST(idx_dtype_pair)>>()             \
+      .SetIsMatchedHob(                                                                          \
+          (user_op::HobDeviceType() == DeviceType::kCUDA)                                        \
+          && (user_op::HobDataType("num_unique_ids", 0) == OF_PP_PAIR_SECOND(idx_dtype_pair))    \
+          && (user_op::HobDataType("embedding_grad", 0) == OF_PP_PAIR_SECOND(g_type_pair))       \
+          && (user_op::HobDataType("unique_embeddings", 0) == OF_PP_PAIR_SECOND(t_dtype_pair)));
+
+OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_CUDA_ADAGRAD_EMBEDDING_UPDATE_KERNEL,
+                                //  FLOATING_DATA_TYPE_SEQ, FLOATING_DATA_TYPE_SEQ HALF_DATA_TYPE_SEQ,
+                                FLOATING_DATA_TYPE_SEQ, FLOATING_DATA_TYPE_SEQ,
+                                IDX_DATA_TYPE_SEQ)
+
+template<typename T, typename G, typename IDX>
+class FtrlEmbeddingUpdateKernel final : public user_op::OpKernel {
+ public:
+  FtrlEmbeddingUpdateKernel() = default;
+  ~FtrlEmbeddingUpdateKernel() override = default;
+
+ private:
+  using user_op::OpKernel::Compute;
+  void Compute(user_op::KernelComputeContext* ctx) const override {
+    const user_op::Tensor* num_unique_ids = ctx->Tensor4ArgNameAndIndex("num_unique_ids", 0);
+    const user_op::Tensor* unique_embeddings = ctx->Tensor4ArgNameAndIndex("unique_embeddings", 0);
+    const user_op::Tensor* embedding_grad = ctx->Tensor4ArgNameAndIndex("embedding_grad", 0);
+    user_op::Tensor* updated_unique_embeddings =
+        ctx->Tensor4ArgNameAndIndex("updated_unique_embeddings", 0);
+    CHECK_EQ(unique_embeddings->shape_view().NumAxes(), 2)
+        << "The NumAxes of unique_embedding should be equal to 2. ";
+    CHECK_EQ(embedding_grad->shape_view().NumAxes(), 2)
+        << "The NumAxes of embedding_grad should be equal to 2. ";
+    const int64_t num_keys = unique_embeddings->shape_view().At(0);
+    const int64_t line_size = unique_embeddings->shape_view().At(1);
+    const int64_t embedding_size = embedding_grad->shape_view().At(1);
+    CHECK_EQ(line_size, embedding_size * 3)
+        << "The line_size should be equal to 3 x embedding_size. ";
+    const float l1 = 0.0;
+    const float l2 = 0.0;
+    const float weight_decay = ctx->Attr<float>("weight_decay");
+    // TODO(zhengzekang): Undefined behavior for ftrl optimizer with weight_decay in `abs(new_z_val)
+    // < lambda1` condition.
+    CHECK_EQ(weight_decay, static_cast<float>(0.0))
+        << "Currently not support for setting weight decay. ";
+    const float lr_power = ctx->Attr<float>("lr_power");
+    const float lambda1 = ctx->Attr<float>("lambda1");
+    const float lambda2 = ctx->Attr<float>("lambda2");
+    const float beta = ctx->Attr<float>("beta");
+    const double scale = ctx->Attr<double>("scale");
+    const T* down_scale_by_ptr = nullptr;
+    if (ctx->has_input("down_scale_by_tensor", 0)) {
+      const user_op::Tensor* down_scale_by_tensor =
+          ctx->Tensor4ArgNameAndIndex("down_scale_by_tensor", 0);
+      CHECK_EQ(down_scale_by_tensor->data_type(), unique_embeddings->data_type());
+      CHECK_EQ(down_scale_by_tensor->shape_view().elem_cnt(), 1);
+      down_scale_by_ptr = down_scale_by_tensor->dptr<T>();
+    }
+    const user_op::Tensor* learning_rate = ctx->Tensor4ArgNameAndIndex("learning_rate", 0);
+    const float* learning_rate_ptr = learning_rate->dptr<float>();
+    const int64_t* skip_if_ptr = nullptr;
+    if (ctx->has_input("skip_if", 0)) {
+      const user_op::Tensor* skip_if = ctx->Tensor4ArgNameAndIndex("skip_if", 0);
+      CHECK_EQ(skip_if->shape_view().elem_cnt(), 1);
+      skip_if_ptr = skip_if->dptr<int64_t>();
+    }
+    // update kernel
+    FtrlUpdateKernel<T, G, IDX>
+        <<<BlocksNum4ThreadsNum(embedding_grad->shape_view().elem_cnt()), kCudaThreadsNumPerBlock,
+           0, ctx->stream()->As<ep::CudaStream>()->cuda_stream()>>>(
+            line_size, embedding_size, static_cast<T>(scale), l1, l2, weight_decay, lr_power,
+            lambda1, lambda2, beta, reinterpret_cast<const IDX*>(num_unique_ids->dptr()),
+            learning_rate_ptr, down_scale_by_ptr, skip_if_ptr, embedding_grad->dptr<G>(),
+            unique_embeddings->dptr<T>(), updated_unique_embeddings->mut_dptr<T>());
+  }
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
+};
+#define REGISTER_CUDA_FTRL_EMBEDDING_UPDATE_KERNEL(t_dtype_pair, g_type_pair, idx_dtype_pair)      \
+  REGISTER_USER_KERNEL("ftrl_embedding_update")                                                    \
+      .SetCreateFn<                                                                                \
+          FtrlEmbeddingUpdateKernel<OF_PP_PAIR_FIRST(t_dtype_pair), OF_PP_PAIR_FIRST(g_type_pair), \
+                                    OF_PP_PAIR_FIRST(idx_dtype_pair)>>()                           \
+      .SetIsMatchedHob(                                                                            \
+          (user_op::HobDeviceType() == DeviceType::kCUDA)                                          \
+          && (user_op::HobDataType("num_unique_ids", 0) == OF_PP_PAIR_SECOND(idx_dtype_pair))      \
+          && (user_op::HobDataType("embedding_grad", 0) == OF_PP_PAIR_SECOND(g_type_pair))         \
+          && (user_op::HobDataType("unique_embeddings", 0) == OF_PP_PAIR_SECOND(t_dtype_pair)));
+OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_CUDA_FTRL_EMBEDDING_UPDATE_KERNEL, FLOATING_DATA_TYPE_SEQ,
+                                //  FLOATING_DATA_TYPE_SEQ HALF_DATA_TYPE_SEQ, IDX_DATA_TYPE_SEQ)
+                                FLOATING_DATA_TYPE_SEQ, IDX_DATA_TYPE_SEQ)
+
 }  // namespace oneflow
\ No newline at end of file
diff --git a/oneflow/user/kernels/one_hot_kernel.hip.cpp b/oneflow/user/kernels/one_hot_kernel.hip.cpp
index 661a41b..4b0a5b1 100644
--- a/oneflow/user/kernels/one_hot_kernel.hip.cpp
+++ b/oneflow/user/kernels/one_hot_kernel.hip.cpp
@@ -1,81 +1,81 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "hip/hip_runtime.h"
-#include "oneflow/core/framework/framework.h"
-#include "oneflow/core/kernel/cuda_graph_support.h"
-#include "oneflow/core/device/cuda_util.h"
-
-namespace oneflow {
-
-namespace {
-
-template<typename T, typename K>
-__global__ void OneHotEncodeGpu(int64_t elem_cnt, const int64_t depth, const T on_value,
-                                const T off_value, const K* indices, T* out) {
-  CUDA_1D_KERNEL_LOOP(i, elem_cnt) {
-    const int64_t row = i / depth;
-    const int64_t col = i - row * depth;
-    const int64_t idx = indices[row];
-    assert(idx >= 0 && idx < depth);
-    out[i] = (idx == col) ? on_value : off_value;
-  }
-}
-
-}  // namespace
-
-template<typename T, typename K>
-class GpuOneHotKernel final : public user_op::OpKernel, public user_op::CudaGraphSupport {
- public:
-  GpuOneHotKernel() = default;
-  ~GpuOneHotKernel() = default;
-
- private:
-  using user_op::OpKernel::Compute;
-  void Compute(user_op::KernelComputeContext* ctx) const override {
-    const user_op::Tensor* indices = ctx->Tensor4ArgNameAndIndex("indices", 0);
-    user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0);
-    const int64_t num_indices = indices->shape_view().elem_cnt();
-    const int64_t depth = ctx->Attr<int64_t>("depth");
-    const DataType dtype = ctx->Attr<DataType>("dtype");
-    const T on_value = IsFloatingDataType(dtype)
-                           ? static_cast<T>(ctx->Attr<double>("floating_on_value"))
-                           : static_cast<T>(ctx->Attr<int64_t>("integer_on_value"));
-    const T off_value = IsFloatingDataType(dtype)
-                            ? static_cast<T>(ctx->Attr<double>("floating_off_value"))
-                            : static_cast<T>(ctx->Attr<int64_t>("integer_off_value"));
-    RUN_CUDA_KERNEL((OneHotEncodeGpu<T, K>), ctx->stream(), num_indices * depth,
-                    num_indices * depth, depth, on_value, off_value, indices->dptr<K>(),
-                    out->mut_dptr<T>());
-  }
-  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
-};
-
-#define REGISTER_CUDA_ONE_HOT_KERNEL(dtype, itype)                                              \
-  REGISTER_USER_KERNEL("one_hot").SetCreateFn<GpuOneHotKernel<dtype, itype>>().SetIsMatchedHob( \
-      (user_op::HobDeviceType() == DeviceType::kCUDA)                                           \
-      && (user_op::HobDataType("indices", 0) == GetDataType<itype>::value)                      \
-      && (user_op::HobDataType("out", 0) == GetDataType<dtype>::value));
-
-REGISTER_CUDA_ONE_HOT_KERNEL(int32_t, int32_t)
-REGISTER_CUDA_ONE_HOT_KERNEL(int32_t, int64_t)
-REGISTER_CUDA_ONE_HOT_KERNEL(int64_t, int32_t)
-REGISTER_CUDA_ONE_HOT_KERNEL(int64_t, int64_t)
-REGISTER_CUDA_ONE_HOT_KERNEL(float, int32_t)
-REGISTER_CUDA_ONE_HOT_KERNEL(float, int64_t)
-REGISTER_CUDA_ONE_HOT_KERNEL(double, int32_t)
-REGISTER_CUDA_ONE_HOT_KERNEL(double, int64_t)
-
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "hip/hip_runtime.h"
+#include "oneflow/core/framework/framework.h"
+#include "oneflow/core/kernel/cuda_graph_support.h"
+#include "oneflow/core/device/cuda_util.h"
+
+namespace oneflow {
+
+namespace {
+
+template<typename T, typename K>
+__global__ void OneHotEncodeGpu(int64_t elem_cnt, const int64_t depth, const T on_value,
+                                const T off_value, const K* indices, T* out) {
+  CUDA_1D_KERNEL_LOOP(i, elem_cnt) {
+    const int64_t row = i / depth;
+    const int64_t col = i - row * depth;
+    const int64_t idx = indices[row];
+    assert(idx >= 0 && idx < depth);
+    out[i] = (idx == col) ? on_value : off_value;
+  }
+}
+
+}  // namespace
+
+template<typename T, typename K>
+class GpuOneHotKernel final : public user_op::OpKernel, public user_op::CudaGraphSupport {
+ public:
+  GpuOneHotKernel() = default;
+  ~GpuOneHotKernel() = default;
+
+ private:
+  using user_op::OpKernel::Compute;
+  void Compute(user_op::KernelComputeContext* ctx) const override {
+    const user_op::Tensor* indices = ctx->Tensor4ArgNameAndIndex("indices", 0);
+    user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0);
+    const int64_t num_indices = indices->shape_view().elem_cnt();
+    const int64_t depth = ctx->Attr<int64_t>("depth");
+    const DataType dtype = ctx->Attr<DataType>("dtype");
+    const T on_value = IsFloatingDataType(dtype)
+                           ? static_cast<T>(ctx->Attr<double>("floating_on_value"))
+                           : static_cast<T>(ctx->Attr<int64_t>("integer_on_value"));
+    const T off_value = IsFloatingDataType(dtype)
+                            ? static_cast<T>(ctx->Attr<double>("floating_off_value"))
+                            : static_cast<T>(ctx->Attr<int64_t>("integer_off_value"));
+    RUN_CUDA_KERNEL((OneHotEncodeGpu<T, K>), ctx->stream(), num_indices * depth,
+                    num_indices * depth, depth, on_value, off_value, indices->dptr<K>(),
+                    out->mut_dptr<T>());
+  }
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
+};
+
+#define REGISTER_CUDA_ONE_HOT_KERNEL(dtype, itype)                                              \
+  REGISTER_USER_KERNEL("one_hot").SetCreateFn<GpuOneHotKernel<dtype, itype>>().SetIsMatchedHob( \
+      (user_op::HobDeviceType() == DeviceType::kCUDA)                                           \
+      && (user_op::HobDataType("indices", 0) == GetDataType<itype>::value)                      \
+      && (user_op::HobDataType("out", 0) == GetDataType<dtype>::value));
+
+REGISTER_CUDA_ONE_HOT_KERNEL(int32_t, int32_t)
+REGISTER_CUDA_ONE_HOT_KERNEL(int32_t, int64_t)
+REGISTER_CUDA_ONE_HOT_KERNEL(int64_t, int32_t)
+REGISTER_CUDA_ONE_HOT_KERNEL(int64_t, int64_t)
+REGISTER_CUDA_ONE_HOT_KERNEL(float, int32_t)
+REGISTER_CUDA_ONE_HOT_KERNEL(float, int64_t)
+REGISTER_CUDA_ONE_HOT_KERNEL(double, int32_t)
+REGISTER_CUDA_ONE_HOT_KERNEL(double, int64_t)
+
 }  // namespace oneflow
\ No newline at end of file
diff --git a/oneflow/user/kernels/pad2d_kernels_util.hip.cpp b/oneflow/user/kernels/pad2d_kernels_util.hip.cpp
index 9c9bf9c..37f40e6 100644
--- a/oneflow/user/kernels/pad2d_kernels_util.hip.cpp
+++ b/oneflow/user/kernels/pad2d_kernels_util.hip.cpp
@@ -1,214 +1,214 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include <cstdint>
-#ifdef WITH_ROCM
-#include "hip/hip_runtime.h"
-#include "oneflow/core/common/data_type.h"
-#include "oneflow/core/framework/framework.h"
-#include "oneflow/user/kernels/pad2d_kernels_util.h"
-#include "oneflow/core/ep/rocm/cuda_stream.h"
-
-namespace oneflow {
-namespace user_op {
-
-template<typename IN_T>
-__global__ void DoCUDAReflectionPad2d(const IN_T* src, IN_T* dest,
-                                      const NdIndexOffsetHelper<int64_t, 4> index_helper,
-                                      int64_t elem_num, int64_t src_num, int64_t dest_num,
-                                      int64_t y_height, int64_t y_width, int64_t x_height,
-                                      int64_t x_width, int64_t pad_left, int64_t pad_top) {
-  DoReflectionPad2d<IN_T>(src, dest, index_helper, elem_num, src_num, dest_num, y_height, y_width,
-                          x_height, x_width, pad_left, pad_top);
-};
-
-template<typename IN_T>
-__global__ void DoCUDAReflectionPad2dGrad(const IN_T* src, IN_T* dest,
-                                          const NdIndexOffsetHelper<int64_t, 4> index_helper,
-                                          int64_t elem_num, int64_t src_num, int64_t dest_num,
-                                          int64_t dy_height, int64_t dy_width, int64_t dx_height,
-                                          int64_t dx_width, int64_t pad_left, int64_t pad_top) {
-  DoReflectionPad2dGrad<IN_T>(src, dest, index_helper, elem_num, src_num, dest_num, dy_height,
-                              dy_width, dx_height, dx_width, pad_left, pad_top);
-};
-
-template<typename IN_T>
-struct ReflectionPad2dFunctor<DeviceType::kCUDA, IN_T> final {
-  void operator()(ep::Stream* stream, const IN_T* src, IN_T* dest,
-                  const NdIndexOffsetHelper<int64_t, 4>& index_helper, int64_t n_batch,
-                  int64_t n_channel, int64_t y_height, int64_t y_width, int64_t x_height,
-                  int64_t x_width, int64_t pad_left, int64_t pad_top) {
-    int64_t dest_num = n_channel * y_height * y_width;
-    int64_t src_num = n_channel * x_height * x_width;
-    int64_t elem_num = n_batch * dest_num;
-    DoCUDAReflectionPad2d<IN_T><<<BlocksNum4ThreadsNum(elem_num), kCudaThreadsNumPerBlock, 0,
-                                  stream->As<ep::CudaStream>()->cuda_stream()>>>(
-        src, dest, index_helper, elem_num, src_num, dest_num, y_height, y_width, x_height, x_width,
-        pad_left, pad_top);
-  }
-};
-
-// float16 implementation
-template<>
-void ReflectionPad2dFunctor<DeviceType::kCUDA, float16>::operator()(
-    ep::Stream* stream, const float16* src, float16* dest,
-    const NdIndexOffsetHelper<int64_t, 4>& index_helper, int64_t n_batch, int64_t n_channel,
-    int64_t y_height, int64_t y_width, int64_t x_height, int64_t x_width, int64_t pad_left,
-    int64_t pad_top) {
-  int64_t dest_num = n_channel * y_height * y_width;
-  int64_t src_num = n_channel * x_height * x_width;
-  int64_t elem_num = n_batch * dest_num;
-  DoCUDAReflectionPad2d<half><<<BlocksNum4ThreadsNum(elem_num), kCudaThreadsNumPerBlock, 0,
-                                stream->As<ep::CudaStream>()->cuda_stream()>>>(
-      reinterpret_cast<const half*>(src), reinterpret_cast<half*>(dest), index_helper, elem_num,
-      src_num, dest_num, y_height, y_width, x_height, x_width, pad_left, pad_top);
-}
-
-template<typename IN_T>
-struct ReflectionPad2dGradFunctor<DeviceType::kCUDA, IN_T> final {
-  void operator()(ep::Stream* stream, const IN_T* src, IN_T* dest,
-                  const NdIndexOffsetHelper<int64_t, 4>& index_helper, int64_t n_batch,
-                  int64_t n_channel, int64_t dy_height, int64_t dy_width, int64_t dx_height,
-                  int64_t dx_width, int64_t pad_left, int64_t pad_top) {
-    int64_t dest_num = n_channel * dx_height * dx_width;
-    int64_t src_num = n_channel * dy_height * dy_width;
-    int64_t elem_num = n_batch * src_num;
-    DoCUDAReflectionPad2dGrad<IN_T><<<BlocksNum4ThreadsNum(elem_num), kCudaThreadsNumPerBlock, 0,
-                                      stream->As<ep::CudaStream>()->cuda_stream()>>>(
-        src, dest, index_helper, elem_num, src_num, dest_num, dy_height, dy_width, dx_height,
-        dx_width, pad_left, pad_top);
-  }
-};
-
-// float16 implementation
-template<>
-void ReflectionPad2dGradFunctor<DeviceType::kCUDA, float16>::operator()(
-    ep::Stream* stream, const float16* src, float16* dest,
-    const NdIndexOffsetHelper<int64_t, 4>& index_helper, int64_t n_batch, int64_t n_channel,
-    int64_t dy_height, int64_t dy_width, int64_t dx_height, int64_t dx_width, int64_t pad_left,
-    int64_t pad_top) {
-  int64_t dest_num = n_channel * dx_height * dx_width;
-  int64_t src_num = n_channel * dy_height * dy_width;
-  int64_t elem_num = n_batch * src_num;
-  DoCUDAReflectionPad2dGrad<half><<<BlocksNum4ThreadsNum(elem_num), kCudaThreadsNumPerBlock, 0,
-                                    stream->As<ep::CudaStream>()->cuda_stream()>>>(
-      reinterpret_cast<const half*>(src), reinterpret_cast<half*>(dest), index_helper, elem_num,
-      src_num, dest_num, dy_height, dy_width, dx_height, dx_width, pad_left, pad_top);
-}
-
-OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_REFLECTION_PAD2D_FUNCTOR,
-                                 OF_PP_MAKE_TUPLE_SEQ(DeviceType::kCUDA),
-                                 PADDING_DATA_TYPE_CUDA_SEQ);
-
-OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_REFLECTION_PAD2D_GRAD_FUNCTOR,
-                                 OF_PP_MAKE_TUPLE_SEQ(DeviceType::kCUDA),
-                                 PADDING_DATA_TYPE_CUDA_SEQ);
-
-template<typename IN_T>
-__global__ void DoCUDAReplicationPad2d(const IN_T* src, IN_T* dest,
-                                       const NdIndexOffsetHelper<int64_t, 4> index_helper,
-                                       int64_t elem_num, int64_t src_num, int64_t dest_num,
-                                       int64_t y_height, int64_t y_width, int64_t x_height,
-                                       int64_t x_width, int64_t pad_left, int64_t pad_top) {
-  DoReplicationPad2d<IN_T>(src, dest, index_helper, elem_num, src_num, dest_num, y_height, y_width,
-                           x_height, x_width, pad_left, pad_top);
-};
-
-template<typename IN_T>
-__global__ void DoCUDAReplicationPad2dGrad(const IN_T* src, IN_T* dest,
-                                           const NdIndexOffsetHelper<int64_t, 4> index_helper,
-                                           int64_t elem_num, int64_t src_num, int64_t dest_num,
-                                           int64_t dy_height, int64_t dy_width, int64_t dx_height,
-                                           int64_t dx_width, int64_t pad_left, int64_t pad_top) {
-  DoReplicationPad2dGrad<IN_T>(src, dest, index_helper, elem_num, src_num, dest_num, dy_height,
-                               dy_width, dx_height, dx_width, pad_left, pad_top);
-};
-
-template<typename IN_T>
-struct ReplicationPad2dFunctor<DeviceType::kCUDA, IN_T> final {
-  void operator()(ep::Stream* stream, const IN_T* src, IN_T* dest,
-                  const NdIndexOffsetHelper<int64_t, 4>& index_helper, int64_t n_batch,
-                  int64_t n_channel, int64_t y_height, int64_t y_width, int64_t x_height,
-                  int64_t x_width, int64_t pad_left, int64_t pad_top) {
-    int64_t dest_num = n_channel * y_height * y_width;
-    int64_t src_num = n_channel * x_height * x_width;
-    int64_t elem_num = n_batch * dest_num;
-    DoCUDAReplicationPad2d<IN_T><<<BlocksNum4ThreadsNum(elem_num), kCudaThreadsNumPerBlock, 0,
-                                   stream->As<ep::CudaStream>()->cuda_stream()>>>(
-        src, dest, index_helper, elem_num, src_num, dest_num, y_height, y_width, x_height, x_width,
-        pad_left, pad_top);
-  }
-};
-
-// float16 implementation
-template<>
-void ReplicationPad2dFunctor<DeviceType::kCUDA, float16>::operator()(
-    ep::Stream* stream, const float16* src, float16* dest,
-    const NdIndexOffsetHelper<int64_t, 4>& index_helper, int64_t n_batch, int64_t n_channel,
-    int64_t y_height, int64_t y_width, int64_t x_height, int64_t x_width, int64_t pad_left,
-    int64_t pad_top) {
-  int64_t dest_num = n_channel * y_height * y_width;
-  int64_t src_num = n_channel * x_height * x_width;
-  int64_t elem_num = n_batch * dest_num;
-  DoCUDAReplicationPad2d<half><<<BlocksNum4ThreadsNum(elem_num), kCudaThreadsNumPerBlock, 0,
-                                 stream->As<ep::CudaStream>()->cuda_stream()>>>(
-      reinterpret_cast<const half*>(src), reinterpret_cast<half*>(dest), index_helper, elem_num,
-      src_num, dest_num, y_height, y_width, x_height, x_width, pad_left, pad_top);
-}
-
-template<typename IN_T>
-struct ReplicationPad2dGradFunctor<DeviceType::kCUDA, IN_T> final {
-  void operator()(ep::Stream* stream, const IN_T* src, IN_T* dest,
-                  const NdIndexOffsetHelper<int64_t, 4>& index_helper, int64_t n_batch,
-                  int64_t n_channel, int64_t dy_height, int64_t dy_width, int64_t dx_height,
-                  int64_t dx_width, int64_t pad_left, int64_t pad_top) {
-    int64_t dest_num = n_channel * dx_height * dx_width;
-    int64_t src_num = n_channel * dy_height * dy_width;
-    int64_t elem_num = n_batch * src_num;
-    DoCUDAReplicationPad2dGrad<IN_T><<<BlocksNum4ThreadsNum(elem_num), kCudaThreadsNumPerBlock, 0,
-                                       stream->As<ep::CudaStream>()->cuda_stream()>>>(
-        src, dest, index_helper, elem_num, src_num, dest_num, dy_height, dy_width, dx_height,
-        dx_width, pad_left, pad_top);
-  }
-};
-
-// float16 implementation
-template<>
-void ReplicationPad2dGradFunctor<DeviceType::kCUDA, float16>::operator()(
-    ep::Stream* stream, const float16* src, float16* dest,
-    const NdIndexOffsetHelper<int64_t, 4>& index_helper, int64_t n_batch, int64_t n_channel,
-    int64_t dy_height, int64_t dy_width, int64_t dx_height, int64_t dx_width, int64_t pad_left,
-    int64_t pad_top) {
-  int64_t dest_num = n_channel * dx_height * dx_width;
-  int64_t src_num = n_channel * dy_height * dy_width;
-  int64_t elem_num = n_batch * src_num;
-  DoCUDAReplicationPad2dGrad<half><<<BlocksNum4ThreadsNum(elem_num), kCudaThreadsNumPerBlock, 0,
-                                     stream->As<ep::CudaStream>()->cuda_stream()>>>(
-      reinterpret_cast<const half*>(src), reinterpret_cast<half*>(dest), index_helper, elem_num,
-      src_num, dest_num, dy_height, dy_width, dx_height, dx_width, pad_left, pad_top);
-}
-
-OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_REPLICATION_PAD2D_FUNCTOR,
-                                 OF_PP_MAKE_TUPLE_SEQ(DeviceType::kCUDA),
-                                 PADDING_DATA_TYPE_CUDA_SEQ);
-
-OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_REPLICATION_PAD2D_GRAD_FUNCTOR,
-                                 OF_PP_MAKE_TUPLE_SEQ(DeviceType::kCUDA),
-                                 PADDING_DATA_TYPE_CUDA_SEQ);
-
-}  // namespace user_op
-}  // namespace oneflow
-
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include <cstdint>
+#ifdef WITH_ROCM
+#include "hip/hip_runtime.h"
+#include "oneflow/core/common/data_type.h"
+#include "oneflow/core/framework/framework.h"
+#include "oneflow/user/kernels/pad2d_kernels_util.h"
+#include "oneflow/core/ep/rocm/cuda_stream.h"
+
+namespace oneflow {
+namespace user_op {
+
+template<typename IN_T>
+__global__ void DoCUDAReflectionPad2d(const IN_T* src, IN_T* dest,
+                                      const NdIndexOffsetHelper<int64_t, 4> index_helper,
+                                      int64_t elem_num, int64_t src_num, int64_t dest_num,
+                                      int64_t y_height, int64_t y_width, int64_t x_height,
+                                      int64_t x_width, int64_t pad_left, int64_t pad_top) {
+  DoReflectionPad2d<IN_T>(src, dest, index_helper, elem_num, src_num, dest_num, y_height, y_width,
+                          x_height, x_width, pad_left, pad_top);
+};
+
+template<typename IN_T>
+__global__ void DoCUDAReflectionPad2dGrad(const IN_T* src, IN_T* dest,
+                                          const NdIndexOffsetHelper<int64_t, 4> index_helper,
+                                          int64_t elem_num, int64_t src_num, int64_t dest_num,
+                                          int64_t dy_height, int64_t dy_width, int64_t dx_height,
+                                          int64_t dx_width, int64_t pad_left, int64_t pad_top) {
+  DoReflectionPad2dGrad<IN_T>(src, dest, index_helper, elem_num, src_num, dest_num, dy_height,
+                              dy_width, dx_height, dx_width, pad_left, pad_top);
+};
+
+template<typename IN_T>
+struct ReflectionPad2dFunctor<DeviceType::kCUDA, IN_T> final {
+  void operator()(ep::Stream* stream, const IN_T* src, IN_T* dest,
+                  const NdIndexOffsetHelper<int64_t, 4>& index_helper, int64_t n_batch,
+                  int64_t n_channel, int64_t y_height, int64_t y_width, int64_t x_height,
+                  int64_t x_width, int64_t pad_left, int64_t pad_top) {
+    int64_t dest_num = n_channel * y_height * y_width;
+    int64_t src_num = n_channel * x_height * x_width;
+    int64_t elem_num = n_batch * dest_num;
+    DoCUDAReflectionPad2d<IN_T><<<BlocksNum4ThreadsNum(elem_num), kCudaThreadsNumPerBlock, 0,
+                                  stream->As<ep::CudaStream>()->cuda_stream()>>>(
+        src, dest, index_helper, elem_num, src_num, dest_num, y_height, y_width, x_height, x_width,
+        pad_left, pad_top);
+  }
+};
+
+// float16 implementation
+template<>
+void ReflectionPad2dFunctor<DeviceType::kCUDA, float16>::operator()(
+    ep::Stream* stream, const float16* src, float16* dest,
+    const NdIndexOffsetHelper<int64_t, 4>& index_helper, int64_t n_batch, int64_t n_channel,
+    int64_t y_height, int64_t y_width, int64_t x_height, int64_t x_width, int64_t pad_left,
+    int64_t pad_top) {
+  int64_t dest_num = n_channel * y_height * y_width;
+  int64_t src_num = n_channel * x_height * x_width;
+  int64_t elem_num = n_batch * dest_num;
+  DoCUDAReflectionPad2d<half><<<BlocksNum4ThreadsNum(elem_num), kCudaThreadsNumPerBlock, 0,
+                                stream->As<ep::CudaStream>()->cuda_stream()>>>(
+      reinterpret_cast<const half*>(src), reinterpret_cast<half*>(dest), index_helper, elem_num,
+      src_num, dest_num, y_height, y_width, x_height, x_width, pad_left, pad_top);
+}
+
+template<typename IN_T>
+struct ReflectionPad2dGradFunctor<DeviceType::kCUDA, IN_T> final {
+  void operator()(ep::Stream* stream, const IN_T* src, IN_T* dest,
+                  const NdIndexOffsetHelper<int64_t, 4>& index_helper, int64_t n_batch,
+                  int64_t n_channel, int64_t dy_height, int64_t dy_width, int64_t dx_height,
+                  int64_t dx_width, int64_t pad_left, int64_t pad_top) {
+    int64_t dest_num = n_channel * dx_height * dx_width;
+    int64_t src_num = n_channel * dy_height * dy_width;
+    int64_t elem_num = n_batch * src_num;
+    DoCUDAReflectionPad2dGrad<IN_T><<<BlocksNum4ThreadsNum(elem_num), kCudaThreadsNumPerBlock, 0,
+                                      stream->As<ep::CudaStream>()->cuda_stream()>>>(
+        src, dest, index_helper, elem_num, src_num, dest_num, dy_height, dy_width, dx_height,
+        dx_width, pad_left, pad_top);
+  }
+};
+
+// float16 implementation
+template<>
+void ReflectionPad2dGradFunctor<DeviceType::kCUDA, float16>::operator()(
+    ep::Stream* stream, const float16* src, float16* dest,
+    const NdIndexOffsetHelper<int64_t, 4>& index_helper, int64_t n_batch, int64_t n_channel,
+    int64_t dy_height, int64_t dy_width, int64_t dx_height, int64_t dx_width, int64_t pad_left,
+    int64_t pad_top) {
+  int64_t dest_num = n_channel * dx_height * dx_width;
+  int64_t src_num = n_channel * dy_height * dy_width;
+  int64_t elem_num = n_batch * src_num;
+  DoCUDAReflectionPad2dGrad<half><<<BlocksNum4ThreadsNum(elem_num), kCudaThreadsNumPerBlock, 0,
+                                    stream->As<ep::CudaStream>()->cuda_stream()>>>(
+      reinterpret_cast<const half*>(src), reinterpret_cast<half*>(dest), index_helper, elem_num,
+      src_num, dest_num, dy_height, dy_width, dx_height, dx_width, pad_left, pad_top);
+}
+
+OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_REFLECTION_PAD2D_FUNCTOR,
+                                 OF_PP_MAKE_TUPLE_SEQ(DeviceType::kCUDA),
+                                 PADDING_DATA_TYPE_CUDA_SEQ);
+
+OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_REFLECTION_PAD2D_GRAD_FUNCTOR,
+                                 OF_PP_MAKE_TUPLE_SEQ(DeviceType::kCUDA),
+                                 PADDING_DATA_TYPE_CUDA_SEQ);
+
+template<typename IN_T>
+__global__ void DoCUDAReplicationPad2d(const IN_T* src, IN_T* dest,
+                                       const NdIndexOffsetHelper<int64_t, 4> index_helper,
+                                       int64_t elem_num, int64_t src_num, int64_t dest_num,
+                                       int64_t y_height, int64_t y_width, int64_t x_height,
+                                       int64_t x_width, int64_t pad_left, int64_t pad_top) {
+  DoReplicationPad2d<IN_T>(src, dest, index_helper, elem_num, src_num, dest_num, y_height, y_width,
+                           x_height, x_width, pad_left, pad_top);
+};
+
+template<typename IN_T>
+__global__ void DoCUDAReplicationPad2dGrad(const IN_T* src, IN_T* dest,
+                                           const NdIndexOffsetHelper<int64_t, 4> index_helper,
+                                           int64_t elem_num, int64_t src_num, int64_t dest_num,
+                                           int64_t dy_height, int64_t dy_width, int64_t dx_height,
+                                           int64_t dx_width, int64_t pad_left, int64_t pad_top) {
+  DoReplicationPad2dGrad<IN_T>(src, dest, index_helper, elem_num, src_num, dest_num, dy_height,
+                               dy_width, dx_height, dx_width, pad_left, pad_top);
+};
+
+template<typename IN_T>
+struct ReplicationPad2dFunctor<DeviceType::kCUDA, IN_T> final {
+  void operator()(ep::Stream* stream, const IN_T* src, IN_T* dest,
+                  const NdIndexOffsetHelper<int64_t, 4>& index_helper, int64_t n_batch,
+                  int64_t n_channel, int64_t y_height, int64_t y_width, int64_t x_height,
+                  int64_t x_width, int64_t pad_left, int64_t pad_top) {
+    int64_t dest_num = n_channel * y_height * y_width;
+    int64_t src_num = n_channel * x_height * x_width;
+    int64_t elem_num = n_batch * dest_num;
+    DoCUDAReplicationPad2d<IN_T><<<BlocksNum4ThreadsNum(elem_num), kCudaThreadsNumPerBlock, 0,
+                                   stream->As<ep::CudaStream>()->cuda_stream()>>>(
+        src, dest, index_helper, elem_num, src_num, dest_num, y_height, y_width, x_height, x_width,
+        pad_left, pad_top);
+  }
+};
+
+// float16 implementation
+template<>
+void ReplicationPad2dFunctor<DeviceType::kCUDA, float16>::operator()(
+    ep::Stream* stream, const float16* src, float16* dest,
+    const NdIndexOffsetHelper<int64_t, 4>& index_helper, int64_t n_batch, int64_t n_channel,
+    int64_t y_height, int64_t y_width, int64_t x_height, int64_t x_width, int64_t pad_left,
+    int64_t pad_top) {
+  int64_t dest_num = n_channel * y_height * y_width;
+  int64_t src_num = n_channel * x_height * x_width;
+  int64_t elem_num = n_batch * dest_num;
+  DoCUDAReplicationPad2d<half><<<BlocksNum4ThreadsNum(elem_num), kCudaThreadsNumPerBlock, 0,
+                                 stream->As<ep::CudaStream>()->cuda_stream()>>>(
+      reinterpret_cast<const half*>(src), reinterpret_cast<half*>(dest), index_helper, elem_num,
+      src_num, dest_num, y_height, y_width, x_height, x_width, pad_left, pad_top);
+}
+
+template<typename IN_T>
+struct ReplicationPad2dGradFunctor<DeviceType::kCUDA, IN_T> final {
+  void operator()(ep::Stream* stream, const IN_T* src, IN_T* dest,
+                  const NdIndexOffsetHelper<int64_t, 4>& index_helper, int64_t n_batch,
+                  int64_t n_channel, int64_t dy_height, int64_t dy_width, int64_t dx_height,
+                  int64_t dx_width, int64_t pad_left, int64_t pad_top) {
+    int64_t dest_num = n_channel * dx_height * dx_width;
+    int64_t src_num = n_channel * dy_height * dy_width;
+    int64_t elem_num = n_batch * src_num;
+    DoCUDAReplicationPad2dGrad<IN_T><<<BlocksNum4ThreadsNum(elem_num), kCudaThreadsNumPerBlock, 0,
+                                       stream->As<ep::CudaStream>()->cuda_stream()>>>(
+        src, dest, index_helper, elem_num, src_num, dest_num, dy_height, dy_width, dx_height,
+        dx_width, pad_left, pad_top);
+  }
+};
+
+// float16 implementation
+template<>
+void ReplicationPad2dGradFunctor<DeviceType::kCUDA, float16>::operator()(
+    ep::Stream* stream, const float16* src, float16* dest,
+    const NdIndexOffsetHelper<int64_t, 4>& index_helper, int64_t n_batch, int64_t n_channel,
+    int64_t dy_height, int64_t dy_width, int64_t dx_height, int64_t dx_width, int64_t pad_left,
+    int64_t pad_top) {
+  int64_t dest_num = n_channel * dx_height * dx_width;
+  int64_t src_num = n_channel * dy_height * dy_width;
+  int64_t elem_num = n_batch * src_num;
+  DoCUDAReplicationPad2dGrad<half><<<BlocksNum4ThreadsNum(elem_num), kCudaThreadsNumPerBlock, 0,
+                                     stream->As<ep::CudaStream>()->cuda_stream()>>>(
+      reinterpret_cast<const half*>(src), reinterpret_cast<half*>(dest), index_helper, elem_num,
+      src_num, dest_num, dy_height, dy_width, dx_height, dx_width, pad_left, pad_top);
+}
+
+OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_REPLICATION_PAD2D_FUNCTOR,
+                                 OF_PP_MAKE_TUPLE_SEQ(DeviceType::kCUDA),
+                                 PADDING_DATA_TYPE_CUDA_SEQ);
+
+OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_REPLICATION_PAD2D_GRAD_FUNCTOR,
+                                 OF_PP_MAKE_TUPLE_SEQ(DeviceType::kCUDA),
+                                 PADDING_DATA_TYPE_CUDA_SEQ);
+
+}  // namespace user_op
+}  // namespace oneflow
+
 #endif  // WITH_ROCM
\ No newline at end of file
diff --git a/oneflow/user/kernels/partial_fc_sample_kernel.hip.cpp b/oneflow/user/kernels/partial_fc_sample_kernel.hip.cpp
index 22c1798..402ea80 100644
--- a/oneflow/user/kernels/partial_fc_sample_kernel.hip.cpp
+++ b/oneflow/user/kernels/partial_fc_sample_kernel.hip.cpp
@@ -1,431 +1,431 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#ifdef WITH_ROCM
-#include "hip/hip_runtime.h"
-#include "oneflow/core/framework/framework.h"
-#include "oneflow/core/kernel/new_kernel_util.h"
-#include "oneflow/core/common/balanced_splitter.h"
-#include "oneflow/user/kernels/gather_kernel_util.h"
-#include "oneflow/core/common/not_equal_to_previous_adjacent_iterator.h"
-#include <hipcub/hipcub.hpp>
-#include <hiprand.h>
-#include <hiprand_kernel.h>
-#include "oneflow/core/ep/rocm/cuda_stream.h"
-
-namespace oneflow {
-namespace user_op {
-
-namespace {
-
-template<typename K>
-int64_t GetCubSortPairsTempStorageSize(int64_t n) {
-  size_t cub_sort_temp_store_size = 0;
-  OF_CUDA_CHECK((hipcub::DeviceRadixSort::SortPairs<K, K>(nullptr, cub_sort_temp_store_size, nullptr,
-                                                       nullptr, nullptr, nullptr, n)));
-  size_t temp_store_size = GetCudaAlignedSize(cub_sort_temp_store_size);
-  CHECK_GE(temp_store_size, 0);
-  CHECK_LT(temp_store_size, static_cast<size_t>(GetMaxVal<int64_t>()));
-  return static_cast<int64_t>(temp_store_size);
-}
-
-template<typename K>
-int64_t GetCubScanTempStorageSize(int64_t n) {
-  size_t cub_scan_temp_store_size = 0;
-  NotEqualToPreviousAdjacentIterator<K, K> unique_counting_iter(nullptr, 0);
-  OF_CUDA_CHECK((hipcub::DeviceScan::InclusiveSum<NotEqualToPreviousAdjacentIterator<K, K>, K*>(
-      nullptr, cub_scan_temp_store_size, unique_counting_iter, nullptr, n)));
-  size_t temp_store_size = GetCudaAlignedSize(cub_scan_temp_store_size);
-  CHECK_GE(temp_store_size, 0);
-  CHECK_LT(temp_store_size, static_cast<size_t>(GetMaxVal<int64_t>()));
-  return static_cast<int64_t>(temp_store_size);
-}
-
-template<typename K>
-class TmpBufferManager final {
- public:
-  OF_DISALLOW_COPY_AND_MOVE(TmpBufferManager);
-  TmpBufferManager(void* ptr, const int64_t device_num_class, const int64_t batch_size,
-                   const int64_t parallel_num)
-      : ptr_(ptr) {
-    const int64_t buffer_elem_cnt = std::max(device_num_class, batch_size);
-    const size_t cub_sort_keys_bytes = GetCudaAlignedSize(buffer_elem_cnt * sizeof(K));
-    const size_t cub_sort_values_bytes = GetCudaAlignedSize(buffer_elem_cnt * sizeof(K));
-    const size_t cub_sort_keys_out_bytes = GetCudaAlignedSize(buffer_elem_cnt * sizeof(K));
-    const size_t cub_sort_values_out_bytes = GetCudaAlignedSize(buffer_elem_cnt * sizeof(K));
-    const size_t bound_index_bytes = GetCudaAlignedSize((parallel_num + 1) * sizeof(K));
-    const size_t bound_value_bytes = GetCudaAlignedSize((parallel_num + 1) * sizeof(K));
-    cub_tmp_storage_bytes_ = std::max(GetCubSortPairsTempStorageSize<K>(buffer_elem_cnt),
-                                      GetCubScanTempStorageSize<K>(batch_size));
-    cub_sort_keys_offset_ = 0;
-    cub_sort_values_offset_ = cub_sort_keys_offset_ + cub_sort_keys_bytes;
-    cub_sort_keys_out_offset_ = cub_sort_values_offset_ + cub_sort_values_bytes;
-    cub_sort_values_out_offset_ = cub_sort_keys_out_offset_ + cub_sort_keys_out_bytes;
-    cub_tmp_storage_offset_ = cub_sort_values_out_offset_ + cub_sort_values_out_bytes;
-    bound_index_offset_ = cub_tmp_storage_offset_ + cub_tmp_storage_bytes_;
-    bound_value_offset_ = bound_index_offset_ + bound_index_bytes;
-    total_buffer_size_ = cub_sort_keys_bytes + cub_sort_values_bytes + cub_sort_keys_out_bytes
-                         + cub_sort_values_out_bytes + cub_tmp_storage_bytes_ + bound_index_bytes
-                         + bound_value_bytes;
-  }
-  ~TmpBufferManager() = default;
-
-  size_t GetTotalBufferSize() const { return total_buffer_size_; }
-  size_t GetCubTmpStorageSize() const { return cub_tmp_storage_bytes_; }
-  K* CubSortKeysPtr() const {
-    CHECK(ptr_ != nullptr);
-    return reinterpret_cast<K*>(reinterpret_cast<char*>(ptr_) + cub_sort_keys_offset_);
-  }
-  K* CubSortValuesPtr() const {
-    CHECK(ptr_ != nullptr);
-    return reinterpret_cast<K*>(reinterpret_cast<char*>(ptr_) + cub_sort_values_offset_);
-  }
-  K* CubSortKeysOutPtr() const {
-    CHECK(ptr_ != nullptr);
-    return reinterpret_cast<K*>(reinterpret_cast<char*>(ptr_) + cub_sort_keys_out_offset_);
-  }
-  K* CubSortValuesOutPtr() const {
-    CHECK(ptr_ != nullptr);
-    return reinterpret_cast<K*>(reinterpret_cast<char*>(ptr_) + cub_sort_values_out_offset_);
-  }
-  void* CubTmpStoragePtr() const {
-    CHECK(ptr_ != nullptr);
-    return reinterpret_cast<void*>(reinterpret_cast<char*>(ptr_) + cub_tmp_storage_offset_);
-  }
-  K* BoundIndexPtr() const {
-    CHECK(ptr_ != nullptr);
-    return reinterpret_cast<K*>(reinterpret_cast<char*>(ptr_) + bound_index_offset_);
-  }
-  K* BoundValuePtr() const {
-    CHECK(ptr_ != nullptr);
-    return reinterpret_cast<K*>(reinterpret_cast<char*>(ptr_) + bound_value_offset_);
-  }
-
- private:
-  size_t cub_sort_keys_offset_;
-  size_t cub_sort_values_offset_;
-  size_t cub_sort_keys_out_offset_;
-  size_t cub_sort_values_out_offset_;
-  size_t cub_tmp_storage_offset_;
-  size_t bound_index_offset_;
-  size_t bound_value_offset_;
-  size_t cub_tmp_storage_bytes_;
-  size_t total_buffer_size_;
-  void* ptr_;
-};
-
-__global__ void SetupKernel(int64_t seed, hiprandState* state) {
-  const int id = blockIdx.x * blockDim.x + threadIdx.x;
-  size_t local_seed = (static_cast<size_t>(seed) + 0x9e3779b9U + (static_cast<size_t>(id) << 6U)
-                       + (static_cast<size_t>(id) >> 2U));
-  hiprand_init(local_seed, 0, 0, &state[id]);
-}
-
-template<typename K>
-__global__ void GenerateGpu(hiprandState* state, const int64_t n, const int64_t max_val, K* buffer) {
-  const int id = blockIdx.x * blockDim.x + threadIdx.x;
-  hiprandState localState = state[id];
-  CUDA_1D_KERNEL_LOOP(i, n) { buffer[i] = static_cast<K>(hiprand(&localState) % max_val); }
-  state[id] = localState;
-}
-
-class DistributedPartialFcSampleOpKernelState final : public user_op::OpKernelState {
- public:
-  DistributedPartialFcSampleOpKernelState(ep::Stream* stream, int64_t lower, int64_t upper,
-                                          int64_t num_sample_per_rank, int64_t seed)
-      : lower_(lower), upper_(upper), num_sample_per_rank_(num_sample_per_rank) {
-    CHECK_NOTNULL(stream);
-    const int64_t num_classes = upper_ - lower_;
-    OF_CUDA_CHECK(hipMalloc(&curand_states_, BlocksNum4ThreadsNum(num_classes)
-                                                  * kCudaThreadsNumPerBlock * sizeof(hiprandState)));
-    SetupKernel<<<BlocksNum4ThreadsNum(num_classes), kCudaThreadsNumPerBlock, 0,
-                  stream->As<ep::CudaStream>()->cuda_stream()>>>(seed, curand_states_);
-  }
-  ~DistributedPartialFcSampleOpKernelState() {
-    hipError_t ret = hipFree(curand_states_);
-    if (ret != hipErrorDeinitialized) { OF_CUDA_CHECK(ret); }
-  };
-
-  int64_t lower() const { return lower_; }
-  int64_t upper() const { return upper_; }
-  int64_t num_sample_per_rank() const { return num_sample_per_rank_; }
-
-  template<typename K>
-  void GenRandom(ep::Stream* stream, const int64_t n, const int64_t max_val, K* buffer) {
-    GenerateGpu<K>
-        <<<BlocksNum4ThreadsNum(n), kCudaThreadsNumPerBlock, 0,
-           stream->As<ep::CudaStream>()->cuda_stream()>>>(curand_states_, n, max_val, buffer);
-  }
-
- private:
-  const int64_t lower_;
-  const int64_t upper_;
-  const int64_t num_sample_per_rank_;
-  hiprandState* curand_states_;
-};
-
-template<typename K>
-__global__ void IotaKernel(int64_t n, K* out) {
-  CUDA_1D_KERNEL_LOOP(i, n) { out[i] = static_cast<K>(i); }
-}
-
-template<typename K>
-__global__ void MarkPositive(const int64_t n, const int64_t offset, const int64_t num_classes,
-                             const K* labels, K* out) {
-  CUDA_1D_KERNEL_LOOP(i, n) {
-    K label = labels[i] - offset;
-    if (label >= 0 && label < num_classes) { out[label] = label - num_classes; }
-  }
-}
-
-template<typename K>
-__global__ void GetSampledLabel(const int64_t n, const int64_t offset, const K* label,
-                                K* sampled_label) {
-  CUDA_1D_KERNEL_LOOP(i, n) { sampled_label[i] = label[i] + offset; }
-}
-
-template<typename K>
-__global__ void GetLabelMap(const int64_t n, const int64_t parallel_num,
-                            const int64_t num_sample_per_rank, const K* bound_index,
-                            const K* bound_value, K* label_map) {
-  CUDA_1D_KERNEL_LOOP(i, n) {
-#pragma unroll
-    for (int64_t j = 0; j < parallel_num; j++) {
-      if (i >= bound_index[j] && i < bound_index[j + 1]) {
-        label_map[i] = label_map[i] - bound_value[j] + j * num_sample_per_rank;
-      }
-    }
-  }
-}
-
-template<typename K>
-__global__ void GetPartionBound(const int64_t n, const int64_t parallel_num,
-                                const int64_t num_classes_per_rank, const K* key_ptr,
-                                const K* value_ptr, K* bound_index, K* bound_value) {
-  CUDA_1D_KERNEL_LOOP(i, n) {
-    if (i != 0) {
-      const K cur_in = key_ptr[i] / num_classes_per_rank;
-      const K pre_in = key_ptr[i - 1] / num_classes_per_rank;
-      if (cur_in > pre_in) {
-        assert(cur_in < parallel_num);
-#pragma unroll
-        for (int32_t j = pre_in + 1; j <= cur_in; ++j) {
-          bound_index[j] = static_cast<K>(i);
-          bound_value[j] = value_ptr[i];
-        }
-      }
-    }
-  }
-  CUDA_1D_KERNEL_LOOP(i, parallel_num + 1) {
-    const K first_in = key_ptr[0] / num_classes_per_rank;
-    const K last_in = key_ptr[n - 1] / num_classes_per_rank;
-    if (i <= first_in) {
-      bound_index[i] = 0;
-      bound_value[i] = value_ptr[0];
-    } else if (i > last_in) {
-      bound_index[i] = n;
-      bound_value[i] = value_ptr[n - 1];
-    }
-  }
-}
-
-template<typename K>
-__global__ void GetMappedLabel(const int64_t n, const K* label_map_key, const K* label_map_value,
-                               K* mapped_label) {
-  CUDA_1D_KERNEL_LOOP(i, n) { mapped_label[label_map_key[i]] = label_map_value[i]; }
-}
-
-template<typename K>
-void MapLabel(ep::Stream* stream, const int64_t num_classes, const int64_t batch_size,
-              const int64_t lower_bound, const int64_t parallel_num, const int64_t num_sample,
-              size_t temp_storage_bytes, const K* label_ptr, K* mapped_label_ptr,
-              K* cub_sort_values_ptr, K* cub_sort_keys_out_ptr, K* cub_sort_values_out_ptr,
-              void* cub_tmp_storage_ptr, K* bound_index_ptr, K* bound_value_ptr) {
-  IotaKernel<<<BlocksNum4ThreadsNum(batch_size), kCudaThreadsNumPerBlock, 0,
-               stream->As<ep::CudaStream>()->cuda_stream()>>>(batch_size, cub_sort_values_ptr);
-  OF_CUDA_CHECK((hipcub::DeviceRadixSort::SortPairs<K, K>(
-      cub_tmp_storage_ptr, temp_storage_bytes, label_ptr, cub_sort_keys_out_ptr,
-      cub_sort_values_ptr, cub_sort_values_out_ptr, batch_size, 0, sizeof(K) * 8,
-      stream->As<ep::CudaStream>()->cuda_stream())));
-  NotEqualToPreviousAdjacentIterator<K, K> unique_counting_iter(cub_sort_keys_out_ptr, 0);
-  OF_CUDA_CHECK((hipcub::DeviceScan::InclusiveSum<NotEqualToPreviousAdjacentIterator<K, K>, K*>(
-      cub_tmp_storage_ptr, temp_storage_bytes, unique_counting_iter, cub_sort_values_ptr,
-      batch_size, stream->As<ep::CudaStream>()->cuda_stream())));
-
-  GetPartionBound<<<BlocksNum4ThreadsNum(batch_size), kCudaThreadsNumPerBlock, 0,
-                    stream->As<ep::CudaStream>()->cuda_stream()>>>(
-      batch_size, parallel_num, num_classes, cub_sort_keys_out_ptr, cub_sort_values_ptr,
-      bound_index_ptr, bound_value_ptr);
-
-  GetLabelMap<K><<<BlocksNum4ThreadsNum(batch_size), kCudaThreadsNumPerBlock, 0,
-                   stream->As<ep::CudaStream>()->cuda_stream()>>>(
-      batch_size, parallel_num, num_sample, bound_index_ptr, bound_value_ptr, cub_sort_values_ptr);
-
-  GetMappedLabel<<<BlocksNum4ThreadsNum(batch_size), kCudaThreadsNumPerBlock, 0,
-                   stream->As<ep::CudaStream>()->cuda_stream()>>>(
-      batch_size, cub_sort_values_out_ptr, cub_sort_values_ptr, mapped_label_ptr);
-}
-
-}  // namespace
-
-template<typename T, typename K>
-class DistributedPartialFcSampleGpuKernel final : public user_op::OpKernel {
- public:
-  DistributedPartialFcSampleGpuKernel() = default;
-  ~DistributedPartialFcSampleGpuKernel() override = default;
-
-  std::shared_ptr<user_op::OpKernelState> CreateOpKernelState(
-      user_op::KernelInitContext* ctx) const override {
-    const SbpParallel& in_sbp = ctx->SbpParallel4ArgNameAndIndex("weight", 0);
-    const TensorDesc* in_logical_desc = ctx->LogicalTensorDesc4ArgNameAndIndex("weight", 0);
-    const int64_t class_num = in_logical_desc->shape().At(0);
-    const int64_t num_sample = ctx->Attr<int64_t>("num_sample");
-    int64_t seed = ctx->Attr<int64_t>("seed");
-    const int64_t parallel_num = ctx->parallel_ctx().parallel_num();
-    const int64_t num_sample_per_rank = RoundUp(num_sample, parallel_num) / parallel_num;
-    if (in_sbp.has_split_parallel() && in_sbp.split_parallel().axis() == 0 && parallel_num > 1) {
-      std::seed_seq seq{seed};
-      std::vector<int64_t> seeds(parallel_num);
-      seq.generate(seeds.begin(), seeds.end());
-      seed = seeds.at(ctx->parallel_ctx().parallel_id());
-      CHECK(ctx->SbpParallel4ArgNameAndIndex("label", 0).has_broadcast_parallel());
-      BalancedSplitter bs(class_num, parallel_num);
-      return std::make_shared<DistributedPartialFcSampleOpKernelState>(
-          ctx->stream(), bs.At(ctx->parallel_ctx().parallel_id()).begin(),
-          bs.At(ctx->parallel_ctx().parallel_id()).end(), num_sample_per_rank, seed);
-    } else {
-      return std::make_shared<DistributedPartialFcSampleOpKernelState>(ctx->stream(), 0, class_num,
-                                                                       num_sample_per_rank, seed);
-    }
-  }
-
- private:
-  using user_op::OpKernel::Compute;
-  void Compute(user_op::KernelComputeContext* ctx, user_op::OpKernelState* state,
-               const user_op::OpKernelCache*) const override {
-    const user_op::Tensor* weight = ctx->Tensor4ArgNameAndIndex("weight", 0);
-    const user_op::Tensor* label = ctx->Tensor4ArgNameAndIndex("label", 0);
-    user_op::Tensor* mapped_label = ctx->Tensor4ArgNameAndIndex("mapped_label", 0);
-    user_op::Tensor* sampled_label = ctx->Tensor4ArgNameAndIndex("sampled_label", 0);
-    user_op::Tensor* sampled_weight = ctx->Tensor4ArgNameAndIndex("sampled_weight", 0);
-    user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0);
-
-    const int64_t batch_size = label->shape_view().At(0);
-    const int64_t num_classes = weight->shape_view().At(0);
-    const int64_t parallel_num = ctx->parallel_ctx().parallel_num();
-    TmpBufferManager<K> buffer_manager(tmp_buffer->mut_dptr(), num_classes, batch_size,
-                                       parallel_num);
-
-    auto* kernel_state = dynamic_cast<DistributedPartialFcSampleOpKernelState*>(state);
-    CHECK_NOTNULL(kernel_state);
-    CHECK_EQ(num_classes, kernel_state->upper() - kernel_state->lower());
-    const int64_t lower_bound = kernel_state->lower();
-    const int64_t num_sample = kernel_state->num_sample_per_rank();
-    kernel_state->GenRandom<K>(ctx->stream(), num_classes, num_classes,
-                               buffer_manager.CubSortKeysPtr());
-    MarkPositive<<<BlocksNum4ThreadsNum(batch_size), kCudaThreadsNumPerBlock, 0,
-                   ctx->stream()->As<ep::CudaStream>()->cuda_stream()>>>(
-        batch_size, lower_bound, num_classes, label->dptr<K>(), buffer_manager.CubSortKeysPtr());
-    IotaKernel<<<BlocksNum4ThreadsNum(num_classes), kCudaThreadsNumPerBlock, 0,
-                 ctx->stream()->As<ep::CudaStream>()->cuda_stream()>>>(
-        num_classes, buffer_manager.CubSortValuesPtr());
-    size_t temp_storage_bytes = buffer_manager.GetCubTmpStorageSize();
-    OF_CUDA_CHECK((hipcub::DeviceRadixSort::SortPairs<K, K>(
-        buffer_manager.CubTmpStoragePtr(), temp_storage_bytes, buffer_manager.CubSortKeysPtr(),
-        buffer_manager.CubSortKeysOutPtr(), buffer_manager.CubSortValuesPtr(),
-        buffer_manager.CubSortValuesOutPtr(), num_classes, 0, sizeof(K) * 8,
-        ctx->stream()->As<ep::CudaStream>()->cuda_stream())));
-
-    GetSampledLabel<<<BlocksNum4ThreadsNum(num_sample), kCudaThreadsNumPerBlock, 0,
-                      ctx->stream()->As<ep::CudaStream>()->cuda_stream()>>>(
-        num_sample, lower_bound, buffer_manager.CubSortValuesOutPtr(),
-        sampled_label->mut_dptr<K>());
-
-    GatherKernelUtilImpl<DeviceType::kCUDA, T, K>::Forward(
-        ctx->stream(), buffer_manager.CubSortValuesOutPtr(), num_sample, weight->dptr<T>(),
-        Shape({1, num_classes, weight->shape_view().Count(1)}), sampled_weight->mut_dptr<T>(), 0);
-
-    MapLabel<K>(ctx->stream(), num_classes, batch_size, lower_bound, parallel_num, num_sample,
-                buffer_manager.GetCubTmpStorageSize(), label->dptr<K>(),
-                mapped_label->mut_dptr<K>(), buffer_manager.CubSortValuesPtr(),
-                buffer_manager.CubSortKeysOutPtr(), buffer_manager.CubSortValuesOutPtr(),
-                buffer_manager.CubTmpStoragePtr(), buffer_manager.BoundIndexPtr(),
-                buffer_manager.BoundValuePtr());
-  }
-
-  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
-};
-
-#define REGISTER_DISTRIBUTED_PARTIAL_FC_SAMPLE_CUDA_KERNEL(dtype_pair, ltype_pair)               \
-  REGISTER_USER_KERNEL("distributed_partial_fc_sample")                                          \
-      .SetCreateFn<DistributedPartialFcSampleGpuKernel<OF_PP_PAIR_FIRST(dtype_pair),             \
-                                                       OF_PP_PAIR_FIRST(ltype_pair)>>()          \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                           \
-                       && (user_op::HobDataType("label", 0) == OF_PP_PAIR_SECOND(ltype_pair))    \
-                       && (user_op::HobDataType("weight", 0) == OF_PP_PAIR_SECOND(dtype_pair)))  \
-      .SetInferTmpSizeFn([](oneflow::user_op::InferContext* ctx) {                               \
-        const int64_t num_classes = ctx->InputTensorDesc("weight", 0).shape().At(0);             \
-        const int64_t batch_size = ctx->InputTensorDesc("label", 0).shape().At(0);               \
-        const int64_t parallel_num = ctx->parallel_ctx().parallel_num();                         \
-        TmpBufferManager<OF_PP_PAIR_FIRST(ltype_pair)> buffer_manager(nullptr, num_classes,      \
-                                                                      batch_size, parallel_num); \
-        return buffer_manager.GetTotalBufferSize();                                              \
-      });
-
-OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_DISTRIBUTED_PARTIAL_FC_SAMPLE_CUDA_KERNEL,
-                                 FLOATING_DATA_TYPE_SEQ, INDEX_DATA_TYPE_SEQ)
-
-template<typename T, typename K>
-class DistributedPartialFcSampleDisableBoxingGpuKernel final : public user_op::OpKernel {
- public:
-  DistributedPartialFcSampleDisableBoxingGpuKernel() = default;
-  ~DistributedPartialFcSampleDisableBoxingGpuKernel() override = default;
-
- private:
-  using user_op::OpKernel::Compute;
-  void Compute(user_op::KernelComputeContext* ctx, user_op::OpKernelState* state,
-               const user_op::OpKernelCache*) const override {
-    const user_op::Tensor* sampled_weight_diff =
-        ctx->Tensor4ArgNameAndIndex("sampled_weight_diff", 0);
-    const user_op::Tensor* sampled_label = ctx->Tensor4ArgNameAndIndex("sampled_label", 0);
-    user_op::Tensor* boxing_disabled_sampled_weight_diff =
-        ctx->Tensor4ArgNameAndIndex("boxing_disabled_sampled_weight_diff", 0);
-    user_op::Tensor* boxing_disabled_sampled_label =
-        ctx->Tensor4ArgNameAndIndex("boxing_disabled_sampled_label", 0);
-    Memcpy<DeviceType::kCUDA>(ctx->stream(), boxing_disabled_sampled_weight_diff->mut_dptr<void>(),
-                              sampled_weight_diff->dptr<void>(),
-                              sampled_weight_diff->shape_view().elem_cnt()
-                                  * GetSizeOfDataType(sampled_weight_diff->data_type()));
-    Memcpy<DeviceType::kCUDA>(
-        ctx->stream(), boxing_disabled_sampled_label->mut_dptr<void>(), sampled_label->dptr<void>(),
-        sampled_label->shape_view().elem_cnt() * GetSizeOfDataType(sampled_label->data_type()));
-  }
-  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
-};
-
-#define REGISTER_DISTRIBUTED_PARTIAL_FC_SAMPLE_DISABLE_BOXING_CUDA_KERNEL(dtype_pair, ltype_pair) \
-  REGISTER_USER_KERNEL("distributed_partial_fc_sample_disable_boxing")                            \
-      .SetCreateFn<DistributedPartialFcSampleDisableBoxingGpuKernel<                              \
-          OF_PP_PAIR_FIRST(dtype_pair), OF_PP_PAIR_FIRST(ltype_pair)>>()                          \
-      .SetIsMatchedHob(                                                                           \
-          (user_op::HobDeviceType() == DeviceType::kCUDA)                                         \
-          && (user_op::HobDataType("sampled_label", 0) == OF_PP_PAIR_SECOND(ltype_pair))          \
-          && (user_op::HobDataType("sampled_weight_diff", 0) == OF_PP_PAIR_SECOND(dtype_pair)));
-OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_DISTRIBUTED_PARTIAL_FC_SAMPLE_DISABLE_BOXING_CUDA_KERNEL,
-                                 FLOATING_DATA_TYPE_SEQ, INDEX_DATA_TYPE_SEQ)
-
-}  // namespace user_op
-}  // namespace oneflow
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#ifdef WITH_ROCM
+#include "hip/hip_runtime.h"
+#include "oneflow/core/framework/framework.h"
+#include "oneflow/core/kernel/new_kernel_util.h"
+#include "oneflow/core/common/balanced_splitter.h"
+#include "oneflow/user/kernels/gather_kernel_util.h"
+#include "oneflow/core/common/not_equal_to_previous_adjacent_iterator.h"
+#include <hipcub/hipcub.hpp>
+#include <hiprand.h>
+#include <hiprand_kernel.h>
+#include "oneflow/core/ep/rocm/cuda_stream.h"
+
+namespace oneflow {
+namespace user_op {
+
+namespace {
+
+template<typename K>
+int64_t GetCubSortPairsTempStorageSize(int64_t n) {
+  size_t cub_sort_temp_store_size = 0;
+  OF_CUDA_CHECK((hipcub::DeviceRadixSort::SortPairs<K, K>(nullptr, cub_sort_temp_store_size, nullptr,
+                                                       nullptr, nullptr, nullptr, n)));
+  size_t temp_store_size = GetCudaAlignedSize(cub_sort_temp_store_size);
+  CHECK_GE(temp_store_size, 0);
+  CHECK_LT(temp_store_size, static_cast<size_t>(GetMaxVal<int64_t>()));
+  return static_cast<int64_t>(temp_store_size);
+}
+
+template<typename K>
+int64_t GetCubScanTempStorageSize(int64_t n) {
+  size_t cub_scan_temp_store_size = 0;
+  NotEqualToPreviousAdjacentIterator<K, K> unique_counting_iter(nullptr, 0);
+  OF_CUDA_CHECK((hipcub::DeviceScan::InclusiveSum<NotEqualToPreviousAdjacentIterator<K, K>, K*>(
+      nullptr, cub_scan_temp_store_size, unique_counting_iter, nullptr, n)));
+  size_t temp_store_size = GetCudaAlignedSize(cub_scan_temp_store_size);
+  CHECK_GE(temp_store_size, 0);
+  CHECK_LT(temp_store_size, static_cast<size_t>(GetMaxVal<int64_t>()));
+  return static_cast<int64_t>(temp_store_size);
+}
+
+template<typename K>
+class TmpBufferManager final {
+ public:
+  OF_DISALLOW_COPY_AND_MOVE(TmpBufferManager);
+  TmpBufferManager(void* ptr, const int64_t device_num_class, const int64_t batch_size,
+                   const int64_t parallel_num)
+      : ptr_(ptr) {
+    const int64_t buffer_elem_cnt = std::max(device_num_class, batch_size);
+    const size_t cub_sort_keys_bytes = GetCudaAlignedSize(buffer_elem_cnt * sizeof(K));
+    const size_t cub_sort_values_bytes = GetCudaAlignedSize(buffer_elem_cnt * sizeof(K));
+    const size_t cub_sort_keys_out_bytes = GetCudaAlignedSize(buffer_elem_cnt * sizeof(K));
+    const size_t cub_sort_values_out_bytes = GetCudaAlignedSize(buffer_elem_cnt * sizeof(K));
+    const size_t bound_index_bytes = GetCudaAlignedSize((parallel_num + 1) * sizeof(K));
+    const size_t bound_value_bytes = GetCudaAlignedSize((parallel_num + 1) * sizeof(K));
+    cub_tmp_storage_bytes_ = std::max(GetCubSortPairsTempStorageSize<K>(buffer_elem_cnt),
+                                      GetCubScanTempStorageSize<K>(batch_size));
+    cub_sort_keys_offset_ = 0;
+    cub_sort_values_offset_ = cub_sort_keys_offset_ + cub_sort_keys_bytes;
+    cub_sort_keys_out_offset_ = cub_sort_values_offset_ + cub_sort_values_bytes;
+    cub_sort_values_out_offset_ = cub_sort_keys_out_offset_ + cub_sort_keys_out_bytes;
+    cub_tmp_storage_offset_ = cub_sort_values_out_offset_ + cub_sort_values_out_bytes;
+    bound_index_offset_ = cub_tmp_storage_offset_ + cub_tmp_storage_bytes_;
+    bound_value_offset_ = bound_index_offset_ + bound_index_bytes;
+    total_buffer_size_ = cub_sort_keys_bytes + cub_sort_values_bytes + cub_sort_keys_out_bytes
+                         + cub_sort_values_out_bytes + cub_tmp_storage_bytes_ + bound_index_bytes
+                         + bound_value_bytes;
+  }
+  ~TmpBufferManager() = default;
+
+  size_t GetTotalBufferSize() const { return total_buffer_size_; }
+  size_t GetCubTmpStorageSize() const { return cub_tmp_storage_bytes_; }
+  K* CubSortKeysPtr() const {
+    CHECK(ptr_ != nullptr);
+    return reinterpret_cast<K*>(reinterpret_cast<char*>(ptr_) + cub_sort_keys_offset_);
+  }
+  K* CubSortValuesPtr() const {
+    CHECK(ptr_ != nullptr);
+    return reinterpret_cast<K*>(reinterpret_cast<char*>(ptr_) + cub_sort_values_offset_);
+  }
+  K* CubSortKeysOutPtr() const {
+    CHECK(ptr_ != nullptr);
+    return reinterpret_cast<K*>(reinterpret_cast<char*>(ptr_) + cub_sort_keys_out_offset_);
+  }
+  K* CubSortValuesOutPtr() const {
+    CHECK(ptr_ != nullptr);
+    return reinterpret_cast<K*>(reinterpret_cast<char*>(ptr_) + cub_sort_values_out_offset_);
+  }
+  void* CubTmpStoragePtr() const {
+    CHECK(ptr_ != nullptr);
+    return reinterpret_cast<void*>(reinterpret_cast<char*>(ptr_) + cub_tmp_storage_offset_);
+  }
+  K* BoundIndexPtr() const {
+    CHECK(ptr_ != nullptr);
+    return reinterpret_cast<K*>(reinterpret_cast<char*>(ptr_) + bound_index_offset_);
+  }
+  K* BoundValuePtr() const {
+    CHECK(ptr_ != nullptr);
+    return reinterpret_cast<K*>(reinterpret_cast<char*>(ptr_) + bound_value_offset_);
+  }
+
+ private:
+  size_t cub_sort_keys_offset_;
+  size_t cub_sort_values_offset_;
+  size_t cub_sort_keys_out_offset_;
+  size_t cub_sort_values_out_offset_;
+  size_t cub_tmp_storage_offset_;
+  size_t bound_index_offset_;
+  size_t bound_value_offset_;
+  size_t cub_tmp_storage_bytes_;
+  size_t total_buffer_size_;
+  void* ptr_;
+};
+
+__global__ void SetupKernel(int64_t seed, hiprandState* state) {
+  const int id = blockIdx.x * blockDim.x + threadIdx.x;
+  size_t local_seed = (static_cast<size_t>(seed) + 0x9e3779b9U + (static_cast<size_t>(id) << 6U)
+                       + (static_cast<size_t>(id) >> 2U));
+  hiprand_init(local_seed, 0, 0, &state[id]);
+}
+
+template<typename K>
+__global__ void GenerateGpu(hiprandState* state, const int64_t n, const int64_t max_val, K* buffer) {
+  const int id = blockIdx.x * blockDim.x + threadIdx.x;
+  hiprandState localState = state[id];
+  CUDA_1D_KERNEL_LOOP(i, n) { buffer[i] = static_cast<K>(hiprand(&localState) % max_val); }
+  state[id] = localState;
+}
+
+class DistributedPartialFcSampleOpKernelState final : public user_op::OpKernelState {
+ public:
+  DistributedPartialFcSampleOpKernelState(ep::Stream* stream, int64_t lower, int64_t upper,
+                                          int64_t num_sample_per_rank, int64_t seed)
+      : lower_(lower), upper_(upper), num_sample_per_rank_(num_sample_per_rank) {
+    CHECK_NOTNULL(stream);
+    const int64_t num_classes = upper_ - lower_;
+    OF_CUDA_CHECK(hipMalloc(&curand_states_, BlocksNum4ThreadsNum(num_classes)
+                                                  * kCudaThreadsNumPerBlock * sizeof(hiprandState)));
+    SetupKernel<<<BlocksNum4ThreadsNum(num_classes), kCudaThreadsNumPerBlock, 0,
+                  stream->As<ep::CudaStream>()->cuda_stream()>>>(seed, curand_states_);
+  }
+  ~DistributedPartialFcSampleOpKernelState() {
+    hipError_t ret = hipFree(curand_states_);
+    if (ret != hipErrorDeinitialized) { OF_CUDA_CHECK(ret); }
+  };
+
+  int64_t lower() const { return lower_; }
+  int64_t upper() const { return upper_; }
+  int64_t num_sample_per_rank() const { return num_sample_per_rank_; }
+
+  template<typename K>
+  void GenRandom(ep::Stream* stream, const int64_t n, const int64_t max_val, K* buffer) {
+    GenerateGpu<K>
+        <<<BlocksNum4ThreadsNum(n), kCudaThreadsNumPerBlock, 0,
+           stream->As<ep::CudaStream>()->cuda_stream()>>>(curand_states_, n, max_val, buffer);
+  }
+
+ private:
+  const int64_t lower_;
+  const int64_t upper_;
+  const int64_t num_sample_per_rank_;
+  hiprandState* curand_states_;
+};
+
+template<typename K>
+__global__ void IotaKernel(int64_t n, K* out) {
+  CUDA_1D_KERNEL_LOOP(i, n) { out[i] = static_cast<K>(i); }
+}
+
+template<typename K>
+__global__ void MarkPositive(const int64_t n, const int64_t offset, const int64_t num_classes,
+                             const K* labels, K* out) {
+  CUDA_1D_KERNEL_LOOP(i, n) {
+    K label = labels[i] - offset;
+    if (label >= 0 && label < num_classes) { out[label] = label - num_classes; }
+  }
+}
+
+template<typename K>
+__global__ void GetSampledLabel(const int64_t n, const int64_t offset, const K* label,
+                                K* sampled_label) {
+  CUDA_1D_KERNEL_LOOP(i, n) { sampled_label[i] = label[i] + offset; }
+}
+
+template<typename K>
+__global__ void GetLabelMap(const int64_t n, const int64_t parallel_num,
+                            const int64_t num_sample_per_rank, const K* bound_index,
+                            const K* bound_value, K* label_map) {
+  CUDA_1D_KERNEL_LOOP(i, n) {
+#pragma unroll
+    for (int64_t j = 0; j < parallel_num; j++) {
+      if (i >= bound_index[j] && i < bound_index[j + 1]) {
+        label_map[i] = label_map[i] - bound_value[j] + j * num_sample_per_rank;
+      }
+    }
+  }
+}
+
+template<typename K>
+__global__ void GetPartionBound(const int64_t n, const int64_t parallel_num,
+                                const int64_t num_classes_per_rank, const K* key_ptr,
+                                const K* value_ptr, K* bound_index, K* bound_value) {
+  CUDA_1D_KERNEL_LOOP(i, n) {
+    if (i != 0) {
+      const K cur_in = key_ptr[i] / num_classes_per_rank;
+      const K pre_in = key_ptr[i - 1] / num_classes_per_rank;
+      if (cur_in > pre_in) {
+        assert(cur_in < parallel_num);
+#pragma unroll
+        for (int32_t j = pre_in + 1; j <= cur_in; ++j) {
+          bound_index[j] = static_cast<K>(i);
+          bound_value[j] = value_ptr[i];
+        }
+      }
+    }
+  }
+  CUDA_1D_KERNEL_LOOP(i, parallel_num + 1) {
+    const K first_in = key_ptr[0] / num_classes_per_rank;
+    const K last_in = key_ptr[n - 1] / num_classes_per_rank;
+    if (i <= first_in) {
+      bound_index[i] = 0;
+      bound_value[i] = value_ptr[0];
+    } else if (i > last_in) {
+      bound_index[i] = n;
+      bound_value[i] = value_ptr[n - 1];
+    }
+  }
+}
+
+template<typename K>
+__global__ void GetMappedLabel(const int64_t n, const K* label_map_key, const K* label_map_value,
+                               K* mapped_label) {
+  CUDA_1D_KERNEL_LOOP(i, n) { mapped_label[label_map_key[i]] = label_map_value[i]; }
+}
+
+template<typename K>
+void MapLabel(ep::Stream* stream, const int64_t num_classes, const int64_t batch_size,
+              const int64_t lower_bound, const int64_t parallel_num, const int64_t num_sample,
+              size_t temp_storage_bytes, const K* label_ptr, K* mapped_label_ptr,
+              K* cub_sort_values_ptr, K* cub_sort_keys_out_ptr, K* cub_sort_values_out_ptr,
+              void* cub_tmp_storage_ptr, K* bound_index_ptr, K* bound_value_ptr) {
+  IotaKernel<<<BlocksNum4ThreadsNum(batch_size), kCudaThreadsNumPerBlock, 0,
+               stream->As<ep::CudaStream>()->cuda_stream()>>>(batch_size, cub_sort_values_ptr);
+  OF_CUDA_CHECK((hipcub::DeviceRadixSort::SortPairs<K, K>(
+      cub_tmp_storage_ptr, temp_storage_bytes, label_ptr, cub_sort_keys_out_ptr,
+      cub_sort_values_ptr, cub_sort_values_out_ptr, batch_size, 0, sizeof(K) * 8,
+      stream->As<ep::CudaStream>()->cuda_stream())));
+  NotEqualToPreviousAdjacentIterator<K, K> unique_counting_iter(cub_sort_keys_out_ptr, 0);
+  OF_CUDA_CHECK((hipcub::DeviceScan::InclusiveSum<NotEqualToPreviousAdjacentIterator<K, K>, K*>(
+      cub_tmp_storage_ptr, temp_storage_bytes, unique_counting_iter, cub_sort_values_ptr,
+      batch_size, stream->As<ep::CudaStream>()->cuda_stream())));
+
+  GetPartionBound<<<BlocksNum4ThreadsNum(batch_size), kCudaThreadsNumPerBlock, 0,
+                    stream->As<ep::CudaStream>()->cuda_stream()>>>(
+      batch_size, parallel_num, num_classes, cub_sort_keys_out_ptr, cub_sort_values_ptr,
+      bound_index_ptr, bound_value_ptr);
+
+  GetLabelMap<K><<<BlocksNum4ThreadsNum(batch_size), kCudaThreadsNumPerBlock, 0,
+                   stream->As<ep::CudaStream>()->cuda_stream()>>>(
+      batch_size, parallel_num, num_sample, bound_index_ptr, bound_value_ptr, cub_sort_values_ptr);
+
+  GetMappedLabel<<<BlocksNum4ThreadsNum(batch_size), kCudaThreadsNumPerBlock, 0,
+                   stream->As<ep::CudaStream>()->cuda_stream()>>>(
+      batch_size, cub_sort_values_out_ptr, cub_sort_values_ptr, mapped_label_ptr);
+}
+
+}  // namespace
+
+template<typename T, typename K>
+class DistributedPartialFcSampleGpuKernel final : public user_op::OpKernel {
+ public:
+  DistributedPartialFcSampleGpuKernel() = default;
+  ~DistributedPartialFcSampleGpuKernel() override = default;
+
+  std::shared_ptr<user_op::OpKernelState> CreateOpKernelState(
+      user_op::KernelInitContext* ctx) const override {
+    const SbpParallel& in_sbp = ctx->SbpParallel4ArgNameAndIndex("weight", 0);
+    const TensorDesc* in_logical_desc = ctx->LogicalTensorDesc4ArgNameAndIndex("weight", 0);
+    const int64_t class_num = in_logical_desc->shape().At(0);
+    const int64_t num_sample = ctx->Attr<int64_t>("num_sample");
+    int64_t seed = ctx->Attr<int64_t>("seed");
+    const int64_t parallel_num = ctx->parallel_ctx().parallel_num();
+    const int64_t num_sample_per_rank = RoundUp(num_sample, parallel_num) / parallel_num;
+    if (in_sbp.has_split_parallel() && in_sbp.split_parallel().axis() == 0 && parallel_num > 1) {
+      std::seed_seq seq{seed};
+      std::vector<int64_t> seeds(parallel_num);
+      seq.generate(seeds.begin(), seeds.end());
+      seed = seeds.at(ctx->parallel_ctx().parallel_id());
+      CHECK(ctx->SbpParallel4ArgNameAndIndex("label", 0).has_broadcast_parallel());
+      BalancedSplitter bs(class_num, parallel_num);
+      return std::make_shared<DistributedPartialFcSampleOpKernelState>(
+          ctx->stream(), bs.At(ctx->parallel_ctx().parallel_id()).begin(),
+          bs.At(ctx->parallel_ctx().parallel_id()).end(), num_sample_per_rank, seed);
+    } else {
+      return std::make_shared<DistributedPartialFcSampleOpKernelState>(ctx->stream(), 0, class_num,
+                                                                       num_sample_per_rank, seed);
+    }
+  }
+
+ private:
+  using user_op::OpKernel::Compute;
+  void Compute(user_op::KernelComputeContext* ctx, user_op::OpKernelState* state,
+               const user_op::OpKernelCache*) const override {
+    const user_op::Tensor* weight = ctx->Tensor4ArgNameAndIndex("weight", 0);
+    const user_op::Tensor* label = ctx->Tensor4ArgNameAndIndex("label", 0);
+    user_op::Tensor* mapped_label = ctx->Tensor4ArgNameAndIndex("mapped_label", 0);
+    user_op::Tensor* sampled_label = ctx->Tensor4ArgNameAndIndex("sampled_label", 0);
+    user_op::Tensor* sampled_weight = ctx->Tensor4ArgNameAndIndex("sampled_weight", 0);
+    user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0);
+
+    const int64_t batch_size = label->shape_view().At(0);
+    const int64_t num_classes = weight->shape_view().At(0);
+    const int64_t parallel_num = ctx->parallel_ctx().parallel_num();
+    TmpBufferManager<K> buffer_manager(tmp_buffer->mut_dptr(), num_classes, batch_size,
+                                       parallel_num);
+
+    auto* kernel_state = dynamic_cast<DistributedPartialFcSampleOpKernelState*>(state);
+    CHECK_NOTNULL(kernel_state);
+    CHECK_EQ(num_classes, kernel_state->upper() - kernel_state->lower());
+    const int64_t lower_bound = kernel_state->lower();
+    const int64_t num_sample = kernel_state->num_sample_per_rank();
+    kernel_state->GenRandom<K>(ctx->stream(), num_classes, num_classes,
+                               buffer_manager.CubSortKeysPtr());
+    MarkPositive<<<BlocksNum4ThreadsNum(batch_size), kCudaThreadsNumPerBlock, 0,
+                   ctx->stream()->As<ep::CudaStream>()->cuda_stream()>>>(
+        batch_size, lower_bound, num_classes, label->dptr<K>(), buffer_manager.CubSortKeysPtr());
+    IotaKernel<<<BlocksNum4ThreadsNum(num_classes), kCudaThreadsNumPerBlock, 0,
+                 ctx->stream()->As<ep::CudaStream>()->cuda_stream()>>>(
+        num_classes, buffer_manager.CubSortValuesPtr());
+    size_t temp_storage_bytes = buffer_manager.GetCubTmpStorageSize();
+    OF_CUDA_CHECK((hipcub::DeviceRadixSort::SortPairs<K, K>(
+        buffer_manager.CubTmpStoragePtr(), temp_storage_bytes, buffer_manager.CubSortKeysPtr(),
+        buffer_manager.CubSortKeysOutPtr(), buffer_manager.CubSortValuesPtr(),
+        buffer_manager.CubSortValuesOutPtr(), num_classes, 0, sizeof(K) * 8,
+        ctx->stream()->As<ep::CudaStream>()->cuda_stream())));
+
+    GetSampledLabel<<<BlocksNum4ThreadsNum(num_sample), kCudaThreadsNumPerBlock, 0,
+                      ctx->stream()->As<ep::CudaStream>()->cuda_stream()>>>(
+        num_sample, lower_bound, buffer_manager.CubSortValuesOutPtr(),
+        sampled_label->mut_dptr<K>());
+
+    GatherKernelUtilImpl<DeviceType::kCUDA, T, K>::Forward(
+        ctx->stream(), buffer_manager.CubSortValuesOutPtr(), num_sample, weight->dptr<T>(),
+        Shape({1, num_classes, weight->shape_view().Count(1)}), sampled_weight->mut_dptr<T>(), 0);
+
+    MapLabel<K>(ctx->stream(), num_classes, batch_size, lower_bound, parallel_num, num_sample,
+                buffer_manager.GetCubTmpStorageSize(), label->dptr<K>(),
+                mapped_label->mut_dptr<K>(), buffer_manager.CubSortValuesPtr(),
+                buffer_manager.CubSortKeysOutPtr(), buffer_manager.CubSortValuesOutPtr(),
+                buffer_manager.CubTmpStoragePtr(), buffer_manager.BoundIndexPtr(),
+                buffer_manager.BoundValuePtr());
+  }
+
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
+};
+
+#define REGISTER_DISTRIBUTED_PARTIAL_FC_SAMPLE_CUDA_KERNEL(dtype_pair, ltype_pair)               \
+  REGISTER_USER_KERNEL("distributed_partial_fc_sample")                                          \
+      .SetCreateFn<DistributedPartialFcSampleGpuKernel<OF_PP_PAIR_FIRST(dtype_pair),             \
+                                                       OF_PP_PAIR_FIRST(ltype_pair)>>()          \
+      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                           \
+                       && (user_op::HobDataType("label", 0) == OF_PP_PAIR_SECOND(ltype_pair))    \
+                       && (user_op::HobDataType("weight", 0) == OF_PP_PAIR_SECOND(dtype_pair)))  \
+      .SetInferTmpSizeFn([](oneflow::user_op::InferContext* ctx) {                               \
+        const int64_t num_classes = ctx->InputTensorDesc("weight", 0).shape().At(0);             \
+        const int64_t batch_size = ctx->InputTensorDesc("label", 0).shape().At(0);               \
+        const int64_t parallel_num = ctx->parallel_ctx().parallel_num();                         \
+        TmpBufferManager<OF_PP_PAIR_FIRST(ltype_pair)> buffer_manager(nullptr, num_classes,      \
+                                                                      batch_size, parallel_num); \
+        return buffer_manager.GetTotalBufferSize();                                              \
+      });
+
+OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_DISTRIBUTED_PARTIAL_FC_SAMPLE_CUDA_KERNEL,
+                                 FLOATING_DATA_TYPE_SEQ, INDEX_DATA_TYPE_SEQ)
+
+template<typename T, typename K>
+class DistributedPartialFcSampleDisableBoxingGpuKernel final : public user_op::OpKernel {
+ public:
+  DistributedPartialFcSampleDisableBoxingGpuKernel() = default;
+  ~DistributedPartialFcSampleDisableBoxingGpuKernel() override = default;
+
+ private:
+  using user_op::OpKernel::Compute;
+  void Compute(user_op::KernelComputeContext* ctx, user_op::OpKernelState* state,
+               const user_op::OpKernelCache*) const override {
+    const user_op::Tensor* sampled_weight_diff =
+        ctx->Tensor4ArgNameAndIndex("sampled_weight_diff", 0);
+    const user_op::Tensor* sampled_label = ctx->Tensor4ArgNameAndIndex("sampled_label", 0);
+    user_op::Tensor* boxing_disabled_sampled_weight_diff =
+        ctx->Tensor4ArgNameAndIndex("boxing_disabled_sampled_weight_diff", 0);
+    user_op::Tensor* boxing_disabled_sampled_label =
+        ctx->Tensor4ArgNameAndIndex("boxing_disabled_sampled_label", 0);
+    Memcpy<DeviceType::kCUDA>(ctx->stream(), boxing_disabled_sampled_weight_diff->mut_dptr<void>(),
+                              sampled_weight_diff->dptr<void>(),
+                              sampled_weight_diff->shape_view().elem_cnt()
+                                  * GetSizeOfDataType(sampled_weight_diff->data_type()));
+    Memcpy<DeviceType::kCUDA>(
+        ctx->stream(), boxing_disabled_sampled_label->mut_dptr<void>(), sampled_label->dptr<void>(),
+        sampled_label->shape_view().elem_cnt() * GetSizeOfDataType(sampled_label->data_type()));
+  }
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
+};
+
+#define REGISTER_DISTRIBUTED_PARTIAL_FC_SAMPLE_DISABLE_BOXING_CUDA_KERNEL(dtype_pair, ltype_pair) \
+  REGISTER_USER_KERNEL("distributed_partial_fc_sample_disable_boxing")                            \
+      .SetCreateFn<DistributedPartialFcSampleDisableBoxingGpuKernel<                              \
+          OF_PP_PAIR_FIRST(dtype_pair), OF_PP_PAIR_FIRST(ltype_pair)>>()                          \
+      .SetIsMatchedHob(                                                                           \
+          (user_op::HobDeviceType() == DeviceType::kCUDA)                                         \
+          && (user_op::HobDataType("sampled_label", 0) == OF_PP_PAIR_SECOND(ltype_pair))          \
+          && (user_op::HobDataType("sampled_weight_diff", 0) == OF_PP_PAIR_SECOND(dtype_pair)));
+OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_DISTRIBUTED_PARTIAL_FC_SAMPLE_DISABLE_BOXING_CUDA_KERNEL,
+                                 FLOATING_DATA_TYPE_SEQ, INDEX_DATA_TYPE_SEQ)
+
+}  // namespace user_op
+}  // namespace oneflow
 #endif
\ No newline at end of file
diff --git a/oneflow/user/kernels/prelu_kernel.hip.cpp b/oneflow/user/kernels/prelu_kernel.hip.cpp
index 1613ff4..870272b 100644
--- a/oneflow/user/kernels/prelu_kernel.hip.cpp
+++ b/oneflow/user/kernels/prelu_kernel.hip.cpp
@@ -1,505 +1,505 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "hip/hip_runtime.h"
-#include "oneflow/core/framework/framework.h"
-#include "oneflow/core/ndarray/ndarray_util.h"
-#include "oneflow/core/hip/elementwise.hip.h"
-#include "oneflow/core/ep/rocm/cuda_stream.h"
-
-namespace oneflow {
-
-namespace {
-
-Shape CreatePreluLeftExtendedShape(const ShapeView& shape) {
-  DimVector dim_vec(shape.NumAxes());
-  dim_vec.at(0) = 1LL;
-  dim_vec.at(1) = shape.At(1);
-  for (int i = 2; i < shape.NumAxes(); i++) { dim_vec.at(i) = 1LL; }
-  return Shape(std::move(dim_vec));
-}
-
-template<typename T>
-struct PreluForwardSingleAlphaFunctor {
-  OF_DEVICE_FUNC explicit PreluForwardSingleAlphaFunctor(const T alpha) : alpha(alpha) {}
-  __device__ T operator()(T x) const { return (x > static_cast<T>(0.0)) ? x : (alpha * x); }
-  const T alpha;
-};
-
-template<typename T>
-struct PreluForwardSingleAlphaPtrFunctor {
-  OF_DEVICE_FUNC explicit PreluForwardSingleAlphaPtrFunctor(const T* alpha_ptr)
-      : alpha_ptr(alpha_ptr) {}
-  __device__ PreluForwardSingleAlphaFunctor<T> operator()() const {
-    return PreluForwardSingleAlphaFunctor<T>(*alpha_ptr);
-  }
-  const T* alpha_ptr;
-};
-
-template<typename T, typename IndexType, int pack_size, bool tail, bool alpha_requires_grad>
-__global__ void PReluBackwardSingleAlphaGpu(const IndexType elem_cnt, const int64_t n_tail,
-                                            const T* x, const T* alpha, const T* dy, T* dx,
-                                            T* alpha_diff, const T* tail_x, const T* tail_dy,
-                                            T* tail_dx, T* tail_alpha_diff) {
-  int32_t global_thread_id = blockIdx.x * blockDim.x + threadIdx.x;
-
-  using LoadType = cuda::elementwise::PackType<T, pack_size>;
-  using LoadPack = cuda::elementwise::Pack<T, pack_size>;
-  T zero_val = static_cast<T>(0);
-  T alpha_val = alpha[0];
-
-  for (int64_t linear_index = global_thread_id * pack_size; linear_index < elem_cnt;
-       linear_index += gridDim.x * blockDim.x * pack_size) {
-    const LoadType* x_load = reinterpret_cast<const LoadType*>(x + linear_index);
-    LoadPack x_vec;
-    x_vec.storage = *x_load;
-
-    const LoadType* dy_load = reinterpret_cast<const LoadType*>(dy + linear_index);
-    LoadPack dy_vec;
-    dy_vec.storage = *dy_load;
-
-    LoadPack dx_vec;
-    T zero_val = static_cast<T>(0.0);
-    if (alpha_requires_grad) {
-      LoadPack dalpha_vec;
-#pragma unroll
-      for (int i = 0; i < pack_size; i++) {
-        if (x_vec.elem[i] > zero_val) {
-          dx_vec.elem[i] = dy_vec.elem[i];
-          dalpha_vec.elem[i] = zero_val;
-        } else {
-          dx_vec.elem[i] = dy_vec.elem[i] * alpha_val;
-          dalpha_vec.elem[i] = dy_vec.elem[i] * x_vec.elem[i];
-        }
-      }
-      *(reinterpret_cast<LoadType*>(dx + linear_index)) = dx_vec.storage;
-      *(reinterpret_cast<LoadType*>(alpha_diff + linear_index)) = dalpha_vec.storage;
-    } else {
-#pragma unroll
-      for (int i = 0; i < pack_size; i++) {
-        if (x_vec.elem[i] > zero_val) {
-          dx_vec.elem[i] = dy_vec.elem[i];
-        } else {
-          dx_vec.elem[i] = dy_vec.elem[i] * alpha_val;
-        }
-      }
-      *(reinterpret_cast<LoadType*>(dx + linear_index)) = dx_vec.storage;
-    }
-  }
-
-  if (tail && global_thread_id < n_tail) {
-    const T tail_dy_val = tail_dy[global_thread_id];
-    if (tail_x[global_thread_id] > zero_val) {
-      tail_dx[global_thread_id] = tail_dy_val;
-      if (alpha_requires_grad) { tail_alpha_diff[global_thread_id] = zero_val; }
-    } else {
-      tail_dx[global_thread_id] = alpha_val * tail_dy_val;
-      if (alpha_requires_grad) {
-        tail_alpha_diff[global_thread_id] = tail_x[global_thread_id] * tail_dy_val;
-      }
-    }
-  }
-}
-
-template<typename T>
-__global__ void BroadcastPReluMultiAlphaNaiveForwardGpu(const int32_t elem_cnt,
-                                                        const int32_t alpha_size,
-                                                        const int32_t inner_size, const T* x,
-                                                        const T* alpha, T* y) {
-  const T zero_val = static_cast<T>(0.0);
-  CUDA_1D_KERNEL_LOOP(i, elem_cnt) {
-    const T x_i = x[i];
-    int32_t alpha_idx = (i / inner_size) % alpha_size;
-    y[i] = x_i > zero_val ? x_i : x_i * alpha[alpha_idx];
-  }
-}
-
-template<typename T, typename IndexType, int pack_size>
-__global__ void PReluForwardMultiAlphaGpu(const IndexType elem_cnt, const IndexType alpha_size,
-                                          const IndexType inner_size, const T* x, const T* alpha,
-                                          T* y) {
-  int32_t global_thread_id = blockIdx.x * blockDim.x + threadIdx.x;
-
-  using LoadType = cuda::elementwise::PackType<T, pack_size>;
-  using LoadPack = cuda::elementwise::Pack<T, pack_size>;
-  T zero_val = static_cast<T>(0);
-  for (int64_t linear_index = global_thread_id * pack_size; linear_index < elem_cnt;
-       linear_index += gridDim.x * blockDim.x * pack_size) {
-    IndexType alpha_idx = (linear_index / inner_size) % alpha_size;
-
-    const LoadType* x_load = reinterpret_cast<const LoadType*>(x + linear_index);
-    LoadPack x_vec;
-    x_vec.storage = *x_load;
-
-    LoadPack y_vec;
-
-    T alpha_val = alpha[alpha_idx];
-#pragma unroll
-    for (int i = 0; i < pack_size; i++) {
-      y_vec.elem[i] = x_vec.elem[i] > zero_val ? x_vec.elem[i] : x_vec.elem[i] * alpha_val;
-    }
-    *(reinterpret_cast<LoadType*>(y + linear_index)) = y_vec.storage;
-  }
-}
-
-template<typename T, bool alpha_requires_grad>
-__global__ void BroadcastPReluMultiAlphaNaiveBackwardGpu(const int32_t elem_cnt,
-                                                         const int32_t alpha_size,
-                                                         const int32_t inner_size, const T* x,
-                                                         const T* alpha, const T* dy, T* dx,
-                                                         T* alpha_diff) {
-  const T zero_val = static_cast<T>(0.0);
-  CUDA_1D_KERNEL_LOOP(i, elem_cnt) {
-    const T x_i = x[i];
-    const T dy_i = dy[i];
-    int32_t alpha_i = (i / inner_size) % alpha_size;
-    if (x_i > zero_val) {
-      dx[i] = dy_i;
-      if (alpha_requires_grad) { alpha_diff[i] = zero_val; }
-    } else {
-      dx[i] = dy_i * alpha[alpha_i];
-      if (alpha_requires_grad) { alpha_diff[i] = dy_i * x_i; }
-    }
-  }
-}
-
-template<typename T, typename IndexType, int pack_size, bool alpha_requires_grad>
-__global__ void PReluBackwardMultiAlphaGpu(const IndexType elem_cnt, const IndexType alpha_size,
-                                           const IndexType inner_size, const T* x, const T* alpha,
-                                           const T* dy, T* dx, T* alpha_diff) {
-  int32_t global_thread_id = blockIdx.x * blockDim.x + threadIdx.x;
-
-  using LoadType = cuda::elementwise::PackType<T, pack_size>;
-  using LoadPack = cuda::elementwise::Pack<T, pack_size>;
-  T zero_val = static_cast<T>(0);
-  for (int64_t linear_index = global_thread_id * pack_size; linear_index < elem_cnt;
-       linear_index += gridDim.x * blockDim.x * pack_size) {
-    IndexType alpha_idx = (linear_index / inner_size) % alpha_size;
-
-    const LoadType* x_load = reinterpret_cast<const LoadType*>(x + linear_index);
-    LoadPack x_vec;
-    x_vec.storage = *x_load;
-
-    const LoadType* dy_load = reinterpret_cast<const LoadType*>(dy + linear_index);
-    LoadPack dy_vec;
-    dy_vec.storage = *dy_load;
-
-    LoadPack dx_vec;
-    T alpha_val = alpha[alpha_idx];
-    if (alpha_requires_grad) {
-      LoadPack dalpha_vec;
-      T zero_val = static_cast<T>(0.0);
-#pragma unroll
-      for (int i = 0; i < pack_size; i++) {
-        if (x_vec.elem[i] > zero_val) {
-          dx_vec.elem[i] = dy_vec.elem[i];
-          dalpha_vec.elem[i] = zero_val;
-        } else {
-          dx_vec.elem[i] = dy_vec.elem[i] * alpha_val;
-          dalpha_vec.elem[i] = dy_vec.elem[i] * x_vec.elem[i];
-        }
-      }
-      *(reinterpret_cast<LoadType*>(dx + linear_index)) = dx_vec.storage;
-      *(reinterpret_cast<LoadType*>(alpha_diff + linear_index)) = dalpha_vec.storage;
-    } else {
-#pragma unroll
-      for (int i = 0; i < pack_size; i++) {
-        if (x_vec.elem[i] > zero_val) {
-          dx_vec.elem[i] = dy_vec.elem[i];
-        } else {
-          dx_vec.elem[i] = dy_vec.elem[i] * alpha_val;
-        }
-      }
-      *(reinterpret_cast<LoadType*>(dx + linear_index)) = dx_vec.storage;
-    }
-  }
-}
-
-constexpr int32_t kBlockSize = 256;
-
-template<typename T>
-int GetLaunchPackSize(const int64_t inner_size) {
-  constexpr int type_pack_size = cuda::elementwise::PackSize<T>();
-  for (int launch_pack_size = 8; launch_pack_size > 0; launch_pack_size /= 2) {
-    if (type_pack_size >= launch_pack_size && inner_size % launch_pack_size == 0) {
-      return launch_pack_size;
-    }
-  }
-  return 1;
-}
-
-template<typename T, typename IndexType>
-void DispatchPreluForwardPackSize(ep::Stream* stream, const int64_t elem_cnt,
-                                  const int64_t alpha_size, const int64_t inner_size, const T* x,
-                                  const T* alpha, T* y) {
-  int grid_size;
-  const int pack_size = GetLaunchPackSize<T>(inner_size);
-  const int64_t pack_num = elem_cnt / pack_size;
-  hipError_t err = cuda::elementwise::GetNumBlocks(pack_num, &grid_size);
-  if (pack_size == 8) {
-    PReluForwardMultiAlphaGpu<T, IndexType, 8>
-        <<<grid_size, kBlockSize, 0, stream->As<ep::CudaStream>()->cuda_stream()>>>(
-            elem_cnt, alpha_size, inner_size, x, alpha, y);
-  } else if (pack_size == 4) {
-    PReluForwardMultiAlphaGpu<T, IndexType, 4>
-        <<<grid_size, kBlockSize, 0, stream->As<ep::CudaStream>()->cuda_stream()>>>(
-            elem_cnt, alpha_size, inner_size, x, alpha, y);
-  } else if (pack_size == 2) {
-    PReluForwardMultiAlphaGpu<T, IndexType, 2>
-        <<<grid_size, kBlockSize, 0, stream->As<ep::CudaStream>()->cuda_stream()>>>(
-            elem_cnt, alpha_size, inner_size, x, alpha, y);
-  } else {
-    BroadcastPReluMultiAlphaNaiveForwardGpu<T>
-        <<<grid_size, kBlockSize, 0, stream->As<ep::CudaStream>()->cuda_stream()>>>(
-            elem_cnt, alpha_size, inner_size, x, alpha, y);
-  }
-}
-
-template<typename T>
-void DispatchPreluForwardIndex(ep::Stream* stream, const int64_t elem_cnt, const int64_t alpha_size,
-                               const int64_t inner_size, const T* x, const T* alpha, T* y) {
-  if (elem_cnt < GetMaxVal<int32_t>()) {
-    DispatchPreluForwardPackSize<T, int32_t>(stream, elem_cnt, alpha_size, inner_size, x, alpha, y);
-  } else {
-    DispatchPreluForwardPackSize<T, int64_t>(stream, elem_cnt, alpha_size, inner_size, x, alpha, y);
-  }
-}
-
-template<typename T, typename IndexType>
-void DispatchPreluBackwardPackSize(ep::Stream* stream, const int64_t elem_cnt,
-                                   const int64_t alpha_size, const int64_t inner_size, const T* x,
-                                   const T* alpha, const T* dy, T* dx, T* alpha_diff,
-                                   const bool alpha_requires_grad) {
-  int grid_size;
-  const int pack_size = GetLaunchPackSize<T>(inner_size);
-  const int64_t pack_num = elem_cnt / pack_size;
-  hipError_t err = cuda::elementwise::GetNumBlocks(pack_num, &grid_size);
-
-  if (pack_size == 8) {
-    if (alpha_requires_grad) {
-      PReluBackwardMultiAlphaGpu<T, IndexType, 8, true>
-          <<<grid_size, kBlockSize, 0, stream->As<ep::CudaStream>()->cuda_stream()>>>(
-              elem_cnt, alpha_size, inner_size, x, alpha, dy, dx, alpha_diff);
-    } else {
-      PReluBackwardMultiAlphaGpu<T, IndexType, 8, false>
-          <<<grid_size, kBlockSize, 0, stream->As<ep::CudaStream>()->cuda_stream()>>>(
-              elem_cnt, alpha_size, inner_size, x, alpha, dy, dx, alpha_diff);
-    }
-  } else if (pack_size == 4) {
-    if (alpha_requires_grad) {
-      PReluBackwardMultiAlphaGpu<T, IndexType, 4, true>
-          <<<grid_size, kBlockSize, 0, stream->As<ep::CudaStream>()->cuda_stream()>>>(
-              elem_cnt, alpha_size, inner_size, x, alpha, dy, dx, alpha_diff);
-    } else {
-      PReluBackwardMultiAlphaGpu<T, IndexType, 4, false>
-          <<<grid_size, kBlockSize, 0, stream->As<ep::CudaStream>()->cuda_stream()>>>(
-              elem_cnt, alpha_size, inner_size, x, alpha, dy, dx, alpha_diff);
-    }
-  } else if (pack_size == 2) {
-    if (alpha_requires_grad) {
-      PReluBackwardMultiAlphaGpu<T, IndexType, 2, true>
-          <<<grid_size, kBlockSize, 0, stream->As<ep::CudaStream>()->cuda_stream()>>>(
-              elem_cnt, alpha_size, inner_size, x, alpha, dy, dx, alpha_diff);
-    } else {
-      PReluBackwardMultiAlphaGpu<T, IndexType, 2, false>
-          <<<grid_size, kBlockSize, 0, stream->As<ep::CudaStream>()->cuda_stream()>>>(
-              elem_cnt, alpha_size, inner_size, x, alpha, dy, dx, alpha_diff);
-    }
-
-  } else {
-    if (alpha_requires_grad) {
-      BroadcastPReluMultiAlphaNaiveBackwardGpu<T, true>
-          <<<grid_size, kBlockSize, 0, stream->As<ep::CudaStream>()->cuda_stream()>>>(
-              elem_cnt, alpha_size, inner_size, x, alpha, dy, dx, alpha_diff);
-    } else {
-      BroadcastPReluMultiAlphaNaiveBackwardGpu<T, false>
-          <<<grid_size, kBlockSize, 0, stream->As<ep::CudaStream>()->cuda_stream()>>>(
-              elem_cnt, alpha_size, inner_size, x, alpha, dy, dx, alpha_diff);
-    }
-  }
-}
-
-template<typename T>
-void DispatchPreluBackwardIndex(ep::Stream* stream, const int64_t elem_cnt,
-                                const int64_t alpha_size, const int64_t inner_size, const T* x,
-                                const T* alpha, const T* dy, T* dx, T* alpha_diff,
-                                const bool alpha_requires_grad) {
-  if (elem_cnt < GetMaxVal<int32_t>()) {
-    DispatchPreluBackwardPackSize<T, int32_t>(stream, elem_cnt, alpha_size, inner_size, x, alpha,
-                                              dy, dx, alpha_diff, alpha_requires_grad);
-  } else {
-    DispatchPreluBackwardPackSize<T, int64_t>(stream, elem_cnt, alpha_size, inner_size, x, alpha,
-                                              dy, dx, alpha_diff, alpha_requires_grad);
-  }
-}
-
-template<typename T, typename IndexType>
-void DispatchPreluBackwardSingleAlphaTail(ep::Stream* stream, const IndexType elem_cnt, const T* x,
-                                          const T* alpha, const T* dy, T* dx, T* alpha_diff,
-                                          const bool alpha_requires_grad) {
-  constexpr int pack_size = cuda::elementwise::PackSize<T>();
-  const int64_t pack_num = elem_cnt / pack_size;
-  int grid_size;
-  hipError_t err = cuda::elementwise::GetNumBlocks(pack_num, &grid_size);
-  const int64_t tail_offset = pack_num * pack_size;
-  const int64_t n_tail = elem_cnt - tail_offset;
-  const bool tail = n_tail > 0 ? true : false;
-  if (tail) {
-    if (alpha_requires_grad) {
-      PReluBackwardSingleAlphaGpu<T, IndexType, pack_size, true, true>
-          <<<grid_size, kBlockSize, 0, stream->As<ep::CudaStream>()->cuda_stream()>>>(
-              elem_cnt, n_tail, x, alpha, dy, dx, alpha_diff, x + tail_offset, dy + tail_offset,
-              dx + tail_offset, alpha_diff + tail_offset);
-    } else {
-      PReluBackwardSingleAlphaGpu<T, IndexType, pack_size, true, false>
-          <<<grid_size, kBlockSize, 0, stream->As<ep::CudaStream>()->cuda_stream()>>>(
-              elem_cnt, n_tail, x, alpha, dy, dx, alpha_diff, x + tail_offset, dy + tail_offset,
-              dx + tail_offset, alpha_diff + tail_offset);
-    }
-  } else {
-    if (alpha_requires_grad) {
-      PReluBackwardSingleAlphaGpu<T, IndexType, pack_size, false, true>
-          <<<grid_size, kBlockSize, 0, stream->As<ep::CudaStream>()->cuda_stream()>>>(
-              elem_cnt, n_tail, x, alpha, dy, dx, alpha_diff, x + tail_offset, dy + tail_offset,
-              dx + tail_offset, alpha_diff + tail_offset);
-    } else {
-      PReluBackwardSingleAlphaGpu<T, IndexType, pack_size, false, false>
-          <<<grid_size, kBlockSize, 0, stream->As<ep::CudaStream>()->cuda_stream()>>>(
-              elem_cnt, n_tail, x, alpha, dy, dx, alpha_diff, x + tail_offset, dy + tail_offset,
-              dx + tail_offset, alpha_diff + tail_offset);
-    }
-  }
-}
-
-template<typename T>
-void DispatchPreluBackwardSingleAlphaIndex(ep::Stream* stream, const int64_t elem_cnt, const T* x,
-                                           const T* alpha, const T* dy, T* dx, T* alpha_diff,
-                                           const bool alpha_requires_grad) {
-  if (elem_cnt < GetMaxVal<int32_t>()) {
-    DispatchPreluBackwardSingleAlphaTail<T, int32_t>(stream, elem_cnt, x, alpha, dy, dx, alpha_diff,
-                                                     alpha_requires_grad);
-  } else {
-    DispatchPreluBackwardSingleAlphaTail<T, int64_t>(stream, elem_cnt, x, alpha, dy, dx, alpha_diff,
-                                                     alpha_requires_grad);
-  }
-}
-
-}  // namespace
-
-template<typename T>
-class GpuPReluKernel final : public user_op::OpKernel {
- public:
-  GpuPReluKernel() = default;
-  ~GpuPReluKernel() = default;
-
- private:
-  using user_op::OpKernel::Compute;
-  void Compute(user_op::KernelComputeContext* ctx) const override {
-    const user_op::Tensor* x = ctx->Tensor4ArgNameAndIndex("x", 0);
-    const user_op::Tensor* alpha = ctx->Tensor4ArgNameAndIndex("alpha", 0);
-    user_op::Tensor* y = ctx->Tensor4ArgNameAndIndex("y", 0);
-    const int32_t elem_cnt = x->shape_view().elem_cnt();
-    const int32_t batch = x->shape_view().At(0);
-    const int32_t channels = (x->shape_view().NumAxes() == 1) ? 1 : x->shape_view().At(1);
-    const int32_t alpha_size = alpha->shape_view().elem_cnt();
-    const int32_t inner_size = elem_cnt / batch / channels;
-
-    if (alpha_size == 1) {
-      OF_CUDA_CHECK((cuda::elementwise::UnaryWithFactory(
-          PreluForwardSingleAlphaPtrFunctor<T>(reinterpret_cast<const T*>(alpha->dptr())), elem_cnt,
-          reinterpret_cast<T*>(y->mut_dptr()), reinterpret_cast<const T*>(x->dptr()),
-          ctx->stream()->As<ep::CudaStream>()->cuda_stream())));
-    } else {
-      DispatchPreluForwardIndex<T>(
-          ctx->stream(), elem_cnt, alpha_size, inner_size, reinterpret_cast<const T*>(x->dptr()),
-          reinterpret_cast<const T*>(alpha->dptr()), reinterpret_cast<T*>(y->mut_dptr()));
-    }
-  }
-  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
-};
-
-#define REGISTER_CUDA_PRELU_KERNEL(dtype)                                             \
-  REGISTER_USER_KERNEL("prelu").SetCreateFn<GpuPReluKernel<dtype>>().SetIsMatchedHob( \
-      (user_op::HobDeviceType() == DeviceType::kCUDA)                                 \
-      && (user_op::HobDataType("y", 0) == GetDataType<dtype>::value));
-
-REGISTER_CUDA_PRELU_KERNEL(half)
-REGISTER_CUDA_PRELU_KERNEL(float)
-REGISTER_CUDA_PRELU_KERNEL(double)
-
-template<typename T>
-class GpuPReluGradKernel final : public user_op::OpKernel {
- public:
-  GpuPReluGradKernel() = default;
-  ~GpuPReluGradKernel() = default;
-
- private:
-  using user_op::OpKernel::Compute;
-  void Compute(user_op::KernelComputeContext* ctx) const override {
-    const user_op::Tensor* x = ctx->Tensor4ArgNameAndIndex("x", 0);
-    const user_op::Tensor* alpha = ctx->Tensor4ArgNameAndIndex("alpha", 0);
-    const user_op::Tensor* dy = ctx->Tensor4ArgNameAndIndex("dy", 0);
-    user_op::Tensor* dx = ctx->Tensor4ArgNameAndIndex("dx", 0);
-    user_op::Tensor* alpha_diff = ctx->Tensor4ArgNameAndIndex("alpha_diff", 0);
-    user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0);
-    const bool alpha_requires_grad = ctx->Attr<bool>("alpha_requires_grad");
-    const int32_t elem_cnt = x->shape_view().elem_cnt();
-    T* broadcasted_alpha_diff = tmp_buffer->mut_dptr<T>();
-    T* reduce_sum_tmp_buf = reinterpret_cast<T*>(tmp_buffer->mut_dptr<char>()
-                                                 + GetCudaAlignedSize(elem_cnt * sizeof(T)));
-
-    const Shape& left_extended_shape = CreatePreluLeftExtendedShape(ShapeView(x->shape_view()));
-
-    const int32_t batch = x->shape_view().At(0);
-    const int32_t channels = (x->shape_view().NumAxes() == 1) ? 1 : x->shape_view().At(1);
-    const int32_t alpha_size = alpha->shape_view().elem_cnt();
-    const int32_t inner_size = elem_cnt / batch / channels;
-    if (alpha_size == 1) {
-      DispatchPreluBackwardSingleAlphaIndex<T>(ctx->stream(), elem_cnt, x->dptr<T>(),
-                                               alpha->dptr<T>(), dy->dptr<T>(), dx->mut_dptr<T>(),
-                                               broadcasted_alpha_diff, alpha_requires_grad);
-    } else {
-      DispatchPreluBackwardIndex<T>(ctx->stream(), elem_cnt, alpha_size, inner_size, x->dptr<T>(),
-                                    alpha->dptr<T>(), dy->dptr<T>(), dx->mut_dptr<T>(),
-                                    broadcasted_alpha_diff, alpha_requires_grad);
-    }
-    if (alpha_requires_grad) {
-      NdarrayUtil<DeviceType::kCUDA, T>::ReduceSum(
-          ctx->stream(), XpuVarNdarray<T>(left_extended_shape, alpha_diff->mut_dptr<T>()),
-          XpuVarNdarray<const T>(x->shape_view(), broadcasted_alpha_diff),
-          XpuVarNdarray<T>(x->shape_view(), reduce_sum_tmp_buf));
-    }
-  }
-  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
-};
-
-#define REGISTER_CUDA_PRELU_GRAD_KERNEL(dtype)                                          \
-  REGISTER_USER_KERNEL("prelu_grad")                                                    \
-      .SetCreateFn<GpuPReluGradKernel<dtype>>()                                         \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                  \
-                       && (user_op::HobDataType("dx", 0) == GetDataType<dtype>::value)) \
-      .SetInferTmpSizeFn([](user_op::InferContext* ctx) {                               \
-        const Shape& in_shape = ctx->InputShape("x", 0);                                \
-        const Shape& alpha_shape = ctx->InputShape("alpha", 0);                         \
-        const int64_t tmp_buffer_size =                                                 \
-            2 * GetCudaAlignedSize(in_shape.elem_cnt() * sizeof(dtype));                \
-        return tmp_buffer_size;                                                         \
-      });
-
-REGISTER_CUDA_PRELU_GRAD_KERNEL(half)
-REGISTER_CUDA_PRELU_GRAD_KERNEL(float)
-REGISTER_CUDA_PRELU_GRAD_KERNEL(double)
-
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "hip/hip_runtime.h"
+#include "oneflow/core/framework/framework.h"
+#include "oneflow/core/ndarray/ndarray_util.h"
+#include "oneflow/core/hip/elementwise.hip.h"
+#include "oneflow/core/ep/rocm/cuda_stream.h"
+
+namespace oneflow {
+
+namespace {
+
+Shape CreatePreluLeftExtendedShape(const ShapeView& shape) {
+  DimVector dim_vec(shape.NumAxes());
+  dim_vec.at(0) = 1LL;
+  dim_vec.at(1) = shape.At(1);
+  for (int i = 2; i < shape.NumAxes(); i++) { dim_vec.at(i) = 1LL; }
+  return Shape(std::move(dim_vec));
+}
+
+template<typename T>
+struct PreluForwardSingleAlphaFunctor {
+  OF_DEVICE_FUNC explicit PreluForwardSingleAlphaFunctor(const T alpha) : alpha(alpha) {}
+  __device__ T operator()(T x) const { return (x > static_cast<T>(0.0)) ? x : (alpha * x); }
+  const T alpha;
+};
+
+template<typename T>
+struct PreluForwardSingleAlphaPtrFunctor {
+  OF_DEVICE_FUNC explicit PreluForwardSingleAlphaPtrFunctor(const T* alpha_ptr)
+      : alpha_ptr(alpha_ptr) {}
+  __device__ PreluForwardSingleAlphaFunctor<T> operator()() const {
+    return PreluForwardSingleAlphaFunctor<T>(*alpha_ptr);
+  }
+  const T* alpha_ptr;
+};
+
+template<typename T, typename IndexType, int pack_size, bool tail, bool alpha_requires_grad>
+__global__ void PReluBackwardSingleAlphaGpu(const IndexType elem_cnt, const int64_t n_tail,
+                                            const T* x, const T* alpha, const T* dy, T* dx,
+                                            T* alpha_diff, const T* tail_x, const T* tail_dy,
+                                            T* tail_dx, T* tail_alpha_diff) {
+  int32_t global_thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+
+  using LoadType = cuda::elementwise::PackType<T, pack_size>;
+  using LoadPack = cuda::elementwise::Pack<T, pack_size>;
+  T zero_val = static_cast<T>(0);
+  T alpha_val = alpha[0];
+
+  for (int64_t linear_index = global_thread_id * pack_size; linear_index < elem_cnt;
+       linear_index += gridDim.x * blockDim.x * pack_size) {
+    const LoadType* x_load = reinterpret_cast<const LoadType*>(x + linear_index);
+    LoadPack x_vec;
+    x_vec.storage = *x_load;
+
+    const LoadType* dy_load = reinterpret_cast<const LoadType*>(dy + linear_index);
+    LoadPack dy_vec;
+    dy_vec.storage = *dy_load;
+
+    LoadPack dx_vec;
+    T zero_val = static_cast<T>(0.0);
+    if (alpha_requires_grad) {
+      LoadPack dalpha_vec;
+#pragma unroll
+      for (int i = 0; i < pack_size; i++) {
+        if (x_vec.elem[i] > zero_val) {
+          dx_vec.elem[i] = dy_vec.elem[i];
+          dalpha_vec.elem[i] = zero_val;
+        } else {
+          dx_vec.elem[i] = dy_vec.elem[i] * alpha_val;
+          dalpha_vec.elem[i] = dy_vec.elem[i] * x_vec.elem[i];
+        }
+      }
+      *(reinterpret_cast<LoadType*>(dx + linear_index)) = dx_vec.storage;
+      *(reinterpret_cast<LoadType*>(alpha_diff + linear_index)) = dalpha_vec.storage;
+    } else {
+#pragma unroll
+      for (int i = 0; i < pack_size; i++) {
+        if (x_vec.elem[i] > zero_val) {
+          dx_vec.elem[i] = dy_vec.elem[i];
+        } else {
+          dx_vec.elem[i] = dy_vec.elem[i] * alpha_val;
+        }
+      }
+      *(reinterpret_cast<LoadType*>(dx + linear_index)) = dx_vec.storage;
+    }
+  }
+
+  if (tail && global_thread_id < n_tail) {
+    const T tail_dy_val = tail_dy[global_thread_id];
+    if (tail_x[global_thread_id] > zero_val) {
+      tail_dx[global_thread_id] = tail_dy_val;
+      if (alpha_requires_grad) { tail_alpha_diff[global_thread_id] = zero_val; }
+    } else {
+      tail_dx[global_thread_id] = alpha_val * tail_dy_val;
+      if (alpha_requires_grad) {
+        tail_alpha_diff[global_thread_id] = tail_x[global_thread_id] * tail_dy_val;
+      }
+    }
+  }
+}
+
+template<typename T>
+__global__ void BroadcastPReluMultiAlphaNaiveForwardGpu(const int32_t elem_cnt,
+                                                        const int32_t alpha_size,
+                                                        const int32_t inner_size, const T* x,
+                                                        const T* alpha, T* y) {
+  const T zero_val = static_cast<T>(0.0);
+  CUDA_1D_KERNEL_LOOP(i, elem_cnt) {
+    const T x_i = x[i];
+    int32_t alpha_idx = (i / inner_size) % alpha_size;
+    y[i] = x_i > zero_val ? x_i : x_i * alpha[alpha_idx];
+  }
+}
+
+template<typename T, typename IndexType, int pack_size>
+__global__ void PReluForwardMultiAlphaGpu(const IndexType elem_cnt, const IndexType alpha_size,
+                                          const IndexType inner_size, const T* x, const T* alpha,
+                                          T* y) {
+  int32_t global_thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+
+  using LoadType = cuda::elementwise::PackType<T, pack_size>;
+  using LoadPack = cuda::elementwise::Pack<T, pack_size>;
+  T zero_val = static_cast<T>(0);
+  for (int64_t linear_index = global_thread_id * pack_size; linear_index < elem_cnt;
+       linear_index += gridDim.x * blockDim.x * pack_size) {
+    IndexType alpha_idx = (linear_index / inner_size) % alpha_size;
+
+    const LoadType* x_load = reinterpret_cast<const LoadType*>(x + linear_index);
+    LoadPack x_vec;
+    x_vec.storage = *x_load;
+
+    LoadPack y_vec;
+
+    T alpha_val = alpha[alpha_idx];
+#pragma unroll
+    for (int i = 0; i < pack_size; i++) {
+      y_vec.elem[i] = x_vec.elem[i] > zero_val ? x_vec.elem[i] : x_vec.elem[i] * alpha_val;
+    }
+    *(reinterpret_cast<LoadType*>(y + linear_index)) = y_vec.storage;
+  }
+}
+
+template<typename T, bool alpha_requires_grad>
+__global__ void BroadcastPReluMultiAlphaNaiveBackwardGpu(const int32_t elem_cnt,
+                                                         const int32_t alpha_size,
+                                                         const int32_t inner_size, const T* x,
+                                                         const T* alpha, const T* dy, T* dx,
+                                                         T* alpha_diff) {
+  const T zero_val = static_cast<T>(0.0);
+  CUDA_1D_KERNEL_LOOP(i, elem_cnt) {
+    const T x_i = x[i];
+    const T dy_i = dy[i];
+    int32_t alpha_i = (i / inner_size) % alpha_size;
+    if (x_i > zero_val) {
+      dx[i] = dy_i;
+      if (alpha_requires_grad) { alpha_diff[i] = zero_val; }
+    } else {
+      dx[i] = dy_i * alpha[alpha_i];
+      if (alpha_requires_grad) { alpha_diff[i] = dy_i * x_i; }
+    }
+  }
+}
+
+template<typename T, typename IndexType, int pack_size, bool alpha_requires_grad>
+__global__ void PReluBackwardMultiAlphaGpu(const IndexType elem_cnt, const IndexType alpha_size,
+                                           const IndexType inner_size, const T* x, const T* alpha,
+                                           const T* dy, T* dx, T* alpha_diff) {
+  int32_t global_thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+
+  using LoadType = cuda::elementwise::PackType<T, pack_size>;
+  using LoadPack = cuda::elementwise::Pack<T, pack_size>;
+  T zero_val = static_cast<T>(0);
+  for (int64_t linear_index = global_thread_id * pack_size; linear_index < elem_cnt;
+       linear_index += gridDim.x * blockDim.x * pack_size) {
+    IndexType alpha_idx = (linear_index / inner_size) % alpha_size;
+
+    const LoadType* x_load = reinterpret_cast<const LoadType*>(x + linear_index);
+    LoadPack x_vec;
+    x_vec.storage = *x_load;
+
+    const LoadType* dy_load = reinterpret_cast<const LoadType*>(dy + linear_index);
+    LoadPack dy_vec;
+    dy_vec.storage = *dy_load;
+
+    LoadPack dx_vec;
+    T alpha_val = alpha[alpha_idx];
+    if (alpha_requires_grad) {
+      LoadPack dalpha_vec;
+      T zero_val = static_cast<T>(0.0);
+#pragma unroll
+      for (int i = 0; i < pack_size; i++) {
+        if (x_vec.elem[i] > zero_val) {
+          dx_vec.elem[i] = dy_vec.elem[i];
+          dalpha_vec.elem[i] = zero_val;
+        } else {
+          dx_vec.elem[i] = dy_vec.elem[i] * alpha_val;
+          dalpha_vec.elem[i] = dy_vec.elem[i] * x_vec.elem[i];
+        }
+      }
+      *(reinterpret_cast<LoadType*>(dx + linear_index)) = dx_vec.storage;
+      *(reinterpret_cast<LoadType*>(alpha_diff + linear_index)) = dalpha_vec.storage;
+    } else {
+#pragma unroll
+      for (int i = 0; i < pack_size; i++) {
+        if (x_vec.elem[i] > zero_val) {
+          dx_vec.elem[i] = dy_vec.elem[i];
+        } else {
+          dx_vec.elem[i] = dy_vec.elem[i] * alpha_val;
+        }
+      }
+      *(reinterpret_cast<LoadType*>(dx + linear_index)) = dx_vec.storage;
+    }
+  }
+}
+
+constexpr int32_t kBlockSize = 256;
+
+template<typename T>
+int GetLaunchPackSize(const int64_t inner_size) {
+  constexpr int type_pack_size = cuda::elementwise::PackSize<T>();
+  for (int launch_pack_size = 8; launch_pack_size > 0; launch_pack_size /= 2) {
+    if (type_pack_size >= launch_pack_size && inner_size % launch_pack_size == 0) {
+      return launch_pack_size;
+    }
+  }
+  return 1;
+}
+
+template<typename T, typename IndexType>
+void DispatchPreluForwardPackSize(ep::Stream* stream, const int64_t elem_cnt,
+                                  const int64_t alpha_size, const int64_t inner_size, const T* x,
+                                  const T* alpha, T* y) {
+  int grid_size;
+  const int pack_size = GetLaunchPackSize<T>(inner_size);
+  const int64_t pack_num = elem_cnt / pack_size;
+  hipError_t err = cuda::elementwise::GetNumBlocks(pack_num, &grid_size);
+  if (pack_size == 8) {
+    PReluForwardMultiAlphaGpu<T, IndexType, 8>
+        <<<grid_size, kBlockSize, 0, stream->As<ep::CudaStream>()->cuda_stream()>>>(
+            elem_cnt, alpha_size, inner_size, x, alpha, y);
+  } else if (pack_size == 4) {
+    PReluForwardMultiAlphaGpu<T, IndexType, 4>
+        <<<grid_size, kBlockSize, 0, stream->As<ep::CudaStream>()->cuda_stream()>>>(
+            elem_cnt, alpha_size, inner_size, x, alpha, y);
+  } else if (pack_size == 2) {
+    PReluForwardMultiAlphaGpu<T, IndexType, 2>
+        <<<grid_size, kBlockSize, 0, stream->As<ep::CudaStream>()->cuda_stream()>>>(
+            elem_cnt, alpha_size, inner_size, x, alpha, y);
+  } else {
+    BroadcastPReluMultiAlphaNaiveForwardGpu<T>
+        <<<grid_size, kBlockSize, 0, stream->As<ep::CudaStream>()->cuda_stream()>>>(
+            elem_cnt, alpha_size, inner_size, x, alpha, y);
+  }
+}
+
+template<typename T>
+void DispatchPreluForwardIndex(ep::Stream* stream, const int64_t elem_cnt, const int64_t alpha_size,
+                               const int64_t inner_size, const T* x, const T* alpha, T* y) {
+  if (elem_cnt < GetMaxVal<int32_t>()) {
+    DispatchPreluForwardPackSize<T, int32_t>(stream, elem_cnt, alpha_size, inner_size, x, alpha, y);
+  } else {
+    DispatchPreluForwardPackSize<T, int64_t>(stream, elem_cnt, alpha_size, inner_size, x, alpha, y);
+  }
+}
+
+template<typename T, typename IndexType>
+void DispatchPreluBackwardPackSize(ep::Stream* stream, const int64_t elem_cnt,
+                                   const int64_t alpha_size, const int64_t inner_size, const T* x,
+                                   const T* alpha, const T* dy, T* dx, T* alpha_diff,
+                                   const bool alpha_requires_grad) {
+  int grid_size;
+  const int pack_size = GetLaunchPackSize<T>(inner_size);
+  const int64_t pack_num = elem_cnt / pack_size;
+  hipError_t err = cuda::elementwise::GetNumBlocks(pack_num, &grid_size);
+
+  if (pack_size == 8) {
+    if (alpha_requires_grad) {
+      PReluBackwardMultiAlphaGpu<T, IndexType, 8, true>
+          <<<grid_size, kBlockSize, 0, stream->As<ep::CudaStream>()->cuda_stream()>>>(
+              elem_cnt, alpha_size, inner_size, x, alpha, dy, dx, alpha_diff);
+    } else {
+      PReluBackwardMultiAlphaGpu<T, IndexType, 8, false>
+          <<<grid_size, kBlockSize, 0, stream->As<ep::CudaStream>()->cuda_stream()>>>(
+              elem_cnt, alpha_size, inner_size, x, alpha, dy, dx, alpha_diff);
+    }
+  } else if (pack_size == 4) {
+    if (alpha_requires_grad) {
+      PReluBackwardMultiAlphaGpu<T, IndexType, 4, true>
+          <<<grid_size, kBlockSize, 0, stream->As<ep::CudaStream>()->cuda_stream()>>>(
+              elem_cnt, alpha_size, inner_size, x, alpha, dy, dx, alpha_diff);
+    } else {
+      PReluBackwardMultiAlphaGpu<T, IndexType, 4, false>
+          <<<grid_size, kBlockSize, 0, stream->As<ep::CudaStream>()->cuda_stream()>>>(
+              elem_cnt, alpha_size, inner_size, x, alpha, dy, dx, alpha_diff);
+    }
+  } else if (pack_size == 2) {
+    if (alpha_requires_grad) {
+      PReluBackwardMultiAlphaGpu<T, IndexType, 2, true>
+          <<<grid_size, kBlockSize, 0, stream->As<ep::CudaStream>()->cuda_stream()>>>(
+              elem_cnt, alpha_size, inner_size, x, alpha, dy, dx, alpha_diff);
+    } else {
+      PReluBackwardMultiAlphaGpu<T, IndexType, 2, false>
+          <<<grid_size, kBlockSize, 0, stream->As<ep::CudaStream>()->cuda_stream()>>>(
+              elem_cnt, alpha_size, inner_size, x, alpha, dy, dx, alpha_diff);
+    }
+
+  } else {
+    if (alpha_requires_grad) {
+      BroadcastPReluMultiAlphaNaiveBackwardGpu<T, true>
+          <<<grid_size, kBlockSize, 0, stream->As<ep::CudaStream>()->cuda_stream()>>>(
+              elem_cnt, alpha_size, inner_size, x, alpha, dy, dx, alpha_diff);
+    } else {
+      BroadcastPReluMultiAlphaNaiveBackwardGpu<T, false>
+          <<<grid_size, kBlockSize, 0, stream->As<ep::CudaStream>()->cuda_stream()>>>(
+              elem_cnt, alpha_size, inner_size, x, alpha, dy, dx, alpha_diff);
+    }
+  }
+}
+
+template<typename T>
+void DispatchPreluBackwardIndex(ep::Stream* stream, const int64_t elem_cnt,
+                                const int64_t alpha_size, const int64_t inner_size, const T* x,
+                                const T* alpha, const T* dy, T* dx, T* alpha_diff,
+                                const bool alpha_requires_grad) {
+  if (elem_cnt < GetMaxVal<int32_t>()) {
+    DispatchPreluBackwardPackSize<T, int32_t>(stream, elem_cnt, alpha_size, inner_size, x, alpha,
+                                              dy, dx, alpha_diff, alpha_requires_grad);
+  } else {
+    DispatchPreluBackwardPackSize<T, int64_t>(stream, elem_cnt, alpha_size, inner_size, x, alpha,
+                                              dy, dx, alpha_diff, alpha_requires_grad);
+  }
+}
+
+template<typename T, typename IndexType>
+void DispatchPreluBackwardSingleAlphaTail(ep::Stream* stream, const IndexType elem_cnt, const T* x,
+                                          const T* alpha, const T* dy, T* dx, T* alpha_diff,
+                                          const bool alpha_requires_grad) {
+  constexpr int pack_size = cuda::elementwise::PackSize<T>();
+  const int64_t pack_num = elem_cnt / pack_size;
+  int grid_size;
+  hipError_t err = cuda::elementwise::GetNumBlocks(pack_num, &grid_size);
+  const int64_t tail_offset = pack_num * pack_size;
+  const int64_t n_tail = elem_cnt - tail_offset;
+  const bool tail = n_tail > 0 ? true : false;
+  if (tail) {
+    if (alpha_requires_grad) {
+      PReluBackwardSingleAlphaGpu<T, IndexType, pack_size, true, true>
+          <<<grid_size, kBlockSize, 0, stream->As<ep::CudaStream>()->cuda_stream()>>>(
+              elem_cnt, n_tail, x, alpha, dy, dx, alpha_diff, x + tail_offset, dy + tail_offset,
+              dx + tail_offset, alpha_diff + tail_offset);
+    } else {
+      PReluBackwardSingleAlphaGpu<T, IndexType, pack_size, true, false>
+          <<<grid_size, kBlockSize, 0, stream->As<ep::CudaStream>()->cuda_stream()>>>(
+              elem_cnt, n_tail, x, alpha, dy, dx, alpha_diff, x + tail_offset, dy + tail_offset,
+              dx + tail_offset, alpha_diff + tail_offset);
+    }
+  } else {
+    if (alpha_requires_grad) {
+      PReluBackwardSingleAlphaGpu<T, IndexType, pack_size, false, true>
+          <<<grid_size, kBlockSize, 0, stream->As<ep::CudaStream>()->cuda_stream()>>>(
+              elem_cnt, n_tail, x, alpha, dy, dx, alpha_diff, x + tail_offset, dy + tail_offset,
+              dx + tail_offset, alpha_diff + tail_offset);
+    } else {
+      PReluBackwardSingleAlphaGpu<T, IndexType, pack_size, false, false>
+          <<<grid_size, kBlockSize, 0, stream->As<ep::CudaStream>()->cuda_stream()>>>(
+              elem_cnt, n_tail, x, alpha, dy, dx, alpha_diff, x + tail_offset, dy + tail_offset,
+              dx + tail_offset, alpha_diff + tail_offset);
+    }
+  }
+}
+
+template<typename T>
+void DispatchPreluBackwardSingleAlphaIndex(ep::Stream* stream, const int64_t elem_cnt, const T* x,
+                                           const T* alpha, const T* dy, T* dx, T* alpha_diff,
+                                           const bool alpha_requires_grad) {
+  if (elem_cnt < GetMaxVal<int32_t>()) {
+    DispatchPreluBackwardSingleAlphaTail<T, int32_t>(stream, elem_cnt, x, alpha, dy, dx, alpha_diff,
+                                                     alpha_requires_grad);
+  } else {
+    DispatchPreluBackwardSingleAlphaTail<T, int64_t>(stream, elem_cnt, x, alpha, dy, dx, alpha_diff,
+                                                     alpha_requires_grad);
+  }
+}
+
+}  // namespace
+
+template<typename T>
+class GpuPReluKernel final : public user_op::OpKernel {
+ public:
+  GpuPReluKernel() = default;
+  ~GpuPReluKernel() = default;
+
+ private:
+  using user_op::OpKernel::Compute;
+  void Compute(user_op::KernelComputeContext* ctx) const override {
+    const user_op::Tensor* x = ctx->Tensor4ArgNameAndIndex("x", 0);
+    const user_op::Tensor* alpha = ctx->Tensor4ArgNameAndIndex("alpha", 0);
+    user_op::Tensor* y = ctx->Tensor4ArgNameAndIndex("y", 0);
+    const int32_t elem_cnt = x->shape_view().elem_cnt();
+    const int32_t batch = x->shape_view().At(0);
+    const int32_t channels = (x->shape_view().NumAxes() == 1) ? 1 : x->shape_view().At(1);
+    const int32_t alpha_size = alpha->shape_view().elem_cnt();
+    const int32_t inner_size = elem_cnt / batch / channels;
+
+    if (alpha_size == 1) {
+      OF_CUDA_CHECK((cuda::elementwise::UnaryWithFactory(
+          PreluForwardSingleAlphaPtrFunctor<T>(reinterpret_cast<const T*>(alpha->dptr())), elem_cnt,
+          reinterpret_cast<T*>(y->mut_dptr()), reinterpret_cast<const T*>(x->dptr()),
+          ctx->stream()->As<ep::CudaStream>()->cuda_stream())));
+    } else {
+      DispatchPreluForwardIndex<T>(
+          ctx->stream(), elem_cnt, alpha_size, inner_size, reinterpret_cast<const T*>(x->dptr()),
+          reinterpret_cast<const T*>(alpha->dptr()), reinterpret_cast<T*>(y->mut_dptr()));
+    }
+  }
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
+};
+
+#define REGISTER_CUDA_PRELU_KERNEL(dtype)                                             \
+  REGISTER_USER_KERNEL("prelu").SetCreateFn<GpuPReluKernel<dtype>>().SetIsMatchedHob( \
+      (user_op::HobDeviceType() == DeviceType::kCUDA)                                 \
+      && (user_op::HobDataType("y", 0) == GetDataType<dtype>::value));
+
+REGISTER_CUDA_PRELU_KERNEL(half)
+REGISTER_CUDA_PRELU_KERNEL(float)
+REGISTER_CUDA_PRELU_KERNEL(double)
+
+template<typename T>
+class GpuPReluGradKernel final : public user_op::OpKernel {
+ public:
+  GpuPReluGradKernel() = default;
+  ~GpuPReluGradKernel() = default;
+
+ private:
+  using user_op::OpKernel::Compute;
+  void Compute(user_op::KernelComputeContext* ctx) const override {
+    const user_op::Tensor* x = ctx->Tensor4ArgNameAndIndex("x", 0);
+    const user_op::Tensor* alpha = ctx->Tensor4ArgNameAndIndex("alpha", 0);
+    const user_op::Tensor* dy = ctx->Tensor4ArgNameAndIndex("dy", 0);
+    user_op::Tensor* dx = ctx->Tensor4ArgNameAndIndex("dx", 0);
+    user_op::Tensor* alpha_diff = ctx->Tensor4ArgNameAndIndex("alpha_diff", 0);
+    user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0);
+    const bool alpha_requires_grad = ctx->Attr<bool>("alpha_requires_grad");
+    const int32_t elem_cnt = x->shape_view().elem_cnt();
+    T* broadcasted_alpha_diff = tmp_buffer->mut_dptr<T>();
+    T* reduce_sum_tmp_buf = reinterpret_cast<T*>(tmp_buffer->mut_dptr<char>()
+                                                 + GetCudaAlignedSize(elem_cnt * sizeof(T)));
+
+    const Shape& left_extended_shape = CreatePreluLeftExtendedShape(ShapeView(x->shape_view()));
+
+    const int32_t batch = x->shape_view().At(0);
+    const int32_t channels = (x->shape_view().NumAxes() == 1) ? 1 : x->shape_view().At(1);
+    const int32_t alpha_size = alpha->shape_view().elem_cnt();
+    const int32_t inner_size = elem_cnt / batch / channels;
+    if (alpha_size == 1) {
+      DispatchPreluBackwardSingleAlphaIndex<T>(ctx->stream(), elem_cnt, x->dptr<T>(),
+                                               alpha->dptr<T>(), dy->dptr<T>(), dx->mut_dptr<T>(),
+                                               broadcasted_alpha_diff, alpha_requires_grad);
+    } else {
+      DispatchPreluBackwardIndex<T>(ctx->stream(), elem_cnt, alpha_size, inner_size, x->dptr<T>(),
+                                    alpha->dptr<T>(), dy->dptr<T>(), dx->mut_dptr<T>(),
+                                    broadcasted_alpha_diff, alpha_requires_grad);
+    }
+    if (alpha_requires_grad) {
+      NdarrayUtil<DeviceType::kCUDA, T>::ReduceSum(
+          ctx->stream(), XpuVarNdarray<T>(left_extended_shape, alpha_diff->mut_dptr<T>()),
+          XpuVarNdarray<const T>(x->shape_view(), broadcasted_alpha_diff),
+          XpuVarNdarray<T>(x->shape_view(), reduce_sum_tmp_buf));
+    }
+  }
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
+};
+
+#define REGISTER_CUDA_PRELU_GRAD_KERNEL(dtype)                                          \
+  REGISTER_USER_KERNEL("prelu_grad")                                                    \
+      .SetCreateFn<GpuPReluGradKernel<dtype>>()                                         \
+      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                  \
+                       && (user_op::HobDataType("dx", 0) == GetDataType<dtype>::value)) \
+      .SetInferTmpSizeFn([](user_op::InferContext* ctx) {                               \
+        const Shape& in_shape = ctx->InputShape("x", 0);                                \
+        const Shape& alpha_shape = ctx->InputShape("alpha", 0);                         \
+        const int64_t tmp_buffer_size =                                                 \
+            2 * GetCudaAlignedSize(in_shape.elem_cnt() * sizeof(dtype));                \
+        return tmp_buffer_size;                                                         \
+      });
+
+REGISTER_CUDA_PRELU_GRAD_KERNEL(half)
+REGISTER_CUDA_PRELU_GRAD_KERNEL(float)
+REGISTER_CUDA_PRELU_GRAD_KERNEL(double)
+
 }  // namespace oneflow
\ No newline at end of file
diff --git a/oneflow/user/kernels/quantization_kernel.hip.cpp b/oneflow/user/kernels/quantization_kernel.hip.cpp
index 51fbbde..2f31245 100644
--- a/oneflow/user/kernels/quantization_kernel.hip.cpp
+++ b/oneflow/user/kernels/quantization_kernel.hip.cpp
@@ -1,159 +1,159 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "hip/hip_runtime.h"
-#include "oneflow/core/device/cuda_util.h"
-#include "oneflow/core/framework/framework.h"
-#include "oneflow/core/kernel/kernel_util.hip.h"
-
-namespace oneflow {
-
-namespace {
-
-template<typename T>
-__global__ void QuantizationSymmetric(const T* in_ptr, const T* scale_ptr, const int64_t scale_size,
-                                      const int64_t elements, const int64_t panel_size,
-                                      const double quantization_bit, T* out_ptr) {
-  int64_t gid = (blockDim.x * blockIdx.x) + threadIdx.x;
-  int64_t step = gridDim.x * blockDim.x;
-
-  T upper_bound = static_cast<T>(pow(2.0, quantization_bit - 1)) - 1;
-  T lower_bound = -upper_bound - 1;
-
-  while (gid < elements) {
-    int64_t channel_index = gid / panel_size;
-    int64_t scale_idx = min(scale_size - 1, channel_index);
-
-    T scale = scale_ptr[scale_idx];
-
-    T out = nearbyint(in_ptr[gid] / scale);
-    out = out > upper_bound ? upper_bound : out;
-    out = out < lower_bound ? lower_bound : out;
-    out_ptr[gid] = out;
-
-    gid += step;
-  }
-}
-
-template<typename T>
-__global__ void QuantizationAffine(const T* in_ptr, const T* scale_ptr, const T* zero_point_ptr,
-                                   const int64_t scale_size, const int64_t elements,
-                                   const int64_t panel_size, const double quantization_bit,
-                                   T* out_ptr) {
-  int64_t gid = (blockDim.x * blockIdx.x) + threadIdx.x;
-  int64_t step = gridDim.x * blockDim.x;
-
-  T upper_bound = static_cast<T>(pow(2.0, quantization_bit)) - 1;
-  T lower_bound = 0;
-
-  while (gid < elements) {
-    int64_t channel_index = gid / panel_size;
-    int64_t scale_idx = min(scale_size - 1, channel_index);
-
-    T scale = scale_ptr[scale_idx];
-    T zero_point = zero_point_ptr[scale_idx];
-
-    T out = nearbyint(in_ptr[gid] / scale + zero_point);
-    out = out > upper_bound ? upper_bound : out;
-    out = out < lower_bound ? lower_bound : out;
-    out_ptr[gid] = out;
-
-    gid += step;
-  }
-}
-
-template<typename T>
-__global__ void QuantizationCambricon(const T* in_ptr, const T* shift, const int64_t scale_size,
-                                      const int64_t elements, const int64_t panel_size,
-                                      const double quantization_bit, T* out_ptr) {
-  int64_t gid = (blockDim.x * blockIdx.x) + threadIdx.x;
-  int64_t step = gridDim.x * blockDim.x;
-
-  T upper_bound = static_cast<T>(pow(2.0, quantization_bit - 1)) - 1;
-  T lower_bound = -upper_bound - 1;
-
-  T scale = static_cast<T>(pow(2.0, static_cast<int32_t>(shift[0])));
-
-  while (gid < elements) {
-    T out = nearbyint(in_ptr[gid] / scale);
-    out = out > upper_bound ? upper_bound : out;
-    out = out < lower_bound ? lower_bound : out;
-    out_ptr[gid] = out;
-    gid += step;
-  }
-}
-
-}  // namespace
-
-template<typename T>
-class GpuQuantizationKernel final : public user_op::OpKernel {
- public:
-  GpuQuantizationKernel() = default;
-  ~GpuQuantizationKernel() = default;
-
- private:
-  using user_op::OpKernel::Compute;
-  void Compute(user_op::KernelComputeContext* ctx) const override {
-    const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("in", 0);
-    const user_op::Tensor* scale = ctx->Tensor4ArgNameAndIndex("scale", 0);
-    const user_op::Tensor* zero_point = ctx->Tensor4ArgNameAndIndex("zero_point", 0);
-    user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0);
-
-    const std::string quantization_scheme = ctx->Attr<std::string>("quantization_scheme");
-    const int32_t quantization_bit = ctx->Attr<int32_t>("quantization_bit");
-    const std::string quantization_formula = ctx->Attr<std::string>("quantization_formula");
-
-    const int64_t elements = in->shape_view().elem_cnt();
-    const int64_t panel_size = in->shape_view().Count(1);
-    const int64_t scale_size = scale->shape_view().elem_cnt();
-
-    // round to even
-    auto origin_round_mode = std::fegetround();
-    std::fesetround(FE_TONEAREST);
-
-    if (quantization_formula == "google") {
-      if (quantization_scheme == "symmetric") {
-        RUN_CUDA_KERNEL((QuantizationSymmetric<T>), ctx->stream(), elements, in->dptr<T>(),
-                        scale->dptr<T>(), scale_size, elements, panel_size, quantization_bit,
-                        out->mut_dptr<T>());
-      } else {  // quantization_scheme == "affine"
-        RUN_CUDA_KERNEL((QuantizationAffine<T>), ctx->stream(), elements, in->dptr<T>(),
-                        scale->dptr<T>(), zero_point->dptr<T>(), scale_size, elements, panel_size,
-                        quantization_bit, out->mut_dptr<T>());
-      }
-    } else if (quantization_formula == "cambricon") {
-      RUN_CUDA_KERNEL((QuantizationCambricon<T>), ctx->stream(), elements, in->dptr<T>(),
-                      scale->dptr<T>(), scale_size, elements, panel_size, quantization_bit,
-                      out->mut_dptr<T>());
-    } else {
-      UNIMPLEMENTED();
-    }
-
-    std::fesetround(origin_round_mode);
-  }
-
-  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
-};
-
-#define REGISTER_QUANTIZATION_KERNEL(dtype)                            \
-  REGISTER_USER_KERNEL("quantization")                                 \
-      .SetCreateFn<GpuQuantizationKernel<dtype>>()                     \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \
-                       && (user_op::HobDataType("in", 0) == GetDataType<dtype>::value))
-
-REGISTER_QUANTIZATION_KERNEL(float);
-REGISTER_QUANTIZATION_KERNEL(double);
-
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "hip/hip_runtime.h"
+#include "oneflow/core/device/cuda_util.h"
+#include "oneflow/core/framework/framework.h"
+#include "oneflow/core/kernel/kernel_util.hip.h"
+
+namespace oneflow {
+
+namespace {
+
+template<typename T>
+__global__ void QuantizationSymmetric(const T* in_ptr, const T* scale_ptr, const int64_t scale_size,
+                                      const int64_t elements, const int64_t panel_size,
+                                      const double quantization_bit, T* out_ptr) {
+  int64_t gid = (blockDim.x * blockIdx.x) + threadIdx.x;
+  int64_t step = gridDim.x * blockDim.x;
+
+  T upper_bound = static_cast<T>(pow(2.0, quantization_bit - 1)) - 1;
+  T lower_bound = -upper_bound - 1;
+
+  while (gid < elements) {
+    int64_t channel_index = gid / panel_size;
+    int64_t scale_idx = min(scale_size - 1, channel_index);
+
+    T scale = scale_ptr[scale_idx];
+
+    T out = nearbyint(in_ptr[gid] / scale);
+    out = out > upper_bound ? upper_bound : out;
+    out = out < lower_bound ? lower_bound : out;
+    out_ptr[gid] = out;
+
+    gid += step;
+  }
+}
+
+template<typename T>
+__global__ void QuantizationAffine(const T* in_ptr, const T* scale_ptr, const T* zero_point_ptr,
+                                   const int64_t scale_size, const int64_t elements,
+                                   const int64_t panel_size, const double quantization_bit,
+                                   T* out_ptr) {
+  int64_t gid = (blockDim.x * blockIdx.x) + threadIdx.x;
+  int64_t step = gridDim.x * blockDim.x;
+
+  T upper_bound = static_cast<T>(pow(2.0, quantization_bit)) - 1;
+  T lower_bound = 0;
+
+  while (gid < elements) {
+    int64_t channel_index = gid / panel_size;
+    int64_t scale_idx = min(scale_size - 1, channel_index);
+
+    T scale = scale_ptr[scale_idx];
+    T zero_point = zero_point_ptr[scale_idx];
+
+    T out = nearbyint(in_ptr[gid] / scale + zero_point);
+    out = out > upper_bound ? upper_bound : out;
+    out = out < lower_bound ? lower_bound : out;
+    out_ptr[gid] = out;
+
+    gid += step;
+  }
+}
+
+template<typename T>
+__global__ void QuantizationCambricon(const T* in_ptr, const T* shift, const int64_t scale_size,
+                                      const int64_t elements, const int64_t panel_size,
+                                      const double quantization_bit, T* out_ptr) {
+  int64_t gid = (blockDim.x * blockIdx.x) + threadIdx.x;
+  int64_t step = gridDim.x * blockDim.x;
+
+  T upper_bound = static_cast<T>(pow(2.0, quantization_bit - 1)) - 1;
+  T lower_bound = -upper_bound - 1;
+
+  T scale = static_cast<T>(pow(2.0, static_cast<int32_t>(shift[0])));
+
+  while (gid < elements) {
+    T out = nearbyint(in_ptr[gid] / scale);
+    out = out > upper_bound ? upper_bound : out;
+    out = out < lower_bound ? lower_bound : out;
+    out_ptr[gid] = out;
+    gid += step;
+  }
+}
+
+}  // namespace
+
+template<typename T>
+class GpuQuantizationKernel final : public user_op::OpKernel {
+ public:
+  GpuQuantizationKernel() = default;
+  ~GpuQuantizationKernel() = default;
+
+ private:
+  using user_op::OpKernel::Compute;
+  void Compute(user_op::KernelComputeContext* ctx) const override {
+    const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("in", 0);
+    const user_op::Tensor* scale = ctx->Tensor4ArgNameAndIndex("scale", 0);
+    const user_op::Tensor* zero_point = ctx->Tensor4ArgNameAndIndex("zero_point", 0);
+    user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0);
+
+    const std::string quantization_scheme = ctx->Attr<std::string>("quantization_scheme");
+    const int32_t quantization_bit = ctx->Attr<int32_t>("quantization_bit");
+    const std::string quantization_formula = ctx->Attr<std::string>("quantization_formula");
+
+    const int64_t elements = in->shape_view().elem_cnt();
+    const int64_t panel_size = in->shape_view().Count(1);
+    const int64_t scale_size = scale->shape_view().elem_cnt();
+
+    // round to even
+    auto origin_round_mode = std::fegetround();
+    std::fesetround(FE_TONEAREST);
+
+    if (quantization_formula == "google") {
+      if (quantization_scheme == "symmetric") {
+        RUN_CUDA_KERNEL((QuantizationSymmetric<T>), ctx->stream(), elements, in->dptr<T>(),
+                        scale->dptr<T>(), scale_size, elements, panel_size, quantization_bit,
+                        out->mut_dptr<T>());
+      } else {  // quantization_scheme == "affine"
+        RUN_CUDA_KERNEL((QuantizationAffine<T>), ctx->stream(), elements, in->dptr<T>(),
+                        scale->dptr<T>(), zero_point->dptr<T>(), scale_size, elements, panel_size,
+                        quantization_bit, out->mut_dptr<T>());
+      }
+    } else if (quantization_formula == "cambricon") {
+      RUN_CUDA_KERNEL((QuantizationCambricon<T>), ctx->stream(), elements, in->dptr<T>(),
+                      scale->dptr<T>(), scale_size, elements, panel_size, quantization_bit,
+                      out->mut_dptr<T>());
+    } else {
+      UNIMPLEMENTED();
+    }
+
+    std::fesetround(origin_round_mode);
+  }
+
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
+};
+
+#define REGISTER_QUANTIZATION_KERNEL(dtype)                            \
+  REGISTER_USER_KERNEL("quantization")                                 \
+      .SetCreateFn<GpuQuantizationKernel<dtype>>()                     \
+      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \
+                       && (user_op::HobDataType("in", 0) == GetDataType<dtype>::value))
+
+REGISTER_QUANTIZATION_KERNEL(float);
+REGISTER_QUANTIZATION_KERNEL(double);
+
 }  // namespace oneflow
\ No newline at end of file
diff --git a/oneflow/user/kernels/radix_sort.hip.h b/oneflow/user/kernels/radix_sort.hip.h
index 8cdd903..c47f090 100644
--- a/oneflow/user/kernels/radix_sort.hip.h
+++ b/oneflow/user/kernels/radix_sort.hip.h
@@ -1,280 +1,280 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#ifndef ONEFLOW_USER_KERNELS_RADIX_SORT_HIP_H_
-#define ONEFLOW_USER_KERNELS_RADIX_SORT_HIP_H_
-
-#include <hipcub/hipcub.hpp>
-#include "oneflow/core/device/cuda_util.h"
-
-namespace oneflow {
-
-namespace {
-
-class MultiplyFunctor final {
- public:
-  MultiplyFunctor(int32_t num_col) : num_col_(num_col) {}
-  __host__ __device__ __forceinline__ int32_t operator()(int32_t idx) const {
-    return idx * num_col_;
-  }
-
- private:
-  int32_t num_col_;
-};
-
-}  // namespace
-
-template<typename KeyType, typename ValueType>
-size_t InferTempStorageForSortPairsAscending(int32_t num_row, int32_t num_col) {
-  using SegmentOffsetIter =
-      hipcub::TransformInputIterator<int32_t, MultiplyFunctor, hipcub::CountingInputIterator<int32_t>>;
-
-  hipcub::CountingInputIterator<int32_t> counting_iter(0);
-  MultiplyFunctor multiply_functor(num_col);
-  SegmentOffsetIter segment_offset_iter(counting_iter, multiply_functor);
-
-  size_t temp_storage_bytes = 0;
-  auto err = hipcub::DeviceSegmentedRadixSort::SortPairs<KeyType, ValueType, SegmentOffsetIter>(
-      /* d_temp_storage */ nullptr,
-      /* temp_storage_bytes */ temp_storage_bytes,
-      /* d_keys_in */ nullptr,
-      /* d_keys_out */ nullptr,
-      /* d_values_in */ nullptr,
-      /* d_values_out */ nullptr,
-      /* num_items */ num_row * num_col,
-      /* num_segments */ num_row,
-      /* d_begin_offsets */ segment_offset_iter,
-      /* d_end_offsets */ segment_offset_iter + 1,
-      /* begin_bit */ 0,
-      /* end_bit */ sizeof(KeyType) * 8,
-      /* stream */ 0);
-  OF_CUDA_CHECK(err);
-
-  return temp_storage_bytes;
-}
-
-template<typename KeyType, typename ValueType>
-size_t InferTempStorageForSortPairsDescending(int32_t num_row, int32_t num_col) {
-  using SegmentOffsetIter =
-      hipcub::TransformInputIterator<int32_t, MultiplyFunctor, hipcub::CountingInputIterator<int32_t>>;
-
-  hipcub::CountingInputIterator<int32_t> counting_iter(0);
-  MultiplyFunctor multiply_functor(num_col);
-  SegmentOffsetIter segment_offset_iter(counting_iter, multiply_functor);
-
-  size_t temp_storage_bytes = 0;
-  auto err =
-      hipcub::DeviceSegmentedRadixSort::SortPairsDescending<KeyType, ValueType, SegmentOffsetIter>(
-          /* d_temp_storage */ nullptr,
-          /* temp_storage_bytes */ temp_storage_bytes,
-          /* d_keys_in */ nullptr,
-          /* d_keys_out */ nullptr,
-          /* d_values_in */ nullptr,
-          /* d_values_out */ nullptr,
-          /* num_items */ num_row * num_col,
-          /* num_segments */ num_row,
-          /* d_begin_offsets */ segment_offset_iter,
-          /* d_end_offsets */ segment_offset_iter + 1,
-          /* begin_bit */ 0,
-          /* end_bit */ sizeof(KeyType) * 8,
-          /* stream */ 0);
-  OF_CUDA_CHECK(err);
-
-  return temp_storage_bytes;
-}
-
-template<typename KeyType>
-size_t InferTempStorageForSortKeysAscending(int32_t num_row, int32_t num_col) {
-  using SegmentOffsetIter =
-      hipcub::TransformInputIterator<int32_t, MultiplyFunctor, hipcub::CountingInputIterator<int32_t>>;
-
-  hipcub::CountingInputIterator<int32_t> counting_iter(0);
-  MultiplyFunctor multiply_functor(num_col);
-  SegmentOffsetIter segment_offset_iter(counting_iter, multiply_functor);
-
-  size_t temp_storage_bytes = 0;
-  auto err = hipcub::DeviceSegmentedRadixSort::SortKeys<KeyType, SegmentOffsetIter>(
-      /* d_temp_storage */ nullptr,
-      /* temp_storage_bytes */ temp_storage_bytes,
-      /* d_keys_in */ nullptr,
-      /* d_keys_out */ nullptr,
-      /* num_items */ num_row * num_col,
-      /* num_segments */ num_row,
-      /* d_begin_offsets */ segment_offset_iter,
-      /* d_end_offsets */ segment_offset_iter + 1,
-      /* begin_bit */ 0,
-      /* end_bit */ sizeof(KeyType) * 8,
-      /* stream */ 0);
-  OF_CUDA_CHECK(err);
-
-  return temp_storage_bytes;
-}
-
-template<typename KeyType>
-size_t InferTempStorageForSortKeysDescending(int32_t num_row, int32_t num_col) {
-  using SegmentOffsetIter =
-      hipcub::TransformInputIterator<int32_t, MultiplyFunctor, hipcub::CountingInputIterator<int32_t>>;
-
-  hipcub::CountingInputIterator<int32_t> counting_iter(0);
-  MultiplyFunctor multiply_functor(num_col);
-  SegmentOffsetIter segment_offset_iter(counting_iter, multiply_functor);
-
-  size_t temp_storage_bytes = 0;
-  auto err = hipcub::DeviceSegmentedRadixSort::SortKeysDescending<KeyType, SegmentOffsetIter>(
-      /* d_temp_storage */ nullptr,
-      /* temp_storage_bytes */ temp_storage_bytes,
-      /* d_keys_in */ nullptr,
-      /* d_keys_out */ nullptr,
-      /* num_items */ num_row * num_col,
-      /* num_segments */ num_row,
-      /* d_begin_offsets */ segment_offset_iter,
-      /* d_end_offsets */ segment_offset_iter + 1,
-      /* begin_bit */ 0,
-      /* end_bit */ sizeof(KeyType) * 8,
-      /* stream */ 0);
-  OF_CUDA_CHECK(err);
-
-  return temp_storage_bytes;
-}
-
-template<typename KeyType, typename ValueType>
-void SortPairsAscending(const KeyType* keys_ptr, const ValueType* values_ptr, int32_t num_row,
-                        int32_t num_col, void* temp_storage_ptr, int32_t temp_storage_bytes,
-                        KeyType* sorted_keys_ptr, ValueType* sorted_values_ptr,
-                        hipStream_t stream) {
-  size_t rt_inferred_temp_storage_bytes =
-      InferTempStorageForSortPairsAscending<KeyType, ValueType>(num_row, num_col);
-  CHECK_LE(rt_inferred_temp_storage_bytes, temp_storage_bytes);
-
-  using SegmentOffsetIter =
-      hipcub::TransformInputIterator<int32_t, MultiplyFunctor, hipcub::CountingInputIterator<int32_t>>;
-
-  hipcub::CountingInputIterator<int32_t> counting_iter(0);
-  MultiplyFunctor multiply_functor(num_col);
-  SegmentOffsetIter segment_offset_iter(counting_iter, multiply_functor);
-
-  auto err = hipcub::DeviceSegmentedRadixSort::SortPairs(
-      /* d_temp_storage */ temp_storage_ptr,
-      /* temp_storage_bytes */ rt_inferred_temp_storage_bytes,
-      /* d_keys_in */ keys_ptr,
-      /* d_keys_out */ sorted_keys_ptr,
-      /* d_values_in */ values_ptr,
-      /* d_values_out */ sorted_values_ptr,
-      /* num_items */ num_row * num_col,
-      /* num_segments */ num_row,
-      /* d_begin_offsets */ segment_offset_iter,
-      /* d_end_offsets */ segment_offset_iter + 1,
-      /* begin_bit */ 0,
-      /* end_bit */ sizeof(KeyType) * 8,
-      /* stream */ stream);
-  OF_CUDA_CHECK(err);
-}
-
-template<typename KeyType, typename ValueType>
-void SortPairsDescending(const KeyType* keys_ptr, const ValueType* values_ptr, int32_t num_row,
-                         int32_t num_col, void* temp_storage_ptr, int32_t temp_storage_bytes,
-                         KeyType* sorted_keys_ptr, ValueType* sorted_values_ptr,
-                         hipStream_t stream) {
-  size_t rt_inferred_temp_storage_bytes =
-      InferTempStorageForSortPairsDescending<KeyType, ValueType>(num_row, num_col);
-  CHECK_LE(rt_inferred_temp_storage_bytes, temp_storage_bytes);
-
-  using SegmentOffsetIter =
-      hipcub::TransformInputIterator<int32_t, MultiplyFunctor, hipcub::CountingInputIterator<int32_t>>;
-
-  hipcub::CountingInputIterator<int32_t> counting_iter(0);
-  MultiplyFunctor multiply_functor(num_col);
-  SegmentOffsetIter segment_offset_iter(counting_iter, multiply_functor);
-
-  auto err = hipcub::DeviceSegmentedRadixSort::SortPairsDescending(
-      /* d_temp_storage */ temp_storage_ptr,
-      /* temp_storage_bytes */ rt_inferred_temp_storage_bytes,
-      /* d_keys_in */ keys_ptr,
-      /* d_keys_out */ sorted_keys_ptr,
-      /* d_values_in */ values_ptr,
-      /* d_values_out */ sorted_values_ptr,
-      /* num_items */ num_row * num_col,
-      /* num_segments */ num_row,
-      /* d_begin_offsets */ segment_offset_iter,
-      /* d_end_offsets */ segment_offset_iter + 1,
-      /* begin_bit */ 0,
-      /* end_bit */ sizeof(KeyType) * 8,
-      /* stream */ stream);
-  OF_CUDA_CHECK(err);
-}
-
-template<typename KeyType>
-void SortKeysAscending(const KeyType* keys_ptr, int32_t num_row, int32_t num_col,
-                       void* temp_storage_ptr, int32_t temp_storage_bytes, KeyType* sorted_keys_ptr,
-                       hipStream_t stream) {
-  size_t rt_inferred_temp_storage_bytes =
-      InferTempStorageForSortKeysAscending<KeyType>(num_row, num_col);
-  CHECK_LE(rt_inferred_temp_storage_bytes, temp_storage_bytes);
-
-  using SegmentOffsetIter =
-      hipcub::TransformInputIterator<int32_t, MultiplyFunctor, hipcub::CountingInputIterator<int32_t>>;
-
-  hipcub::CountingInputIterator<int32_t> counting_iter(0);
-  MultiplyFunctor multiply_functor(num_col);
-  SegmentOffsetIter segment_offset_iter(counting_iter, multiply_functor);
-
-  auto err = hipcub::DeviceSegmentedRadixSort::SortKeys(
-      /* d_temp_storage */ temp_storage_ptr,
-      /* temp_storage_bytes */ rt_inferred_temp_storage_bytes,
-      /* d_keys_in */ keys_ptr,
-      /* d_keys_out */ sorted_keys_ptr,
-      /* num_items */ num_row * num_col,
-      /* num_segments */ num_row,
-      /* d_begin_offsets */ segment_offset_iter,
-      /* d_end_offsets */ segment_offset_iter + 1,
-      /* begin_bit */ 0,
-      /* end_bit */ sizeof(KeyType) * 8,
-      /* stream */ stream);
-  OF_CUDA_CHECK(err);
-}
-
-template<typename KeyType>
-void SortKeysDescending(const KeyType* keys_ptr, int32_t num_row, int32_t num_col,
-                        void* temp_storage_ptr, int32_t temp_storage_bytes,
-                        KeyType* sorted_keys_ptr, hipStream_t stream) {
-  size_t rt_inferred_temp_storage_bytes =
-      InferTempStorageForSortKeysDescending<KeyType>(num_row, num_col);
-  CHECK_LE(rt_inferred_temp_storage_bytes, temp_storage_bytes);
-
-  using SegmentOffsetIter =
-      hipcub::TransformInputIterator<int32_t, MultiplyFunctor, hipcub::CountingInputIterator<int32_t>>;
-
-  hipcub::CountingInputIterator<int32_t> counting_iter(0);
-  MultiplyFunctor multiply_functor(num_col);
-  SegmentOffsetIter segment_offset_iter(counting_iter, multiply_functor);
-
-  auto err = hipcub::DeviceSegmentedRadixSort::SortKeysDescending(
-      /* d_temp_storage */ temp_storage_ptr,
-      /* temp_storage_bytes */ rt_inferred_temp_storage_bytes,
-      /* d_keys_in */ keys_ptr,
-      /* d_keys_out */ sorted_keys_ptr,
-      /* num_items */ num_row * num_col,
-      /* num_segments */ num_row,
-      /* d_begin_offsets */ segment_offset_iter,
-      /* d_end_offsets */ segment_offset_iter + 1,
-      /* begin_bit */ 0,
-      /* end_bit */ sizeof(KeyType) * 8,
-      /* stream */ stream);
-  OF_CUDA_CHECK(err);
-}
-
-}  // namespace oneflow
-
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#ifndef ONEFLOW_USER_KERNELS_RADIX_SORT_HIP_H_
+#define ONEFLOW_USER_KERNELS_RADIX_SORT_HIP_H_
+
+#include <hipcub/hipcub.hpp>
+#include "oneflow/core/device/cuda_util.h"
+
+namespace oneflow {
+
+namespace {
+
+class MultiplyFunctor final {
+ public:
+  MultiplyFunctor(int32_t num_col) : num_col_(num_col) {}
+  __host__ __device__ __forceinline__ int32_t operator()(int32_t idx) const {
+    return idx * num_col_;
+  }
+
+ private:
+  int32_t num_col_;
+};
+
+}  // namespace
+
+template<typename KeyType, typename ValueType>
+size_t InferTempStorageForSortPairsAscending(int32_t num_row, int32_t num_col) {
+  using SegmentOffsetIter =
+      hipcub::TransformInputIterator<int32_t, MultiplyFunctor, hipcub::CountingInputIterator<int32_t>>;
+
+  hipcub::CountingInputIterator<int32_t> counting_iter(0);
+  MultiplyFunctor multiply_functor(num_col);
+  SegmentOffsetIter segment_offset_iter(counting_iter, multiply_functor);
+
+  size_t temp_storage_bytes = 0;
+  auto err = hipcub::DeviceSegmentedRadixSort::SortPairs<KeyType, ValueType, SegmentOffsetIter>(
+      /* d_temp_storage */ nullptr,
+      /* temp_storage_bytes */ temp_storage_bytes,
+      /* d_keys_in */ nullptr,
+      /* d_keys_out */ nullptr,
+      /* d_values_in */ nullptr,
+      /* d_values_out */ nullptr,
+      /* num_items */ num_row * num_col,
+      /* num_segments */ num_row,
+      /* d_begin_offsets */ segment_offset_iter,
+      /* d_end_offsets */ segment_offset_iter + 1,
+      /* begin_bit */ 0,
+      /* end_bit */ sizeof(KeyType) * 8,
+      /* stream */ 0);
+  OF_CUDA_CHECK(err);
+
+  return temp_storage_bytes;
+}
+
+template<typename KeyType, typename ValueType>
+size_t InferTempStorageForSortPairsDescending(int32_t num_row, int32_t num_col) {
+  using SegmentOffsetIter =
+      hipcub::TransformInputIterator<int32_t, MultiplyFunctor, hipcub::CountingInputIterator<int32_t>>;
+
+  hipcub::CountingInputIterator<int32_t> counting_iter(0);
+  MultiplyFunctor multiply_functor(num_col);
+  SegmentOffsetIter segment_offset_iter(counting_iter, multiply_functor);
+
+  size_t temp_storage_bytes = 0;
+  auto err =
+      hipcub::DeviceSegmentedRadixSort::SortPairsDescending<KeyType, ValueType, SegmentOffsetIter>(
+          /* d_temp_storage */ nullptr,
+          /* temp_storage_bytes */ temp_storage_bytes,
+          /* d_keys_in */ nullptr,
+          /* d_keys_out */ nullptr,
+          /* d_values_in */ nullptr,
+          /* d_values_out */ nullptr,
+          /* num_items */ num_row * num_col,
+          /* num_segments */ num_row,
+          /* d_begin_offsets */ segment_offset_iter,
+          /* d_end_offsets */ segment_offset_iter + 1,
+          /* begin_bit */ 0,
+          /* end_bit */ sizeof(KeyType) * 8,
+          /* stream */ 0);
+  OF_CUDA_CHECK(err);
+
+  return temp_storage_bytes;
+}
+
+template<typename KeyType>
+size_t InferTempStorageForSortKeysAscending(int32_t num_row, int32_t num_col) {
+  using SegmentOffsetIter =
+      hipcub::TransformInputIterator<int32_t, MultiplyFunctor, hipcub::CountingInputIterator<int32_t>>;
+
+  hipcub::CountingInputIterator<int32_t> counting_iter(0);
+  MultiplyFunctor multiply_functor(num_col);
+  SegmentOffsetIter segment_offset_iter(counting_iter, multiply_functor);
+
+  size_t temp_storage_bytes = 0;
+  auto err = hipcub::DeviceSegmentedRadixSort::SortKeys<KeyType, SegmentOffsetIter>(
+      /* d_temp_storage */ nullptr,
+      /* temp_storage_bytes */ temp_storage_bytes,
+      /* d_keys_in */ nullptr,
+      /* d_keys_out */ nullptr,
+      /* num_items */ num_row * num_col,
+      /* num_segments */ num_row,
+      /* d_begin_offsets */ segment_offset_iter,
+      /* d_end_offsets */ segment_offset_iter + 1,
+      /* begin_bit */ 0,
+      /* end_bit */ sizeof(KeyType) * 8,
+      /* stream */ 0);
+  OF_CUDA_CHECK(err);
+
+  return temp_storage_bytes;
+}
+
+template<typename KeyType>
+size_t InferTempStorageForSortKeysDescending(int32_t num_row, int32_t num_col) {
+  using SegmentOffsetIter =
+      hipcub::TransformInputIterator<int32_t, MultiplyFunctor, hipcub::CountingInputIterator<int32_t>>;
+
+  hipcub::CountingInputIterator<int32_t> counting_iter(0);
+  MultiplyFunctor multiply_functor(num_col);
+  SegmentOffsetIter segment_offset_iter(counting_iter, multiply_functor);
+
+  size_t temp_storage_bytes = 0;
+  auto err = hipcub::DeviceSegmentedRadixSort::SortKeysDescending<KeyType, SegmentOffsetIter>(
+      /* d_temp_storage */ nullptr,
+      /* temp_storage_bytes */ temp_storage_bytes,
+      /* d_keys_in */ nullptr,
+      /* d_keys_out */ nullptr,
+      /* num_items */ num_row * num_col,
+      /* num_segments */ num_row,
+      /* d_begin_offsets */ segment_offset_iter,
+      /* d_end_offsets */ segment_offset_iter + 1,
+      /* begin_bit */ 0,
+      /* end_bit */ sizeof(KeyType) * 8,
+      /* stream */ 0);
+  OF_CUDA_CHECK(err);
+
+  return temp_storage_bytes;
+}
+
+template<typename KeyType, typename ValueType>
+void SortPairsAscending(const KeyType* keys_ptr, const ValueType* values_ptr, int32_t num_row,
+                        int32_t num_col, void* temp_storage_ptr, int32_t temp_storage_bytes,
+                        KeyType* sorted_keys_ptr, ValueType* sorted_values_ptr,
+                        hipStream_t stream) {
+  size_t rt_inferred_temp_storage_bytes =
+      InferTempStorageForSortPairsAscending<KeyType, ValueType>(num_row, num_col);
+  CHECK_LE(rt_inferred_temp_storage_bytes, temp_storage_bytes);
+
+  using SegmentOffsetIter =
+      hipcub::TransformInputIterator<int32_t, MultiplyFunctor, hipcub::CountingInputIterator<int32_t>>;
+
+  hipcub::CountingInputIterator<int32_t> counting_iter(0);
+  MultiplyFunctor multiply_functor(num_col);
+  SegmentOffsetIter segment_offset_iter(counting_iter, multiply_functor);
+
+  auto err = hipcub::DeviceSegmentedRadixSort::SortPairs(
+      /* d_temp_storage */ temp_storage_ptr,
+      /* temp_storage_bytes */ rt_inferred_temp_storage_bytes,
+      /* d_keys_in */ keys_ptr,
+      /* d_keys_out */ sorted_keys_ptr,
+      /* d_values_in */ values_ptr,
+      /* d_values_out */ sorted_values_ptr,
+      /* num_items */ num_row * num_col,
+      /* num_segments */ num_row,
+      /* d_begin_offsets */ segment_offset_iter,
+      /* d_end_offsets */ segment_offset_iter + 1,
+      /* begin_bit */ 0,
+      /* end_bit */ sizeof(KeyType) * 8,
+      /* stream */ stream);
+  OF_CUDA_CHECK(err);
+}
+
+template<typename KeyType, typename ValueType>
+void SortPairsDescending(const KeyType* keys_ptr, const ValueType* values_ptr, int32_t num_row,
+                         int32_t num_col, void* temp_storage_ptr, int32_t temp_storage_bytes,
+                         KeyType* sorted_keys_ptr, ValueType* sorted_values_ptr,
+                         hipStream_t stream) {
+  size_t rt_inferred_temp_storage_bytes =
+      InferTempStorageForSortPairsDescending<KeyType, ValueType>(num_row, num_col);
+  CHECK_LE(rt_inferred_temp_storage_bytes, temp_storage_bytes);
+
+  using SegmentOffsetIter =
+      hipcub::TransformInputIterator<int32_t, MultiplyFunctor, hipcub::CountingInputIterator<int32_t>>;
+
+  hipcub::CountingInputIterator<int32_t> counting_iter(0);
+  MultiplyFunctor multiply_functor(num_col);
+  SegmentOffsetIter segment_offset_iter(counting_iter, multiply_functor);
+
+  auto err = hipcub::DeviceSegmentedRadixSort::SortPairsDescending(
+      /* d_temp_storage */ temp_storage_ptr,
+      /* temp_storage_bytes */ rt_inferred_temp_storage_bytes,
+      /* d_keys_in */ keys_ptr,
+      /* d_keys_out */ sorted_keys_ptr,
+      /* d_values_in */ values_ptr,
+      /* d_values_out */ sorted_values_ptr,
+      /* num_items */ num_row * num_col,
+      /* num_segments */ num_row,
+      /* d_begin_offsets */ segment_offset_iter,
+      /* d_end_offsets */ segment_offset_iter + 1,
+      /* begin_bit */ 0,
+      /* end_bit */ sizeof(KeyType) * 8,
+      /* stream */ stream);
+  OF_CUDA_CHECK(err);
+}
+
+template<typename KeyType>
+void SortKeysAscending(const KeyType* keys_ptr, int32_t num_row, int32_t num_col,
+                       void* temp_storage_ptr, int32_t temp_storage_bytes, KeyType* sorted_keys_ptr,
+                       hipStream_t stream) {
+  size_t rt_inferred_temp_storage_bytes =
+      InferTempStorageForSortKeysAscending<KeyType>(num_row, num_col);
+  CHECK_LE(rt_inferred_temp_storage_bytes, temp_storage_bytes);
+
+  using SegmentOffsetIter =
+      hipcub::TransformInputIterator<int32_t, MultiplyFunctor, hipcub::CountingInputIterator<int32_t>>;
+
+  hipcub::CountingInputIterator<int32_t> counting_iter(0);
+  MultiplyFunctor multiply_functor(num_col);
+  SegmentOffsetIter segment_offset_iter(counting_iter, multiply_functor);
+
+  auto err = hipcub::DeviceSegmentedRadixSort::SortKeys(
+      /* d_temp_storage */ temp_storage_ptr,
+      /* temp_storage_bytes */ rt_inferred_temp_storage_bytes,
+      /* d_keys_in */ keys_ptr,
+      /* d_keys_out */ sorted_keys_ptr,
+      /* num_items */ num_row * num_col,
+      /* num_segments */ num_row,
+      /* d_begin_offsets */ segment_offset_iter,
+      /* d_end_offsets */ segment_offset_iter + 1,
+      /* begin_bit */ 0,
+      /* end_bit */ sizeof(KeyType) * 8,
+      /* stream */ stream);
+  OF_CUDA_CHECK(err);
+}
+
+template<typename KeyType>
+void SortKeysDescending(const KeyType* keys_ptr, int32_t num_row, int32_t num_col,
+                        void* temp_storage_ptr, int32_t temp_storage_bytes,
+                        KeyType* sorted_keys_ptr, hipStream_t stream) {
+  size_t rt_inferred_temp_storage_bytes =
+      InferTempStorageForSortKeysDescending<KeyType>(num_row, num_col);
+  CHECK_LE(rt_inferred_temp_storage_bytes, temp_storage_bytes);
+
+  using SegmentOffsetIter =
+      hipcub::TransformInputIterator<int32_t, MultiplyFunctor, hipcub::CountingInputIterator<int32_t>>;
+
+  hipcub::CountingInputIterator<int32_t> counting_iter(0);
+  MultiplyFunctor multiply_functor(num_col);
+  SegmentOffsetIter segment_offset_iter(counting_iter, multiply_functor);
+
+  auto err = hipcub::DeviceSegmentedRadixSort::SortKeysDescending(
+      /* d_temp_storage */ temp_storage_ptr,
+      /* temp_storage_bytes */ rt_inferred_temp_storage_bytes,
+      /* d_keys_in */ keys_ptr,
+      /* d_keys_out */ sorted_keys_ptr,
+      /* num_items */ num_row * num_col,
+      /* num_segments */ num_row,
+      /* d_begin_offsets */ segment_offset_iter,
+      /* d_end_offsets */ segment_offset_iter + 1,
+      /* begin_bit */ 0,
+      /* end_bit */ sizeof(KeyType) * 8,
+      /* stream */ stream);
+  OF_CUDA_CHECK(err);
+}
+
+}  // namespace oneflow
+
 #endif  // ONEFLOW_USER_KERNELS_RADIX_SORT_HIP_H_
\ No newline at end of file
diff --git a/oneflow/user/kernels/radix_sort_top_k_kernel.hip.cpp b/oneflow/user/kernels/radix_sort_top_k_kernel.hip.cpp
index f1ea0e3..073f8e8 100644
--- a/oneflow/user/kernels/radix_sort_top_k_kernel.hip.cpp
+++ b/oneflow/user/kernels/radix_sort_top_k_kernel.hip.cpp
@@ -1,145 +1,145 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "hip/hip_runtime.h"
-#include "oneflow/core/framework/framework.h"
-#include "oneflow/user/kernels/radix_sort.hip.h"
-#include "oneflow/core/ep/rocm/cuda_stream.h"
-
-namespace oneflow {
-
-namespace {
-
-template<typename T>
-class TmpBufferManager final {
- public:
-  OF_DISALLOW_COPY_AND_MOVE(TmpBufferManager);
-  TmpBufferManager(int64_t capacity, void* ptr, const ShapeView& in_shape)
-      : capacity_{capacity},
-        sorted_in_elem_cnt_{in_shape.elem_cnt()},
-        indices_elem_cnt_{sorted_in_elem_cnt_},
-        sorted_indices_elem_cnt_{sorted_in_elem_cnt_} {
-    const int64_t sorted_in_aligned_bytes = GetCudaAlignedSize(sorted_in_elem_cnt_ * sizeof(T));
-    const int64_t indices_aligned_bytes = GetCudaAlignedSize(indices_elem_cnt_ * sizeof(int64_t));
-    const int64_t sorted_indices_aligned_bytes = indices_aligned_bytes;
-    sorted_in_ptr_ = reinterpret_cast<T*>(ptr);
-    indices_ptr_ = reinterpret_cast<int64_t*>(reinterpret_cast<char*>(sorted_in_ptr_)
-                                              + sorted_in_aligned_bytes);
-    sorted_indices_ptr_ =
-        reinterpret_cast<int64_t*>(reinterpret_cast<char*>(indices_ptr_) + indices_aligned_bytes);
-    temp_storage_ptr_ = reinterpret_cast<void*>(reinterpret_cast<char*>(sorted_indices_ptr_)
-                                                + sorted_indices_aligned_bytes);
-    temp_storage_bytes_ =
-        capacity_ - sorted_in_aligned_bytes - indices_aligned_bytes - sorted_indices_aligned_bytes;
-    CHECK_GE(temp_storage_bytes_, 0);
-  }
-  ~TmpBufferManager() = default;
-
-  T* SortedInPtr() const { return sorted_in_ptr_; }
-  int64_t* IndicesPtr() const { return indices_ptr_; }
-  int64_t* SortedIndicesPtr() const { return sorted_indices_ptr_; }
-  void* TempStoragePtr() const { return temp_storage_ptr_; }
-
-  int64_t TempStorageBytes() const { return temp_storage_bytes_; }
-
- private:
-  int64_t capacity_;
-
-  T* sorted_in_ptr_;
-  int64_t* indices_ptr_;
-  int64_t* sorted_indices_ptr_;
-  void* temp_storage_ptr_;
-
-  int64_t sorted_in_elem_cnt_;
-  int64_t indices_elem_cnt_;
-  int64_t sorted_indices_elem_cnt_;
-  int64_t temp_storage_bytes_;
-};
-
-__global__ void InitializeIndices(int64_t elem_cnt, int64_t* indices_ptr, int64_t instance_size) {
-  CUDA_1D_KERNEL_LOOP(i, elem_cnt) { indices_ptr[i] = i % instance_size; };
-}
-
-}  // namespace
-
-template<typename T>
-class GpuRadixSortTopKKernel final : public user_op::OpKernel {
- public:
-  GpuRadixSortTopKKernel() = default;
-  ~GpuRadixSortTopKKernel() = default;
-
- private:
-  using user_op::OpKernel::Compute;
-  void Compute(user_op::KernelComputeContext* ctx) const override {
-    const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("in", 0);
-    if (in->shape_view().elem_cnt() == 0) { return; }
-    user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0);
-    user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0);
-    TmpBufferManager<T> buf_manager(static_cast<int64_t>(tmp_buffer->shape_view().elem_cnt()),
-                                    tmp_buffer->mut_dptr<void>(), in->shape_view());
-
-    const int64_t elem_cnt = in->shape_view().elem_cnt();
-    const int64_t instance_size = in->shape_view().At(in->shape_view().NumAxes() - 1);
-    const int64_t instance_num = elem_cnt / instance_size;
-    const int64_t k = std::min(static_cast<int64_t>(ctx->Attr<int32_t>("k")), instance_size);
-    InitializeIndices<<<BlocksNum4ThreadsNum(elem_cnt), kCudaThreadsNumPerBlock, 0,
-                        ctx->stream()->As<ep::CudaStream>()->cuda_stream()>>>(
-        elem_cnt, buf_manager.IndicesPtr(), instance_size);
-    SortPairsDescending(in->dptr<T>(), buf_manager.IndicesPtr(), instance_num, instance_size,
-                        buf_manager.TempStoragePtr(), buf_manager.TempStorageBytes(),
-                        buf_manager.SortedInPtr(), buf_manager.SortedIndicesPtr(),
-                        ctx->stream()->As<ep::CudaStream>()->cuda_stream());
-    OF_CUDA_CHECK(hipMemcpy2DAsync(out->mut_dptr<int64_t>(), k * sizeof(int64_t),
-                                    buf_manager.SortedIndicesPtr(), instance_size * sizeof(int64_t),
-                                    k * sizeof(int64_t), instance_num, hipMemcpyDefault,
-                                    ctx->stream()->As<ep::CudaStream>()->cuda_stream()));
-  }
-  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
-};
-
-#define REGISTER_CUDA_RADIX_SORT_TOP_K_KERNEL(dtype)                                             \
-  REGISTER_USER_KERNEL("top_k")                                                                  \
-      .SetCreateFn<GpuRadixSortTopKKernel<dtype>>()                                              \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                           \
-                       && (user_op::HobAttr<int32_t>("k") > 128)                                 \
-                       && (user_op::HobDataType("in", 0) == GetDataType<dtype>::value))          \
-      .SetInferTmpSizeFn([](user_op::InferContext* ctx) {                                        \
-        const Shape& in_shape = ctx->InputShape("in", 0);                                        \
-        const int64_t elem_cnt = in_shape.elem_cnt();                                            \
-        const int64_t instance_size = in_shape.dim_vec().back();                                 \
-        const int64_t instance_num = elem_cnt / instance_size;                                   \
-                                                                                                 \
-        /* Sorted In*/                                                                           \
-        const int64_t sorted_in_aligned_bytes = GetCudaAlignedSize(elem_cnt * sizeof(dtype));    \
-        /* Indices */                                                                            \
-        const int64_t indices_aligned_bytes = GetCudaAlignedSize(elem_cnt * sizeof(int64_t));    \
-        /* Sorted Indices */                                                                     \
-        const int64_t sorted_indices_aligned_bytes = indices_aligned_bytes;                      \
-        /* CUB Temp Storage */                                                                   \
-        int64_t temp_storage_bytes =                                                             \
-            InferTempStorageForSortPairsDescending<dtype, int64_t>(instance_num, instance_size); \
-                                                                                                 \
-        return sorted_in_aligned_bytes + indices_aligned_bytes + sorted_indices_aligned_bytes    \
-               + temp_storage_bytes;                                                             \
-      });
-
-REGISTER_CUDA_RADIX_SORT_TOP_K_KERNEL(float)
-REGISTER_CUDA_RADIX_SORT_TOP_K_KERNEL(double)
-REGISTER_CUDA_RADIX_SORT_TOP_K_KERNEL(uint8_t)
-REGISTER_CUDA_RADIX_SORT_TOP_K_KERNEL(int8_t)
-REGISTER_CUDA_RADIX_SORT_TOP_K_KERNEL(int32_t)
-REGISTER_CUDA_RADIX_SORT_TOP_K_KERNEL(int64_t)
-
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "hip/hip_runtime.h"
+#include "oneflow/core/framework/framework.h"
+#include "oneflow/user/kernels/radix_sort.hip.h"
+#include "oneflow/core/ep/rocm/cuda_stream.h"
+
+namespace oneflow {
+
+namespace {
+
+template<typename T>
+class TmpBufferManager final {
+ public:
+  OF_DISALLOW_COPY_AND_MOVE(TmpBufferManager);
+  TmpBufferManager(int64_t capacity, void* ptr, const ShapeView& in_shape)
+      : capacity_{capacity},
+        sorted_in_elem_cnt_{in_shape.elem_cnt()},
+        indices_elem_cnt_{sorted_in_elem_cnt_},
+        sorted_indices_elem_cnt_{sorted_in_elem_cnt_} {
+    const int64_t sorted_in_aligned_bytes = GetCudaAlignedSize(sorted_in_elem_cnt_ * sizeof(T));
+    const int64_t indices_aligned_bytes = GetCudaAlignedSize(indices_elem_cnt_ * sizeof(int64_t));
+    const int64_t sorted_indices_aligned_bytes = indices_aligned_bytes;
+    sorted_in_ptr_ = reinterpret_cast<T*>(ptr);
+    indices_ptr_ = reinterpret_cast<int64_t*>(reinterpret_cast<char*>(sorted_in_ptr_)
+                                              + sorted_in_aligned_bytes);
+    sorted_indices_ptr_ =
+        reinterpret_cast<int64_t*>(reinterpret_cast<char*>(indices_ptr_) + indices_aligned_bytes);
+    temp_storage_ptr_ = reinterpret_cast<void*>(reinterpret_cast<char*>(sorted_indices_ptr_)
+                                                + sorted_indices_aligned_bytes);
+    temp_storage_bytes_ =
+        capacity_ - sorted_in_aligned_bytes - indices_aligned_bytes - sorted_indices_aligned_bytes;
+    CHECK_GE(temp_storage_bytes_, 0);
+  }
+  ~TmpBufferManager() = default;
+
+  T* SortedInPtr() const { return sorted_in_ptr_; }
+  int64_t* IndicesPtr() const { return indices_ptr_; }
+  int64_t* SortedIndicesPtr() const { return sorted_indices_ptr_; }
+  void* TempStoragePtr() const { return temp_storage_ptr_; }
+
+  int64_t TempStorageBytes() const { return temp_storage_bytes_; }
+
+ private:
+  int64_t capacity_;
+
+  T* sorted_in_ptr_;
+  int64_t* indices_ptr_;
+  int64_t* sorted_indices_ptr_;
+  void* temp_storage_ptr_;
+
+  int64_t sorted_in_elem_cnt_;
+  int64_t indices_elem_cnt_;
+  int64_t sorted_indices_elem_cnt_;
+  int64_t temp_storage_bytes_;
+};
+
+__global__ void InitializeIndices(int64_t elem_cnt, int64_t* indices_ptr, int64_t instance_size) {
+  CUDA_1D_KERNEL_LOOP(i, elem_cnt) { indices_ptr[i] = i % instance_size; };
+}
+
+}  // namespace
+
+template<typename T>
+class GpuRadixSortTopKKernel final : public user_op::OpKernel {
+ public:
+  GpuRadixSortTopKKernel() = default;
+  ~GpuRadixSortTopKKernel() = default;
+
+ private:
+  using user_op::OpKernel::Compute;
+  void Compute(user_op::KernelComputeContext* ctx) const override {
+    const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("in", 0);
+    if (in->shape_view().elem_cnt() == 0) { return; }
+    user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0);
+    user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0);
+    TmpBufferManager<T> buf_manager(static_cast<int64_t>(tmp_buffer->shape_view().elem_cnt()),
+                                    tmp_buffer->mut_dptr<void>(), in->shape_view());
+
+    const int64_t elem_cnt = in->shape_view().elem_cnt();
+    const int64_t instance_size = in->shape_view().At(in->shape_view().NumAxes() - 1);
+    const int64_t instance_num = elem_cnt / instance_size;
+    const int64_t k = std::min(static_cast<int64_t>(ctx->Attr<int32_t>("k")), instance_size);
+    InitializeIndices<<<BlocksNum4ThreadsNum(elem_cnt), kCudaThreadsNumPerBlock, 0,
+                        ctx->stream()->As<ep::CudaStream>()->cuda_stream()>>>(
+        elem_cnt, buf_manager.IndicesPtr(), instance_size);
+    SortPairsDescending(in->dptr<T>(), buf_manager.IndicesPtr(), instance_num, instance_size,
+                        buf_manager.TempStoragePtr(), buf_manager.TempStorageBytes(),
+                        buf_manager.SortedInPtr(), buf_manager.SortedIndicesPtr(),
+                        ctx->stream()->As<ep::CudaStream>()->cuda_stream());
+    OF_CUDA_CHECK(hipMemcpy2DAsync(out->mut_dptr<int64_t>(), k * sizeof(int64_t),
+                                    buf_manager.SortedIndicesPtr(), instance_size * sizeof(int64_t),
+                                    k * sizeof(int64_t), instance_num, hipMemcpyDefault,
+                                    ctx->stream()->As<ep::CudaStream>()->cuda_stream()));
+  }
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
+};
+
+#define REGISTER_CUDA_RADIX_SORT_TOP_K_KERNEL(dtype)                                             \
+  REGISTER_USER_KERNEL("top_k")                                                                  \
+      .SetCreateFn<GpuRadixSortTopKKernel<dtype>>()                                              \
+      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                           \
+                       && (user_op::HobAttr<int32_t>("k") > 128)                                 \
+                       && (user_op::HobDataType("in", 0) == GetDataType<dtype>::value))          \
+      .SetInferTmpSizeFn([](user_op::InferContext* ctx) {                                        \
+        const Shape& in_shape = ctx->InputShape("in", 0);                                        \
+        const int64_t elem_cnt = in_shape.elem_cnt();                                            \
+        const int64_t instance_size = in_shape.dim_vec().back();                                 \
+        const int64_t instance_num = elem_cnt / instance_size;                                   \
+                                                                                                 \
+        /* Sorted In*/                                                                           \
+        const int64_t sorted_in_aligned_bytes = GetCudaAlignedSize(elem_cnt * sizeof(dtype));    \
+        /* Indices */                                                                            \
+        const int64_t indices_aligned_bytes = GetCudaAlignedSize(elem_cnt * sizeof(int64_t));    \
+        /* Sorted Indices */                                                                     \
+        const int64_t sorted_indices_aligned_bytes = indices_aligned_bytes;                      \
+        /* CUB Temp Storage */                                                                   \
+        int64_t temp_storage_bytes =                                                             \
+            InferTempStorageForSortPairsDescending<dtype, int64_t>(instance_num, instance_size); \
+                                                                                                 \
+        return sorted_in_aligned_bytes + indices_aligned_bytes + sorted_indices_aligned_bytes    \
+               + temp_storage_bytes;                                                             \
+      });
+
+REGISTER_CUDA_RADIX_SORT_TOP_K_KERNEL(float)
+REGISTER_CUDA_RADIX_SORT_TOP_K_KERNEL(double)
+REGISTER_CUDA_RADIX_SORT_TOP_K_KERNEL(uint8_t)
+REGISTER_CUDA_RADIX_SORT_TOP_K_KERNEL(int8_t)
+REGISTER_CUDA_RADIX_SORT_TOP_K_KERNEL(int32_t)
+REGISTER_CUDA_RADIX_SORT_TOP_K_KERNEL(int64_t)
+
 }  // namespace oneflow
\ No newline at end of file
diff --git a/oneflow/user/kernels/random_mask_generator.hip.cpp b/oneflow/user/kernels/random_mask_generator.hip.cpp
index c28b7e3..6408b5b 100644
--- a/oneflow/user/kernels/random_mask_generator.hip.cpp
+++ b/oneflow/user/kernels/random_mask_generator.hip.cpp
@@ -1,69 +1,69 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "hip/hip_runtime.h"
-#include "oneflow/user/kernels/random_mask_generator.h"
-#include "oneflow/core/ep/rocm/cuda_stream.h"
-
-namespace oneflow {
-
-namespace {
-
-constexpr int32_t kMinPackPerThread = 2;
-
-using PackType = ulonglong2;
-
-union Pack {
-  PackType p_value;
-  bool b_value[sizeof(PackType)];
-};
-
-__device__ bool GenMask(hiprandState* state, const float rate) {
-  return hiprand_uniform(state) > rate;
-}
-
-__global__ void GenerateGpu(hiprandState* state, const int64_t n, const float rate, bool* mask) {
-  const int id = blockIdx.x * blockDim.x + threadIdx.x;
-  hiprandState localState = state[id];
-  PackType* pack_mask = reinterpret_cast<PackType*>(mask);
-  Pack pack;
-  CUDA_1D_KERNEL_LOOP(i, n / sizeof(PackType)) {
-#pragma unroll
-    for (int j = 0; j < sizeof(PackType); ++j) { pack.b_value[j] = GenMask(&localState, rate); }
-    pack_mask[i] = pack.p_value;
-  }
-  const int32_t rem_cnt = n % sizeof(PackType);
-  const int32_t rem_offset = n - rem_cnt;
-  if (id < rem_cnt) { mask[id + rem_offset] = GenMask(&localState, rate); }
-  state[id] = localState;
-}
-
-}  // namespace
-
-void RandomMaskGenerator<DeviceType::kCUDA>::Generate(ep::Stream* stream, const int64_t n,
-                                                      const float rate, bool* mask) {
-  int32_t block_num = generator_->max_block_num();
-  int32_t thread_num = generator_->max_thread_num();
-  auto* curand_states = generator_->curand_states();
-  const int32_t elem_cnt_per_block = thread_num * sizeof(PackType) * kMinPackPerThread;
-  const int32_t block_num_final =
-      std::min(static_cast<int32_t>((n + elem_cnt_per_block - 1) / elem_cnt_per_block), block_num);
-  GenerateGpu<<<block_num_final, thread_num, 0, stream->As<ep::CudaStream>()->cuda_stream()>>>(
-      curand_states, n, rate, mask);
-}
-
-template class RandomMaskGenerator<DeviceType::kCUDA>;
-
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "hip/hip_runtime.h"
+#include "oneflow/user/kernels/random_mask_generator.h"
+#include "oneflow/core/ep/rocm/cuda_stream.h"
+
+namespace oneflow {
+
+namespace {
+
+constexpr int32_t kMinPackPerThread = 2;
+
+using PackType = ulonglong2;
+
+union Pack {
+  PackType p_value;
+  bool b_value[sizeof(PackType)];
+};
+
+__device__ bool GenMask(hiprandState* state, const float rate) {
+  return hiprand_uniform(state) > rate;
+}
+
+__global__ void GenerateGpu(hiprandState* state, const int64_t n, const float rate, bool* mask) {
+  const int id = blockIdx.x * blockDim.x + threadIdx.x;
+  hiprandState localState = state[id];
+  PackType* pack_mask = reinterpret_cast<PackType*>(mask);
+  Pack pack;
+  CUDA_1D_KERNEL_LOOP(i, n / sizeof(PackType)) {
+#pragma unroll
+    for (int j = 0; j < sizeof(PackType); ++j) { pack.b_value[j] = GenMask(&localState, rate); }
+    pack_mask[i] = pack.p_value;
+  }
+  const int32_t rem_cnt = n % sizeof(PackType);
+  const int32_t rem_offset = n - rem_cnt;
+  if (id < rem_cnt) { mask[id + rem_offset] = GenMask(&localState, rate); }
+  state[id] = localState;
+}
+
+}  // namespace
+
+void RandomMaskGenerator<DeviceType::kCUDA>::Generate(ep::Stream* stream, const int64_t n,
+                                                      const float rate, bool* mask) {
+  int32_t block_num = generator_->max_block_num();
+  int32_t thread_num = generator_->max_thread_num();
+  auto* curand_states = generator_->curand_states();
+  const int32_t elem_cnt_per_block = thread_num * sizeof(PackType) * kMinPackPerThread;
+  const int32_t block_num_final =
+      std::min(static_cast<int32_t>((n + elem_cnt_per_block - 1) / elem_cnt_per_block), block_num);
+  GenerateGpu<<<block_num_final, thread_num, 0, stream->As<ep::CudaStream>()->cuda_stream()>>>(
+      curand_states, n, rate, mask);
+}
+
+template class RandomMaskGenerator<DeviceType::kCUDA>;
+
 }  // namespace oneflow
\ No newline at end of file
diff --git a/oneflow/user/kernels/randperm_kernel.hip.cpp b/oneflow/user/kernels/randperm_kernel.hip.cpp
index 94dcf6c..611ba57 100644
--- a/oneflow/user/kernels/randperm_kernel.hip.cpp
+++ b/oneflow/user/kernels/randperm_kernel.hip.cpp
@@ -1,201 +1,201 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include <hiprand.h>
-#include <hiprand_kernel.h>
-#include "hip/hip_runtime.h"
-#include "oneflow/core/common/data_type.h"
-#include "oneflow/core/ep/include/stream.h"
-#include "oneflow/core/framework/framework.h"
-#include "oneflow/core/framework/random_generator.h"
-#include "oneflow/user/kernels/op_kernel_wrapper.h"
-#include "oneflow/user/kernels/arange_kernel_util.h"
-#include "oneflow/user/kernels/radix_sort.hip.h"
-#include "oneflow/user/kernels/distributions/common.h"
-#include "oneflow/core/ep/include/device.h"
-#include "oneflow/core/ep/rocm/cuda_stream.h"
-#include "oneflow/core/job/nd_sbp_util.h"
-#include "oneflow/core/common/container_util.h"
-#include "oneflow/core/register/tensor_slice_view.h"
-#include "oneflow/core/device/cuda_util.h"
-
-namespace oneflow {
-__global__ void GeneKeysAndValues(const int32_t n, int32_t* values, int32_t* keys,
-                                  hiprandState* state) {
-  const int id = blockIdx.x * blockDim.x + threadIdx.x;
-  hiprandState local_state = state[id];
-  CUDA_1D_KERNEL_LOOP(i, n) {
-    keys[i] = hiprand(&local_state);
-    values[i] = i;
-  }
-  state[id] = local_state;
-}
-
-__global__ void tempcopy2output(const int32_t n, const int32_t offset, int32_t* temp,
-                                int32_t* output) {
-  CUDA_1D_KERNEL_LOOP(i, n) { output[i] = temp[offset + i]; }
-}
-class GpuRandPermKernelCache final : public user_op::OpKernelCache {
- public:
-  GpuRandPermKernelCache(int32_t lower, int32_t upper) : lower_(lower), upper_(upper) {}
-  ~GpuRandPermKernelCache() override = default;
-
-  int32_t lower() const { return lower_; }
-  int32_t upper() const { return upper_; }
-
- private:
-  const int32_t lower_;
-  const int32_t upper_;
-};
-
-namespace {
-
-template<typename K>
-size_t GetCubSortPairsTempStorageSize(int64_t n) {
-  size_t cub_sort_temp_store_size = 0;
-  OF_CUDA_CHECK((hipcub::DeviceRadixSort::SortPairs<K, K>(nullptr, cub_sort_temp_store_size, nullptr,
-                                                       nullptr, nullptr, nullptr, n)));
-  size_t temp_store_size = GetCudaAlignedSize(cub_sort_temp_store_size);
-  CHECK_GE(temp_store_size, 0) << "temp_store_size should >= 0.";
-  CHECK_LT(temp_store_size, static_cast<size_t>(GetMaxVal<int64_t>()))
-      << "temp_store_size should < " << static_cast<size_t>(GetMaxVal<int64_t>());
-  return temp_store_size;
-}
-
-}  // namespace
-
-class GpuRandPermKernel final : public user_op::OpKernel {
- public:
-  GpuRandPermKernel() = default;
-  ~GpuRandPermKernel() = default;
-  std::shared_ptr<user_op::OpKernelCache> InitOpKernelCache(
-      user_op::KernelCacheContext* ctx) const override {
-    int64_t parallel_num = ctx->parallel_ctx().parallel_num();
-    if (parallel_num > 1) {
-      const NdSbp& nd_sbp = ctx->NdSbp4ArgNameAndIndex("out", 0);
-      const Shape& hierarchy = *ctx->parallel_desc().hierarchy();
-      int64_t parallel_id = ctx->parallel_ctx().parallel_id();
-      int32_t n = ctx->Attr<int32_t>("n");
-      const Shape& logical_shape = Shape({n});
-      TensorSliceView view =
-          GetTensorSliceView4ParallelId(hierarchy, nd_sbp, logical_shape, parallel_id);
-      std::shared_ptr<GpuRandPermKernelCache> cache(
-          new GpuRandPermKernelCache(view.At(0).begin(), view.At(0).end()));
-      return cache;
-    } else {
-      return nullptr;
-    }
-  }
-  std::shared_ptr<user_op::OpKernelState> CreateOpKernelState(
-      user_op::KernelInitContext* ctx) const override {
-    const auto& generator = CHECK_JUST(one::MakeGenerator(kCUDA));
-    generator->set_current_seed(ctx->Attr<int64_t>("seed"));
-    return std::make_shared<DistributionKernelState>(generator);
-  }
-
- private:
-  using user_op::OpKernel::Compute;
-  void Compute(user_op::KernelComputeContext* ctx, user_op::OpKernelState* state,
-               const user_op::OpKernelCache* cache) const override {
-    user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0);
-    int32_t* output = out->mut_dptr<int32_t>();
-    const int32_t n = ctx->Attr<int32_t>("n");
-    if (n == 0) { return; }
-    user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0);
-
-    auto* distribution_state = dynamic_cast<DistributionKernelState*>(state);
-    CHECK_NOTNULL(distribution_state);
-    const auto& generator = distribution_state->generator();
-    auto* stream = ctx->stream();
-    const auto device_index = stream->device()->device_index();
-    const auto& gpu_generator = CHECK_JUST(generator->Get<one::CUDAGeneratorImpl>(device_index));
-    CHECK_NOTNULL(generator);
-
-    int32_t block_num = gpu_generator->max_block_num();
-    int32_t thread_num = gpu_generator->max_thread_num();
-    hiprandState* curand_states = gpu_generator->curand_states();
-
-    // layout for tmp |...key(in and out,2xN)..|....value....|.... space for sort function....|
-    // values are the desired indexes ,and keys are generated randomly.
-    void* tmp = tmp_buffer->mut_dptr<void>();
-    int32_t* key_base = reinterpret_cast<int32_t*>(tmp);
-
-    const int32_t key_aligned_bytes = GetCudaAlignedSize(n * sizeof(int32_t));
-    int32_t* value_base =
-        reinterpret_cast<int32_t*>(reinterpret_cast<char*>(key_base) + 2 * key_aligned_bytes);
-    const int32_t indices_aligned_bytes = GetCudaAlignedSize(n * sizeof(int32_t));
-    int32_t* temp_buffer_base =
-        reinterpret_cast<int32_t*>(reinterpret_cast<char*>(value_base) + indices_aligned_bytes);
-    const int32_t temp_buffer_aligned_bytes = GetCudaAlignedSize(n * sizeof(int32_t));
-
-    void* tmp_base = reinterpret_cast<void*>(reinterpret_cast<char*>(temp_buffer_base)
-                                             + temp_buffer_aligned_bytes);
-    size_t temp_storage_bytes = GetCubSortPairsTempStorageSize<int32_t>(n);
-    GeneKeysAndValues<<<block_num, thread_num, 0, stream->As<ep::CudaStream>()->cuda_stream()>>>(
-        n, value_base, key_base, curand_states);
-    if (cache == nullptr) {
-      auto err = hipcub::DeviceRadixSort::SortPairs(
-          /* d_temp_storage */ tmp_base,
-          /* temp_storage_bytes */ temp_storage_bytes,
-          /* d_keys_in */ key_base,
-          /* d_keys_out */ key_base + n,
-          /* d_values_in */ value_base,
-          /* d_values_out */ output,
-          /* num_items */ n,
-          /* begin_bit */ 0,
-          /* end_bit */ sizeof(int32_t) * 8,
-          /* stream */ ctx->stream()->As<ep::CudaStream>()->cuda_stream());
-      OF_CUDA_CHECK(err);
-    } else {
-      auto err = hipcub::DeviceRadixSort::SortPairs(
-          /* d_temp_storage */ tmp_base,
-          /* temp_storage_bytes */ temp_storage_bytes,
-          /* d_keys_in */ key_base,
-          /* d_keys_out */ key_base + n,
-          /* d_values_in */ value_base,
-          /* d_values_out */ temp_buffer_base,
-          /* num_items */ n,
-          /* begin_bit */ 0,
-          /* end_bit */ sizeof(int32_t) * 8,
-          /* stream */ ctx->stream()->As<ep::CudaStream>()->cuda_stream());
-      OF_CUDA_CHECK(err);
-      const auto* randperm_cache = dynamic_cast<const GpuRandPermKernelCache*>(cache);
-      auto len = randperm_cache->upper() - randperm_cache->lower();
-      const int64_t offset = randperm_cache->lower();
-      tempcopy2output<<<block_num, kCudaThreadsNumPerBlock, 0,
-                        ctx->stream()->As<ep::CudaStream>()->cuda_stream()>>>(
-          len, offset, temp_buffer_base, output);
-    }
-  }
-  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
-};
-REGISTER_USER_KERNEL("randperm")
-    .SetCreateFn<GpuRandPermKernel>()
-    .SetIsMatchedHob(user_op::HobDeviceType() == DeviceType::kCUDA)
-    .SetInferTmpSizeFn([](user_op::InferContext* ctx) {
-      const int32_t n = ctx->Attr<int32_t>("n");
-      /* Sorted In */
-      const int32_t sorted_in_aligned_bytes = 2 * GetCudaAlignedSize(n * sizeof(int32_t));
-      /* Indices */
-      const int32_t indices_aligned_bytes = GetCudaAlignedSize(n * sizeof(int32_t));
-      const int32_t temp_aligned_bytes = GetCudaAlignedSize(n * sizeof(int32_t));
-
-      /* CUB Temp Storage */
-      const int32_t temp_storage_bytes = GetCubSortPairsTempStorageSize<int32_t>(n);
-
-      return sorted_in_aligned_bytes + indices_aligned_bytes + temp_storage_bytes
-             + temp_aligned_bytes;
-    });
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include <hiprand.h>
+#include <hiprand_kernel.h>
+#include "hip/hip_runtime.h"
+#include "oneflow/core/common/data_type.h"
+#include "oneflow/core/ep/include/stream.h"
+#include "oneflow/core/framework/framework.h"
+#include "oneflow/core/framework/random_generator.h"
+#include "oneflow/user/kernels/op_kernel_wrapper.h"
+#include "oneflow/user/kernels/arange_kernel_util.h"
+#include "oneflow/user/kernels/radix_sort.hip.h"
+#include "oneflow/user/kernels/distributions/common.h"
+#include "oneflow/core/ep/include/device.h"
+#include "oneflow/core/ep/rocm/cuda_stream.h"
+#include "oneflow/core/job/nd_sbp_util.h"
+#include "oneflow/core/common/container_util.h"
+#include "oneflow/core/register/tensor_slice_view.h"
+#include "oneflow/core/device/cuda_util.h"
+
+namespace oneflow {
+__global__ void GeneKeysAndValues(const int32_t n, int32_t* values, int32_t* keys,
+                                  hiprandState* state) {
+  const int id = blockIdx.x * blockDim.x + threadIdx.x;
+  hiprandState local_state = state[id];
+  CUDA_1D_KERNEL_LOOP(i, n) {
+    keys[i] = hiprand(&local_state);
+    values[i] = i;
+  }
+  state[id] = local_state;
+}
+
+__global__ void tempcopy2output(const int32_t n, const int32_t offset, int32_t* temp,
+                                int32_t* output) {
+  CUDA_1D_KERNEL_LOOP(i, n) { output[i] = temp[offset + i]; }
+}
+class GpuRandPermKernelCache final : public user_op::OpKernelCache {
+ public:
+  GpuRandPermKernelCache(int32_t lower, int32_t upper) : lower_(lower), upper_(upper) {}
+  ~GpuRandPermKernelCache() override = default;
+
+  int32_t lower() const { return lower_; }
+  int32_t upper() const { return upper_; }
+
+ private:
+  const int32_t lower_;
+  const int32_t upper_;
+};
+
+namespace {
+
+template<typename K>
+size_t GetCubSortPairsTempStorageSize(int64_t n) {
+  size_t cub_sort_temp_store_size = 0;
+  OF_CUDA_CHECK((hipcub::DeviceRadixSort::SortPairs<K, K>(nullptr, cub_sort_temp_store_size, nullptr,
+                                                       nullptr, nullptr, nullptr, n)));
+  size_t temp_store_size = GetCudaAlignedSize(cub_sort_temp_store_size);
+  CHECK_GE(temp_store_size, 0) << "temp_store_size should >= 0.";
+  CHECK_LT(temp_store_size, static_cast<size_t>(GetMaxVal<int64_t>()))
+      << "temp_store_size should < " << static_cast<size_t>(GetMaxVal<int64_t>());
+  return temp_store_size;
+}
+
+}  // namespace
+
+class GpuRandPermKernel final : public user_op::OpKernel {
+ public:
+  GpuRandPermKernel() = default;
+  ~GpuRandPermKernel() = default;
+  std::shared_ptr<user_op::OpKernelCache> InitOpKernelCache(
+      user_op::KernelCacheContext* ctx) const override {
+    int64_t parallel_num = ctx->parallel_ctx().parallel_num();
+    if (parallel_num > 1) {
+      const NdSbp& nd_sbp = ctx->NdSbp4ArgNameAndIndex("out", 0);
+      const Shape& hierarchy = *ctx->parallel_desc().hierarchy();
+      int64_t parallel_id = ctx->parallel_ctx().parallel_id();
+      int32_t n = ctx->Attr<int32_t>("n");
+      const Shape& logical_shape = Shape({n});
+      TensorSliceView view =
+          GetTensorSliceView4ParallelId(hierarchy, nd_sbp, logical_shape, parallel_id);
+      std::shared_ptr<GpuRandPermKernelCache> cache(
+          new GpuRandPermKernelCache(view.At(0).begin(), view.At(0).end()));
+      return cache;
+    } else {
+      return nullptr;
+    }
+  }
+  std::shared_ptr<user_op::OpKernelState> CreateOpKernelState(
+      user_op::KernelInitContext* ctx) const override {
+    const auto& generator = CHECK_JUST(one::MakeGenerator(kCUDA));
+    generator->set_current_seed(ctx->Attr<int64_t>("seed"));
+    return std::make_shared<DistributionKernelState>(generator);
+  }
+
+ private:
+  using user_op::OpKernel::Compute;
+  void Compute(user_op::KernelComputeContext* ctx, user_op::OpKernelState* state,
+               const user_op::OpKernelCache* cache) const override {
+    user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0);
+    int32_t* output = out->mut_dptr<int32_t>();
+    const int32_t n = ctx->Attr<int32_t>("n");
+    if (n == 0) { return; }
+    user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0);
+
+    auto* distribution_state = dynamic_cast<DistributionKernelState*>(state);
+    CHECK_NOTNULL(distribution_state);
+    const auto& generator = distribution_state->generator();
+    auto* stream = ctx->stream();
+    const auto device_index = stream->device()->device_index();
+    const auto& gpu_generator = CHECK_JUST(generator->Get<one::CUDAGeneratorImpl>(device_index));
+    CHECK_NOTNULL(generator);
+
+    int32_t block_num = gpu_generator->max_block_num();
+    int32_t thread_num = gpu_generator->max_thread_num();
+    hiprandState* curand_states = gpu_generator->curand_states();
+
+    // layout for tmp |...key(in and out,2xN)..|....value....|.... space for sort function....|
+    // values are the desired indexes ,and keys are generated randomly.
+    void* tmp = tmp_buffer->mut_dptr<void>();
+    int32_t* key_base = reinterpret_cast<int32_t*>(tmp);
+
+    const int32_t key_aligned_bytes = GetCudaAlignedSize(n * sizeof(int32_t));
+    int32_t* value_base =
+        reinterpret_cast<int32_t*>(reinterpret_cast<char*>(key_base) + 2 * key_aligned_bytes);
+    const int32_t indices_aligned_bytes = GetCudaAlignedSize(n * sizeof(int32_t));
+    int32_t* temp_buffer_base =
+        reinterpret_cast<int32_t*>(reinterpret_cast<char*>(value_base) + indices_aligned_bytes);
+    const int32_t temp_buffer_aligned_bytes = GetCudaAlignedSize(n * sizeof(int32_t));
+
+    void* tmp_base = reinterpret_cast<void*>(reinterpret_cast<char*>(temp_buffer_base)
+                                             + temp_buffer_aligned_bytes);
+    size_t temp_storage_bytes = GetCubSortPairsTempStorageSize<int32_t>(n);
+    GeneKeysAndValues<<<block_num, thread_num, 0, stream->As<ep::CudaStream>()->cuda_stream()>>>(
+        n, value_base, key_base, curand_states);
+    if (cache == nullptr) {
+      auto err = hipcub::DeviceRadixSort::SortPairs(
+          /* d_temp_storage */ tmp_base,
+          /* temp_storage_bytes */ temp_storage_bytes,
+          /* d_keys_in */ key_base,
+          /* d_keys_out */ key_base + n,
+          /* d_values_in */ value_base,
+          /* d_values_out */ output,
+          /* num_items */ n,
+          /* begin_bit */ 0,
+          /* end_bit */ sizeof(int32_t) * 8,
+          /* stream */ ctx->stream()->As<ep::CudaStream>()->cuda_stream());
+      OF_CUDA_CHECK(err);
+    } else {
+      auto err = hipcub::DeviceRadixSort::SortPairs(
+          /* d_temp_storage */ tmp_base,
+          /* temp_storage_bytes */ temp_storage_bytes,
+          /* d_keys_in */ key_base,
+          /* d_keys_out */ key_base + n,
+          /* d_values_in */ value_base,
+          /* d_values_out */ temp_buffer_base,
+          /* num_items */ n,
+          /* begin_bit */ 0,
+          /* end_bit */ sizeof(int32_t) * 8,
+          /* stream */ ctx->stream()->As<ep::CudaStream>()->cuda_stream());
+      OF_CUDA_CHECK(err);
+      const auto* randperm_cache = dynamic_cast<const GpuRandPermKernelCache*>(cache);
+      auto len = randperm_cache->upper() - randperm_cache->lower();
+      const int64_t offset = randperm_cache->lower();
+      tempcopy2output<<<block_num, kCudaThreadsNumPerBlock, 0,
+                        ctx->stream()->As<ep::CudaStream>()->cuda_stream()>>>(
+          len, offset, temp_buffer_base, output);
+    }
+  }
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
+};
+REGISTER_USER_KERNEL("randperm")
+    .SetCreateFn<GpuRandPermKernel>()
+    .SetIsMatchedHob(user_op::HobDeviceType() == DeviceType::kCUDA)
+    .SetInferTmpSizeFn([](user_op::InferContext* ctx) {
+      const int32_t n = ctx->Attr<int32_t>("n");
+      /* Sorted In */
+      const int32_t sorted_in_aligned_bytes = 2 * GetCudaAlignedSize(n * sizeof(int32_t));
+      /* Indices */
+      const int32_t indices_aligned_bytes = GetCudaAlignedSize(n * sizeof(int32_t));
+      const int32_t temp_aligned_bytes = GetCudaAlignedSize(n * sizeof(int32_t));
+
+      /* CUB Temp Storage */
+      const int32_t temp_storage_bytes = GetCubSortPairsTempStorageSize<int32_t>(n);
+
+      return sorted_in_aligned_bytes + indices_aligned_bytes + temp_storage_bytes
+             + temp_aligned_bytes;
+    });
 }  // namespace oneflow
\ No newline at end of file
diff --git a/oneflow/user/kernels/repeat_interleave_kernel.hip.cpp b/oneflow/user/kernels/repeat_interleave_kernel.hip.cpp
index 3b3d3dc..f4c1d22 100644
--- a/oneflow/user/kernels/repeat_interleave_kernel.hip.cpp
+++ b/oneflow/user/kernels/repeat_interleave_kernel.hip.cpp
@@ -1,73 +1,73 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "hip/hip_runtime.h"
-#include "oneflow/core/framework/framework.h"
-#include "oneflow/user/kernels/roll_kernel_utils.h"
-#include "oneflow/core/ep/rocm/cuda_stream.h"
-
-#include <algorithm>
-
-namespace oneflow {
-
-namespace {
-
-template<typename T>
-__global__ void repeat_interleave(const T* in_ptr, const T* cumsum_ptr, T* out_ptr,
-                                  const int64_t num) {
-  CUDA_1D_KERNEL_LOOP(i, num) {
-    T end = cumsum_ptr[i];
-    T size = in_ptr[i];
-    T start = end - size;
-    for (T j = start; j < end; j++) { out_ptr[j] = i; }
-  }
-}
-
-}  // namespace
-
-template<typename T>
-class GpuRepeatInterLeaveKernel final : public user_op::OpKernel {
- public:
-  GpuRepeatInterLeaveKernel() = default;
-  ~GpuRepeatInterLeaveKernel() = default;
-
- private:
-  void Compute(user_op::KernelComputeContext* ctx) const override {
-    const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("in", 0);
-    const user_op::Tensor* cumsum = ctx->Tensor4ArgNameAndIndex("cumsum", 0);
-    user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0);
-    const int64_t& repeat_num = ctx->Attr<std::int64_t>("repeat_num");
-    const T* in_ptr = in->dptr<T>();
-    const T* cumsum_ptr = cumsum->dptr<T>();
-    T* out_ptr = out->mut_dptr<T>();
-
-    repeat_interleave<T><<<BlocksNum4ThreadsNum(in->shape_view().At(0)), kCudaThreadsNumPerBlock, 0,
-                           ctx->stream()->As<ep::CudaStream>()->cuda_stream()>>>(
-        in_ptr, cumsum_ptr, out_ptr, in->shape_view().At(0));
-  }
-
-  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
-};
-
-#define REGISTER_REPEAT_INTER_LEAVE_KERNEL(dtype)                      \
-  REGISTER_USER_KERNEL("repeat_interleave")                            \
-      .SetCreateFn<GpuRepeatInterLeaveKernel<dtype>>()                 \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \
-                       && (user_op::HobDataType("in", 0) == GetDataType<dtype>::value))
-
-REGISTER_REPEAT_INTER_LEAVE_KERNEL(int32_t);
-REGISTER_REPEAT_INTER_LEAVE_KERNEL(int64_t);
-
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "hip/hip_runtime.h"
+#include "oneflow/core/framework/framework.h"
+#include "oneflow/user/kernels/roll_kernel_utils.h"
+#include "oneflow/core/ep/rocm/cuda_stream.h"
+
+#include <algorithm>
+
+namespace oneflow {
+
+namespace {
+
+template<typename T>
+__global__ void repeat_interleave(const T* in_ptr, const T* cumsum_ptr, T* out_ptr,
+                                  const int64_t num) {
+  CUDA_1D_KERNEL_LOOP(i, num) {
+    T end = cumsum_ptr[i];
+    T size = in_ptr[i];
+    T start = end - size;
+    for (T j = start; j < end; j++) { out_ptr[j] = i; }
+  }
+}
+
+}  // namespace
+
+template<typename T>
+class GpuRepeatInterLeaveKernel final : public user_op::OpKernel {
+ public:
+  GpuRepeatInterLeaveKernel() = default;
+  ~GpuRepeatInterLeaveKernel() = default;
+
+ private:
+  void Compute(user_op::KernelComputeContext* ctx) const override {
+    const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("in", 0);
+    const user_op::Tensor* cumsum = ctx->Tensor4ArgNameAndIndex("cumsum", 0);
+    user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0);
+    const int64_t& repeat_num = ctx->Attr<std::int64_t>("repeat_num");
+    const T* in_ptr = in->dptr<T>();
+    const T* cumsum_ptr = cumsum->dptr<T>();
+    T* out_ptr = out->mut_dptr<T>();
+
+    repeat_interleave<T><<<BlocksNum4ThreadsNum(in->shape_view().At(0)), kCudaThreadsNumPerBlock, 0,
+                           ctx->stream()->As<ep::CudaStream>()->cuda_stream()>>>(
+        in_ptr, cumsum_ptr, out_ptr, in->shape_view().At(0));
+  }
+
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
+};
+
+#define REGISTER_REPEAT_INTER_LEAVE_KERNEL(dtype)                      \
+  REGISTER_USER_KERNEL("repeat_interleave")                            \
+      .SetCreateFn<GpuRepeatInterLeaveKernel<dtype>>()                 \
+      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \
+                       && (user_op::HobDataType("in", 0) == GetDataType<dtype>::value))
+
+REGISTER_REPEAT_INTER_LEAVE_KERNEL(int32_t);
+REGISTER_REPEAT_INTER_LEAVE_KERNEL(int64_t);
+
 }  // namespace oneflow
\ No newline at end of file
diff --git a/oneflow/user/kernels/roi_align_kernel.hip.cpp b/oneflow/user/kernels/roi_align_kernel.hip.cpp
index f9492c6..a6f3608 100644
--- a/oneflow/user/kernels/roi_align_kernel.hip.cpp
+++ b/oneflow/user/kernels/roi_align_kernel.hip.cpp
@@ -1,302 +1,302 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "hip/hip_runtime.h"
-#include "oneflow/core/framework/framework.h"
-#include "oneflow/core/kernel/new_kernel_util.h"
-#include "oneflow/core/ep/rocm/cuda_stream.h"
-
-namespace oneflow {
-
-namespace {
-
-template<typename T>
-__device__ T BilinearInterpolate(const T* channel_dptr, const int32_t height, const int32_t width,
-                                 T y, T x) {
-  if (y < -1.0 || y > height || x < -1.0 || x > width) { return 0; }
-
-  if (y <= 0) { y = 0; }
-  if (x <= 0) { x = 0; }
-  int32_t y_low = static_cast<int32_t>(y);
-  int32_t x_low = static_cast<int32_t>(x);
-  int32_t y_high = 0;
-  int32_t x_high = 0;
-
-  if (y_low >= height - 1) {
-    y_low = height - 1;
-    y_high = y_low;
-    y = static_cast<T>(y_low);
-  } else {
-    y_high = y_low + 1;
-  }
-
-  if (x_low >= width - 1) {
-    x_low = width - 1;
-    x_high = x_low;
-    x = static_cast<T>(x_low);
-  } else {
-    x_high = x_low + 1;
-  }
-
-  const T ly = y - y_low;
-  const T lx = x - x_low;
-  const T hy = 1.f - ly;
-  const T hx = 1.f - lx;
-
-  // https://en.wikipedia.org/wiki/Bilinear_interpolation
-  const int64_t q11 = y_low * width + x_low;
-  const int64_t q21 = y_low * width + x_high;
-  const int64_t q12 = y_high * width + x_low;
-  const int64_t q22 = y_high * width + x_high;
-  //  no 1 / (x_high - x_low) * (y_high - y_low) because it will always be 1 in RoI Align
-  return (hy * hx) * channel_dptr[q11] + (hy * lx) * channel_dptr[q21]
-         + (ly * hx) * channel_dptr[q12] + (ly * lx) * channel_dptr[q22];
-}
-
-template<typename T>
-__device__ bool BilinearInterpolateDiff(const T bin_diff_avg, const int64_t height,
-                                        const int64_t width, T y, T x, T& diff11, T& diff21,
-                                        T& diff12, T& diff22, int32_t& x_low, int32_t& x_high,
-                                        int32_t& y_low, int32_t& y_high) {
-  if (y < -1.0 || y > height || x < -1.0 || x > width) { return false; }
-
-  if (y <= 0) { y = 0; }
-  if (x <= 0) { x = 0; }
-
-  y_low = static_cast<int32_t>(y);
-  x_low = static_cast<int32_t>(x);
-
-  if (y_low >= height - 1) {
-    y_low = height - 1;
-    y_high = y_low;
-    y = static_cast<T>(y_low);
-  } else {
-    y_high = y_low + 1;
-  }
-
-  if (x_low >= width - 1) {
-    x_low = width - 1;
-    x_high = x_low;
-    x = static_cast<T>(x_low);
-  } else {
-    x_high = x_low + 1;
-  }
-
-  const T ly = y - y_low;
-  const T lx = x - x_low;
-  const T hy = 1.f - ly;
-  const T hx = 1.f - lx;
-
-  diff11 = bin_diff_avg * hy * hx;
-  diff21 = bin_diff_avg * hy * lx;
-  diff12 = bin_diff_avg * ly * hx;
-  diff22 = bin_diff_avg * ly * lx;
-  return true;
-}
-
-template<typename T>
-__global__ void RoiAlignForward(const int64_t nthreads, const T* in_dptr, const T* rois_dptr,
-                                const T spatial_scale, const int32_t sampling_ratio,
-                                const int64_t channel_num, const int64_t height,
-                                const int64_t width, const int64_t pooled_height,
-                                const int64_t pooled_width, const bool aligned, T* out_dptr) {
-  const int64_t pooled_area = pooled_height * pooled_width;
-  const int64_t channel_pooled_area = channel_num * pooled_height * pooled_width;
-  CUDA_1D_KERNEL_LOOP(index, nthreads) {
-    const int64_t h = (index / pooled_width) % pooled_height;
-    const int64_t w = index % pooled_width;
-    const int64_t c = (index / pooled_area) % channel_num;
-    const int64_t r = index / channel_pooled_area;
-    const T* offset_rois_dptr = rois_dptr + r * 5;
-    const int64_t n = static_cast<int64_t>(offset_rois_dptr[0]);
-    const T align_offset = aligned ? static_cast<T>(0.5) : static_cast<T>(0.f);
-    const T roi_start_w = offset_rois_dptr[1] * spatial_scale - align_offset;
-    const T roi_start_h = offset_rois_dptr[2] * spatial_scale - align_offset;
-    const T roi_end_w = offset_rois_dptr[3] * spatial_scale - align_offset;
-    const T roi_end_h = offset_rois_dptr[4] * spatial_scale - align_offset;
-    T roi_height = roi_end_h - roi_start_h;
-    T roi_width = roi_end_w - roi_start_w;
-    // aligned == false is for compatibility. the argument "aligned" doesn't have the semantic of
-    // determining minimum roi size
-    if (aligned == false) {
-      roi_height = max(roi_height, static_cast<T>(1.0));
-      roi_width = max(roi_width, static_cast<T>(1.0));
-    }
-    const T bin_height = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
-    const T bin_width = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
-    const int32_t bin_grid_height =
-        (sampling_ratio > 0) ? sampling_ratio : ceil(roi_height / pooled_height);
-    const int32_t bin_grid_width =
-        (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
-    const T count = max(bin_grid_height * bin_grid_width, 1);
-    const T* channel_dptr = in_dptr + (n * channel_num + c) * height * width;
-    T out_val = 0.0;
-    FOR_RANGE(int64_t, grid_i, 0, bin_grid_height) {
-      // + .5f for center position
-      T y = roi_start_h + h * bin_height
-            + static_cast<T>(grid_i + 0.5f) * bin_height / static_cast<T>(bin_grid_height);
-      FOR_RANGE(int64_t, grid_j, 0, bin_grid_width) {
-        T x = roi_start_w + w * bin_width
-              + static_cast<T>(grid_j + 0.5f) * bin_width / static_cast<T>(bin_grid_width);
-        out_val += BilinearInterpolate(channel_dptr, height, width, y, x);
-      }
-    }
-    out_dptr[index] = out_val / count;
-  }
-}
-
-template<typename T>
-__global__ void RoiAlignBackward(const int64_t nthreads, const T* out_diff_dptr, const T* rois_dptr,
-                                 const T spatial_scale, const int32_t sampling_ratio,
-                                 const int64_t channel_num, const int64_t height,
-                                 const int64_t width, const int64_t pooled_height,
-                                 const int64_t pooled_width, const bool aligned, T* in_diff_dptr) {
-  const int64_t pooled_area = pooled_height * pooled_width;
-  const int64_t channel_pooled_area = channel_num * pooled_height * pooled_width;
-  CUDA_1D_KERNEL_LOOP(index, nthreads) {
-    const int64_t h = (index / pooled_width) % pooled_height;
-    const int64_t w = index % pooled_width;
-    const int64_t c = (index / pooled_area) % channel_num;
-    const int64_t r = index / channel_pooled_area;
-    const T* offset_rois_dptr = rois_dptr + r * 5;
-    const int64_t n = static_cast<int64_t>(offset_rois_dptr[0]);
-    const T align_offset = aligned ? static_cast<T>(0.5) : static_cast<T>(0.f);
-    const T roi_start_w = offset_rois_dptr[1] * spatial_scale - align_offset;
-    const T roi_start_h = offset_rois_dptr[2] * spatial_scale - align_offset;
-    const T roi_end_w = offset_rois_dptr[3] * spatial_scale - align_offset;
-    const T roi_end_h = offset_rois_dptr[4] * spatial_scale - align_offset;
-    T roi_width = roi_end_w - roi_start_w;
-    T roi_height = roi_end_h - roi_start_h;
-    // aligned == false is for compatibility. the argument "aligned" doesn't have the semantic of
-    // determining minimum roi size
-    if (aligned == false) {
-      roi_height = max(roi_height, static_cast<T>(1.0));
-      roi_width = max(roi_width, static_cast<T>(1.0));
-    }
-    const T bin_height = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
-    const T bin_width = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
-    const int32_t bin_grid_height =
-        (sampling_ratio > 0) ? sampling_ratio : ceil(roi_height / pooled_height);
-    const int32_t bin_grid_width =
-        (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
-
-    const T count = max(bin_grid_height * bin_grid_width, 1);
-    const T bin_diff_avg = out_diff_dptr[index] / count;
-    T* in_diff_channel_dptr = in_diff_dptr + (n * channel_num + c) * height * width;
-    FOR_RANGE(int64_t, grid_i, 0, bin_grid_height) {
-      // + .5f for center position
-      T y = roi_start_h + h * bin_height
-            + static_cast<T>(grid_i + 0.5f) * bin_height / static_cast<T>(bin_grid_height);
-      FOR_RANGE(int64_t, grid_j, 0, bin_grid_width) {
-        T x = roi_start_w + w * bin_width
-              + static_cast<T>(grid_j + 0.5f) * bin_width / static_cast<T>(bin_grid_width);
-        T diff11 = 0;
-        T diff21 = 0;
-        T diff12 = 0;
-        T diff22 = 0;
-        int32_t x_low = 0;
-        int32_t x_high = 0;
-        int32_t y_low = 0;
-        int32_t y_high = 0;
-        bool has_diff = BilinearInterpolateDiff(bin_diff_avg, height, width, y, x, diff11, diff21,
-                                                diff12, diff22, x_low, x_high, y_low, y_high);
-        if (has_diff) {
-          const int64_t q11 = y_low * width + x_low;
-          const int64_t q21 = y_low * width + x_high;
-          const int64_t q12 = y_high * width + x_low;
-          const int64_t q22 = y_high * width + x_high;
-          atomicAdd(in_diff_channel_dptr + q11, diff11);
-          atomicAdd(in_diff_channel_dptr + q21, diff21);
-          atomicAdd(in_diff_channel_dptr + q12, diff12);
-          atomicAdd(in_diff_channel_dptr + q22, diff22);
-        }
-      }
-    }
-  }
-}
-
-}  // namespace
-
-template<typename T>
-class RoIAlignKernel final : public user_op::OpKernel {
- public:
-  RoIAlignKernel() = default;
-  ~RoIAlignKernel() = default;
-
- private:
-  using user_op::OpKernel::Compute;
-  void Compute(user_op::KernelComputeContext* ctx) const override {
-    const user_op::Tensor* x_blob = ctx->Tensor4ArgNameAndIndex("x", 0);
-    const user_op::Tensor* rois_blob = ctx->Tensor4ArgNameAndIndex("rois", 0);
-    if (rois_blob->shape_view().elem_cnt() == 0) { return; }
-    user_op::Tensor* y_blob = ctx->Tensor4ArgNameAndIndex("y", 0);
-    const int32_t pooled_h = ctx->Attr<int32_t>("pooled_h");
-    const int32_t pooled_w = ctx->Attr<int32_t>("pooled_w");
-    const float spatial_scale = ctx->Attr<float>("spatial_scale");
-    const int32_t sampling_ratio = ctx->Attr<int32_t>("sampling_ratio");
-    const bool aligned = ctx->Attr<bool>("aligned");
-
-    const int64_t elem_cnt = y_blob->shape_view().elem_cnt();
-    RoiAlignForward<T><<<BlocksNum4ThreadsNum(elem_cnt), kCudaThreadsNumPerBlock, 0,
-                         ctx->stream()->As<ep::CudaStream>()->cuda_stream()>>>(
-        elem_cnt, x_blob->dptr<T>(), rois_blob->dptr<T>(), spatial_scale, sampling_ratio,
-        x_blob->shape_view().At(1), x_blob->shape_view().At(2), x_blob->shape_view().At(3),
-        pooled_h, pooled_w, aligned, y_blob->mut_dptr<T>());
-  }
-  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
-};
-
-template<typename T>
-class RoIAlignGradKernel final : public user_op::OpKernel {
- public:
-  RoIAlignGradKernel() = default;
-  ~RoIAlignGradKernel() = default;
-
- private:
-  using user_op::OpKernel::Compute;
-  void Compute(user_op::KernelComputeContext* ctx) const override {
-    user_op::Tensor* dx_blob = ctx->Tensor4ArgNameAndIndex("dx", 0);
-    if (dx_blob == nullptr) { return; }
-    Memset<DeviceType::kCUDA>(ctx->stream(), dx_blob->mut_dptr<T>(), 0,
-                              dx_blob->shape_view().elem_cnt() * sizeof(T));
-    const user_op::Tensor* dy_blob = ctx->Tensor4ArgNameAndIndex("dy", 0);
-    const user_op::Tensor* rois_blob = ctx->Tensor4ArgNameAndIndex("rois", 0);
-    const int32_t pooled_h = ctx->Attr<int32_t>("pooled_h");
-    const int32_t pooled_w = ctx->Attr<int32_t>("pooled_w");
-    const float spatial_scale = ctx->Attr<float>("spatial_scale");
-    const int32_t sampling_ratio = ctx->Attr<int32_t>("sampling_ratio");
-    const bool aligned = ctx->Attr<bool>("aligned");
-
-    const int64_t elem_cnt = dy_blob->shape_view().elem_cnt();
-    if (elem_cnt > 0) {
-      RoiAlignBackward<T><<<BlocksNum4ThreadsNum(elem_cnt), kCudaThreadsNumPerBlock, 0,
-                            ctx->stream()->As<ep::CudaStream>()->cuda_stream()>>>(
-          elem_cnt, dy_blob->dptr<T>(), rois_blob->dptr<T>(), spatial_scale, sampling_ratio,
-          dx_blob->shape_view().At(1), dx_blob->shape_view().At(2), dx_blob->shape_view().At(3),
-          pooled_h, pooled_w, aligned, dx_blob->mut_dptr<T>());
-    }
-  }
-  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
-};
-
-REGISTER_USER_KERNEL("roi_align")
-    .SetCreateFn<RoIAlignKernel<float>>()
-    .SetIsMatchedHob(user_op::HobDeviceType() == DeviceType::kCUDA);
-
-REGISTER_USER_KERNEL("roi_align_grad")
-    .SetCreateFn<RoIAlignGradKernel<float>>()
-    .SetIsMatchedHob(user_op::HobDeviceType() == DeviceType::kCUDA);
-
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "hip/hip_runtime.h"
+#include "oneflow/core/framework/framework.h"
+#include "oneflow/core/kernel/new_kernel_util.h"
+#include "oneflow/core/ep/rocm/cuda_stream.h"
+
+namespace oneflow {
+
+namespace {
+
+template<typename T>
+__device__ T BilinearInterpolate(const T* channel_dptr, const int32_t height, const int32_t width,
+                                 T y, T x) {
+  if (y < -1.0 || y > height || x < -1.0 || x > width) { return 0; }
+
+  if (y <= 0) { y = 0; }
+  if (x <= 0) { x = 0; }
+  int32_t y_low = static_cast<int32_t>(y);
+  int32_t x_low = static_cast<int32_t>(x);
+  int32_t y_high = 0;
+  int32_t x_high = 0;
+
+  if (y_low >= height - 1) {
+    y_low = height - 1;
+    y_high = y_low;
+    y = static_cast<T>(y_low);
+  } else {
+    y_high = y_low + 1;
+  }
+
+  if (x_low >= width - 1) {
+    x_low = width - 1;
+    x_high = x_low;
+    x = static_cast<T>(x_low);
+  } else {
+    x_high = x_low + 1;
+  }
+
+  const T ly = y - y_low;
+  const T lx = x - x_low;
+  const T hy = 1.f - ly;
+  const T hx = 1.f - lx;
+
+  // https://en.wikipedia.org/wiki/Bilinear_interpolation
+  const int64_t q11 = y_low * width + x_low;
+  const int64_t q21 = y_low * width + x_high;
+  const int64_t q12 = y_high * width + x_low;
+  const int64_t q22 = y_high * width + x_high;
+  //  no 1 / (x_high - x_low) * (y_high - y_low) because it will always be 1 in RoI Align
+  return (hy * hx) * channel_dptr[q11] + (hy * lx) * channel_dptr[q21]
+         + (ly * hx) * channel_dptr[q12] + (ly * lx) * channel_dptr[q22];
+}
+
+template<typename T>
+__device__ bool BilinearInterpolateDiff(const T bin_diff_avg, const int64_t height,
+                                        const int64_t width, T y, T x, T& diff11, T& diff21,
+                                        T& diff12, T& diff22, int32_t& x_low, int32_t& x_high,
+                                        int32_t& y_low, int32_t& y_high) {
+  if (y < -1.0 || y > height || x < -1.0 || x > width) { return false; }
+
+  if (y <= 0) { y = 0; }
+  if (x <= 0) { x = 0; }
+
+  y_low = static_cast<int32_t>(y);
+  x_low = static_cast<int32_t>(x);
+
+  if (y_low >= height - 1) {
+    y_low = height - 1;
+    y_high = y_low;
+    y = static_cast<T>(y_low);
+  } else {
+    y_high = y_low + 1;
+  }
+
+  if (x_low >= width - 1) {
+    x_low = width - 1;
+    x_high = x_low;
+    x = static_cast<T>(x_low);
+  } else {
+    x_high = x_low + 1;
+  }
+
+  const T ly = y - y_low;
+  const T lx = x - x_low;
+  const T hy = 1.f - ly;
+  const T hx = 1.f - lx;
+
+  diff11 = bin_diff_avg * hy * hx;
+  diff21 = bin_diff_avg * hy * lx;
+  diff12 = bin_diff_avg * ly * hx;
+  diff22 = bin_diff_avg * ly * lx;
+  return true;
+}
+
+template<typename T>
+__global__ void RoiAlignForward(const int64_t nthreads, const T* in_dptr, const T* rois_dptr,
+                                const T spatial_scale, const int32_t sampling_ratio,
+                                const int64_t channel_num, const int64_t height,
+                                const int64_t width, const int64_t pooled_height,
+                                const int64_t pooled_width, const bool aligned, T* out_dptr) {
+  const int64_t pooled_area = pooled_height * pooled_width;
+  const int64_t channel_pooled_area = channel_num * pooled_height * pooled_width;
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    const int64_t h = (index / pooled_width) % pooled_height;
+    const int64_t w = index % pooled_width;
+    const int64_t c = (index / pooled_area) % channel_num;
+    const int64_t r = index / channel_pooled_area;
+    const T* offset_rois_dptr = rois_dptr + r * 5;
+    const int64_t n = static_cast<int64_t>(offset_rois_dptr[0]);
+    const T align_offset = aligned ? static_cast<T>(0.5) : static_cast<T>(0.f);
+    const T roi_start_w = offset_rois_dptr[1] * spatial_scale - align_offset;
+    const T roi_start_h = offset_rois_dptr[2] * spatial_scale - align_offset;
+    const T roi_end_w = offset_rois_dptr[3] * spatial_scale - align_offset;
+    const T roi_end_h = offset_rois_dptr[4] * spatial_scale - align_offset;
+    T roi_height = roi_end_h - roi_start_h;
+    T roi_width = roi_end_w - roi_start_w;
+    // aligned == false is for compatibility. the argument "aligned" doesn't have the semantic of
+    // determining minimum roi size
+    if (aligned == false) {
+      roi_height = max(roi_height, static_cast<T>(1.0));
+      roi_width = max(roi_width, static_cast<T>(1.0));
+    }
+    const T bin_height = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+    const T bin_width = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
+    const int32_t bin_grid_height =
+        (sampling_ratio > 0) ? sampling_ratio : ceil(roi_height / pooled_height);
+    const int32_t bin_grid_width =
+        (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
+    const T count = max(bin_grid_height * bin_grid_width, 1);
+    const T* channel_dptr = in_dptr + (n * channel_num + c) * height * width;
+    T out_val = 0.0;
+    FOR_RANGE(int64_t, grid_i, 0, bin_grid_height) {
+      // + .5f for center position
+      T y = roi_start_h + h * bin_height
+            + static_cast<T>(grid_i + 0.5f) * bin_height / static_cast<T>(bin_grid_height);
+      FOR_RANGE(int64_t, grid_j, 0, bin_grid_width) {
+        T x = roi_start_w + w * bin_width
+              + static_cast<T>(grid_j + 0.5f) * bin_width / static_cast<T>(bin_grid_width);
+        out_val += BilinearInterpolate(channel_dptr, height, width, y, x);
+      }
+    }
+    out_dptr[index] = out_val / count;
+  }
+}
+
+template<typename T>
+__global__ void RoiAlignBackward(const int64_t nthreads, const T* out_diff_dptr, const T* rois_dptr,
+                                 const T spatial_scale, const int32_t sampling_ratio,
+                                 const int64_t channel_num, const int64_t height,
+                                 const int64_t width, const int64_t pooled_height,
+                                 const int64_t pooled_width, const bool aligned, T* in_diff_dptr) {
+  const int64_t pooled_area = pooled_height * pooled_width;
+  const int64_t channel_pooled_area = channel_num * pooled_height * pooled_width;
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    const int64_t h = (index / pooled_width) % pooled_height;
+    const int64_t w = index % pooled_width;
+    const int64_t c = (index / pooled_area) % channel_num;
+    const int64_t r = index / channel_pooled_area;
+    const T* offset_rois_dptr = rois_dptr + r * 5;
+    const int64_t n = static_cast<int64_t>(offset_rois_dptr[0]);
+    const T align_offset = aligned ? static_cast<T>(0.5) : static_cast<T>(0.f);
+    const T roi_start_w = offset_rois_dptr[1] * spatial_scale - align_offset;
+    const T roi_start_h = offset_rois_dptr[2] * spatial_scale - align_offset;
+    const T roi_end_w = offset_rois_dptr[3] * spatial_scale - align_offset;
+    const T roi_end_h = offset_rois_dptr[4] * spatial_scale - align_offset;
+    T roi_width = roi_end_w - roi_start_w;
+    T roi_height = roi_end_h - roi_start_h;
+    // aligned == false is for compatibility. the argument "aligned" doesn't have the semantic of
+    // determining minimum roi size
+    if (aligned == false) {
+      roi_height = max(roi_height, static_cast<T>(1.0));
+      roi_width = max(roi_width, static_cast<T>(1.0));
+    }
+    const T bin_height = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+    const T bin_width = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
+    const int32_t bin_grid_height =
+        (sampling_ratio > 0) ? sampling_ratio : ceil(roi_height / pooled_height);
+    const int32_t bin_grid_width =
+        (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
+
+    const T count = max(bin_grid_height * bin_grid_width, 1);
+    const T bin_diff_avg = out_diff_dptr[index] / count;
+    T* in_diff_channel_dptr = in_diff_dptr + (n * channel_num + c) * height * width;
+    FOR_RANGE(int64_t, grid_i, 0, bin_grid_height) {
+      // + .5f for center position
+      T y = roi_start_h + h * bin_height
+            + static_cast<T>(grid_i + 0.5f) * bin_height / static_cast<T>(bin_grid_height);
+      FOR_RANGE(int64_t, grid_j, 0, bin_grid_width) {
+        T x = roi_start_w + w * bin_width
+              + static_cast<T>(grid_j + 0.5f) * bin_width / static_cast<T>(bin_grid_width);
+        T diff11 = 0;
+        T diff21 = 0;
+        T diff12 = 0;
+        T diff22 = 0;
+        int32_t x_low = 0;
+        int32_t x_high = 0;
+        int32_t y_low = 0;
+        int32_t y_high = 0;
+        bool has_diff = BilinearInterpolateDiff(bin_diff_avg, height, width, y, x, diff11, diff21,
+                                                diff12, diff22, x_low, x_high, y_low, y_high);
+        if (has_diff) {
+          const int64_t q11 = y_low * width + x_low;
+          const int64_t q21 = y_low * width + x_high;
+          const int64_t q12 = y_high * width + x_low;
+          const int64_t q22 = y_high * width + x_high;
+          atomicAdd(in_diff_channel_dptr + q11, diff11);
+          atomicAdd(in_diff_channel_dptr + q21, diff21);
+          atomicAdd(in_diff_channel_dptr + q12, diff12);
+          atomicAdd(in_diff_channel_dptr + q22, diff22);
+        }
+      }
+    }
+  }
+}
+
+}  // namespace
+
+template<typename T>
+class RoIAlignKernel final : public user_op::OpKernel {
+ public:
+  RoIAlignKernel() = default;
+  ~RoIAlignKernel() = default;
+
+ private:
+  using user_op::OpKernel::Compute;
+  void Compute(user_op::KernelComputeContext* ctx) const override {
+    const user_op::Tensor* x_blob = ctx->Tensor4ArgNameAndIndex("x", 0);
+    const user_op::Tensor* rois_blob = ctx->Tensor4ArgNameAndIndex("rois", 0);
+    if (rois_blob->shape_view().elem_cnt() == 0) { return; }
+    user_op::Tensor* y_blob = ctx->Tensor4ArgNameAndIndex("y", 0);
+    const int32_t pooled_h = ctx->Attr<int32_t>("pooled_h");
+    const int32_t pooled_w = ctx->Attr<int32_t>("pooled_w");
+    const float spatial_scale = ctx->Attr<float>("spatial_scale");
+    const int32_t sampling_ratio = ctx->Attr<int32_t>("sampling_ratio");
+    const bool aligned = ctx->Attr<bool>("aligned");
+
+    const int64_t elem_cnt = y_blob->shape_view().elem_cnt();
+    RoiAlignForward<T><<<BlocksNum4ThreadsNum(elem_cnt), kCudaThreadsNumPerBlock, 0,
+                         ctx->stream()->As<ep::CudaStream>()->cuda_stream()>>>(
+        elem_cnt, x_blob->dptr<T>(), rois_blob->dptr<T>(), spatial_scale, sampling_ratio,
+        x_blob->shape_view().At(1), x_blob->shape_view().At(2), x_blob->shape_view().At(3),
+        pooled_h, pooled_w, aligned, y_blob->mut_dptr<T>());
+  }
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
+};
+
+template<typename T>
+class RoIAlignGradKernel final : public user_op::OpKernel {
+ public:
+  RoIAlignGradKernel() = default;
+  ~RoIAlignGradKernel() = default;
+
+ private:
+  using user_op::OpKernel::Compute;
+  void Compute(user_op::KernelComputeContext* ctx) const override {
+    user_op::Tensor* dx_blob = ctx->Tensor4ArgNameAndIndex("dx", 0);
+    if (dx_blob == nullptr) { return; }
+    Memset<DeviceType::kCUDA>(ctx->stream(), dx_blob->mut_dptr<T>(), 0,
+                              dx_blob->shape_view().elem_cnt() * sizeof(T));
+    const user_op::Tensor* dy_blob = ctx->Tensor4ArgNameAndIndex("dy", 0);
+    const user_op::Tensor* rois_blob = ctx->Tensor4ArgNameAndIndex("rois", 0);
+    const int32_t pooled_h = ctx->Attr<int32_t>("pooled_h");
+    const int32_t pooled_w = ctx->Attr<int32_t>("pooled_w");
+    const float spatial_scale = ctx->Attr<float>("spatial_scale");
+    const int32_t sampling_ratio = ctx->Attr<int32_t>("sampling_ratio");
+    const bool aligned = ctx->Attr<bool>("aligned");
+
+    const int64_t elem_cnt = dy_blob->shape_view().elem_cnt();
+    if (elem_cnt > 0) {
+      RoiAlignBackward<T><<<BlocksNum4ThreadsNum(elem_cnt), kCudaThreadsNumPerBlock, 0,
+                            ctx->stream()->As<ep::CudaStream>()->cuda_stream()>>>(
+          elem_cnt, dy_blob->dptr<T>(), rois_blob->dptr<T>(), spatial_scale, sampling_ratio,
+          dx_blob->shape_view().At(1), dx_blob->shape_view().At(2), dx_blob->shape_view().At(3),
+          pooled_h, pooled_w, aligned, dx_blob->mut_dptr<T>());
+    }
+  }
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
+};
+
+REGISTER_USER_KERNEL("roi_align")
+    .SetCreateFn<RoIAlignKernel<float>>()
+    .SetIsMatchedHob(user_op::HobDeviceType() == DeviceType::kCUDA);
+
+REGISTER_USER_KERNEL("roi_align_grad")
+    .SetCreateFn<RoIAlignGradKernel<float>>()
+    .SetIsMatchedHob(user_op::HobDeviceType() == DeviceType::kCUDA);
+
 }  // namespace oneflow
\ No newline at end of file
diff --git a/oneflow/user/kernels/roll_kernel.hip.cpp b/oneflow/user/kernels/roll_kernel.hip.cpp
index dc65156..c1e119d 100644
--- a/oneflow/user/kernels/roll_kernel.hip.cpp
+++ b/oneflow/user/kernels/roll_kernel.hip.cpp
@@ -1,295 +1,295 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "hip/hip_runtime.h"
-#include "oneflow/core/device/cuda_util.h"
-#include "oneflow/core/hip/atomic.hip.h"
-#include "oneflow/user/kernels/roll_kernel_utils.h"
-#include "oneflow/core/ep/rocm/cuda_stream.h"
-
-namespace oneflow {
-
-namespace {
-
-template<typename T, int Dim>
-__global__ void RollCudaKernel(const T* in_ptr, const SHIFTS shifts, const SHAPE shape,
-                               const STRIDE stride, const int64_t elements, T* out_ptr) {
-  int32_t global_index = (blockDim.x * blockIdx.x) + threadIdx.x;
-  int32_t step = gridDim.x * blockDim.x;
-  while (global_index < elements) {
-    int32_t shifted_global_index =
-        getShiftedIndex<Dim>(global_index, shifts.val, shape.val, stride.val);
-    out_ptr[global_index] = in_ptr[shifted_global_index];
-    global_index += step;
-  }
-}
-
-template<typename T, int Dim>
-struct GpuRollFunctor final {
-  void operator()(ep::Stream* stream, const T* in_ptr, const SHIFTS shifts, const SHAPE shape,
-                  const STRIDE stride, const int64_t elements, T* out_ptr) {
-    RollCudaKernel<T, Dim><<<BlocksNum4ThreadsNum(elements), kCudaThreadsNumPerBlock, 0,
-                             stream->As<ep::CudaStream>()->cuda_stream()>>>(
-        in_ptr, shifts, shape, stride, elements, out_ptr);
-  }
-};
-
-template<int Dim>
-struct GpuRollFunctor<float16, Dim> final {
-  void operator()(ep::Stream* stream, const float16* in_ptr, const SHIFTS shifts, const SHAPE shape,
-                  const STRIDE stride, const int64_t elements, float16* out_ptr) {
-    RollCudaKernel<half, Dim><<<BlocksNum4ThreadsNum(elements), kCudaThreadsNumPerBlock, 0,
-                                stream->As<ep::CudaStream>()->cuda_stream()>>>(
-        reinterpret_cast<const half*>(in_ptr), shifts, shape, stride, elements,
-        reinterpret_cast<half*>(out_ptr));
-  }
-};
-
-template<typename T>
-__global__ void RollFlattenCudaKernel(const T* in_ptr, const int64_t start,
-                                      const int64_t elem_count_minus_start, const int64_t elements,
-                                      T* out_ptr) {
-  int64_t global_index = (blockDim.x * blockIdx.x) + threadIdx.x;
-  int32_t step = gridDim.x * blockDim.x;
-
-  while (global_index < elements) {
-    int64_t source_idx = 0;
-    if (global_index >= elem_count_minus_start) {
-      source_idx = global_index - elem_count_minus_start;
-    } else {
-      source_idx = global_index + start;
-    }
-    out_ptr[global_index] = in_ptr[source_idx];
-
-    global_index += step;
-  }
-}
-
-template<typename T>
-struct GpuRollFlattenFunctor final {
-  void operator()(ep::Stream* stream, const T* in_ptr, const int64_t start,
-                  const int64_t elem_count_minus_start, const int64_t elements, T* out_ptr) {
-    RollFlattenCudaKernel<T><<<BlocksNum4ThreadsNum(elements), kCudaThreadsNumPerBlock, 0,
-                               stream->As<ep::CudaStream>()->cuda_stream()>>>(
-        in_ptr, start, elem_count_minus_start, elements, out_ptr);
-  }
-};
-
-template<>
-void GpuRollFlattenFunctor<float16>::operator()(ep::Stream* stream, const float16* in_ptr,
-                                                const int64_t start,
-                                                const int64_t elem_count_minus_start,
-                                                const int64_t elements, float16* out_ptr) {
-  RollFlattenCudaKernel<half><<<BlocksNum4ThreadsNum(elements), kCudaThreadsNumPerBlock, 0,
-                                stream->As<ep::CudaStream>()->cuda_stream()>>>(
-      reinterpret_cast<const half*>(in_ptr), start, elem_count_minus_start, elements,
-      reinterpret_cast<half*>(out_ptr));
-}
-
-template<typename T>
-__global__ void Roll1DimCudaKernel(const T* in_ptr, const int32_t stride_x_size,
-                                   const int32_t stride, const int32_t size_minus_start,
-                                   const int32_t size_minus_start_x_stride,
-                                   const int32_t start_x_stride, const int64_t elements,
-                                   T* out_ptr) {
-  int32_t global_index = (blockDim.x * blockIdx.x) + threadIdx.x;
-  int32_t step = gridDim.x * blockDim.x;
-
-  while (global_index < elements) {
-    // roll dim idx is the index of linear_index along the rolling dimension.
-    int32_t roll_dim_idx = global_index % stride_x_size / stride;
-    // index into the source data to find appropriate value.
-    int32_t source_idx = 0;
-    if (roll_dim_idx >= size_minus_start) {
-      source_idx = global_index - size_minus_start_x_stride;
-    } else {
-      source_idx = global_index + start_x_stride;
-    }
-    out_ptr[global_index] = in_ptr[source_idx];
-
-    global_index += step;
-  }
-}
-
-template<typename T>
-struct GpuRoll1DimFunctor final {
-  void operator()(ep::Stream* stream, const T* in_ptr, const int32_t stride_x_size,
-                  const int32_t stride, const int32_t size_minus_start,
-                  const int32_t size_minus_start_x_stride, const int32_t start_x_stride,
-                  const int64_t elements, T* out_ptr) {
-    Roll1DimCudaKernel<T><<<BlocksNum4ThreadsNum(elements), kCudaThreadsNumPerBlock, 0,
-                            stream->As<ep::CudaStream>()->cuda_stream()>>>(
-        in_ptr, stride_x_size, stride, size_minus_start, size_minus_start_x_stride, start_x_stride,
-        elements, out_ptr);
-  }
-};
-
-template<>
-void GpuRoll1DimFunctor<float16>::operator()(ep::Stream* stream, const float16* in_ptr,
-                                             const int32_t stride_x_size, const int32_t stride,
-                                             const int32_t size_minus_start,
-                                             const int32_t size_minus_start_x_stride,
-                                             const int32_t start_x_stride, const int64_t elements,
-                                             float16* out_ptr) {
-  Roll1DimCudaKernel<half><<<BlocksNum4ThreadsNum(elements), kCudaThreadsNumPerBlock, 0,
-                             stream->As<ep::CudaStream>()->cuda_stream()>>>(
-      reinterpret_cast<const half*>(in_ptr), stride_x_size, stride, size_minus_start,
-      size_minus_start_x_stride, start_x_stride, elements, reinterpret_cast<half*>(out_ptr));
-}
-
-}  // namespace
-
-template<typename T>
-class GpuRollKernel final : public user_op::OpKernel {
- public:
-  GpuRollKernel() = default;
-  ~GpuRollKernel() = default;
-
- private:
-  using user_op::OpKernel::Compute;
-  void Compute(user_op::KernelComputeContext* ctx) const override {
-    const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("in", 0);
-    user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0);
-    const std::vector<int32_t>& shifts = ctx->Attr<std::vector<int32_t>>("shifts");
-    const std::vector<int32_t>& dims = ctx->Attr<std::vector<int32_t>>("dims");
-
-    const T* in_ptr = in->dptr<T>();
-    T* out_ptr = out->mut_dptr<T>();
-    const int64_t elem_count = out->shape_view().elem_cnt();
-
-    if (dims[0] == -1) {
-      // NOTE(Liang Depeng): Borrow the implementation of pytorch and simplify to 1d array case.
-      int64_t start = (elem_count - shifts[0]) % elem_count;
-      if (start < 0) start = start + elem_count;
-      const int64_t elem_count_minus_start = elem_count - start;
-      GpuRollFlattenFunctor<T>()(ctx->stream(), in_ptr, start, elem_count_minus_start, elem_count,
-                                 out_ptr);
-    } else {
-      SHAPE new_shape{};
-      SHIFTS new_shifts{};
-      int32_t num_axes = 0;
-      computeParams(in->shape_view(), shifts, dims, new_shifts.val, new_shape.val, &num_axes);
-
-      STRIDE stride{};
-      initStride(stride, new_shape, num_axes);
-
-      if (dims.size() == 1) {
-        // NOTE(Liang Depeng): Borrow the implementation of pytorch
-        const int32_t size = new_shape.val[dims[0]];
-        int32_t start = (size - new_shifts.val[dims[0]]) % size;
-        // Behavior of % is different in C++ vs Python for negative numbers. This
-        // corrects the difference.
-        if (start < 0) start = start + size;
-
-        const int32_t stride_x_size = stride.val[dims[0]] * size;
-        const int32_t size_minus_start = size - start;
-        const int32_t size_minus_start_x_stride = size_minus_start * stride.val[dims[0]];
-        const int32_t start_x_stride = start * stride.val[dims[0]];
-
-        GpuRoll1DimFunctor<T>()(ctx->stream(), in_ptr, stride_x_size, stride.val[dims[0]],
-                                size_minus_start, size_minus_start_x_stride, start_x_stride,
-                                elem_count, out_ptr);
-
-      } else {
-        transformShifts(new_shifts.val, new_shape.val, num_axes);
-        switch (num_axes) {
-          case 1:
-            GpuRollFunctor<T, 1>()(ctx->stream(), in_ptr, new_shifts, new_shape, stride, elem_count,
-                                   out_ptr);
-            break;
-          case 2:
-            GpuRollFunctor<T, 2>()(ctx->stream(), in_ptr, new_shifts, new_shape, stride, elem_count,
-                                   out_ptr);
-            break;
-          case 3:
-            GpuRollFunctor<T, 3>()(ctx->stream(), in_ptr, new_shifts, new_shape, stride, elem_count,
-                                   out_ptr);
-            break;
-          case 4:
-            GpuRollFunctor<T, 4>()(ctx->stream(), in_ptr, new_shifts, new_shape, stride, elem_count,
-                                   out_ptr);
-            break;
-          case 5:
-            GpuRollFunctor<T, 5>()(ctx->stream(), in_ptr, new_shifts, new_shape, stride, elem_count,
-                                   out_ptr);
-            break;
-          case 6:
-            GpuRollFunctor<T, 6>()(ctx->stream(), in_ptr, new_shifts, new_shape, stride, elem_count,
-                                   out_ptr);
-            break;
-          case 7:
-            GpuRollFunctor<T, 7>()(ctx->stream(), in_ptr, new_shifts, new_shape, stride, elem_count,
-                                   out_ptr);
-            break;
-          case 8:
-            GpuRollFunctor<T, 8>()(ctx->stream(), in_ptr, new_shifts, new_shape, stride, elem_count,
-                                   out_ptr);
-            break;
-          case 9:
-            GpuRollFunctor<T, 9>()(ctx->stream(), in_ptr, new_shifts, new_shape, stride, elem_count,
-                                   out_ptr);
-            break;
-          case 10:
-            GpuRollFunctor<T, 10>()(ctx->stream(), in_ptr, new_shifts, new_shape, stride,
-                                    elem_count, out_ptr);
-            break;
-          case 11:
-            GpuRollFunctor<T, 11>()(ctx->stream(), in_ptr, new_shifts, new_shape, stride,
-                                    elem_count, out_ptr);
-            break;
-          case 12:
-            GpuRollFunctor<T, 12>()(ctx->stream(), in_ptr, new_shifts, new_shape, stride,
-                                    elem_count, out_ptr);
-            break;
-          case 13:
-            GpuRollFunctor<T, 13>()(ctx->stream(), in_ptr, new_shifts, new_shape, stride,
-                                    elem_count, out_ptr);
-            break;
-          case 14:
-            GpuRollFunctor<T, 14>()(ctx->stream(), in_ptr, new_shifts, new_shape, stride,
-                                    elem_count, out_ptr);
-            break;
-          case 15:
-            GpuRollFunctor<T, 15>()(ctx->stream(), in_ptr, new_shifts, new_shape, stride,
-                                    elem_count, out_ptr);
-            break;
-          case 16:
-            GpuRollFunctor<T, 16>()(ctx->stream(), in_ptr, new_shifts, new_shape, stride,
-                                    elem_count, out_ptr);
-            break;
-          default: break;
-        }
-      }
-    }
-  }
-
-  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
-};
-
-#define REGISTER_ROLL_KERNEL(dtype)                                                 \
-  REGISTER_USER_KERNEL("roll").SetCreateFn<GpuRollKernel<dtype>>().SetIsMatchedHob( \
-      (user_op::HobDeviceType() == DeviceType::kCUDA)                               \
-      && (user_op::HobDataType("in", 0) == GetDataType<dtype>::value))
-
-REGISTER_ROLL_KERNEL(float);
-REGISTER_ROLL_KERNEL(double);
-REGISTER_ROLL_KERNEL(float16);
-REGISTER_ROLL_KERNEL(bool);
-REGISTER_ROLL_KERNEL(uint8_t);
-REGISTER_ROLL_KERNEL(int8_t);
-REGISTER_ROLL_KERNEL(int32_t);
-REGISTER_ROLL_KERNEL(int64_t);
-
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "hip/hip_runtime.h"
+#include "oneflow/core/device/cuda_util.h"
+#include "oneflow/core/hip/atomic.hip.h"
+#include "oneflow/user/kernels/roll_kernel_utils.h"
+#include "oneflow/core/ep/rocm/cuda_stream.h"
+
+namespace oneflow {
+
+namespace {
+
+template<typename T, int Dim>
+__global__ void RollCudaKernel(const T* in_ptr, const SHIFTS shifts, const SHAPE shape,
+                               const STRIDE stride, const int64_t elements, T* out_ptr) {
+  int32_t global_index = (blockDim.x * blockIdx.x) + threadIdx.x;
+  int32_t step = gridDim.x * blockDim.x;
+  while (global_index < elements) {
+    int32_t shifted_global_index =
+        getShiftedIndex<Dim>(global_index, shifts.val, shape.val, stride.val);
+    out_ptr[global_index] = in_ptr[shifted_global_index];
+    global_index += step;
+  }
+}
+
+template<typename T, int Dim>
+struct GpuRollFunctor final {
+  void operator()(ep::Stream* stream, const T* in_ptr, const SHIFTS shifts, const SHAPE shape,
+                  const STRIDE stride, const int64_t elements, T* out_ptr) {
+    RollCudaKernel<T, Dim><<<BlocksNum4ThreadsNum(elements), kCudaThreadsNumPerBlock, 0,
+                             stream->As<ep::CudaStream>()->cuda_stream()>>>(
+        in_ptr, shifts, shape, stride, elements, out_ptr);
+  }
+};
+
+template<int Dim>
+struct GpuRollFunctor<float16, Dim> final {
+  void operator()(ep::Stream* stream, const float16* in_ptr, const SHIFTS shifts, const SHAPE shape,
+                  const STRIDE stride, const int64_t elements, float16* out_ptr) {
+    RollCudaKernel<half, Dim><<<BlocksNum4ThreadsNum(elements), kCudaThreadsNumPerBlock, 0,
+                                stream->As<ep::CudaStream>()->cuda_stream()>>>(
+        reinterpret_cast<const half*>(in_ptr), shifts, shape, stride, elements,
+        reinterpret_cast<half*>(out_ptr));
+  }
+};
+
+template<typename T>
+__global__ void RollFlattenCudaKernel(const T* in_ptr, const int64_t start,
+                                      const int64_t elem_count_minus_start, const int64_t elements,
+                                      T* out_ptr) {
+  int64_t global_index = (blockDim.x * blockIdx.x) + threadIdx.x;
+  int32_t step = gridDim.x * blockDim.x;
+
+  while (global_index < elements) {
+    int64_t source_idx = 0;
+    if (global_index >= elem_count_minus_start) {
+      source_idx = global_index - elem_count_minus_start;
+    } else {
+      source_idx = global_index + start;
+    }
+    out_ptr[global_index] = in_ptr[source_idx];
+
+    global_index += step;
+  }
+}
+
+template<typename T>
+struct GpuRollFlattenFunctor final {
+  void operator()(ep::Stream* stream, const T* in_ptr, const int64_t start,
+                  const int64_t elem_count_minus_start, const int64_t elements, T* out_ptr) {
+    RollFlattenCudaKernel<T><<<BlocksNum4ThreadsNum(elements), kCudaThreadsNumPerBlock, 0,
+                               stream->As<ep::CudaStream>()->cuda_stream()>>>(
+        in_ptr, start, elem_count_minus_start, elements, out_ptr);
+  }
+};
+
+template<>
+void GpuRollFlattenFunctor<float16>::operator()(ep::Stream* stream, const float16* in_ptr,
+                                                const int64_t start,
+                                                const int64_t elem_count_minus_start,
+                                                const int64_t elements, float16* out_ptr) {
+  RollFlattenCudaKernel<half><<<BlocksNum4ThreadsNum(elements), kCudaThreadsNumPerBlock, 0,
+                                stream->As<ep::CudaStream>()->cuda_stream()>>>(
+      reinterpret_cast<const half*>(in_ptr), start, elem_count_minus_start, elements,
+      reinterpret_cast<half*>(out_ptr));
+}
+
+template<typename T>
+__global__ void Roll1DimCudaKernel(const T* in_ptr, const int32_t stride_x_size,
+                                   const int32_t stride, const int32_t size_minus_start,
+                                   const int32_t size_minus_start_x_stride,
+                                   const int32_t start_x_stride, const int64_t elements,
+                                   T* out_ptr) {
+  int32_t global_index = (blockDim.x * blockIdx.x) + threadIdx.x;
+  int32_t step = gridDim.x * blockDim.x;
+
+  while (global_index < elements) {
+    // roll dim idx is the index of linear_index along the rolling dimension.
+    int32_t roll_dim_idx = global_index % stride_x_size / stride;
+    // index into the source data to find appropriate value.
+    int32_t source_idx = 0;
+    if (roll_dim_idx >= size_minus_start) {
+      source_idx = global_index - size_minus_start_x_stride;
+    } else {
+      source_idx = global_index + start_x_stride;
+    }
+    out_ptr[global_index] = in_ptr[source_idx];
+
+    global_index += step;
+  }
+}
+
+template<typename T>
+struct GpuRoll1DimFunctor final {
+  void operator()(ep::Stream* stream, const T* in_ptr, const int32_t stride_x_size,
+                  const int32_t stride, const int32_t size_minus_start,
+                  const int32_t size_minus_start_x_stride, const int32_t start_x_stride,
+                  const int64_t elements, T* out_ptr) {
+    Roll1DimCudaKernel<T><<<BlocksNum4ThreadsNum(elements), kCudaThreadsNumPerBlock, 0,
+                            stream->As<ep::CudaStream>()->cuda_stream()>>>(
+        in_ptr, stride_x_size, stride, size_minus_start, size_minus_start_x_stride, start_x_stride,
+        elements, out_ptr);
+  }
+};
+
+template<>
+void GpuRoll1DimFunctor<float16>::operator()(ep::Stream* stream, const float16* in_ptr,
+                                             const int32_t stride_x_size, const int32_t stride,
+                                             const int32_t size_minus_start,
+                                             const int32_t size_minus_start_x_stride,
+                                             const int32_t start_x_stride, const int64_t elements,
+                                             float16* out_ptr) {
+  Roll1DimCudaKernel<half><<<BlocksNum4ThreadsNum(elements), kCudaThreadsNumPerBlock, 0,
+                             stream->As<ep::CudaStream>()->cuda_stream()>>>(
+      reinterpret_cast<const half*>(in_ptr), stride_x_size, stride, size_minus_start,
+      size_minus_start_x_stride, start_x_stride, elements, reinterpret_cast<half*>(out_ptr));
+}
+
+}  // namespace
+
+template<typename T>
+class GpuRollKernel final : public user_op::OpKernel {
+ public:
+  GpuRollKernel() = default;
+  ~GpuRollKernel() = default;
+
+ private:
+  using user_op::OpKernel::Compute;
+  void Compute(user_op::KernelComputeContext* ctx) const override {
+    const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("in", 0);
+    user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0);
+    const std::vector<int32_t>& shifts = ctx->Attr<std::vector<int32_t>>("shifts");
+    const std::vector<int32_t>& dims = ctx->Attr<std::vector<int32_t>>("dims");
+
+    const T* in_ptr = in->dptr<T>();
+    T* out_ptr = out->mut_dptr<T>();
+    const int64_t elem_count = out->shape_view().elem_cnt();
+
+    if (dims[0] == -1) {
+      // NOTE(Liang Depeng): Borrow the implementation of pytorch and simplify to 1d array case.
+      int64_t start = (elem_count - shifts[0]) % elem_count;
+      if (start < 0) start = start + elem_count;
+      const int64_t elem_count_minus_start = elem_count - start;
+      GpuRollFlattenFunctor<T>()(ctx->stream(), in_ptr, start, elem_count_minus_start, elem_count,
+                                 out_ptr);
+    } else {
+      SHAPE new_shape{};
+      SHIFTS new_shifts{};
+      int32_t num_axes = 0;
+      computeParams(in->shape_view(), shifts, dims, new_shifts.val, new_shape.val, &num_axes);
+
+      STRIDE stride{};
+      initStride(stride, new_shape, num_axes);
+
+      if (dims.size() == 1) {
+        // NOTE(Liang Depeng): Borrow the implementation of pytorch
+        const int32_t size = new_shape.val[dims[0]];
+        int32_t start = (size - new_shifts.val[dims[0]]) % size;
+        // Behavior of % is different in C++ vs Python for negative numbers. This
+        // corrects the difference.
+        if (start < 0) start = start + size;
+
+        const int32_t stride_x_size = stride.val[dims[0]] * size;
+        const int32_t size_minus_start = size - start;
+        const int32_t size_minus_start_x_stride = size_minus_start * stride.val[dims[0]];
+        const int32_t start_x_stride = start * stride.val[dims[0]];
+
+        GpuRoll1DimFunctor<T>()(ctx->stream(), in_ptr, stride_x_size, stride.val[dims[0]],
+                                size_minus_start, size_minus_start_x_stride, start_x_stride,
+                                elem_count, out_ptr);
+
+      } else {
+        transformShifts(new_shifts.val, new_shape.val, num_axes);
+        switch (num_axes) {
+          case 1:
+            GpuRollFunctor<T, 1>()(ctx->stream(), in_ptr, new_shifts, new_shape, stride, elem_count,
+                                   out_ptr);
+            break;
+          case 2:
+            GpuRollFunctor<T, 2>()(ctx->stream(), in_ptr, new_shifts, new_shape, stride, elem_count,
+                                   out_ptr);
+            break;
+          case 3:
+            GpuRollFunctor<T, 3>()(ctx->stream(), in_ptr, new_shifts, new_shape, stride, elem_count,
+                                   out_ptr);
+            break;
+          case 4:
+            GpuRollFunctor<T, 4>()(ctx->stream(), in_ptr, new_shifts, new_shape, stride, elem_count,
+                                   out_ptr);
+            break;
+          case 5:
+            GpuRollFunctor<T, 5>()(ctx->stream(), in_ptr, new_shifts, new_shape, stride, elem_count,
+                                   out_ptr);
+            break;
+          case 6:
+            GpuRollFunctor<T, 6>()(ctx->stream(), in_ptr, new_shifts, new_shape, stride, elem_count,
+                                   out_ptr);
+            break;
+          case 7:
+            GpuRollFunctor<T, 7>()(ctx->stream(), in_ptr, new_shifts, new_shape, stride, elem_count,
+                                   out_ptr);
+            break;
+          case 8:
+            GpuRollFunctor<T, 8>()(ctx->stream(), in_ptr, new_shifts, new_shape, stride, elem_count,
+                                   out_ptr);
+            break;
+          case 9:
+            GpuRollFunctor<T, 9>()(ctx->stream(), in_ptr, new_shifts, new_shape, stride, elem_count,
+                                   out_ptr);
+            break;
+          case 10:
+            GpuRollFunctor<T, 10>()(ctx->stream(), in_ptr, new_shifts, new_shape, stride,
+                                    elem_count, out_ptr);
+            break;
+          case 11:
+            GpuRollFunctor<T, 11>()(ctx->stream(), in_ptr, new_shifts, new_shape, stride,
+                                    elem_count, out_ptr);
+            break;
+          case 12:
+            GpuRollFunctor<T, 12>()(ctx->stream(), in_ptr, new_shifts, new_shape, stride,
+                                    elem_count, out_ptr);
+            break;
+          case 13:
+            GpuRollFunctor<T, 13>()(ctx->stream(), in_ptr, new_shifts, new_shape, stride,
+                                    elem_count, out_ptr);
+            break;
+          case 14:
+            GpuRollFunctor<T, 14>()(ctx->stream(), in_ptr, new_shifts, new_shape, stride,
+                                    elem_count, out_ptr);
+            break;
+          case 15:
+            GpuRollFunctor<T, 15>()(ctx->stream(), in_ptr, new_shifts, new_shape, stride,
+                                    elem_count, out_ptr);
+            break;
+          case 16:
+            GpuRollFunctor<T, 16>()(ctx->stream(), in_ptr, new_shifts, new_shape, stride,
+                                    elem_count, out_ptr);
+            break;
+          default: break;
+        }
+      }
+    }
+  }
+
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
+};
+
+#define REGISTER_ROLL_KERNEL(dtype)                                                 \
+  REGISTER_USER_KERNEL("roll").SetCreateFn<GpuRollKernel<dtype>>().SetIsMatchedHob( \
+      (user_op::HobDeviceType() == DeviceType::kCUDA)                               \
+      && (user_op::HobDataType("in", 0) == GetDataType<dtype>::value))
+
+REGISTER_ROLL_KERNEL(float);
+REGISTER_ROLL_KERNEL(double);
+REGISTER_ROLL_KERNEL(float16);
+REGISTER_ROLL_KERNEL(bool);
+REGISTER_ROLL_KERNEL(uint8_t);
+REGISTER_ROLL_KERNEL(int8_t);
+REGISTER_ROLL_KERNEL(int32_t);
+REGISTER_ROLL_KERNEL(int64_t);
+
 }  // namespace oneflow
\ No newline at end of file
diff --git a/oneflow/user/kernels/scalar_math_kernels.hip.cpp b/oneflow/user/kernels/scalar_math_kernels.hip.cpp
index 6691786..377e669 100644
--- a/oneflow/user/kernels/scalar_math_kernels.hip.cpp
+++ b/oneflow/user/kernels/scalar_math_kernels.hip.cpp
@@ -1,223 +1,223 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "oneflow/user/kernels/scalar_math_kernels.h"
-#include "oneflow/core/hip/elementwise.hip.h"
-#include "oneflow/core/kernel/util/cuda_half_util.h"
-#include "oneflow/core/ep/rocm/cuda_stream.h"
-
-namespace oneflow {
-
-template<template<typename> class Op, typename T>
-struct UnaryByScalarFunctor {
-  __host__ __device__ explicit UnaryByScalarFunctor(T scalar) : scalar(scalar) {}
-  __device__ T operator()(T a) const { return Op<T>::Invoke(a, scalar); }
-  const T scalar;
-};
-
-template<template<typename> class Op, typename T>
-struct UnaryByScalarReverseFunctor {
-  __host__ __device__ explicit UnaryByScalarReverseFunctor(T scalar) : scalar(scalar) {}
-  __device__ T operator()(T a) const { return Op<T>::Invoke(scalar, a); }
-  const T scalar;
-};
-
-template<template<typename> class Op>
-struct UnaryByScalarFunctor<Op, float16> {
-  __host__ __device__ explicit UnaryByScalarFunctor(half scalar) : scalar(scalar) {}
-  __device__ half operator()(half a) const { return Op<half>::Invoke(a, scalar); }
-  const half scalar;
-};
-
-template<template<typename> class Op>
-struct UnaryByScalarReverseFunctor<Op, float16> {
-  __host__ __device__ explicit UnaryByScalarReverseFunctor(half scalar) : scalar(scalar) {}
-  __device__ half operator()(half a) const { return Op<half>::Invoke(scalar, a); }
-  const half scalar;
-};
-
-template<template<typename> class BIN_OP, typename T>
-struct ScalarMathFunctor<DeviceType::kCUDA, BIN_OP, T> final {
-  void operator()(ep::Stream* stream, const int64_t elem_cnt, const T scalar, const T* in, T* out) {
-    OF_CUDA_CHECK(cuda::elementwise::Unary(UnaryByScalarFunctor<BIN_OP, T>(scalar), elem_cnt, out,
-                                           in, stream->As<ep::CudaStream>()->cuda_stream()));
-  }
-};
-
-template<template<typename> class BIN_OP>
-struct ScalarMathFunctor<DeviceType::kCUDA, BIN_OP, float16> final {
-  void operator()(ep::Stream* stream, const int64_t elem_cnt, float16 scalar, const float16* in,
-                  float16* out) {
-    OF_CUDA_CHECK(cuda::elementwise::Unary(
-        UnaryByScalarFunctor<BIN_OP, float16>(float16_2half(scalar)), elem_cnt,
-        reinterpret_cast<half*>(out), reinterpret_cast<const half*>(in),
-        stream->As<ep::CudaStream>()->cuda_stream()));
-  }
-};
-
-template<template<typename> class BIN_OP, typename T>
-struct ScalarReverseMathFunctor<DeviceType::kCUDA, BIN_OP, T> final {
-  void operator()(ep::Stream* stream, const int64_t elem_cnt, const T scalar, const T* in, T* out) {
-    OF_CUDA_CHECK(cuda::elementwise::Unary(UnaryByScalarReverseFunctor<BIN_OP, T>(scalar), elem_cnt,
-                                           out, in, stream->As<ep::CudaStream>()->cuda_stream()));
-  }
-};
-
-template<template<typename> class BIN_OP>
-struct ScalarReverseMathFunctor<DeviceType::kCUDA, BIN_OP, float16> final {
-  void operator()(ep::Stream* stream, const int64_t elem_cnt, float16 scalar, const float16* in,
-                  float16* out) {
-    OF_CUDA_CHECK(cuda::elementwise::Unary(
-        UnaryByScalarReverseFunctor<BIN_OP, float16>(float16_2half(scalar)), elem_cnt,
-        reinterpret_cast<half*>(out), reinterpret_cast<const half*>(in),
-        stream->As<ep::CudaStream>()->cuda_stream()));
-  }
-};
-
-INSTANTIATE_SCALAR_MATH_FUNCTORS(DeviceType::kCUDA, BinaryFuncAdd);
-INSTANTIATE_SCALAR_MATH_FUNCTORS(DeviceType::kCUDA, BinaryFuncFloorDiv);
-INSTANTIATE_SCALAR_MATH_FUNCTORS(DeviceType::kCUDA, BinaryFuncFMod);
-INSTANTIATE_SCALAR_MATH_FUNCTORS(DeviceType::kCUDA, BinaryFuncMul);
-INSTANTIATE_SCALAR_MATH_FUNCTORS(DeviceType::kCUDA, BinaryFuncDiv);
-INSTANTIATE_SCALAR_MATH_FUNCTORS(DeviceType::kCUDA, BinaryFuncPow);
-INSTANTIATE_SCALAR_REVERSE_MATH_FUNCTORS(DeviceType::kCUDA, BinaryFuncPow);
-
-template<typename T>
-struct ScalarPowGradFunctor {
-  OF_DEVICE_FUNC explicit ScalarPowGradFunctor(T exponent) : exponent(exponent) {}
-  __device__ T operator()(T x, T dy) const {
-    return exponent * (pow(x, exponent - static_cast<T>(1.0))) * dy;
-  }
-  const T exponent;
-};
-
-template<>
-struct ScalarPowGradFunctor<half> {
-  OF_DEVICE_FUNC explicit ScalarPowGradFunctor(half exponent) : exponent(exponent) {}
-  __device__ half operator()(half x, half dy) const {
-    return __float2half(__half2float(exponent)
-                        * (powf(__half2float(x), __half2float(exponent) - static_cast<float>(1.0)))
-                        * __half2float(dy));
-  }
-  const half exponent;
-};
-
-template<typename T>
-struct ScalarReversePowGradFunctor {
-  OF_DEVICE_FUNC explicit ScalarReversePowGradFunctor(T exponent) : exponent(exponent) {}
-  __device__ T operator()(T x, T dy) const { return pow(exponent, x) * log(exponent) * dy; }
-  const T exponent;
-};
-
-template<>
-struct ScalarReversePowGradFunctor<float> {
-  OF_DEVICE_FUNC explicit ScalarReversePowGradFunctor(float exponent) : exponent(exponent) {}
-  __device__ float operator()(float x, float dy) const {
-    return powf(exponent, x) * logf(exponent) * dy;
-  }
-  const float exponent;
-};
-
-template<>
-struct ScalarReversePowGradFunctor<half> {
-  OF_DEVICE_FUNC explicit ScalarReversePowGradFunctor(half exponent) : exponent(exponent) {}
-  __device__ half operator()(half x, half dy) const {
-    const float exp = __half2float(exponent);
-    return __float2half(exp * powf(exp, __half2float(x)) * logf(exp) * __half2float(dy));
-  }
-  const half exponent;
-};
-
-template<DeviceType device_type, typename T>
-class GpuScalarPowGradKernel final : public user_op::OpKernel {
- public:
-  GpuScalarPowGradKernel() = default;
-  ~GpuScalarPowGradKernel() = default;
-
- private:
-  using user_op::OpKernel::Compute;
-  void Compute(user_op::KernelComputeContext* ctx) const override {
-    const user_op::Tensor* x_tensor = ctx->Tensor4ArgNameAndIndex("x", 0);
-    const user_op::Tensor* dy_tensor = ctx->Tensor4ArgNameAndIndex("dy", 0);
-    user_op::Tensor* dx_tensor = ctx->Tensor4ArgNameAndIndex("dx", 0);
-    const T* x_ptr = x_tensor->dptr<T>();
-    const T* dy_ptr = dy_tensor->dptr<T>();
-    T* dx_ptr = dx_tensor->mut_dptr<T>();
-    T scalar_operand = static_cast<T>(0);
-    if (ctx->Attr<bool>("has_int_operand")) {
-      scalar_operand = static_cast<T>(ctx->Attr<int64_t>("int_operand"));
-    } else if (ctx->Attr<bool>("has_float_operand")) {
-      scalar_operand = static_cast<T>(ctx->Attr<double>("float_operand"));
-    } else {
-      UNIMPLEMENTED();
-    }
-    const int32_t elem_cnt = x_tensor->shape_view().elem_cnt();
-    OF_CUDA_CHECK((oneflow::cuda::elementwise::Binary(
-        ScalarPowGradFunctor<T>(scalar_operand), elem_cnt, dx_ptr, x_ptr, dy_ptr,
-        ctx->stream()->As<ep::CudaStream>()->cuda_stream())));
-  }
-  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
-};
-
-#define REGISTER_CUDA_SCALAR_POW_BACKWARD_KERNEL(device, dtype) \
-  REGISTER_USER_KERNEL("scalar_pow_grad")                       \
-      .SetCreateFn<GpuScalarPowGradKernel<device, dtype>>()     \
-      .SetIsMatchedHob((user_op::HobDeviceType() == device)     \
-                       && (user_op::HobDataType("dx", 0) == GetDataType<dtype>::value));
-
-REGISTER_CUDA_SCALAR_POW_BACKWARD_KERNEL(DeviceType::kCUDA, float);
-REGISTER_CUDA_SCALAR_POW_BACKWARD_KERNEL(DeviceType::kCUDA, double);
-
-template<DeviceType device_type, typename T>
-class GpuScalarReversePowGradKernel final : public user_op::OpKernel {
- public:
-  GpuScalarReversePowGradKernel() = default;
-  ~GpuScalarReversePowGradKernel() = default;
-
- private:
-  using user_op::OpKernel::Compute;
-  void Compute(user_op::KernelComputeContext* ctx) const override {
-    const user_op::Tensor* x_tensor = ctx->Tensor4ArgNameAndIndex("x", 0);
-    const user_op::Tensor* dy_tensor = ctx->Tensor4ArgNameAndIndex("dy", 0);
-    user_op::Tensor* dx_tensor = ctx->Tensor4ArgNameAndIndex("dx", 0);
-    const T* x_ptr = x_tensor->dptr<T>();
-    const T* dy_ptr = dy_tensor->dptr<T>();
-    T* dx_ptr = dx_tensor->mut_dptr<T>();
-    T scalar_operand = static_cast<T>(0);
-    if (ctx->Attr<bool>("has_int_operand")) {
-      scalar_operand = static_cast<T>(ctx->Attr<int64_t>("int_operand"));
-    } else if (ctx->Attr<bool>("has_float_operand")) {
-      scalar_operand = static_cast<T>(ctx->Attr<double>("float_operand"));
-    } else {
-      UNIMPLEMENTED();
-    }
-    const int32_t elem_cnt = x_tensor->shape_view().elem_cnt();
-    OF_CUDA_CHECK((oneflow::cuda::elementwise::Binary(
-        ScalarReversePowGradFunctor<T>(scalar_operand), elem_cnt, dx_ptr, x_ptr, dy_ptr,
-        ctx->stream()->As<ep::CudaStream>()->cuda_stream())));
-  }
-  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
-};
-
-#define REGISTER_CUDA_SCALAR_REVERSE_POW_BACKWARD_KERNEL(device, dtype) \
-  REGISTER_USER_KERNEL("scalar_reverse_pow_grad")                       \
-      .SetCreateFn<GpuScalarReversePowGradKernel<device, dtype>>()      \
-      .SetIsMatchedHob((user_op::HobDeviceType() == device)             \
-                       && (user_op::HobDataType("dx", 0) == GetDataType<dtype>::value));
-
-REGISTER_CUDA_SCALAR_REVERSE_POW_BACKWARD_KERNEL(DeviceType::kCUDA, float);
-REGISTER_CUDA_SCALAR_REVERSE_POW_BACKWARD_KERNEL(DeviceType::kCUDA, double);
-
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "oneflow/user/kernels/scalar_math_kernels.h"
+#include "oneflow/core/hip/elementwise.hip.h"
+#include "oneflow/core/kernel/util/cuda_half_util.h"
+#include "oneflow/core/ep/rocm/cuda_stream.h"
+
+namespace oneflow {
+
+template<template<typename> class Op, typename T>
+struct UnaryByScalarFunctor {
+  __host__ __device__ explicit UnaryByScalarFunctor(T scalar) : scalar(scalar) {}
+  __device__ T operator()(T a) const { return Op<T>::Invoke(a, scalar); }
+  const T scalar;
+};
+
+template<template<typename> class Op, typename T>
+struct UnaryByScalarReverseFunctor {
+  __host__ __device__ explicit UnaryByScalarReverseFunctor(T scalar) : scalar(scalar) {}
+  __device__ T operator()(T a) const { return Op<T>::Invoke(scalar, a); }
+  const T scalar;
+};
+
+template<template<typename> class Op>
+struct UnaryByScalarFunctor<Op, float16> {
+  __host__ __device__ explicit UnaryByScalarFunctor(half scalar) : scalar(scalar) {}
+  __device__ half operator()(half a) const { return Op<half>::Invoke(a, scalar); }
+  const half scalar;
+};
+
+template<template<typename> class Op>
+struct UnaryByScalarReverseFunctor<Op, float16> {
+  __host__ __device__ explicit UnaryByScalarReverseFunctor(half scalar) : scalar(scalar) {}
+  __device__ half operator()(half a) const { return Op<half>::Invoke(scalar, a); }
+  const half scalar;
+};
+
+template<template<typename> class BIN_OP, typename T>
+struct ScalarMathFunctor<DeviceType::kCUDA, BIN_OP, T> final {
+  void operator()(ep::Stream* stream, const int64_t elem_cnt, const T scalar, const T* in, T* out) {
+    OF_CUDA_CHECK(cuda::elementwise::Unary(UnaryByScalarFunctor<BIN_OP, T>(scalar), elem_cnt, out,
+                                           in, stream->As<ep::CudaStream>()->cuda_stream()));
+  }
+};
+
+template<template<typename> class BIN_OP>
+struct ScalarMathFunctor<DeviceType::kCUDA, BIN_OP, float16> final {
+  void operator()(ep::Stream* stream, const int64_t elem_cnt, float16 scalar, const float16* in,
+                  float16* out) {
+    OF_CUDA_CHECK(cuda::elementwise::Unary(
+        UnaryByScalarFunctor<BIN_OP, float16>(float16_2half(scalar)), elem_cnt,
+        reinterpret_cast<half*>(out), reinterpret_cast<const half*>(in),
+        stream->As<ep::CudaStream>()->cuda_stream()));
+  }
+};
+
+template<template<typename> class BIN_OP, typename T>
+struct ScalarReverseMathFunctor<DeviceType::kCUDA, BIN_OP, T> final {
+  void operator()(ep::Stream* stream, const int64_t elem_cnt, const T scalar, const T* in, T* out) {
+    OF_CUDA_CHECK(cuda::elementwise::Unary(UnaryByScalarReverseFunctor<BIN_OP, T>(scalar), elem_cnt,
+                                           out, in, stream->As<ep::CudaStream>()->cuda_stream()));
+  }
+};
+
+template<template<typename> class BIN_OP>
+struct ScalarReverseMathFunctor<DeviceType::kCUDA, BIN_OP, float16> final {
+  void operator()(ep::Stream* stream, const int64_t elem_cnt, float16 scalar, const float16* in,
+                  float16* out) {
+    OF_CUDA_CHECK(cuda::elementwise::Unary(
+        UnaryByScalarReverseFunctor<BIN_OP, float16>(float16_2half(scalar)), elem_cnt,
+        reinterpret_cast<half*>(out), reinterpret_cast<const half*>(in),
+        stream->As<ep::CudaStream>()->cuda_stream()));
+  }
+};
+
+INSTANTIATE_SCALAR_MATH_FUNCTORS(DeviceType::kCUDA, BinaryFuncAdd);
+INSTANTIATE_SCALAR_MATH_FUNCTORS(DeviceType::kCUDA, BinaryFuncFloorDiv);
+INSTANTIATE_SCALAR_MATH_FUNCTORS(DeviceType::kCUDA, BinaryFuncFMod);
+INSTANTIATE_SCALAR_MATH_FUNCTORS(DeviceType::kCUDA, BinaryFuncMul);
+INSTANTIATE_SCALAR_MATH_FUNCTORS(DeviceType::kCUDA, BinaryFuncDiv);
+INSTANTIATE_SCALAR_MATH_FUNCTORS(DeviceType::kCUDA, BinaryFuncPow);
+INSTANTIATE_SCALAR_REVERSE_MATH_FUNCTORS(DeviceType::kCUDA, BinaryFuncPow);
+
+template<typename T>
+struct ScalarPowGradFunctor {
+  OF_DEVICE_FUNC explicit ScalarPowGradFunctor(T exponent) : exponent(exponent) {}
+  __device__ T operator()(T x, T dy) const {
+    return exponent * (pow(x, exponent - static_cast<T>(1.0))) * dy;
+  }
+  const T exponent;
+};
+
+template<>
+struct ScalarPowGradFunctor<half> {
+  OF_DEVICE_FUNC explicit ScalarPowGradFunctor(half exponent) : exponent(exponent) {}
+  __device__ half operator()(half x, half dy) const {
+    return __float2half(__half2float(exponent)
+                        * (powf(__half2float(x), __half2float(exponent) - static_cast<float>(1.0)))
+                        * __half2float(dy));
+  }
+  const half exponent;
+};
+
+template<typename T>
+struct ScalarReversePowGradFunctor {
+  OF_DEVICE_FUNC explicit ScalarReversePowGradFunctor(T exponent) : exponent(exponent) {}
+  __device__ T operator()(T x, T dy) const { return pow(exponent, x) * log(exponent) * dy; }
+  const T exponent;
+};
+
+template<>
+struct ScalarReversePowGradFunctor<float> {
+  OF_DEVICE_FUNC explicit ScalarReversePowGradFunctor(float exponent) : exponent(exponent) {}
+  __device__ float operator()(float x, float dy) const {
+    return powf(exponent, x) * logf(exponent) * dy;
+  }
+  const float exponent;
+};
+
+template<>
+struct ScalarReversePowGradFunctor<half> {
+  OF_DEVICE_FUNC explicit ScalarReversePowGradFunctor(half exponent) : exponent(exponent) {}
+  __device__ half operator()(half x, half dy) const {
+    const float exp = __half2float(exponent);
+    return __float2half(exp * powf(exp, __half2float(x)) * logf(exp) * __half2float(dy));
+  }
+  const half exponent;
+};
+
+template<DeviceType device_type, typename T>
+class GpuScalarPowGradKernel final : public user_op::OpKernel {
+ public:
+  GpuScalarPowGradKernel() = default;
+  ~GpuScalarPowGradKernel() = default;
+
+ private:
+  using user_op::OpKernel::Compute;
+  void Compute(user_op::KernelComputeContext* ctx) const override {
+    const user_op::Tensor* x_tensor = ctx->Tensor4ArgNameAndIndex("x", 0);
+    const user_op::Tensor* dy_tensor = ctx->Tensor4ArgNameAndIndex("dy", 0);
+    user_op::Tensor* dx_tensor = ctx->Tensor4ArgNameAndIndex("dx", 0);
+    const T* x_ptr = x_tensor->dptr<T>();
+    const T* dy_ptr = dy_tensor->dptr<T>();
+    T* dx_ptr = dx_tensor->mut_dptr<T>();
+    T scalar_operand = static_cast<T>(0);
+    if (ctx->Attr<bool>("has_int_operand")) {
+      scalar_operand = static_cast<T>(ctx->Attr<int64_t>("int_operand"));
+    } else if (ctx->Attr<bool>("has_float_operand")) {
+      scalar_operand = static_cast<T>(ctx->Attr<double>("float_operand"));
+    } else {
+      UNIMPLEMENTED();
+    }
+    const int32_t elem_cnt = x_tensor->shape_view().elem_cnt();
+    OF_CUDA_CHECK((oneflow::cuda::elementwise::Binary(
+        ScalarPowGradFunctor<T>(scalar_operand), elem_cnt, dx_ptr, x_ptr, dy_ptr,
+        ctx->stream()->As<ep::CudaStream>()->cuda_stream())));
+  }
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
+};
+
+#define REGISTER_CUDA_SCALAR_POW_BACKWARD_KERNEL(device, dtype) \
+  REGISTER_USER_KERNEL("scalar_pow_grad")                       \
+      .SetCreateFn<GpuScalarPowGradKernel<device, dtype>>()     \
+      .SetIsMatchedHob((user_op::HobDeviceType() == device)     \
+                       && (user_op::HobDataType("dx", 0) == GetDataType<dtype>::value));
+
+REGISTER_CUDA_SCALAR_POW_BACKWARD_KERNEL(DeviceType::kCUDA, float);
+REGISTER_CUDA_SCALAR_POW_BACKWARD_KERNEL(DeviceType::kCUDA, double);
+
+template<DeviceType device_type, typename T>
+class GpuScalarReversePowGradKernel final : public user_op::OpKernel {
+ public:
+  GpuScalarReversePowGradKernel() = default;
+  ~GpuScalarReversePowGradKernel() = default;
+
+ private:
+  using user_op::OpKernel::Compute;
+  void Compute(user_op::KernelComputeContext* ctx) const override {
+    const user_op::Tensor* x_tensor = ctx->Tensor4ArgNameAndIndex("x", 0);
+    const user_op::Tensor* dy_tensor = ctx->Tensor4ArgNameAndIndex("dy", 0);
+    user_op::Tensor* dx_tensor = ctx->Tensor4ArgNameAndIndex("dx", 0);
+    const T* x_ptr = x_tensor->dptr<T>();
+    const T* dy_ptr = dy_tensor->dptr<T>();
+    T* dx_ptr = dx_tensor->mut_dptr<T>();
+    T scalar_operand = static_cast<T>(0);
+    if (ctx->Attr<bool>("has_int_operand")) {
+      scalar_operand = static_cast<T>(ctx->Attr<int64_t>("int_operand"));
+    } else if (ctx->Attr<bool>("has_float_operand")) {
+      scalar_operand = static_cast<T>(ctx->Attr<double>("float_operand"));
+    } else {
+      UNIMPLEMENTED();
+    }
+    const int32_t elem_cnt = x_tensor->shape_view().elem_cnt();
+    OF_CUDA_CHECK((oneflow::cuda::elementwise::Binary(
+        ScalarReversePowGradFunctor<T>(scalar_operand), elem_cnt, dx_ptr, x_ptr, dy_ptr,
+        ctx->stream()->As<ep::CudaStream>()->cuda_stream())));
+  }
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
+};
+
+#define REGISTER_CUDA_SCALAR_REVERSE_POW_BACKWARD_KERNEL(device, dtype) \
+  REGISTER_USER_KERNEL("scalar_reverse_pow_grad")                       \
+      .SetCreateFn<GpuScalarReversePowGradKernel<device, dtype>>()      \
+      .SetIsMatchedHob((user_op::HobDeviceType() == device)             \
+                       && (user_op::HobDataType("dx", 0) == GetDataType<dtype>::value));
+
+REGISTER_CUDA_SCALAR_REVERSE_POW_BACKWARD_KERNEL(DeviceType::kCUDA, float);
+REGISTER_CUDA_SCALAR_REVERSE_POW_BACKWARD_KERNEL(DeviceType::kCUDA, double);
+
 }  // namespace oneflow
\ No newline at end of file
diff --git a/oneflow/user/kernels/search_sorted_kernel.hip.cpp b/oneflow/user/kernels/search_sorted_kernel.hip.cpp
index 2fcdabe..bb65bbe 100644
--- a/oneflow/user/kernels/search_sorted_kernel.hip.cpp
+++ b/oneflow/user/kernels/search_sorted_kernel.hip.cpp
@@ -1,129 +1,129 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "hip/hip_runtime.h"
-#include "oneflow/core/framework/framework.h"
-#include "oneflow/core/kernel/cuda_graph_support.h"
-#include "oneflow/core/kernel/new_kernel_util.h"
-#include "oneflow/core/device/cuda_util.h"
-#include "oneflow/user/kernels/search_sorted_kernel_util.h"
-
-namespace oneflow {
-
-template<typename T, typename K>
-__global__ void DoSearchSortedLogical(int32_t instance_num, bool is_sequence_1d,
-                                      K values_shape_last, K sequence_shape_last, bool right,
-                                      const T* values_ptr, const T* sequence_ptr, K* out_ptr) {
-  CUDA_1D_KERNEL_LOOP(i, instance_num) {
-    K start_bd = is_sequence_1d ? 0 : i / values_shape_last * sequence_shape_last;
-    K end_bd = start_bd + sequence_shape_last;
-    K pos = !right
-                ? cus_lower_bound<T, K>(start_bd, end_bd, values_ptr[i], sequence_ptr) - start_bd
-                : cus_upper_bound<T, K>(start_bd, end_bd, values_ptr[i], sequence_ptr) - start_bd;
-    out_ptr[i] = pos;
-  }
-}
-
-template<typename T, typename K>
-__global__ void DoSearchSortedScalarLogical(K sequence_shape_last, bool right, const T values,
-                                            const T* sequence_ptr, K* out_ptr) {
-  CUDA_1D_KERNEL_LOOP(i, 1) {
-    K pos = !right ? cus_lower_bound<T, K>(0, sequence_shape_last, values, sequence_ptr)
-                   : cus_upper_bound<T, K>(0, sequence_shape_last, values, sequence_ptr);
-    out_ptr[0] = pos;
-  }
-}
-
-template<typename T, typename K>
-class GpuSearchSortedKernel final : public user_op::OpKernel {
- public:
-  GpuSearchSortedKernel() = default;
-  ~GpuSearchSortedKernel() = default;
-
- private:
-  using user_op::OpKernel::Compute;
-  void Compute(user_op::KernelComputeContext* ctx) const override {
-    const user_op::Tensor* sorted_sequence = ctx->Tensor4ArgNameAndIndex("sorted_sequence", 0);
-    const user_op::Tensor* values = ctx->Tensor4ArgNameAndIndex("values", 0);
-
-    user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0);
-    const bool& right = ctx->Attr<bool>("right");
-    const T* values_ptr = values->dptr<T>();
-    const T* sequence_ptr = sorted_sequence->dptr<T>();
-    K* out_ptr = out->mut_dptr<K>();
-    const int32_t instance_num = values->shape_view().elem_cnt();
-    bool is_values_scalar = values->shape_view().NumAxes() == 0;
-    bool is_sequence_1d = (sorted_sequence->shape_view().NumAxes() == 1);
-    K values_shape_last =
-        is_values_scalar ? 1 : values->shape_view().At(values->shape_view().NumAxes() - 1);
-    K sequence_shape_last =
-        sorted_sequence->shape_view().At(sorted_sequence->shape_view().NumAxes() - 1);
-    RUN_CUDA_KERNEL((DoSearchSortedLogical<T, K>), ctx->stream(), instance_num, instance_num,
-                    is_sequence_1d, values_shape_last, sequence_shape_last, right, values_ptr,
-                    sequence_ptr, out_ptr);
-  }
-  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
-};
-
-#define REGISTER_GPU_SEARCH_SORTED_KERNEL(in_dtype, out_dtype)                              \
-  REGISTER_USER_KERNEL("searchsorted")                                                      \
-      .SetCreateFn<                                                                         \
-          GpuSearchSortedKernel<OF_PP_PAIR_FIRST(in_dtype), OF_PP_PAIR_FIRST(out_dtype)>>() \
-      .SetIsMatchedHob(                                                                     \
-          (user_op::HobDeviceType() == DeviceType::kCUDA)                                   \
-          && (user_op::HobDataType("sorted_sequence", 0) == OF_PP_PAIR_SECOND(in_dtype))    \
-          && (user_op::HobDataType("values", 0) == OF_PP_PAIR_SECOND(in_dtype))             \
-          && (user_op::HobDataType("out", 0) == OF_PP_PAIR_SECOND(out_dtype)));
-
-OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_GPU_SEARCH_SORTED_KERNEL, ARITHMETIC_DATA_TYPE_SEQ,
-                                 INDEX_DATA_TYPE_SEQ)
-
-template<typename T, typename K>
-class GpuSearchSortedScalarKernel final : public user_op::OpKernel {
- public:
-  GpuSearchSortedScalarKernel() = default;
-  ~GpuSearchSortedScalarKernel() = default;
-
- private:
-  using user_op::OpKernel::Compute;
-  void Compute(user_op::KernelComputeContext* ctx) const override {
-    const user_op::Tensor* sorted_sequence = ctx->Tensor4ArgNameAndIndex("sorted_sequence", 0);
-    user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0);
-
-    const bool& right = ctx->Attr<bool>("right");
-    const T& values = static_cast<T>(ctx->Attr<double>("values"));
-
-    const T* sequence_ptr = sorted_sequence->dptr<T>();
-    K* out_ptr = out->mut_dptr<K>();
-    K sequence_shape_last = sorted_sequence->shape_view().At(0);
-    RUN_CUDA_KERNEL((DoSearchSortedScalarLogical<T, K>), ctx->stream(), 1, sequence_shape_last,
-                    right, values, sequence_ptr, out_ptr);
-  }
-  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
-};
-
-#define REGISTER_GPU_SEARCH_SORTED_SCALAR_KERNEL(in_dtype, out_dtype)                             \
-  REGISTER_USER_KERNEL("searchsorted_scalar")                                                     \
-      .SetCreateFn<                                                                               \
-          GpuSearchSortedScalarKernel<OF_PP_PAIR_FIRST(in_dtype), OF_PP_PAIR_FIRST(out_dtype)>>() \
-      .SetIsMatchedHob(                                                                           \
-          (user_op::HobDeviceType() == DeviceType::kCUDA)                                         \
-          && (user_op::HobDataType("sorted_sequence", 0) == OF_PP_PAIR_SECOND(in_dtype))          \
-          && (user_op::HobDataType("out", 0) == OF_PP_PAIR_SECOND(out_dtype)));
-
-OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_GPU_SEARCH_SORTED_SCALAR_KERNEL, ARITHMETIC_DATA_TYPE_SEQ,
-                                 INDEX_DATA_TYPE_SEQ)
-
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "hip/hip_runtime.h"
+#include "oneflow/core/framework/framework.h"
+#include "oneflow/core/kernel/cuda_graph_support.h"
+#include "oneflow/core/kernel/new_kernel_util.h"
+#include "oneflow/core/device/cuda_util.h"
+#include "oneflow/user/kernels/search_sorted_kernel_util.h"
+
+namespace oneflow {
+
+template<typename T, typename K>
+__global__ void DoSearchSortedLogical(int32_t instance_num, bool is_sequence_1d,
+                                      K values_shape_last, K sequence_shape_last, bool right,
+                                      const T* values_ptr, const T* sequence_ptr, K* out_ptr) {
+  CUDA_1D_KERNEL_LOOP(i, instance_num) {
+    K start_bd = is_sequence_1d ? 0 : i / values_shape_last * sequence_shape_last;
+    K end_bd = start_bd + sequence_shape_last;
+    K pos = !right
+                ? cus_lower_bound<T, K>(start_bd, end_bd, values_ptr[i], sequence_ptr) - start_bd
+                : cus_upper_bound<T, K>(start_bd, end_bd, values_ptr[i], sequence_ptr) - start_bd;
+    out_ptr[i] = pos;
+  }
+}
+
+template<typename T, typename K>
+__global__ void DoSearchSortedScalarLogical(K sequence_shape_last, bool right, const T values,
+                                            const T* sequence_ptr, K* out_ptr) {
+  CUDA_1D_KERNEL_LOOP(i, 1) {
+    K pos = !right ? cus_lower_bound<T, K>(0, sequence_shape_last, values, sequence_ptr)
+                   : cus_upper_bound<T, K>(0, sequence_shape_last, values, sequence_ptr);
+    out_ptr[0] = pos;
+  }
+}
+
+template<typename T, typename K>
+class GpuSearchSortedKernel final : public user_op::OpKernel {
+ public:
+  GpuSearchSortedKernel() = default;
+  ~GpuSearchSortedKernel() = default;
+
+ private:
+  using user_op::OpKernel::Compute;
+  void Compute(user_op::KernelComputeContext* ctx) const override {
+    const user_op::Tensor* sorted_sequence = ctx->Tensor4ArgNameAndIndex("sorted_sequence", 0);
+    const user_op::Tensor* values = ctx->Tensor4ArgNameAndIndex("values", 0);
+
+    user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0);
+    const bool& right = ctx->Attr<bool>("right");
+    const T* values_ptr = values->dptr<T>();
+    const T* sequence_ptr = sorted_sequence->dptr<T>();
+    K* out_ptr = out->mut_dptr<K>();
+    const int32_t instance_num = values->shape_view().elem_cnt();
+    bool is_values_scalar = values->shape_view().NumAxes() == 0;
+    bool is_sequence_1d = (sorted_sequence->shape_view().NumAxes() == 1);
+    K values_shape_last =
+        is_values_scalar ? 1 : values->shape_view().At(values->shape_view().NumAxes() - 1);
+    K sequence_shape_last =
+        sorted_sequence->shape_view().At(sorted_sequence->shape_view().NumAxes() - 1);
+    RUN_CUDA_KERNEL((DoSearchSortedLogical<T, K>), ctx->stream(), instance_num, instance_num,
+                    is_sequence_1d, values_shape_last, sequence_shape_last, right, values_ptr,
+                    sequence_ptr, out_ptr);
+  }
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
+};
+
+#define REGISTER_GPU_SEARCH_SORTED_KERNEL(in_dtype, out_dtype)                              \
+  REGISTER_USER_KERNEL("searchsorted")                                                      \
+      .SetCreateFn<                                                                         \
+          GpuSearchSortedKernel<OF_PP_PAIR_FIRST(in_dtype), OF_PP_PAIR_FIRST(out_dtype)>>() \
+      .SetIsMatchedHob(                                                                     \
+          (user_op::HobDeviceType() == DeviceType::kCUDA)                                   \
+          && (user_op::HobDataType("sorted_sequence", 0) == OF_PP_PAIR_SECOND(in_dtype))    \
+          && (user_op::HobDataType("values", 0) == OF_PP_PAIR_SECOND(in_dtype))             \
+          && (user_op::HobDataType("out", 0) == OF_PP_PAIR_SECOND(out_dtype)));
+
+OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_GPU_SEARCH_SORTED_KERNEL, ARITHMETIC_DATA_TYPE_SEQ,
+                                 INDEX_DATA_TYPE_SEQ)
+
+template<typename T, typename K>
+class GpuSearchSortedScalarKernel final : public user_op::OpKernel {
+ public:
+  GpuSearchSortedScalarKernel() = default;
+  ~GpuSearchSortedScalarKernel() = default;
+
+ private:
+  using user_op::OpKernel::Compute;
+  void Compute(user_op::KernelComputeContext* ctx) const override {
+    const user_op::Tensor* sorted_sequence = ctx->Tensor4ArgNameAndIndex("sorted_sequence", 0);
+    user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0);
+
+    const bool& right = ctx->Attr<bool>("right");
+    const T& values = static_cast<T>(ctx->Attr<double>("values"));
+
+    const T* sequence_ptr = sorted_sequence->dptr<T>();
+    K* out_ptr = out->mut_dptr<K>();
+    K sequence_shape_last = sorted_sequence->shape_view().At(0);
+    RUN_CUDA_KERNEL((DoSearchSortedScalarLogical<T, K>), ctx->stream(), 1, sequence_shape_last,
+                    right, values, sequence_ptr, out_ptr);
+  }
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
+};
+
+#define REGISTER_GPU_SEARCH_SORTED_SCALAR_KERNEL(in_dtype, out_dtype)                             \
+  REGISTER_USER_KERNEL("searchsorted_scalar")                                                     \
+      .SetCreateFn<                                                                               \
+          GpuSearchSortedScalarKernel<OF_PP_PAIR_FIRST(in_dtype), OF_PP_PAIR_FIRST(out_dtype)>>() \
+      .SetIsMatchedHob(                                                                           \
+          (user_op::HobDeviceType() == DeviceType::kCUDA)                                         \
+          && (user_op::HobDataType("sorted_sequence", 0) == OF_PP_PAIR_SECOND(in_dtype))          \
+          && (user_op::HobDataType("out", 0) == OF_PP_PAIR_SECOND(out_dtype)));
+
+OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_GPU_SEARCH_SORTED_SCALAR_KERNEL, ARITHMETIC_DATA_TYPE_SEQ,
+                                 INDEX_DATA_TYPE_SEQ)
+
 }  // namespace oneflow
\ No newline at end of file
diff --git a/oneflow/user/kernels/sigmoid_cross_entropy_kernel.hip.cpp b/oneflow/user/kernels/sigmoid_cross_entropy_kernel.hip.cpp
index 237d71b..2181b4b 100644
--- a/oneflow/user/kernels/sigmoid_cross_entropy_kernel.hip.cpp
+++ b/oneflow/user/kernels/sigmoid_cross_entropy_kernel.hip.cpp
@@ -1,55 +1,55 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "oneflow/core/hip/elementwise.hip.h"
-#include "oneflow/user/kernels/sigmoid_cross_entropy_kernel.h"
-#include "oneflow/core/ep/rocm/cuda_stream.h"
-
-namespace oneflow {
-
-namespace {
-template<template<typename, typename> class Opt, typename PredT, typename LabelT>
-struct ElemwiseSigmoidCrossEntropyGradFunctor<DeviceType::kCUDA, Opt, PredT, LabelT> final {
-  void operator()(ep::Stream* stream, int64_t n, PredT* prediction_diff, const PredT* prediction,
-                  const LabelT* label, const PredT* loss_diff) {
-    OF_CUDA_CHECK(cuda::elementwise::Ternary(Opt<PredT, LabelT>(), n, prediction_diff, prediction,
-                                             label, loss_diff,
-                                             stream->As<ep::CudaStream>()->cuda_stream()));
-  }
-};
-
-template<template<typename, typename> class Opt, typename PredT, typename LabelT>
-struct ElemwiseSigmoidCrossEntropyFunctor<DeviceType::kCUDA, Opt, PredT, LabelT> final {
-  void operator()(ep::Stream* stream, int64_t n, PredT* loss, const PredT* prediction,
-                  const LabelT* label) {
-    OF_CUDA_CHECK(cuda::elementwise::Binary(Opt<PredT, LabelT>(), n, loss, prediction, label,
-                                            stream->As<ep::CudaStream>()->cuda_stream()));
-  }
-};
-}  // namespace
-REGISTER_SIGMOID_CROSS_ENTROPY_KERNEL(DeviceType::kCUDA, float, int32_t)
-REGISTER_SIGMOID_CROSS_ENTROPY_KERNEL(DeviceType::kCUDA, double, int32_t)
-REGISTER_SIGMOID_CROSS_ENTROPY_KERNEL(DeviceType::kCUDA, float, int8_t)
-REGISTER_SIGMOID_CROSS_ENTROPY_KERNEL(DeviceType::kCUDA, double, int8_t)
-REGISTER_SIGMOID_CROSS_ENTROPY_KERNEL(DeviceType::kCUDA, float, float)
-REGISTER_SIGMOID_CROSS_ENTROPY_KERNEL(DeviceType::kCUDA, double, double)
-REGISTER_SIGMOID_CROSS_ENTROPY_GRAD_KERNEL(DeviceType::kCUDA, float, int32_t)
-REGISTER_SIGMOID_CROSS_ENTROPY_GRAD_KERNEL(DeviceType::kCUDA, double, int32_t)
-REGISTER_SIGMOID_CROSS_ENTROPY_GRAD_KERNEL(DeviceType::kCUDA, float, int8_t)
-REGISTER_SIGMOID_CROSS_ENTROPY_GRAD_KERNEL(DeviceType::kCUDA, double, int8_t)
-REGISTER_SIGMOID_CROSS_ENTROPY_GRAD_KERNEL(DeviceType::kCUDA, float, float)
-REGISTER_SIGMOID_CROSS_ENTROPY_GRAD_KERNEL(DeviceType::kCUDA, double, double)
-
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "oneflow/core/hip/elementwise.hip.h"
+#include "oneflow/user/kernels/sigmoid_cross_entropy_kernel.h"
+#include "oneflow/core/ep/rocm/cuda_stream.h"
+
+namespace oneflow {
+
+namespace {
+template<template<typename, typename> class Opt, typename PredT, typename LabelT>
+struct ElemwiseSigmoidCrossEntropyGradFunctor<DeviceType::kCUDA, Opt, PredT, LabelT> final {
+  void operator()(ep::Stream* stream, int64_t n, PredT* prediction_diff, const PredT* prediction,
+                  const LabelT* label, const PredT* loss_diff) {
+    OF_CUDA_CHECK(cuda::elementwise::Ternary(Opt<PredT, LabelT>(), n, prediction_diff, prediction,
+                                             label, loss_diff,
+                                             stream->As<ep::CudaStream>()->cuda_stream()));
+  }
+};
+
+template<template<typename, typename> class Opt, typename PredT, typename LabelT>
+struct ElemwiseSigmoidCrossEntropyFunctor<DeviceType::kCUDA, Opt, PredT, LabelT> final {
+  void operator()(ep::Stream* stream, int64_t n, PredT* loss, const PredT* prediction,
+                  const LabelT* label) {
+    OF_CUDA_CHECK(cuda::elementwise::Binary(Opt<PredT, LabelT>(), n, loss, prediction, label,
+                                            stream->As<ep::CudaStream>()->cuda_stream()));
+  }
+};
+}  // namespace
+REGISTER_SIGMOID_CROSS_ENTROPY_KERNEL(DeviceType::kCUDA, float, int32_t)
+REGISTER_SIGMOID_CROSS_ENTROPY_KERNEL(DeviceType::kCUDA, double, int32_t)
+REGISTER_SIGMOID_CROSS_ENTROPY_KERNEL(DeviceType::kCUDA, float, int8_t)
+REGISTER_SIGMOID_CROSS_ENTROPY_KERNEL(DeviceType::kCUDA, double, int8_t)
+REGISTER_SIGMOID_CROSS_ENTROPY_KERNEL(DeviceType::kCUDA, float, float)
+REGISTER_SIGMOID_CROSS_ENTROPY_KERNEL(DeviceType::kCUDA, double, double)
+REGISTER_SIGMOID_CROSS_ENTROPY_GRAD_KERNEL(DeviceType::kCUDA, float, int32_t)
+REGISTER_SIGMOID_CROSS_ENTROPY_GRAD_KERNEL(DeviceType::kCUDA, double, int32_t)
+REGISTER_SIGMOID_CROSS_ENTROPY_GRAD_KERNEL(DeviceType::kCUDA, float, int8_t)
+REGISTER_SIGMOID_CROSS_ENTROPY_GRAD_KERNEL(DeviceType::kCUDA, double, int8_t)
+REGISTER_SIGMOID_CROSS_ENTROPY_GRAD_KERNEL(DeviceType::kCUDA, float, float)
+REGISTER_SIGMOID_CROSS_ENTROPY_GRAD_KERNEL(DeviceType::kCUDA, double, double)
+
 }  // namespace oneflow
\ No newline at end of file
diff --git a/oneflow/user/kernels/slice_util.hip.cpp b/oneflow/user/kernels/slice_util.hip.cpp
index a008c27..2f32177 100644
--- a/oneflow/user/kernels/slice_util.hip.cpp
+++ b/oneflow/user/kernels/slice_util.hip.cpp
@@ -1,232 +1,232 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "hip/hip_runtime.h"
-#include "oneflow/user/kernels/slice_util.h"
-#include "oneflow/core/common/switch_func.h"
-#include "oneflow/core/ep/rocm/cuda_stream.h"
-
-namespace oneflow {
-
-namespace {
-
-template<typename T, int NDIM>
-__global__ void SliceForwardGpu(const int n, SliceParams params,
-                                SliceIndexHelper<NDIM> entire_idx_cvtr,
-                                SliceIndexHelper<NDIM> sliced_idx_cvtr, const T* entire,
-                                T* sliced) {
-  CUDA_1D_KERNEL_LOOP(i, n) {
-    int64_t offset = SliceOffsetToEntireOffset<NDIM>(i, params, entire_idx_cvtr, sliced_idx_cvtr);
-    sliced[i] = entire[offset];
-  }
-}
-
-template<typename T, int NDIM>
-__global__ void SliceForwardGpu(const int n, SliceParams entire_params, SliceParams sliced_params,
-                                SliceIndexHelper<NDIM> entire_splitted_large_idx_cvtr,
-                                SliceIndexHelper<NDIM> sliced_splitted_large_idx_cvtr,
-                                SliceIndexHelper<NDIM> entire_full_small_idx_cvtr,
-                                SliceIndexHelper<NDIM> sliced_full_small_idx_cvtr, const T* entire,
-                                T* sliced) {
-  CUDA_1D_KERNEL_LOOP(i, n) {
-    int64_t entire_offset = SliceOffsetToEntireOffset<NDIM>(
-        i, entire_params, entire_splitted_large_idx_cvtr, sliced_splitted_large_idx_cvtr);
-    int64_t sliced_offset = SliceOffsetToEntireOffset<NDIM>(
-        i, sliced_params, entire_full_small_idx_cvtr, sliced_full_small_idx_cvtr);
-    sliced[sliced_offset] = entire[entire_offset];
-  }
-}
-
-template<typename T, int NDIM>
-__global__ void SliceBackwardGpu(const int n, SliceParams params,
-                                 SliceIndexHelper<NDIM> entire_idx_cvtr,
-                                 SliceIndexHelper<NDIM> sliced_idx_cvtr, T* entire,
-                                 const T* sliced) {
-  CUDA_1D_KERNEL_LOOP(i, n) {
-    int64_t offset = SliceOffsetToEntireOffset<NDIM>(i, params, entire_idx_cvtr, sliced_idx_cvtr);
-    entire[offset] = sliced[i];
-  }
-}
-
-template<typename T, int NDIM>
-void LaunchSliceForward(ep::Stream* stream, const SliceParams& params, const T* entire, T* sliced) {
-  CHECK_EQ(params.ndim, NDIM);
-  int64_t elem_cnt = params.elem_cnt();
-  SliceIndexHelper<NDIM> entire_idx_cvtr(params.dims);
-  SliceIndexHelper<NDIM> sliced_idx_cvtr(params.size);
-  if (elem_cnt == 0) { return; }
-  SliceForwardGpu<T, NDIM><<<BlocksNum4ThreadsNum(elem_cnt), kCudaThreadsNumPerBlock, 0,
-                             stream->As<ep::CudaStream>()->cuda_stream()>>>(
-      elem_cnt, params, entire_idx_cvtr, sliced_idx_cvtr, entire, sliced);
-}
-
-template<typename T, int NDIM>
-void LaunchSliceForward(ep::Stream* stream, const SliceParams& entire_params,
-                        const SliceParams& sliced_params, const T* entire, T* sliced) {
-  CHECK_EQ(entire_params.ndim, NDIM);
-  CHECK_EQ(sliced_params.ndim, NDIM);
-  int64_t elem_cnt = entire_params.elem_cnt();
-  if (elem_cnt == 0) { return; }
-  SliceIndexHelper<NDIM> entire_splitted_large_idx_cvtr(entire_params.dims);
-  SliceIndexHelper<NDIM> sliced_splitted_large_idx_cvtr(entire_params.size);
-  SliceIndexHelper<NDIM> entire_full_small_idx_cvtr(sliced_params.dims);
-  SliceIndexHelper<NDIM> sliced_full_small_idx_cvtr(sliced_params.size);
-  SliceForwardGpu<T, NDIM><<<BlocksNum4ThreadsNum(elem_cnt), kCudaThreadsNumPerBlock, 0,
-                             stream->As<ep::CudaStream>()->cuda_stream()>>>(
-      elem_cnt, entire_params, sliced_params, entire_splitted_large_idx_cvtr,
-      sliced_splitted_large_idx_cvtr, entire_full_small_idx_cvtr, sliced_full_small_idx_cvtr,
-      entire, sliced);
-}
-
-template<typename T, int NDIM>
-void LaunchSliceBackward(ep::Stream* stream, const SliceParams& params, const T* sliced,
-                         T* entire) {
-  CHECK_EQ(params.ndim, NDIM);
-  int64_t elem_cnt = params.elem_cnt();
-  SliceIndexHelper<NDIM> entire_idx_cvtr(params.dims);
-  SliceIndexHelper<NDIM> sliced_idx_cvtr(params.size);
-  if (elem_cnt == 0) { return; }
-  SliceBackwardGpu<T, NDIM><<<BlocksNum4ThreadsNum(elem_cnt), kCudaThreadsNumPerBlock, 0,
-                              stream->As<ep::CudaStream>()->cuda_stream()>>>(
-      elem_cnt, params, entire_idx_cvtr, sliced_idx_cvtr, entire, sliced);
-}
-
-template<typename T>
-struct SliceSwitchUtil final {
-#define MAKE_SLICE_SWITCH_ENTRY(func_name, N) func_name<T, N>
-#define DEFINE_SLICE_SWITCH_UTIL_STATIC_METHOD(func_name) \
-  DEFINE_STATIC_SWITCH_FUNC(void, func_name, MAKE_SLICE_SWITCH_ENTRY, MAKE_NDIM_CTRV_SEQ(DIM_SEQ))
-
-  DEFINE_SLICE_SWITCH_UTIL_STATIC_METHOD(LaunchSliceForward)
-  DEFINE_SLICE_SWITCH_UTIL_STATIC_METHOD(LaunchSliceBackward)
-#undef DEFINE_SLICE_SWITCH_UTIL_STATIC_METHOD
-#undef MAKE_SLICE_SWITCH_ENTRY
-};
-
-template<typename T>
-size_t GetPackSize(const SliceParams& params, const T* entire, const T* sliced) {
-  CHECK_GT(params.ndim, 0);
-  const int64_t last_dim = params.ndim - 1;
-  const int64_t mask = (params.dims[last_dim] * sizeof(T)) | (params.start[last_dim] * sizeof(T))
-                       | (params.size[last_dim] * sizeof(T))
-                       | static_cast<int64_t>(reinterpret_cast<uintptr_t>(entire))
-                       | static_cast<int64_t>(reinterpret_cast<uintptr_t>(sliced));
-  if ((mask & 0xF) == 0) {
-    return 16;
-  } else if ((mask & 0x7) == 0) {
-    return 8;
-  } else if ((mask & 0x3) == 0) {
-    return 4;
-  } else if ((mask & 0x1) == 0) {
-    return 2;
-  } else {
-    return 1;
-  }
-}
-
-template<typename T>
-void GetPackedParams(const SliceParams& params, const T* entire, const T* sliced, size_t* pack_size,
-                     SliceParams* packed_params) {
-  CHECK_GT(params.ndim, 0);
-  const int64_t last_dim = params.ndim - 1;
-  if (params.step[last_dim] == 1) {
-    *pack_size = GetPackSize<T>(params, entire, sliced);
-    CHECK_GE(*pack_size, sizeof(T));
-    const int64_t elem_per_pack = *pack_size / sizeof(T);
-    *packed_params = params;
-    packed_params->dims[last_dim] /= elem_per_pack;
-    packed_params->start[last_dim] /= elem_per_pack;
-    packed_params->size[last_dim] /= elem_per_pack;
-  } else {
-    *pack_size = sizeof(T);
-    *packed_params = params;
-  }
-}
-
-}  // namespace
-
-template<typename T>
-struct SliceKernelUtil<DeviceType::kCUDA, T> {
-  static void Forward(ep::Stream* stream, const SliceParams& params, const T* entire, T* sliced) {
-    SliceParams fold_slice_params = FoldContiguousFullSliceDimensions(params);
-    size_t pack_size;
-    SliceParams packed_params{};
-    GetPackedParams<T>(fold_slice_params, entire, sliced, &pack_size, &packed_params);
-    if (pack_size == 1) {
-      SliceSwitchUtil<uint8_t>::SwitchLaunchSliceForward(
-          SwitchCase(packed_params.ndim), stream, packed_params,
-          reinterpret_cast<const uint8_t*>(entire), reinterpret_cast<uint8_t*>(sliced));
-    } else if (pack_size == 2) {
-      SliceSwitchUtil<uint16_t>::SwitchLaunchSliceForward(
-          SwitchCase(packed_params.ndim), stream, packed_params,
-          reinterpret_cast<const uint16_t*>(entire), reinterpret_cast<uint16_t*>(sliced));
-    } else if (pack_size == 4) {
-      SliceSwitchUtil<uint32_t>::SwitchLaunchSliceForward(
-          SwitchCase(packed_params.ndim), stream, packed_params,
-          reinterpret_cast<const uint32_t*>(entire), reinterpret_cast<uint32_t*>(sliced));
-    } else if (pack_size == 8) {
-      SliceSwitchUtil<uint64_t>::SwitchLaunchSliceForward(
-          SwitchCase(packed_params.ndim), stream, packed_params,
-          reinterpret_cast<const uint64_t*>(entire), reinterpret_cast<uint64_t*>(sliced));
-    } else if (pack_size == 16) {
-      SliceSwitchUtil<ulonglong2>::SwitchLaunchSliceForward(
-          SwitchCase(packed_params.ndim), stream, packed_params,
-          reinterpret_cast<const ulonglong2*>(entire), reinterpret_cast<ulonglong2*>(sliced));
-    } else {
-      UNIMPLEMENTED();
-    }
-  }
-
-  static void Forward(ep::Stream* stream, const SliceParams& entire_params,
-                      const SliceParams& sliced_params, const T* entire, T* sliced) {
-    SliceSwitchUtil<T>::SwitchLaunchSliceForward(SwitchCase(entire_params.ndim), stream,
-                                                 entire_params, sliced_params, entire, sliced);
-  }
-
-  static void Backward(ep::Stream* stream, const SliceParams& params, const T* sliced, T* entire) {
-    SliceParams fold_slice_params = FoldContiguousFullSliceDimensions(params);
-    size_t pack_size;
-    SliceParams packed_params{};
-    GetPackedParams<T>(fold_slice_params, entire, sliced, &pack_size, &packed_params);
-    if (pack_size == 1) {
-      SliceSwitchUtil<uint8_t>::SwitchLaunchSliceBackward(
-          SwitchCase(packed_params.ndim), stream, packed_params,
-          reinterpret_cast<const uint8_t*>(sliced), reinterpret_cast<uint8_t*>(entire));
-    } else if (pack_size == 2) {
-      SliceSwitchUtil<uint16_t>::SwitchLaunchSliceBackward(
-          SwitchCase(packed_params.ndim), stream, packed_params,
-          reinterpret_cast<const uint16_t*>(sliced), reinterpret_cast<uint16_t*>(entire));
-    } else if (pack_size == 4) {
-      SliceSwitchUtil<uint32_t>::SwitchLaunchSliceBackward(
-          SwitchCase(packed_params.ndim), stream, packed_params,
-          reinterpret_cast<const uint32_t*>(sliced), reinterpret_cast<uint32_t*>(entire));
-    } else if (pack_size == 8) {
-      SliceSwitchUtil<uint64_t>::SwitchLaunchSliceBackward(
-          SwitchCase(packed_params.ndim), stream, packed_params,
-          reinterpret_cast<const uint64_t*>(sliced), reinterpret_cast<uint64_t*>(entire));
-    } else if (pack_size == 16) {
-      SliceSwitchUtil<ulonglong2>::SwitchLaunchSliceBackward(
-          SwitchCase(packed_params.ndim), stream, packed_params,
-          reinterpret_cast<const ulonglong2*>(sliced), reinterpret_cast<ulonglong2*>(entire));
-    } else {
-      UNIMPLEMENTED();
-    }
-  }
-};
-
-INSTANTIATE_SLICE_KERNEL_UTIL_WITH_DEVICE(DeviceType::kCUDA)
-INSTANTIATE_SLICE_KERNEL_UTIL(DeviceType::kCUDA, float16)
-
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "hip/hip_runtime.h"
+#include "oneflow/user/kernels/slice_util.h"
+#include "oneflow/core/common/switch_func.h"
+#include "oneflow/core/ep/rocm/cuda_stream.h"
+
+namespace oneflow {
+
+namespace {
+
+template<typename T, int NDIM>
+__global__ void SliceForwardGpu(const int n, SliceParams params,
+                                SliceIndexHelper<NDIM> entire_idx_cvtr,
+                                SliceIndexHelper<NDIM> sliced_idx_cvtr, const T* entire,
+                                T* sliced) {
+  CUDA_1D_KERNEL_LOOP(i, n) {
+    int64_t offset = SliceOffsetToEntireOffset<NDIM>(i, params, entire_idx_cvtr, sliced_idx_cvtr);
+    sliced[i] = entire[offset];
+  }
+}
+
+template<typename T, int NDIM>
+__global__ void SliceForwardGpu(const int n, SliceParams entire_params, SliceParams sliced_params,
+                                SliceIndexHelper<NDIM> entire_splitted_large_idx_cvtr,
+                                SliceIndexHelper<NDIM> sliced_splitted_large_idx_cvtr,
+                                SliceIndexHelper<NDIM> entire_full_small_idx_cvtr,
+                                SliceIndexHelper<NDIM> sliced_full_small_idx_cvtr, const T* entire,
+                                T* sliced) {
+  CUDA_1D_KERNEL_LOOP(i, n) {
+    int64_t entire_offset = SliceOffsetToEntireOffset<NDIM>(
+        i, entire_params, entire_splitted_large_idx_cvtr, sliced_splitted_large_idx_cvtr);
+    int64_t sliced_offset = SliceOffsetToEntireOffset<NDIM>(
+        i, sliced_params, entire_full_small_idx_cvtr, sliced_full_small_idx_cvtr);
+    sliced[sliced_offset] = entire[entire_offset];
+  }
+}
+
+template<typename T, int NDIM>
+__global__ void SliceBackwardGpu(const int n, SliceParams params,
+                                 SliceIndexHelper<NDIM> entire_idx_cvtr,
+                                 SliceIndexHelper<NDIM> sliced_idx_cvtr, T* entire,
+                                 const T* sliced) {
+  CUDA_1D_KERNEL_LOOP(i, n) {
+    int64_t offset = SliceOffsetToEntireOffset<NDIM>(i, params, entire_idx_cvtr, sliced_idx_cvtr);
+    entire[offset] = sliced[i];
+  }
+}
+
+template<typename T, int NDIM>
+void LaunchSliceForward(ep::Stream* stream, const SliceParams& params, const T* entire, T* sliced) {
+  CHECK_EQ(params.ndim, NDIM);
+  int64_t elem_cnt = params.elem_cnt();
+  SliceIndexHelper<NDIM> entire_idx_cvtr(params.dims);
+  SliceIndexHelper<NDIM> sliced_idx_cvtr(params.size);
+  if (elem_cnt == 0) { return; }
+  SliceForwardGpu<T, NDIM><<<BlocksNum4ThreadsNum(elem_cnt), kCudaThreadsNumPerBlock, 0,
+                             stream->As<ep::CudaStream>()->cuda_stream()>>>(
+      elem_cnt, params, entire_idx_cvtr, sliced_idx_cvtr, entire, sliced);
+}
+
+template<typename T, int NDIM>
+void LaunchSliceForward(ep::Stream* stream, const SliceParams& entire_params,
+                        const SliceParams& sliced_params, const T* entire, T* sliced) {
+  CHECK_EQ(entire_params.ndim, NDIM);
+  CHECK_EQ(sliced_params.ndim, NDIM);
+  int64_t elem_cnt = entire_params.elem_cnt();
+  if (elem_cnt == 0) { return; }
+  SliceIndexHelper<NDIM> entire_splitted_large_idx_cvtr(entire_params.dims);
+  SliceIndexHelper<NDIM> sliced_splitted_large_idx_cvtr(entire_params.size);
+  SliceIndexHelper<NDIM> entire_full_small_idx_cvtr(sliced_params.dims);
+  SliceIndexHelper<NDIM> sliced_full_small_idx_cvtr(sliced_params.size);
+  SliceForwardGpu<T, NDIM><<<BlocksNum4ThreadsNum(elem_cnt), kCudaThreadsNumPerBlock, 0,
+                             stream->As<ep::CudaStream>()->cuda_stream()>>>(
+      elem_cnt, entire_params, sliced_params, entire_splitted_large_idx_cvtr,
+      sliced_splitted_large_idx_cvtr, entire_full_small_idx_cvtr, sliced_full_small_idx_cvtr,
+      entire, sliced);
+}
+
+template<typename T, int NDIM>
+void LaunchSliceBackward(ep::Stream* stream, const SliceParams& params, const T* sliced,
+                         T* entire) {
+  CHECK_EQ(params.ndim, NDIM);
+  int64_t elem_cnt = params.elem_cnt();
+  SliceIndexHelper<NDIM> entire_idx_cvtr(params.dims);
+  SliceIndexHelper<NDIM> sliced_idx_cvtr(params.size);
+  if (elem_cnt == 0) { return; }
+  SliceBackwardGpu<T, NDIM><<<BlocksNum4ThreadsNum(elem_cnt), kCudaThreadsNumPerBlock, 0,
+                              stream->As<ep::CudaStream>()->cuda_stream()>>>(
+      elem_cnt, params, entire_idx_cvtr, sliced_idx_cvtr, entire, sliced);
+}
+
+template<typename T>
+struct SliceSwitchUtil final {
+#define MAKE_SLICE_SWITCH_ENTRY(func_name, N) func_name<T, N>
+#define DEFINE_SLICE_SWITCH_UTIL_STATIC_METHOD(func_name) \
+  DEFINE_STATIC_SWITCH_FUNC(void, func_name, MAKE_SLICE_SWITCH_ENTRY, MAKE_NDIM_CTRV_SEQ(DIM_SEQ))
+
+  DEFINE_SLICE_SWITCH_UTIL_STATIC_METHOD(LaunchSliceForward)
+  DEFINE_SLICE_SWITCH_UTIL_STATIC_METHOD(LaunchSliceBackward)
+#undef DEFINE_SLICE_SWITCH_UTIL_STATIC_METHOD
+#undef MAKE_SLICE_SWITCH_ENTRY
+};
+
+template<typename T>
+size_t GetPackSize(const SliceParams& params, const T* entire, const T* sliced) {
+  CHECK_GT(params.ndim, 0);
+  const int64_t last_dim = params.ndim - 1;
+  const int64_t mask = (params.dims[last_dim] * sizeof(T)) | (params.start[last_dim] * sizeof(T))
+                       | (params.size[last_dim] * sizeof(T))
+                       | static_cast<int64_t>(reinterpret_cast<uintptr_t>(entire))
+                       | static_cast<int64_t>(reinterpret_cast<uintptr_t>(sliced));
+  if ((mask & 0xF) == 0) {
+    return 16;
+  } else if ((mask & 0x7) == 0) {
+    return 8;
+  } else if ((mask & 0x3) == 0) {
+    return 4;
+  } else if ((mask & 0x1) == 0) {
+    return 2;
+  } else {
+    return 1;
+  }
+}
+
+template<typename T>
+void GetPackedParams(const SliceParams& params, const T* entire, const T* sliced, size_t* pack_size,
+                     SliceParams* packed_params) {
+  CHECK_GT(params.ndim, 0);
+  const int64_t last_dim = params.ndim - 1;
+  if (params.step[last_dim] == 1) {
+    *pack_size = GetPackSize<T>(params, entire, sliced);
+    CHECK_GE(*pack_size, sizeof(T));
+    const int64_t elem_per_pack = *pack_size / sizeof(T);
+    *packed_params = params;
+    packed_params->dims[last_dim] /= elem_per_pack;
+    packed_params->start[last_dim] /= elem_per_pack;
+    packed_params->size[last_dim] /= elem_per_pack;
+  } else {
+    *pack_size = sizeof(T);
+    *packed_params = params;
+  }
+}
+
+}  // namespace
+
+template<typename T>
+struct SliceKernelUtil<DeviceType::kCUDA, T> {
+  static void Forward(ep::Stream* stream, const SliceParams& params, const T* entire, T* sliced) {
+    SliceParams fold_slice_params = FoldContiguousFullSliceDimensions(params);
+    size_t pack_size;
+    SliceParams packed_params{};
+    GetPackedParams<T>(fold_slice_params, entire, sliced, &pack_size, &packed_params);
+    if (pack_size == 1) {
+      SliceSwitchUtil<uint8_t>::SwitchLaunchSliceForward(
+          SwitchCase(packed_params.ndim), stream, packed_params,
+          reinterpret_cast<const uint8_t*>(entire), reinterpret_cast<uint8_t*>(sliced));
+    } else if (pack_size == 2) {
+      SliceSwitchUtil<uint16_t>::SwitchLaunchSliceForward(
+          SwitchCase(packed_params.ndim), stream, packed_params,
+          reinterpret_cast<const uint16_t*>(entire), reinterpret_cast<uint16_t*>(sliced));
+    } else if (pack_size == 4) {
+      SliceSwitchUtil<uint32_t>::SwitchLaunchSliceForward(
+          SwitchCase(packed_params.ndim), stream, packed_params,
+          reinterpret_cast<const uint32_t*>(entire), reinterpret_cast<uint32_t*>(sliced));
+    } else if (pack_size == 8) {
+      SliceSwitchUtil<uint64_t>::SwitchLaunchSliceForward(
+          SwitchCase(packed_params.ndim), stream, packed_params,
+          reinterpret_cast<const uint64_t*>(entire), reinterpret_cast<uint64_t*>(sliced));
+    } else if (pack_size == 16) {
+      SliceSwitchUtil<ulonglong2>::SwitchLaunchSliceForward(
+          SwitchCase(packed_params.ndim), stream, packed_params,
+          reinterpret_cast<const ulonglong2*>(entire), reinterpret_cast<ulonglong2*>(sliced));
+    } else {
+      UNIMPLEMENTED();
+    }
+  }
+
+  static void Forward(ep::Stream* stream, const SliceParams& entire_params,
+                      const SliceParams& sliced_params, const T* entire, T* sliced) {
+    SliceSwitchUtil<T>::SwitchLaunchSliceForward(SwitchCase(entire_params.ndim), stream,
+                                                 entire_params, sliced_params, entire, sliced);
+  }
+
+  static void Backward(ep::Stream* stream, const SliceParams& params, const T* sliced, T* entire) {
+    SliceParams fold_slice_params = FoldContiguousFullSliceDimensions(params);
+    size_t pack_size;
+    SliceParams packed_params{};
+    GetPackedParams<T>(fold_slice_params, entire, sliced, &pack_size, &packed_params);
+    if (pack_size == 1) {
+      SliceSwitchUtil<uint8_t>::SwitchLaunchSliceBackward(
+          SwitchCase(packed_params.ndim), stream, packed_params,
+          reinterpret_cast<const uint8_t*>(sliced), reinterpret_cast<uint8_t*>(entire));
+    } else if (pack_size == 2) {
+      SliceSwitchUtil<uint16_t>::SwitchLaunchSliceBackward(
+          SwitchCase(packed_params.ndim), stream, packed_params,
+          reinterpret_cast<const uint16_t*>(sliced), reinterpret_cast<uint16_t*>(entire));
+    } else if (pack_size == 4) {
+      SliceSwitchUtil<uint32_t>::SwitchLaunchSliceBackward(
+          SwitchCase(packed_params.ndim), stream, packed_params,
+          reinterpret_cast<const uint32_t*>(sliced), reinterpret_cast<uint32_t*>(entire));
+    } else if (pack_size == 8) {
+      SliceSwitchUtil<uint64_t>::SwitchLaunchSliceBackward(
+          SwitchCase(packed_params.ndim), stream, packed_params,
+          reinterpret_cast<const uint64_t*>(sliced), reinterpret_cast<uint64_t*>(entire));
+    } else if (pack_size == 16) {
+      SliceSwitchUtil<ulonglong2>::SwitchLaunchSliceBackward(
+          SwitchCase(packed_params.ndim), stream, packed_params,
+          reinterpret_cast<const ulonglong2*>(sliced), reinterpret_cast<ulonglong2*>(entire));
+    } else {
+      UNIMPLEMENTED();
+    }
+  }
+};
+
+INSTANTIATE_SLICE_KERNEL_UTIL_WITH_DEVICE(DeviceType::kCUDA)
+INSTANTIATE_SLICE_KERNEL_UTIL(DeviceType::kCUDA, float16)
+
 }  // namespace oneflow
\ No newline at end of file
diff --git a/oneflow/user/kernels/smooth_l1_loss_kernel.hip.cpp b/oneflow/user/kernels/smooth_l1_loss_kernel.hip.cpp
index 59d4eeb..ff8aca8 100644
--- a/oneflow/user/kernels/smooth_l1_loss_kernel.hip.cpp
+++ b/oneflow/user/kernels/smooth_l1_loss_kernel.hip.cpp
@@ -1,145 +1,145 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "oneflow/core/framework/framework.h"
-#include "oneflow/core/hip/elementwise.hip.h"
-#include "oneflow/user/kernels/loss_kernel_util.h"
-#include "oneflow/core/ep/rocm/cuda_stream.h"
-
-namespace oneflow {
-namespace user_op {
-
-namespace {
-
-using namespace loss;
-
-template<typename T>
-struct SmoothL1Functor {
-  float beta_;
-  float inv_beta_;
-  T half_of_one_;
-  SmoothL1Functor(float beta)
-      : beta_(beta), inv_beta_(static_cast<float>(1.0 / beta)), half_of_one_(static_cast<T>(0.5)) {}
-
-  __device__ __forceinline__ T operator()(T input_val, T target_val) const {
-    const T abs_diff = abs(input_val - target_val);
-    if (abs_diff < beta_) {
-      return half_of_one_ * abs_diff * abs_diff * inv_beta_;
-    } else {
-      return abs_diff - half_of_one_ * beta_;
-    }
-  }
-};
-
-template<>
-struct SmoothL1Functor<half> {
-  half beta_;
-  half inv_beta_;
-  half zero_;
-  half half_of_one_;
-  SmoothL1Functor(float beta)
-      : beta_(__float2half(beta)),
-        inv_beta_(__float2half(static_cast<float>(1.0 / beta))),
-        zero_(__float2half(0.f)),
-        half_of_one_(__float2half(0.5f)) {}
-
-  __device__ __forceinline__ half operator()(half input_val, half target_val) const {
-    const half diff = input_val - target_val;
-    const half abs_diff = diff < zero_ ? __hneg(diff) : diff;
-    if (abs_diff < beta_) {
-      return half_of_one_ * abs_diff * abs_diff * inv_beta_;
-    } else {
-      return abs_diff - half_of_one_ * beta_;
-    }
-  }
-};
-
-template<typename T>
-struct SmoothL1GradFunctor {
-  float beta_;
-  float inv_beta_;
-  T zero_;
-  SmoothL1GradFunctor(float beta)
-      : beta_(beta), inv_beta_(static_cast<float>(1.0 / beta)), zero_(GetZeroVal<T>()) {}
-
-  __device__ __forceinline__ T operator()(T input_val, T target_val, T dy_val) const {
-    const T diff = input_val - target_val;
-    const T abs_diff = abs(diff);
-    T dx_val;
-    if (abs_diff < beta_) {
-      dx_val = diff * inv_beta_;
-    } else {
-      dx_val = (diff > zero_) - (diff < zero_);
-    }
-    return dx_val * dy_val;
-  }
-};
-
-template<>
-struct SmoothL1GradFunctor<half> {
-  half beta_;
-  half inv_beta_;
-  half zero_;
-  half one_;
-  SmoothL1GradFunctor(float beta)
-      : beta_(__float2half(beta)),
-        inv_beta_(__float2half(static_cast<float>(1.0 / beta))),
-        zero_(__float2half(0.f)),
-        one_(__float2half(1.f)) {}
-
-  __device__ __forceinline__ half operator()(half input_val, half target_val, half dy_val) const {
-    const half diff = input_val - target_val;
-    const half abs_diff = diff < zero_ ? __hneg(diff) : diff;
-    half dx_val;
-    if (abs_diff < beta_) {
-      dx_val = diff * inv_beta_;
-    } else {
-      dx_val = (diff > zero_) - (diff < zero_);
-    }
-    return dx_val * dy_val;
-  }
-};
-
-template<typename T>
-class SmoothL1LossKernel : public SimpleLossKernel<DeviceType::kCUDA, T, SmoothL1LossKernel<T>> {
- public:
-  void ComputeOut(user_op::KernelComputeContext* ctx, int64_t elem_cnt, const T* input,
-                  const T* target, T* out) const {
-    const float beta = ctx->Attr<float>("beta");
-    OF_CUDA_CHECK((cuda::elementwise::Binary(SmoothL1Functor<T>(beta), elem_cnt, out, input, target,
-                                             ctx->stream()->As<ep::CudaStream>()->cuda_stream())));
-  }
-};
-
-template<typename T>
-class SmoothL1LossGradKernel
-    : public SimpleLossGradKernel<DeviceType::kCUDA, T, SmoothL1LossGradKernel<T>> {
- public:
-  void ComputeOut(user_op::KernelComputeContext* ctx, int64_t elem_cnt, const T* input,
-                  const T* target, const T* dy, T* dx) const {
-    const float beta = ctx->Attr<float>("beta");
-    OF_CUDA_CHECK(
-        (cuda::elementwise::Ternary(SmoothL1GradFunctor<T>(beta), elem_cnt, dx, input, target, dy,
-                                    ctx->stream()->As<ep::CudaStream>()->cuda_stream())));
-  }
-};
-
-}  // namespace
-
-REGISTER_SIMPLE_LOSS_KERNEL_CUDA("smooth_l1_loss", SmoothL1LossKernel)
-REGISTER_SIMPLE_LOSS_GRAD_KERNEL_CUDA("smooth_l1_loss_grad", SmoothL1LossGradKernel)
-
-}  // namespace user_op
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "oneflow/core/framework/framework.h"
+#include "oneflow/core/hip/elementwise.hip.h"
+#include "oneflow/user/kernels/loss_kernel_util.h"
+#include "oneflow/core/ep/rocm/cuda_stream.h"
+
+namespace oneflow {
+namespace user_op {
+
+namespace {
+
+using namespace loss;
+
+template<typename T>
+struct SmoothL1Functor {
+  float beta_;
+  float inv_beta_;
+  T half_of_one_;
+  SmoothL1Functor(float beta)
+      : beta_(beta), inv_beta_(static_cast<float>(1.0 / beta)), half_of_one_(static_cast<T>(0.5)) {}
+
+  __device__ __forceinline__ T operator()(T input_val, T target_val) const {
+    const T abs_diff = abs(input_val - target_val);
+    if (abs_diff < beta_) {
+      return half_of_one_ * abs_diff * abs_diff * inv_beta_;
+    } else {
+      return abs_diff - half_of_one_ * beta_;
+    }
+  }
+};
+
+template<>
+struct SmoothL1Functor<half> {
+  half beta_;
+  half inv_beta_;
+  half zero_;
+  half half_of_one_;
+  SmoothL1Functor(float beta)
+      : beta_(__float2half(beta)),
+        inv_beta_(__float2half(static_cast<float>(1.0 / beta))),
+        zero_(__float2half(0.f)),
+        half_of_one_(__float2half(0.5f)) {}
+
+  __device__ __forceinline__ half operator()(half input_val, half target_val) const {
+    const half diff = input_val - target_val;
+    const half abs_diff = diff < zero_ ? __hneg(diff) : diff;
+    if (abs_diff < beta_) {
+      return half_of_one_ * abs_diff * abs_diff * inv_beta_;
+    } else {
+      return abs_diff - half_of_one_ * beta_;
+    }
+  }
+};
+
+template<typename T>
+struct SmoothL1GradFunctor {
+  float beta_;
+  float inv_beta_;
+  T zero_;
+  SmoothL1GradFunctor(float beta)
+      : beta_(beta), inv_beta_(static_cast<float>(1.0 / beta)), zero_(GetZeroVal<T>()) {}
+
+  __device__ __forceinline__ T operator()(T input_val, T target_val, T dy_val) const {
+    const T diff = input_val - target_val;
+    const T abs_diff = abs(diff);
+    T dx_val;
+    if (abs_diff < beta_) {
+      dx_val = diff * inv_beta_;
+    } else {
+      dx_val = (diff > zero_) - (diff < zero_);
+    }
+    return dx_val * dy_val;
+  }
+};
+
+template<>
+struct SmoothL1GradFunctor<half> {
+  half beta_;
+  half inv_beta_;
+  half zero_;
+  half one_;
+  SmoothL1GradFunctor(float beta)
+      : beta_(__float2half(beta)),
+        inv_beta_(__float2half(static_cast<float>(1.0 / beta))),
+        zero_(__float2half(0.f)),
+        one_(__float2half(1.f)) {}
+
+  __device__ __forceinline__ half operator()(half input_val, half target_val, half dy_val) const {
+    const half diff = input_val - target_val;
+    const half abs_diff = diff < zero_ ? __hneg(diff) : diff;
+    half dx_val;
+    if (abs_diff < beta_) {
+      dx_val = diff * inv_beta_;
+    } else {
+      dx_val = (diff > zero_) - (diff < zero_);
+    }
+    return dx_val * dy_val;
+  }
+};
+
+template<typename T>
+class SmoothL1LossKernel : public SimpleLossKernel<DeviceType::kCUDA, T, SmoothL1LossKernel<T>> {
+ public:
+  void ComputeOut(user_op::KernelComputeContext* ctx, int64_t elem_cnt, const T* input,
+                  const T* target, T* out) const {
+    const float beta = ctx->Attr<float>("beta");
+    OF_CUDA_CHECK((cuda::elementwise::Binary(SmoothL1Functor<T>(beta), elem_cnt, out, input, target,
+                                             ctx->stream()->As<ep::CudaStream>()->cuda_stream())));
+  }
+};
+
+template<typename T>
+class SmoothL1LossGradKernel
+    : public SimpleLossGradKernel<DeviceType::kCUDA, T, SmoothL1LossGradKernel<T>> {
+ public:
+  void ComputeOut(user_op::KernelComputeContext* ctx, int64_t elem_cnt, const T* input,
+                  const T* target, const T* dy, T* dx) const {
+    const float beta = ctx->Attr<float>("beta");
+    OF_CUDA_CHECK(
+        (cuda::elementwise::Ternary(SmoothL1GradFunctor<T>(beta), elem_cnt, dx, input, target, dy,
+                                    ctx->stream()->As<ep::CudaStream>()->cuda_stream())));
+  }
+};
+
+}  // namespace
+
+REGISTER_SIMPLE_LOSS_KERNEL_CUDA("smooth_l1_loss", SmoothL1LossKernel)
+REGISTER_SIMPLE_LOSS_GRAD_KERNEL_CUDA("smooth_l1_loss_grad", SmoothL1LossGradKernel)
+
+}  // namespace user_op
 }  // namespace oneflow
\ No newline at end of file
diff --git a/oneflow/user/kernels/softmax_cross_entropy_kernel.hip.cpp b/oneflow/user/kernels/softmax_cross_entropy_kernel.hip.cpp
index 40e5cb1..c15d355 100644
--- a/oneflow/user/kernels/softmax_cross_entropy_kernel.hip.cpp
+++ b/oneflow/user/kernels/softmax_cross_entropy_kernel.hip.cpp
@@ -1,156 +1,156 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "hip/hip_runtime.h"
-#include "oneflow/user/kernels/softmax_cross_entropy_kernel.h"
-#include "oneflow/core/kernel/kernel_util.hip.h"
-#include "oneflow/core/ep/rocm/cuda_stream.h"
-#include <hipcub/hipcub.hpp>
-
-namespace oneflow {
-namespace user_op {
-
-namespace {
-
-constexpr int64_t kCrossEntropyGpuBlockSize = 128;
-
-template<typename T>
-__global__ void ComputeEntropyGpu(const int64_t num_instances, const int64_t num_classes,
-                                  const T* x, const T* labels, T* y) {
-  typedef hipcub::BlockReduce<T, kCrossEntropyGpuBlockSize> BlockReduce;
-  __shared__ typename BlockReduce::TempStorage temp_storage;
-  const int tid = threadIdx.x;
-  for (int row = blockIdx.x; row < num_instances; row += gridDim.x) {
-    const int row_offset = row * num_classes;
-    const T* in_row = x + row_offset;
-    const T* label_row = labels + row_offset;
-    T result = 0;
-    for (int col = tid; col < num_classes; col += kCrossEntropyGpuBlockSize) {
-      T label = label_row[col];
-      T prob = in_row[col];
-      result += -label * SafeLog(prob);
-    }
-    __syncthreads();
-    T row_reduce_result = BlockReduce(temp_storage).Reduce(result, hipcub::Sum());
-    if (0 == tid) { y[row] = row_reduce_result; }
-  }
-}
-
-__global__ void ComputeEntropyGpuHalf(const int64_t num_instances, const int64_t num_classes,
-                                      const half* x, const half* labels, half* y) {
-  typedef hipcub::BlockReduce<float, kCrossEntropyGpuBlockSize> BlockReduce;
-  __shared__ typename BlockReduce::TempStorage temp_storage;
-  const int tid = threadIdx.x;
-  for (int row = blockIdx.x; row < num_instances; row += gridDim.x) {
-    const int row_offset = row * num_classes;
-    const half* in_row = x + row_offset;
-    const half* label_row = labels + row_offset;
-    float result = 0;
-    for (int col = tid; col < num_classes; col += kCrossEntropyGpuBlockSize) {
-      float label = __half2float(label_row[col]);
-      float prob = __half2float(in_row[col]);
-      result += -label * SafeLog(prob);
-    }
-    __syncthreads();
-    float row_reduce_result = BlockReduce(temp_storage).Reduce(result, hipcub::Sum());
-    if (0 == tid) { y[row] = __float2half(row_reduce_result); }
-  }
-}
-
-template<typename T>
-__global__ void ComputeDiffWithSoftmaxGpu(const int64_t elem_cnt, const int64_t num_classes,
-                                          const T* prob, const T* labels, const T* dy, T* dx) {
-  CUDA_1D_KERNEL_LOOP(i, elem_cnt) {
-    const int32_t row_id = i / num_classes;
-    dx[i] = dy[row_id] * (prob[i] - labels[i]);
-  }
-}
-
-__global__ void ComputeDiffWithSoftmaxGpuHalf(const int64_t elem_cnt, const int64_t num_classes,
-                                              const half* prob, const half* labels, const half* dy,
-                                              half* dx) {
-#if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
-  CUDA_1D_KERNEL_LOOP(i, elem_cnt) {
-    const int32_t row_id = i / num_classes;
-    dx[i] = __hmul(dy[row_id], __hsub(prob[i], labels[i]));
-  }
-#else
-  printf("use half need nvcc arch >= 530");
-  assert(false);
-#endif /* __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)*/
-}
-
-}  // namespace
-
-int GetCrossEntropyNumBlocks(const int num_instances) {
-  return std::min(static_cast<int>(num_instances), kCudaMaxBlocksNum);
-}
-
-int GetCrossEntropyBlockSize() { return kCrossEntropyGpuBlockSize; }
-
-template<typename T>
-struct CrossEntropyKernelUtil<DeviceType::kCUDA, T> {
-  static void ComputeEntropy(ep::Stream* stream, const int64_t num_instances,
-                             const int64_t num_classes, const T* x, const T* labels, T* y) {
-    OF_CUDA_CHECK(hipMemsetAsync(y, 0, sizeof(T) * num_instances,
-                                  stream->As<ep::CudaStream>()->cuda_stream()));
-    ComputeEntropyGpu<<<GetCrossEntropyNumBlocks(num_instances), GetCrossEntropyBlockSize(), 0,
-                        stream->As<ep::CudaStream>()->cuda_stream()>>>(num_instances, num_classes,
-                                                                       x, labels, y);
-  }
-
-  static void ComputeDiffWithSoftmax(ep::Stream* stream, const int64_t elem_cnt,
-                                     const int64_t num_classes, const T* prob, const T* labels,
-                                     const T* dy, T* dx) {
-    ComputeDiffWithSoftmaxGpu<<<BlocksNum4ThreadsNum(elem_cnt), kCudaThreadsNumPerBlock, 0,
-                                stream->As<ep::CudaStream>()->cuda_stream()>>>(
-        elem_cnt, num_classes, prob, labels, dy, dx);
-  }
-};
-
-template<>
-struct CrossEntropyKernelUtil<DeviceType::kCUDA, float16> {
-  static void ComputeEntropy(ep::Stream* stream, const int64_t num_instances,
-                             const int64_t num_classes, const float16* x, const float16* labels,
-                             float16* y) {
-    OF_CUDA_CHECK(hipMemsetAsync(y, 0, sizeof(float16) * num_instances,
-                                  stream->As<ep::CudaStream>()->cuda_stream()));
-    ComputeEntropyGpuHalf<<<GetCrossEntropyNumBlocks(num_instances), GetCrossEntropyBlockSize(), 0,
-                            stream->As<ep::CudaStream>()->cuda_stream()>>>(
-        num_instances, num_classes, reinterpret_cast<const half*>(x),
-        reinterpret_cast<const half*>(labels), reinterpret_cast<half*>(y));
-  }
-
-  static void ComputeDiffWithSoftmax(ep::Stream* stream, const int64_t elem_cnt,
-                                     const int64_t num_classes, const float16* prob,
-                                     const float16* labels, const float16* dy, float16* dx) {
-    ComputeDiffWithSoftmaxGpuHalf<<<BlocksNum4ThreadsNum(elem_cnt), kCudaThreadsNumPerBlock, 0,
-                                    stream->As<ep::CudaStream>()->cuda_stream()>>>(
-        elem_cnt, num_classes, reinterpret_cast<const half*>(prob),
-        reinterpret_cast<const half*>(labels), reinterpret_cast<const half*>(dy),
-        reinterpret_cast<half*>(dx));
-  }
-};
-
-OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_SOFTMAX_CROSS_ENTROPY_KERNEL,
-                                 OF_PP_MAKE_TUPLE_SEQ(DeviceType::kCUDA),
-                                 FLOATING_DATA_TYPE_SEQ FLOAT16_DATA_TYPE_SEQ)
-
-OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_SOFTMAX_CROSS_ENTROPY_GRAD_KERNEL,
-                                 OF_PP_MAKE_TUPLE_SEQ(DeviceType::kCUDA),
-                                 FLOATING_DATA_TYPE_SEQ FLOAT16_DATA_TYPE_SEQ)
-
-}  // namespace user_op
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "hip/hip_runtime.h"
+#include "oneflow/user/kernels/softmax_cross_entropy_kernel.h"
+#include "oneflow/core/kernel/kernel_util.hip.h"
+#include "oneflow/core/ep/rocm/cuda_stream.h"
+#include <hipcub/hipcub.hpp>
+
+namespace oneflow {
+namespace user_op {
+
+namespace {
+
+constexpr int64_t kCrossEntropyGpuBlockSize = 128;
+
+template<typename T>
+__global__ void ComputeEntropyGpu(const int64_t num_instances, const int64_t num_classes,
+                                  const T* x, const T* labels, T* y) {
+  typedef hipcub::BlockReduce<T, kCrossEntropyGpuBlockSize> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage temp_storage;
+  const int tid = threadIdx.x;
+  for (int row = blockIdx.x; row < num_instances; row += gridDim.x) {
+    const int row_offset = row * num_classes;
+    const T* in_row = x + row_offset;
+    const T* label_row = labels + row_offset;
+    T result = 0;
+    for (int col = tid; col < num_classes; col += kCrossEntropyGpuBlockSize) {
+      T label = label_row[col];
+      T prob = in_row[col];
+      result += -label * SafeLog(prob);
+    }
+    __syncthreads();
+    T row_reduce_result = BlockReduce(temp_storage).Reduce(result, hipcub::Sum());
+    if (0 == tid) { y[row] = row_reduce_result; }
+  }
+}
+
+__global__ void ComputeEntropyGpuHalf(const int64_t num_instances, const int64_t num_classes,
+                                      const half* x, const half* labels, half* y) {
+  typedef hipcub::BlockReduce<float, kCrossEntropyGpuBlockSize> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage temp_storage;
+  const int tid = threadIdx.x;
+  for (int row = blockIdx.x; row < num_instances; row += gridDim.x) {
+    const int row_offset = row * num_classes;
+    const half* in_row = x + row_offset;
+    const half* label_row = labels + row_offset;
+    float result = 0;
+    for (int col = tid; col < num_classes; col += kCrossEntropyGpuBlockSize) {
+      float label = __half2float(label_row[col]);
+      float prob = __half2float(in_row[col]);
+      result += -label * SafeLog(prob);
+    }
+    __syncthreads();
+    float row_reduce_result = BlockReduce(temp_storage).Reduce(result, hipcub::Sum());
+    if (0 == tid) { y[row] = __float2half(row_reduce_result); }
+  }
+}
+
+template<typename T>
+__global__ void ComputeDiffWithSoftmaxGpu(const int64_t elem_cnt, const int64_t num_classes,
+                                          const T* prob, const T* labels, const T* dy, T* dx) {
+  CUDA_1D_KERNEL_LOOP(i, elem_cnt) {
+    const int32_t row_id = i / num_classes;
+    dx[i] = dy[row_id] * (prob[i] - labels[i]);
+  }
+}
+
+__global__ void ComputeDiffWithSoftmaxGpuHalf(const int64_t elem_cnt, const int64_t num_classes,
+                                              const half* prob, const half* labels, const half* dy,
+                                              half* dx) {
+#if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
+  CUDA_1D_KERNEL_LOOP(i, elem_cnt) {
+    const int32_t row_id = i / num_classes;
+    dx[i] = __hmul(dy[row_id], __hsub(prob[i], labels[i]));
+  }
+#else
+  printf("use half need nvcc arch >= 530");
+  assert(false);
+#endif /* __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)*/
+}
+
+}  // namespace
+
+int GetCrossEntropyNumBlocks(const int num_instances) {
+  return std::min(static_cast<int>(num_instances), kCudaMaxBlocksNum);
+}
+
+int GetCrossEntropyBlockSize() { return kCrossEntropyGpuBlockSize; }
+
+template<typename T>
+struct CrossEntropyKernelUtil<DeviceType::kCUDA, T> {
+  static void ComputeEntropy(ep::Stream* stream, const int64_t num_instances,
+                             const int64_t num_classes, const T* x, const T* labels, T* y) {
+    OF_CUDA_CHECK(hipMemsetAsync(y, 0, sizeof(T) * num_instances,
+                                  stream->As<ep::CudaStream>()->cuda_stream()));
+    ComputeEntropyGpu<<<GetCrossEntropyNumBlocks(num_instances), GetCrossEntropyBlockSize(), 0,
+                        stream->As<ep::CudaStream>()->cuda_stream()>>>(num_instances, num_classes,
+                                                                       x, labels, y);
+  }
+
+  static void ComputeDiffWithSoftmax(ep::Stream* stream, const int64_t elem_cnt,
+                                     const int64_t num_classes, const T* prob, const T* labels,
+                                     const T* dy, T* dx) {
+    ComputeDiffWithSoftmaxGpu<<<BlocksNum4ThreadsNum(elem_cnt), kCudaThreadsNumPerBlock, 0,
+                                stream->As<ep::CudaStream>()->cuda_stream()>>>(
+        elem_cnt, num_classes, prob, labels, dy, dx);
+  }
+};
+
+template<>
+struct CrossEntropyKernelUtil<DeviceType::kCUDA, float16> {
+  static void ComputeEntropy(ep::Stream* stream, const int64_t num_instances,
+                             const int64_t num_classes, const float16* x, const float16* labels,
+                             float16* y) {
+    OF_CUDA_CHECK(hipMemsetAsync(y, 0, sizeof(float16) * num_instances,
+                                  stream->As<ep::CudaStream>()->cuda_stream()));
+    ComputeEntropyGpuHalf<<<GetCrossEntropyNumBlocks(num_instances), GetCrossEntropyBlockSize(), 0,
+                            stream->As<ep::CudaStream>()->cuda_stream()>>>(
+        num_instances, num_classes, reinterpret_cast<const half*>(x),
+        reinterpret_cast<const half*>(labels), reinterpret_cast<half*>(y));
+  }
+
+  static void ComputeDiffWithSoftmax(ep::Stream* stream, const int64_t elem_cnt,
+                                     const int64_t num_classes, const float16* prob,
+                                     const float16* labels, const float16* dy, float16* dx) {
+    ComputeDiffWithSoftmaxGpuHalf<<<BlocksNum4ThreadsNum(elem_cnt), kCudaThreadsNumPerBlock, 0,
+                                    stream->As<ep::CudaStream>()->cuda_stream()>>>(
+        elem_cnt, num_classes, reinterpret_cast<const half*>(prob),
+        reinterpret_cast<const half*>(labels), reinterpret_cast<const half*>(dy),
+        reinterpret_cast<half*>(dx));
+  }
+};
+
+OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_SOFTMAX_CROSS_ENTROPY_KERNEL,
+                                 OF_PP_MAKE_TUPLE_SEQ(DeviceType::kCUDA),
+                                 FLOATING_DATA_TYPE_SEQ FLOAT16_DATA_TYPE_SEQ)
+
+OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_SOFTMAX_CROSS_ENTROPY_GRAD_KERNEL,
+                                 OF_PP_MAKE_TUPLE_SEQ(DeviceType::kCUDA),
+                                 FLOATING_DATA_TYPE_SEQ FLOAT16_DATA_TYPE_SEQ)
+
+}  // namespace user_op
 }  // namespace oneflow
\ No newline at end of file
diff --git a/oneflow/user/kernels/sort_kernel.hip.cpp b/oneflow/user/kernels/sort_kernel.hip.cpp
index 1f6067f..186dfb9 100644
--- a/oneflow/user/kernels/sort_kernel.hip.cpp
+++ b/oneflow/user/kernels/sort_kernel.hip.cpp
@@ -1,81 +1,81 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "oneflow/core/framework/framework.h"
-#include "oneflow/core/kernel/new_kernel_util.h"
-#include "oneflow/user/kernels/radix_sort.hip.h"
-#include "oneflow/core/ep/rocm/cuda_stream.h"
-
-namespace oneflow {
-
-template<typename T>
-class GpuSortKernel final : public user_op::OpKernel {
- public:
-  GpuSortKernel() = default;
-  ~GpuSortKernel() = default;
-
- private:
-  using user_op::OpKernel::Compute;
-  void Compute(user_op::KernelComputeContext* ctx) const override {
-    const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("in", 0);
-    user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0);
-    user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0);
-
-    Memcpy<DeviceType::kCUDA>(ctx->stream(), out->mut_dptr<T>(), in->dptr<T>(),
-                              in->shape_view().elem_cnt() * sizeof(T));
-    const int32_t instance_size = in->shape_view().At(in->shape_view().NumAxes() - 1);
-    const int32_t instance_num = in->shape_view().elem_cnt() / instance_size;
-    const std::string& direction = ctx->Attr<std::string>("direction");
-    if (direction == "ASCENDING") {
-      SortKeysAscending(in->dptr<T>(), instance_num, instance_size, tmp_buffer->mut_dptr<void>(),
-                        tmp_buffer->shape_view().elem_cnt(), out->mut_dptr<T>(),
-                        ctx->stream()->As<ep::CudaStream>()->cuda_stream());
-    } else if (direction == "DESCENDING") {
-      SortKeysDescending(in->dptr<T>(), instance_num, instance_size, tmp_buffer->mut_dptr<void>(),
-                         tmp_buffer->shape_view().elem_cnt(), out->mut_dptr<T>(),
-                         ctx->stream()->As<ep::CudaStream>()->cuda_stream());
-    } else {
-      UNIMPLEMENTED();
-    }
-  }
-  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
-};
-
-#define REGISTER_CUDA_SORT_KERNEL(dtype)                                                    \
-  REGISTER_USER_KERNEL("sort")                                                              \
-      .SetCreateFn<GpuSortKernel<dtype>>()                                                  \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                      \
-                       && (user_op::HobDataType("out", 0) == GetDataType<dtype>::value))    \
-      .SetInferTmpSizeFn([](user_op::InferContext* ctx) -> size_t {                         \
-        const Shape& in_shape = ctx->InputShape("in", 0);                                   \
-        const int32_t instance_size = in_shape.dim_vec().back();                            \
-        const int32_t instance_num = in_shape.elem_cnt() / instance_size;                   \
-        const std::string& direction = ctx->Attr<std::string>("direction");                 \
-        if (direction == "ASCENDING") {                                                     \
-          return InferTempStorageForSortKeysAscending<dtype>(instance_num, instance_size);  \
-        } else if (direction == "DESCENDING") {                                             \
-          return InferTempStorageForSortKeysDescending<dtype>(instance_num, instance_size); \
-        } else {                                                                            \
-          UNIMPLEMENTED();                                                                  \
-          return 0;                                                                         \
-        }                                                                                   \
-      });
-
-REGISTER_CUDA_SORT_KERNEL(float)
-REGISTER_CUDA_SORT_KERNEL(double)
-REGISTER_CUDA_SORT_KERNEL(int32_t)
-REGISTER_CUDA_SORT_KERNEL(int64_t)
-
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "oneflow/core/framework/framework.h"
+#include "oneflow/core/kernel/new_kernel_util.h"
+#include "oneflow/user/kernels/radix_sort.hip.h"
+#include "oneflow/core/ep/rocm/cuda_stream.h"
+
+namespace oneflow {
+
+template<typename T>
+class GpuSortKernel final : public user_op::OpKernel {
+ public:
+  GpuSortKernel() = default;
+  ~GpuSortKernel() = default;
+
+ private:
+  using user_op::OpKernel::Compute;
+  void Compute(user_op::KernelComputeContext* ctx) const override {
+    const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("in", 0);
+    user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0);
+    user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0);
+
+    Memcpy<DeviceType::kCUDA>(ctx->stream(), out->mut_dptr<T>(), in->dptr<T>(),
+                              in->shape_view().elem_cnt() * sizeof(T));
+    const int32_t instance_size = in->shape_view().At(in->shape_view().NumAxes() - 1);
+    const int32_t instance_num = in->shape_view().elem_cnt() / instance_size;
+    const std::string& direction = ctx->Attr<std::string>("direction");
+    if (direction == "ASCENDING") {
+      SortKeysAscending(in->dptr<T>(), instance_num, instance_size, tmp_buffer->mut_dptr<void>(),
+                        tmp_buffer->shape_view().elem_cnt(), out->mut_dptr<T>(),
+                        ctx->stream()->As<ep::CudaStream>()->cuda_stream());
+    } else if (direction == "DESCENDING") {
+      SortKeysDescending(in->dptr<T>(), instance_num, instance_size, tmp_buffer->mut_dptr<void>(),
+                         tmp_buffer->shape_view().elem_cnt(), out->mut_dptr<T>(),
+                         ctx->stream()->As<ep::CudaStream>()->cuda_stream());
+    } else {
+      UNIMPLEMENTED();
+    }
+  }
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
+};
+
+#define REGISTER_CUDA_SORT_KERNEL(dtype)                                                    \
+  REGISTER_USER_KERNEL("sort")                                                              \
+      .SetCreateFn<GpuSortKernel<dtype>>()                                                  \
+      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                      \
+                       && (user_op::HobDataType("out", 0) == GetDataType<dtype>::value))    \
+      .SetInferTmpSizeFn([](user_op::InferContext* ctx) -> size_t {                         \
+        const Shape& in_shape = ctx->InputShape("in", 0);                                   \
+        const int32_t instance_size = in_shape.dim_vec().back();                            \
+        const int32_t instance_num = in_shape.elem_cnt() / instance_size;                   \
+        const std::string& direction = ctx->Attr<std::string>("direction");                 \
+        if (direction == "ASCENDING") {                                                     \
+          return InferTempStorageForSortKeysAscending<dtype>(instance_num, instance_size);  \
+        } else if (direction == "DESCENDING") {                                             \
+          return InferTempStorageForSortKeysDescending<dtype>(instance_num, instance_size); \
+        } else {                                                                            \
+          UNIMPLEMENTED();                                                                  \
+          return 0;                                                                         \
+        }                                                                                   \
+      });
+
+REGISTER_CUDA_SORT_KERNEL(float)
+REGISTER_CUDA_SORT_KERNEL(double)
+REGISTER_CUDA_SORT_KERNEL(int32_t)
+REGISTER_CUDA_SORT_KERNEL(int64_t)
+
 }  // namespace oneflow
\ No newline at end of file
diff --git a/oneflow/user/kernels/sparse_cross_entropy_kernel_util.hip.cpp b/oneflow/user/kernels/sparse_cross_entropy_kernel_util.hip.cpp
index 38c3903..d417fb8 100644
--- a/oneflow/user/kernels/sparse_cross_entropy_kernel_util.hip.cpp
+++ b/oneflow/user/kernels/sparse_cross_entropy_kernel_util.hip.cpp
@@ -1,267 +1,267 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "hip/hip_runtime.h"
-#include "oneflow/user/kernels/sparse_cross_entropy_kernel_util.h"
-#include "oneflow/core/kernel/kernel_util.hip.h"
-#include "oneflow/core/kernel/new_kernel_util.h"
-#include "oneflow/core/ep/rocm/cuda_stream.h"
-
-namespace oneflow {
-namespace user_op {
-
-namespace {
-
-template<typename T, typename K>
-__global__ void ComputeEntropyGpu(const int64_t num_instances, const int64_t num_classes,
-                                  const int64_t depth, const int64_t lower_bound, const T* x,
-                                  const K* labels, T* y) {
-  CUDA_1D_KERNEL_LOOP_T(int64_t, i, num_instances) {
-    assert(labels[i] >= 0);
-    assert(labels[i] < depth);
-    K label = labels[i] - lower_bound;
-    if (label >= 0 && label < num_classes) { y[i] = -SafeLog(x[i * num_classes + label]); }
-  }
-}
-
-template<typename K>
-__global__ void ComputeEntropyGpuHalf(const int64_t num_instances, const int64_t num_classes,
-                                      const int64_t depth, const int64_t lower_bound, const half* x,
-                                      const K* labels, half* y) {
-#if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
-  CUDA_1D_KERNEL_LOOP_T(int64_t, i, num_instances) {
-    assert(labels[i] >= 0);
-    assert(labels[i] < depth);
-    K label = labels[i] - lower_bound;
-    if (label >= 0 && label < num_classes) {
-      y[i] = __float2half(-SafeLog(__half2float(x[i * num_classes + label])));
-    }
-  }
-#else
-  printf("use half need nvcc arch >= 530");
-  assert(false);
-#endif /* __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)*/
-}
-
-template<typename T, typename K>
-__global__ void ComputeDiffGpu(const int64_t num_instances, const int64_t num_classes,
-                               const int64_t depth, const int64_t lower_bound, const T* x,
-                               const K* labels, const T* dy, T* dx) {
-  CUDA_1D_KERNEL_LOOP_T(int64_t, i, num_instances) {
-    assert(labels[i] >= 0);
-    assert(labels[i] < depth);
-    K label = labels[i] - lower_bound;
-    if (label >= 0 && label < num_classes) {
-      dx[i * num_classes + label] = -dy[i] / MaxWithLogThreshold(x[i * num_classes + label]);
-    }
-  }
-}
-
-template<typename K>
-__global__ void ComputeDiffGpuHalf(const int64_t num_instances, const int64_t num_classes,
-                                   const int64_t depth, const int64_t lower_bound, const half* x,
-                                   const K* labels, const half* dy, half* dx) {
-#if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
-  CUDA_1D_KERNEL_LOOP_T(int64_t, i, num_instances) {
-    assert(labels[i] >= 0);
-    assert(labels[i] < depth);
-    K label = labels[i] - lower_bound;
-    if (label >= 0 && label < num_classes) {
-      dx[i * num_classes + label] =
-          __hneg(__hdiv(__float2half(dy[i]), MaxWithLogThreshold(x[i * num_classes + label])));
-    }
-  }
-#else
-  printf("use half need nvcc arch >= 530");
-  assert(false);
-#endif /* __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)*/
-}
-
-template<typename T, typename K, typename IndexType>
-__global__ void ComputeDiffWithSoftmaxGpu(const int64_t elem_cnt, const int64_t num_classes,
-                                          const int64_t depth, const int64_t lower_bound,
-                                          const T* prob, const K* labels, const T* dy, T* dx) {
-  CUDA_1D_KERNEL_LOOP_T(IndexType, i, elem_cnt) {
-    const IndexType row_id = i / num_classes;
-    const IndexType col_id = i - row_id * num_classes;
-    assert(labels[row_id] >= 0);
-    assert(labels[row_id] < depth);
-    K label = labels[row_id] - lower_bound;
-    if (label == col_id) {
-      dx[i] = dy[row_id] * (prob[i] - 1);
-    } else {
-      dx[i] = dy[row_id] * prob[i];
-    }
-  }
-}
-
-template<typename K, typename IndexType>
-__global__ void ComputeDiffWithSoftmaxGpuHalf(const int64_t elem_cnt, const int64_t num_classes,
-                                              const int64_t depth, const int64_t lower_bound,
-                                              const half* prob, const K* labels, const half* dy,
-                                              half* dx) {
-#if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
-  CUDA_1D_KERNEL_LOOP_T(IndexType, i, elem_cnt) {
-    // NOTE(chengcheng): int division ('/') of i will reduce performance of int64_t.
-    const IndexType row_id = i / num_classes;
-    const IndexType col_id = i - row_id * num_classes;
-    assert(labels[row_id] >= 0);
-    assert(labels[row_id] < depth);
-    K label = labels[row_id] - lower_bound;
-    if (label == col_id) {
-      dx[i] = __hmul(dy[row_id], __hsub(prob[i], __float2half(1.0)));
-    } else {
-      dx[i] = __hmul(dy[row_id], prob[i]);
-    }
-  }
-#else
-  printf("use half need nvcc arch >= 530");
-  assert(false);
-#endif /* __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)*/
-}
-
-template<typename K, typename IndexType>
-__global__ void ComputeDiffWithSoftmaxGpuHalf2(const int64_t elem_cnt, const int64_t num_classes,
-                                               const int64_t depth, const int64_t lower_bound,
-                                               const half* prob, const K* labels, const half* dy,
-                                               half* dx) {
-#if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
-  const int64_t h2_num_classes = num_classes / 2;
-  const int64_t h2_elem_cnt = elem_cnt / 2;
-  const auto* prob_h2 = reinterpret_cast<const half2*>(prob);
-  auto* dx_h2 = reinterpret_cast<half2*>(dx);
-  CUDA_1D_KERNEL_LOOP_T(IndexType, i, h2_elem_cnt) {
-    const IndexType row_id = i / h2_num_classes;
-    const IndexType h2_col_id = i - row_id * h2_num_classes;
-    assert(labels[row_id] >= 0);
-    assert(labels[row_id] < depth);
-    K label = labels[row_id] - lower_bound;
-    const half2 prob_h2_i = prob_h2[i];
-    const half dy_row = dy[row_id];
-    half2 dx_h2_i;
-    dx_h2_i.data.x = __hmul(dy_row, __hsub(prob_h2_i.data.x, static_cast<half>(label == 2 * h2_col_id)));
-    dx_h2_i.data.y = __hmul(dy_row, __hsub(prob_h2_i.data.y, static_cast<half>(label == 2 * h2_col_id + 1)));
-    dx_h2[i] = dx_h2_i;
-  }
-#else
-  printf("use half need nvcc arch >= 530");
-  assert(false);
-#endif /* __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)*/
-}
-
-}  // namespace
-
-template<typename T, typename K>
-struct SparseCrossEntropyKernelUtil<DeviceType::kCUDA, T, K> {
-  static void ComputeEntropy(ep::Stream* stream, const int64_t num_instances,
-                             const int64_t num_classes, const int64_t depth,
-                             const int64_t lower_bound, const T* x, const K* labels, T* y) {
-    ComputeEntropyGpu<<<BlocksNum4ThreadsNum(num_instances), kCudaThreadsNumPerBlock, 0,
-                        stream->As<ep::CudaStream>()->cuda_stream()>>>(
-        num_instances, num_classes, depth, lower_bound, x, labels, y);
-  }
-
-  static void ComputeDiff(ep::Stream* stream, const int64_t num_instances,
-                          const int64_t num_classes, const int64_t depth, const int64_t lower_bound,
-                          const T* x, const K* labels, const T* dy, T* dx) {
-    ComputeDiffGpu<<<BlocksNum4ThreadsNum(num_instances), kCudaThreadsNumPerBlock, 0,
-                     stream->As<ep::CudaStream>()->cuda_stream()>>>(
-        num_instances, num_classes, depth, lower_bound, x, labels, dy, dx);
-  }
-
-  static void ComputeDiffWithSoftmax(ep::Stream* stream, const int64_t elem_cnt,
-                                     const int64_t num_classes, const int64_t depth,
-                                     const int64_t lower_bound, const T* prob, const K* labels,
-                                     const T* dy, T* dx) {
-    if (elem_cnt < GetMaxVal<int32_t>() / 2) {
-      ComputeDiffWithSoftmaxGpu<T, K, int32_t>
-          <<<BlocksNum4ThreadsNum(elem_cnt), kCudaThreadsNumPerBlock, 0,
-             stream->As<ep::CudaStream>()->cuda_stream()>>>(elem_cnt, num_classes, depth,
-                                                            lower_bound, prob, labels, dy, dx);
-    } else {
-      ComputeDiffWithSoftmaxGpu<T, K, int64_t>
-          <<<BlocksNum4ThreadsNum(elem_cnt), kCudaThreadsNumPerBlock, 0,
-             stream->As<ep::CudaStream>()->cuda_stream()>>>(elem_cnt, num_classes, depth,
-                                                            lower_bound, prob, labels, dy, dx);
-    }
-  }
-};
-
-template<typename K>
-struct SparseCrossEntropyKernelUtil<DeviceType::kCUDA, float16, K> {
-  static void ComputeEntropy(ep::Stream* stream, const int64_t num_instances,
-                             const int64_t num_classes, const int64_t depth,
-                             const int64_t lower_bound, const float16* x, const K* labels,
-                             float16* y) {
-    ComputeEntropyGpuHalf<K><<<BlocksNum4ThreadsNum(num_instances), kCudaThreadsNumPerBlock, 0,
-                               stream->As<ep::CudaStream>()->cuda_stream()>>>(
-        num_instances, num_classes, depth, lower_bound, reinterpret_cast<const half*>(x), labels,
-        reinterpret_cast<half*>(y));
-  }
-
-  static void ComputeDiff(ep::Stream* stream, const int64_t num_instances,
-                          const int64_t num_classes, const int64_t depth, const int64_t lower_bound,
-                          const float16* x, const K* labels, const float16* dy, float16* dx) {
-    ComputeDiffGpuHalf<K><<<BlocksNum4ThreadsNum(num_instances), kCudaThreadsNumPerBlock, 0,
-                            stream->As<ep::CudaStream>()->cuda_stream()>>>(
-        num_instances, num_classes, depth, lower_bound, reinterpret_cast<const half*>(x), labels,
-        reinterpret_cast<const half*>(dy), reinterpret_cast<half*>(dx));
-  }
-
-  static void ComputeDiffWithSoftmax(ep::Stream* stream, const int64_t elem_cnt,
-                                     const int64_t num_classes, const int64_t depth,
-                                     const int64_t lower_bound, const float16* prob,
-                                     const K* labels, const float16* dy, float16* dx) {
-    if (num_classes % 2 == 0) {
-      if (elem_cnt < GetMaxVal<int32_t>() / 2) {
-        ComputeDiffWithSoftmaxGpuHalf2<K, int32_t>
-            <<<BlocksNum4ThreadsNum(elem_cnt / 2), kCudaThreadsNumPerBlock, 0,
-               stream->As<ep::CudaStream>()->cuda_stream()>>>(
-                elem_cnt, num_classes, depth, lower_bound, reinterpret_cast<const half*>(prob),
-                labels, reinterpret_cast<const half*>(dy), reinterpret_cast<half*>(dx));
-      } else {
-        ComputeDiffWithSoftmaxGpuHalf2<K, int64_t>
-            <<<BlocksNum4ThreadsNum(elem_cnt / 2), kCudaThreadsNumPerBlock, 0,
-               stream->As<ep::CudaStream>()->cuda_stream()>>>(
-                elem_cnt, num_classes, depth, lower_bound, reinterpret_cast<const half*>(prob),
-                labels, reinterpret_cast<const half*>(dy), reinterpret_cast<half*>(dx));
-      }
-    } else {
-      if (elem_cnt < GetMaxVal<int32_t>() / 2) {
-        ComputeDiffWithSoftmaxGpuHalf<K, int32_t>
-            <<<BlocksNum4ThreadsNum(elem_cnt), kCudaThreadsNumPerBlock, 0,
-               stream->As<ep::CudaStream>()->cuda_stream()>>>(
-                elem_cnt, num_classes, depth, lower_bound, reinterpret_cast<const half*>(prob),
-                labels, reinterpret_cast<const half*>(dy), reinterpret_cast<half*>(dx));
-      } else {
-        ComputeDiffWithSoftmaxGpuHalf<K, int64_t>
-            <<<BlocksNum4ThreadsNum(elem_cnt), kCudaThreadsNumPerBlock, 0,
-               stream->As<ep::CudaStream>()->cuda_stream()>>>(
-                elem_cnt, num_classes, depth, lower_bound, reinterpret_cast<const half*>(prob),
-                labels, reinterpret_cast<const half*>(dy), reinterpret_cast<half*>(dx));
-      }
-    }
-  }
-};
-
-#define INSTANTIATE_SPARSE_CROSS_ENTROPY_KERNEL_UTIL_CUDA(data_type_pair, index_type_pair) \
-  template struct SparseCrossEntropyKernelUtil<                                            \
-      DeviceType::kCUDA, OF_PP_PAIR_FIRST(data_type_pair), OF_PP_PAIR_FIRST(index_type_pair)>;
-OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_SPARSE_CROSS_ENTROPY_KERNEL_UTIL_CUDA,
-                                 FLOATING_DATA_TYPE_SEQ FLOAT16_DATA_TYPE_SEQ, INDEX_DATA_TYPE_SEQ);
-#undef INSTANTIATE_SPARSE_CROSS_ENTROPY_KERNEL_UTIL_CUDA
-
-}  // namespace user_op
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "hip/hip_runtime.h"
+#include "oneflow/user/kernels/sparse_cross_entropy_kernel_util.h"
+#include "oneflow/core/kernel/kernel_util.hip.h"
+#include "oneflow/core/kernel/new_kernel_util.h"
+#include "oneflow/core/ep/rocm/cuda_stream.h"
+
+namespace oneflow {
+namespace user_op {
+
+namespace {
+
+template<typename T, typename K>
+__global__ void ComputeEntropyGpu(const int64_t num_instances, const int64_t num_classes,
+                                  const int64_t depth, const int64_t lower_bound, const T* x,
+                                  const K* labels, T* y) {
+  CUDA_1D_KERNEL_LOOP_T(int64_t, i, num_instances) {
+    assert(labels[i] >= 0);
+    assert(labels[i] < depth);
+    K label = labels[i] - lower_bound;
+    if (label >= 0 && label < num_classes) { y[i] = -SafeLog(x[i * num_classes + label]); }
+  }
+}
+
+template<typename K>
+__global__ void ComputeEntropyGpuHalf(const int64_t num_instances, const int64_t num_classes,
+                                      const int64_t depth, const int64_t lower_bound, const half* x,
+                                      const K* labels, half* y) {
+#if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
+  CUDA_1D_KERNEL_LOOP_T(int64_t, i, num_instances) {
+    assert(labels[i] >= 0);
+    assert(labels[i] < depth);
+    K label = labels[i] - lower_bound;
+    if (label >= 0 && label < num_classes) {
+      y[i] = __float2half(-SafeLog(__half2float(x[i * num_classes + label])));
+    }
+  }
+#else
+  printf("use half need nvcc arch >= 530");
+  assert(false);
+#endif /* __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)*/
+}
+
+template<typename T, typename K>
+__global__ void ComputeDiffGpu(const int64_t num_instances, const int64_t num_classes,
+                               const int64_t depth, const int64_t lower_bound, const T* x,
+                               const K* labels, const T* dy, T* dx) {
+  CUDA_1D_KERNEL_LOOP_T(int64_t, i, num_instances) {
+    assert(labels[i] >= 0);
+    assert(labels[i] < depth);
+    K label = labels[i] - lower_bound;
+    if (label >= 0 && label < num_classes) {
+      dx[i * num_classes + label] = -dy[i] / MaxWithLogThreshold(x[i * num_classes + label]);
+    }
+  }
+}
+
+template<typename K>
+__global__ void ComputeDiffGpuHalf(const int64_t num_instances, const int64_t num_classes,
+                                   const int64_t depth, const int64_t lower_bound, const half* x,
+                                   const K* labels, const half* dy, half* dx) {
+#if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
+  CUDA_1D_KERNEL_LOOP_T(int64_t, i, num_instances) {
+    assert(labels[i] >= 0);
+    assert(labels[i] < depth);
+    K label = labels[i] - lower_bound;
+    if (label >= 0 && label < num_classes) {
+      dx[i * num_classes + label] =
+          __hneg(__hdiv(__float2half(dy[i]), MaxWithLogThreshold(x[i * num_classes + label])));
+    }
+  }
+#else
+  printf("use half need nvcc arch >= 530");
+  assert(false);
+#endif /* __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)*/
+}
+
+template<typename T, typename K, typename IndexType>
+__global__ void ComputeDiffWithSoftmaxGpu(const int64_t elem_cnt, const int64_t num_classes,
+                                          const int64_t depth, const int64_t lower_bound,
+                                          const T* prob, const K* labels, const T* dy, T* dx) {
+  CUDA_1D_KERNEL_LOOP_T(IndexType, i, elem_cnt) {
+    const IndexType row_id = i / num_classes;
+    const IndexType col_id = i - row_id * num_classes;
+    assert(labels[row_id] >= 0);
+    assert(labels[row_id] < depth);
+    K label = labels[row_id] - lower_bound;
+    if (label == col_id) {
+      dx[i] = dy[row_id] * (prob[i] - 1);
+    } else {
+      dx[i] = dy[row_id] * prob[i];
+    }
+  }
+}
+
+template<typename K, typename IndexType>
+__global__ void ComputeDiffWithSoftmaxGpuHalf(const int64_t elem_cnt, const int64_t num_classes,
+                                              const int64_t depth, const int64_t lower_bound,
+                                              const half* prob, const K* labels, const half* dy,
+                                              half* dx) {
+#if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
+  CUDA_1D_KERNEL_LOOP_T(IndexType, i, elem_cnt) {
+    // NOTE(chengcheng): int division ('/') of i will reduce performance of int64_t.
+    const IndexType row_id = i / num_classes;
+    const IndexType col_id = i - row_id * num_classes;
+    assert(labels[row_id] >= 0);
+    assert(labels[row_id] < depth);
+    K label = labels[row_id] - lower_bound;
+    if (label == col_id) {
+      dx[i] = __hmul(dy[row_id], __hsub(prob[i], __float2half(1.0)));
+    } else {
+      dx[i] = __hmul(dy[row_id], prob[i]);
+    }
+  }
+#else
+  printf("use half need nvcc arch >= 530");
+  assert(false);
+#endif /* __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)*/
+}
+
+template<typename K, typename IndexType>
+__global__ void ComputeDiffWithSoftmaxGpuHalf2(const int64_t elem_cnt, const int64_t num_classes,
+                                               const int64_t depth, const int64_t lower_bound,
+                                               const half* prob, const K* labels, const half* dy,
+                                               half* dx) {
+#if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
+  const int64_t h2_num_classes = num_classes / 2;
+  const int64_t h2_elem_cnt = elem_cnt / 2;
+  const auto* prob_h2 = reinterpret_cast<const half2*>(prob);
+  auto* dx_h2 = reinterpret_cast<half2*>(dx);
+  CUDA_1D_KERNEL_LOOP_T(IndexType, i, h2_elem_cnt) {
+    const IndexType row_id = i / h2_num_classes;
+    const IndexType h2_col_id = i - row_id * h2_num_classes;
+    assert(labels[row_id] >= 0);
+    assert(labels[row_id] < depth);
+    K label = labels[row_id] - lower_bound;
+    const half2 prob_h2_i = prob_h2[i];
+    const half dy_row = dy[row_id];
+    half2 dx_h2_i;
+    dx_h2_i.data.x = __hmul(dy_row, __hsub(prob_h2_i.data.x, static_cast<half>(label == 2 * h2_col_id)));
+    dx_h2_i.data.y = __hmul(dy_row, __hsub(prob_h2_i.data.y, static_cast<half>(label == 2 * h2_col_id + 1)));
+    dx_h2[i] = dx_h2_i;
+  }
+#else
+  printf("use half need nvcc arch >= 530");
+  assert(false);
+#endif /* __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)*/
+}
+
+}  // namespace
+
+template<typename T, typename K>
+struct SparseCrossEntropyKernelUtil<DeviceType::kCUDA, T, K> {
+  static void ComputeEntropy(ep::Stream* stream, const int64_t num_instances,
+                             const int64_t num_classes, const int64_t depth,
+                             const int64_t lower_bound, const T* x, const K* labels, T* y) {
+    ComputeEntropyGpu<<<BlocksNum4ThreadsNum(num_instances), kCudaThreadsNumPerBlock, 0,
+                        stream->As<ep::CudaStream>()->cuda_stream()>>>(
+        num_instances, num_classes, depth, lower_bound, x, labels, y);
+  }
+
+  static void ComputeDiff(ep::Stream* stream, const int64_t num_instances,
+                          const int64_t num_classes, const int64_t depth, const int64_t lower_bound,
+                          const T* x, const K* labels, const T* dy, T* dx) {
+    ComputeDiffGpu<<<BlocksNum4ThreadsNum(num_instances), kCudaThreadsNumPerBlock, 0,
+                     stream->As<ep::CudaStream>()->cuda_stream()>>>(
+        num_instances, num_classes, depth, lower_bound, x, labels, dy, dx);
+  }
+
+  static void ComputeDiffWithSoftmax(ep::Stream* stream, const int64_t elem_cnt,
+                                     const int64_t num_classes, const int64_t depth,
+                                     const int64_t lower_bound, const T* prob, const K* labels,
+                                     const T* dy, T* dx) {
+    if (elem_cnt < GetMaxVal<int32_t>() / 2) {
+      ComputeDiffWithSoftmaxGpu<T, K, int32_t>
+          <<<BlocksNum4ThreadsNum(elem_cnt), kCudaThreadsNumPerBlock, 0,
+             stream->As<ep::CudaStream>()->cuda_stream()>>>(elem_cnt, num_classes, depth,
+                                                            lower_bound, prob, labels, dy, dx);
+    } else {
+      ComputeDiffWithSoftmaxGpu<T, K, int64_t>
+          <<<BlocksNum4ThreadsNum(elem_cnt), kCudaThreadsNumPerBlock, 0,
+             stream->As<ep::CudaStream>()->cuda_stream()>>>(elem_cnt, num_classes, depth,
+                                                            lower_bound, prob, labels, dy, dx);
+    }
+  }
+};
+
+template<typename K>
+struct SparseCrossEntropyKernelUtil<DeviceType::kCUDA, float16, K> {
+  static void ComputeEntropy(ep::Stream* stream, const int64_t num_instances,
+                             const int64_t num_classes, const int64_t depth,
+                             const int64_t lower_bound, const float16* x, const K* labels,
+                             float16* y) {
+    ComputeEntropyGpuHalf<K><<<BlocksNum4ThreadsNum(num_instances), kCudaThreadsNumPerBlock, 0,
+                               stream->As<ep::CudaStream>()->cuda_stream()>>>(
+        num_instances, num_classes, depth, lower_bound, reinterpret_cast<const half*>(x), labels,
+        reinterpret_cast<half*>(y));
+  }
+
+  static void ComputeDiff(ep::Stream* stream, const int64_t num_instances,
+                          const int64_t num_classes, const int64_t depth, const int64_t lower_bound,
+                          const float16* x, const K* labels, const float16* dy, float16* dx) {
+    ComputeDiffGpuHalf<K><<<BlocksNum4ThreadsNum(num_instances), kCudaThreadsNumPerBlock, 0,
+                            stream->As<ep::CudaStream>()->cuda_stream()>>>(
+        num_instances, num_classes, depth, lower_bound, reinterpret_cast<const half*>(x), labels,
+        reinterpret_cast<const half*>(dy), reinterpret_cast<half*>(dx));
+  }
+
+  static void ComputeDiffWithSoftmax(ep::Stream* stream, const int64_t elem_cnt,
+                                     const int64_t num_classes, const int64_t depth,
+                                     const int64_t lower_bound, const float16* prob,
+                                     const K* labels, const float16* dy, float16* dx) {
+    if (num_classes % 2 == 0) {
+      if (elem_cnt < GetMaxVal<int32_t>() / 2) {
+        ComputeDiffWithSoftmaxGpuHalf2<K, int32_t>
+            <<<BlocksNum4ThreadsNum(elem_cnt / 2), kCudaThreadsNumPerBlock, 0,
+               stream->As<ep::CudaStream>()->cuda_stream()>>>(
+                elem_cnt, num_classes, depth, lower_bound, reinterpret_cast<const half*>(prob),
+                labels, reinterpret_cast<const half*>(dy), reinterpret_cast<half*>(dx));
+      } else {
+        ComputeDiffWithSoftmaxGpuHalf2<K, int64_t>
+            <<<BlocksNum4ThreadsNum(elem_cnt / 2), kCudaThreadsNumPerBlock, 0,
+               stream->As<ep::CudaStream>()->cuda_stream()>>>(
+                elem_cnt, num_classes, depth, lower_bound, reinterpret_cast<const half*>(prob),
+                labels, reinterpret_cast<const half*>(dy), reinterpret_cast<half*>(dx));
+      }
+    } else {
+      if (elem_cnt < GetMaxVal<int32_t>() / 2) {
+        ComputeDiffWithSoftmaxGpuHalf<K, int32_t>
+            <<<BlocksNum4ThreadsNum(elem_cnt), kCudaThreadsNumPerBlock, 0,
+               stream->As<ep::CudaStream>()->cuda_stream()>>>(
+                elem_cnt, num_classes, depth, lower_bound, reinterpret_cast<const half*>(prob),
+                labels, reinterpret_cast<const half*>(dy), reinterpret_cast<half*>(dx));
+      } else {
+        ComputeDiffWithSoftmaxGpuHalf<K, int64_t>
+            <<<BlocksNum4ThreadsNum(elem_cnt), kCudaThreadsNumPerBlock, 0,
+               stream->As<ep::CudaStream>()->cuda_stream()>>>(
+                elem_cnt, num_classes, depth, lower_bound, reinterpret_cast<const half*>(prob),
+                labels, reinterpret_cast<const half*>(dy), reinterpret_cast<half*>(dx));
+      }
+    }
+  }
+};
+
+#define INSTANTIATE_SPARSE_CROSS_ENTROPY_KERNEL_UTIL_CUDA(data_type_pair, index_type_pair) \
+  template struct SparseCrossEntropyKernelUtil<                                            \
+      DeviceType::kCUDA, OF_PP_PAIR_FIRST(data_type_pair), OF_PP_PAIR_FIRST(index_type_pair)>;
+OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_SPARSE_CROSS_ENTROPY_KERNEL_UTIL_CUDA,
+                                 FLOATING_DATA_TYPE_SEQ FLOAT16_DATA_TYPE_SEQ, INDEX_DATA_TYPE_SEQ);
+#undef INSTANTIATE_SPARSE_CROSS_ENTROPY_KERNEL_UTIL_CUDA
+
+}  // namespace user_op
 }  // namespace oneflow
\ No newline at end of file
diff --git a/oneflow/user/kernels/sparse_softmax_cross_entropy_kernel.hip.cpp b/oneflow/user/kernels/sparse_softmax_cross_entropy_kernel.hip.cpp
index 869b283..71d7845 100644
--- a/oneflow/user/kernels/sparse_softmax_cross_entropy_kernel.hip.cpp
+++ b/oneflow/user/kernels/sparse_softmax_cross_entropy_kernel.hip.cpp
@@ -1,131 +1,131 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "hip/hip_runtime.h"
-#include "oneflow/user/kernels/sparse_cross_entropy_kernel_util.h"
-#include "oneflow/core/framework/framework.h"
-#include "oneflow/core/hip/softmax.hip.h"
-#include "oneflow/core/kernel/cuda_graph_support.h"
-#include "oneflow/core/ep/rocm/cuda_stream.h"
-
-namespace oneflow {
-namespace user_op {
-
-namespace {
-
-template<typename T>
-void ComputeProb(ep::Stream* stream, const int64_t row, const int64_t col, const T* in, T* prob) {
-  using ComputeType = typename cuda::softmax::DefaultComputeType<T>::type;
-  cuda::softmax::DirectLoad<T, ComputeType> load(in, col);
-  cuda::softmax::DirectStore<ComputeType, T> store(prob, col);
-  OF_CUDA_CHECK((cuda::softmax::DispatchLogSoftmax<decltype(load), decltype(store), ComputeType>(
-      stream->As<ep::CudaStream>()->cuda_stream(), load, store, row, col)));
-}
-
-template<>
-void ComputeProb(ep::Stream* stream, const int64_t row, const int64_t col, const float16* in,
-                 float16* prob) {
-  cuda::softmax::DirectLoad<half, float> load(reinterpret_cast<const half*>(in), col);
-  cuda::softmax::DirectStore<float, half> store(reinterpret_cast<half*>(prob), col);
-  OF_CUDA_CHECK((cuda::softmax::DispatchLogSoftmax<decltype(load), decltype(store), float>(
-      stream->As<ep::CudaStream>()->cuda_stream(), load, store, row, col)));
-}
-
-template<typename T, typename K>
-__global__ void ComputeSparseSoftmaxCrossEntropyResultGpu(const int64_t num_instances,
-                                                          const int64_t num_classes,
-                                                          const int64_t depth,
-                                                          const int64_t lower_bound,
-                                                          const K* labels, const T* prob, T* out) {
-  CUDA_1D_KERNEL_LOOP_T(int64_t, i, num_instances) {
-    assert(labels[i] >= 0);
-    assert(labels[i] < depth);
-    K label = labels[i] - lower_bound;
-    if (label >= 0 && label < num_classes) { out[i] = -prob[i * num_classes + label]; }
-  }
-}
-template<typename T, typename K>
-inline typename std::enable_if<std::is_floating_point<T>::value, void>::type
-ComputeSparseSoftmaxCrossEntropyResult(ep::Stream* stream, const int64_t num_instances,
-                                       const int64_t num_classes, const int64_t depth,
-                                       const int64_t lower_bound, const K* labels, const T* prob,
-                                       T* out) {
-  ComputeSparseSoftmaxCrossEntropyResultGpu<T, K>
-      <<<BlocksNum4ThreadsNum(num_instances), kCudaThreadsNumPerBlock, 0,
-         stream->As<ep::CudaStream>()->cuda_stream()>>>(num_instances, num_classes, depth,
-                                                        lower_bound, labels, prob, out);
-}
-template<typename T, typename K>
-inline typename std::enable_if<std::is_same<T, float16>::value, void>::type
-ComputeSparseSoftmaxCrossEntropyResult(ep::Stream* stream, const int64_t num_instances,
-                                       const int64_t num_classes, const int64_t depth,
-                                       const int64_t lower_bound, const K* labels, const T* prob,
-                                       T* out) {
-#if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
-  ComputeSparseSoftmaxCrossEntropyResultGpu<half, K>
-      <<<BlocksNum4ThreadsNum(num_instances), kCudaThreadsNumPerBlock, 0,
-         stream->As<ep::CudaStream>()->cuda_stream()>>>(
-          num_instances, num_classes, depth, lower_bound, labels,
-          reinterpret_cast<const half*>(prob), reinterpret_cast<half*>(out));
-#else
-  printf("use half need nvcc arch >= 530");
-  assert(false);
-#endif /* __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)*/
-}
-}  // namespace
-
-template<typename T, typename K>
-class SparseSoftmaxCrossEntropyKernel final : public user_op::OpKernel,
-                                              public user_op::CudaGraphSupport {
- public:
-  SparseSoftmaxCrossEntropyKernel() = default;
-  ~SparseSoftmaxCrossEntropyKernel() override = default;
-
- private:
-  using user_op::OpKernel::Compute;
-  void Compute(user_op::KernelComputeContext* ctx) const override {
-    const user_op::Tensor* prediction = ctx->Tensor4ArgNameAndIndex("prediction", 0);
-    const user_op::Tensor* label = ctx->Tensor4ArgNameAndIndex("label", 0);
-    user_op::Tensor* prob = ctx->Tensor4ArgNameAndIndex("prob", 0);
-    user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0);
-
-    const int64_t num_instances = label->shape_view().elem_cnt();
-    CHECK_EQ(prediction->shape_view().elem_cnt() % num_instances, 0);
-    const int64_t num_classes = prediction->shape_view().elem_cnt() / num_instances;
-    const int64_t lower_bound = 0;
-    const int64_t depth = ctx->Attr<int64_t>("depth");
-
-    ComputeProb<T>(ctx->stream(), num_instances, num_classes, prediction->dptr<T>(),
-                   prob->mut_dptr<T>());
-    ComputeSparseSoftmaxCrossEntropyResult<T, K>(ctx->stream(), num_instances, num_classes, depth,
-                                                 lower_bound, label->dptr<K>(), prob->dptr<T>(),
-                                                 out->mut_dptr<T>());
-  }
-  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
-};
-
-#define REGISTER_SPARSE_SOFTMAX_CROSS_ENTROPY_KERNEL(dtype_pair, ltype_pair)                  \
-  REGISTER_USER_KERNEL("sparse_softmax_cross_entropy")                                        \
-      .SetCreateFn<SparseSoftmaxCrossEntropyKernel<OF_PP_PAIR_FIRST(dtype_pair),              \
-                                                   OF_PP_PAIR_FIRST(ltype_pair)>>()           \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                        \
-                       && (user_op::HobDataType("label", 0) == OF_PP_PAIR_SECOND(ltype_pair)) \
-                       && (user_op::HobDataType("out", 0) == OF_PP_PAIR_SECOND(dtype_pair)));
-
-OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_SPARSE_SOFTMAX_CROSS_ENTROPY_KERNEL,
-                                 FLOATING_DATA_TYPE_SEQ FLOAT16_DATA_TYPE_SEQ, INDEX_DATA_TYPE_SEQ)
-
-}  // namespace user_op
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "hip/hip_runtime.h"
+#include "oneflow/user/kernels/sparse_cross_entropy_kernel_util.h"
+#include "oneflow/core/framework/framework.h"
+#include "oneflow/core/hip/softmax.hip.h"
+#include "oneflow/core/kernel/cuda_graph_support.h"
+#include "oneflow/core/ep/rocm/cuda_stream.h"
+
+namespace oneflow {
+namespace user_op {
+
+namespace {
+
+template<typename T>
+void ComputeProb(ep::Stream* stream, const int64_t row, const int64_t col, const T* in, T* prob) {
+  using ComputeType = typename cuda::softmax::DefaultComputeType<T>::type;
+  cuda::softmax::DirectLoad<T, ComputeType> load(in, col);
+  cuda::softmax::DirectStore<ComputeType, T> store(prob, col);
+  OF_CUDA_CHECK((cuda::softmax::DispatchLogSoftmax<decltype(load), decltype(store), ComputeType>(
+      stream->As<ep::CudaStream>()->cuda_stream(), load, store, row, col)));
+}
+
+template<>
+void ComputeProb(ep::Stream* stream, const int64_t row, const int64_t col, const float16* in,
+                 float16* prob) {
+  cuda::softmax::DirectLoad<half, float> load(reinterpret_cast<const half*>(in), col);
+  cuda::softmax::DirectStore<float, half> store(reinterpret_cast<half*>(prob), col);
+  OF_CUDA_CHECK((cuda::softmax::DispatchLogSoftmax<decltype(load), decltype(store), float>(
+      stream->As<ep::CudaStream>()->cuda_stream(), load, store, row, col)));
+}
+
+template<typename T, typename K>
+__global__ void ComputeSparseSoftmaxCrossEntropyResultGpu(const int64_t num_instances,
+                                                          const int64_t num_classes,
+                                                          const int64_t depth,
+                                                          const int64_t lower_bound,
+                                                          const K* labels, const T* prob, T* out) {
+  CUDA_1D_KERNEL_LOOP_T(int64_t, i, num_instances) {
+    assert(labels[i] >= 0);
+    assert(labels[i] < depth);
+    K label = labels[i] - lower_bound;
+    if (label >= 0 && label < num_classes) { out[i] = -prob[i * num_classes + label]; }
+  }
+}
+template<typename T, typename K>
+inline typename std::enable_if<std::is_floating_point<T>::value, void>::type
+ComputeSparseSoftmaxCrossEntropyResult(ep::Stream* stream, const int64_t num_instances,
+                                       const int64_t num_classes, const int64_t depth,
+                                       const int64_t lower_bound, const K* labels, const T* prob,
+                                       T* out) {
+  ComputeSparseSoftmaxCrossEntropyResultGpu<T, K>
+      <<<BlocksNum4ThreadsNum(num_instances), kCudaThreadsNumPerBlock, 0,
+         stream->As<ep::CudaStream>()->cuda_stream()>>>(num_instances, num_classes, depth,
+                                                        lower_bound, labels, prob, out);
+}
+template<typename T, typename K>
+inline typename std::enable_if<std::is_same<T, float16>::value, void>::type
+ComputeSparseSoftmaxCrossEntropyResult(ep::Stream* stream, const int64_t num_instances,
+                                       const int64_t num_classes, const int64_t depth,
+                                       const int64_t lower_bound, const K* labels, const T* prob,
+                                       T* out) {
+#if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
+  ComputeSparseSoftmaxCrossEntropyResultGpu<half, K>
+      <<<BlocksNum4ThreadsNum(num_instances), kCudaThreadsNumPerBlock, 0,
+         stream->As<ep::CudaStream>()->cuda_stream()>>>(
+          num_instances, num_classes, depth, lower_bound, labels,
+          reinterpret_cast<const half*>(prob), reinterpret_cast<half*>(out));
+#else
+  printf("use half need nvcc arch >= 530");
+  assert(false);
+#endif /* __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)*/
+}
+}  // namespace
+
+template<typename T, typename K>
+class SparseSoftmaxCrossEntropyKernel final : public user_op::OpKernel,
+                                              public user_op::CudaGraphSupport {
+ public:
+  SparseSoftmaxCrossEntropyKernel() = default;
+  ~SparseSoftmaxCrossEntropyKernel() override = default;
+
+ private:
+  using user_op::OpKernel::Compute;
+  void Compute(user_op::KernelComputeContext* ctx) const override {
+    const user_op::Tensor* prediction = ctx->Tensor4ArgNameAndIndex("prediction", 0);
+    const user_op::Tensor* label = ctx->Tensor4ArgNameAndIndex("label", 0);
+    user_op::Tensor* prob = ctx->Tensor4ArgNameAndIndex("prob", 0);
+    user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0);
+
+    const int64_t num_instances = label->shape_view().elem_cnt();
+    CHECK_EQ(prediction->shape_view().elem_cnt() % num_instances, 0);
+    const int64_t num_classes = prediction->shape_view().elem_cnt() / num_instances;
+    const int64_t lower_bound = 0;
+    const int64_t depth = ctx->Attr<int64_t>("depth");
+
+    ComputeProb<T>(ctx->stream(), num_instances, num_classes, prediction->dptr<T>(),
+                   prob->mut_dptr<T>());
+    ComputeSparseSoftmaxCrossEntropyResult<T, K>(ctx->stream(), num_instances, num_classes, depth,
+                                                 lower_bound, label->dptr<K>(), prob->dptr<T>(),
+                                                 out->mut_dptr<T>());
+  }
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
+};
+
+#define REGISTER_SPARSE_SOFTMAX_CROSS_ENTROPY_KERNEL(dtype_pair, ltype_pair)                  \
+  REGISTER_USER_KERNEL("sparse_softmax_cross_entropy")                                        \
+      .SetCreateFn<SparseSoftmaxCrossEntropyKernel<OF_PP_PAIR_FIRST(dtype_pair),              \
+                                                   OF_PP_PAIR_FIRST(ltype_pair)>>()           \
+      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                        \
+                       && (user_op::HobDataType("label", 0) == OF_PP_PAIR_SECOND(ltype_pair)) \
+                       && (user_op::HobDataType("out", 0) == OF_PP_PAIR_SECOND(dtype_pair)));
+
+OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_SPARSE_SOFTMAX_CROSS_ENTROPY_KERNEL,
+                                 FLOATING_DATA_TYPE_SEQ FLOAT16_DATA_TYPE_SEQ, INDEX_DATA_TYPE_SEQ)
+
+}  // namespace user_op
 }  // namespace oneflow
\ No newline at end of file
diff --git a/oneflow/user/kernels/sparse_softmax_cross_entropy_kernel_util.hip.cpp b/oneflow/user/kernels/sparse_softmax_cross_entropy_kernel_util.hip.cpp
index b2a004a..e637751 100644
--- a/oneflow/user/kernels/sparse_softmax_cross_entropy_kernel_util.hip.cpp
+++ b/oneflow/user/kernels/sparse_softmax_cross_entropy_kernel_util.hip.cpp
@@ -1,134 +1,134 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "hip/hip_runtime.h"
-#include "oneflow/user/kernels/sparse_softmax_cross_entropy_kernel_util.h"
-#include "oneflow/core/hip/softmax.hip.h"
-#include "oneflow/core/ep/rocm/cuda_stream.h"
-
-namespace oneflow {
-namespace user_op {
-namespace {
-
-template<typename T>
-__inline__ __device__ T Exp(T x);
-
-template<>
-__inline__ __device__ float Exp<float>(float x) {
-#ifdef OF_SOFTMAX_USE_FAST_MATH
-  return __expf(x);
-#else
-  return exp(x);
-#endif
-}
-
-template<>
-__inline__ __device__ double Exp<double>(double x) {
-  return exp(x);
-}
-
-template<>
-__inline__ __device__ half Exp<half>(half x) {
-#ifdef OF_SOFTMAX_USE_FAST_MATH
-  return __float2half(__expf(__half2float(x)));
-#else
-  return __float2half(exp(__half2float(x)));
-#endif
-}
-
-template<typename T, typename K, typename IndexType>
-__global__ void ComputeDiffGpu(const int64_t num_instances, const int64_t num_classes,
-                               const int64_t depth, const int64_t lower_bound, const T* prob,
-                               const K* labels, const T* dy, T* dx) {
-  CUDA_1D_KERNEL_LOOP_T(IndexType, i, num_instances) {
-    const IndexType row_id = i / num_classes;
-    const IndexType col_id = i - row_id * num_classes;
-    assert(labels[row_id] >= 0);
-    assert(labels[row_id] < depth);
-    K label = labels[row_id] - lower_bound;
-    if (label == col_id) {
-      dx[i] = dy[row_id] * (Exp(prob[i]) - 1);
-    } else {
-      dx[i] = dy[row_id] * Exp(prob[i]);
-    }
-  }
-}
-
-template<typename K, typename IndexType>
-__global__ void ComputeDiffGpuHalf(const int64_t num_instances, const int64_t num_classes,
-                                   const int64_t depth, const int64_t lower_bound, const half* prob,
-                                   const K* labels, const half* dy, half* dx) {
-  CUDA_1D_KERNEL_LOOP_T(IndexType, i, num_instances) {
-    const IndexType row_id = i / num_classes;
-    const IndexType col_id = i - row_id * num_classes;
-    assert(labels[row_id] >= 0);
-    assert(labels[row_id] < depth);
-    K label = labels[row_id] - lower_bound;
-    if (label == col_id) {
-      dx[i] = __hmul(dy[row_id], __hsub(Exp(prob[i]), __float2half(1.0)));
-    } else {
-      dx[i] = __hmul(dy[row_id], Exp(prob[i]));
-    }
-  }
-}
-
-}  // namespace
-
-template<typename T, typename K>
-struct SparseSoftmaxCrossEntropyKernelUtil<DeviceType::kCUDA, T, K> {
-  static void ComputeDiff(ep::Stream* stream, const int64_t num_instances,
-                          const int64_t num_classes, const int64_t depth, const int64_t lower_bound,
-                          const T* prob, const K* labels, const T* dy, T* dx) {
-    if (num_instances < GetMaxVal<int32_t>() / 2) {
-      ComputeDiffGpu<T, K, int32_t><<<BlocksNum4ThreadsNum(num_instances), kCudaThreadsNumPerBlock,
-                                      0, stream->As<ep::CudaStream>()->cuda_stream()>>>(
-          num_instances, num_classes, depth, lower_bound, prob, labels, dy, dx);
-    } else {
-      // NOTE(chengcheng): int division ('/') of i will reduce performance of int64_t.
-      ComputeDiffGpu<T, K, int64_t><<<BlocksNum4ThreadsNum(num_instances), kCudaThreadsNumPerBlock,
-                                      0, stream->As<ep::CudaStream>()->cuda_stream()>>>(
-          num_instances, num_classes, depth, lower_bound, prob, labels, dy, dx);
-    }
-  }
-};
-
-template<typename K>
-struct SparseSoftmaxCrossEntropyKernelUtil<DeviceType::kCUDA, float16, K> {
-  static void ComputeDiff(ep::Stream* stream, const int64_t num_instances,
-                          const int64_t num_classes, const int64_t depth, const int64_t lower_bound,
-                          const float16* prob, const K* labels, const float16* dy, float16* dx) {
-    if (num_instances < GetMaxVal<int32_t>() / 2) {
-      ComputeDiffGpuHalf<K, int32_t><<<BlocksNum4ThreadsNum(num_instances), kCudaThreadsNumPerBlock,
-                                       0, stream->As<ep::CudaStream>()->cuda_stream()>>>(
-          num_instances, num_classes, depth, lower_bound, reinterpret_cast<const half*>(prob),
-          labels, reinterpret_cast<const half*>(dy), reinterpret_cast<half*>(dx));
-    } else {
-      ComputeDiffGpuHalf<K, int64_t><<<BlocksNum4ThreadsNum(num_instances), kCudaThreadsNumPerBlock,
-                                       0, stream->As<ep::CudaStream>()->cuda_stream()>>>(
-          num_instances, num_classes, depth, lower_bound, reinterpret_cast<const half*>(prob),
-          labels, reinterpret_cast<const half*>(dy), reinterpret_cast<half*>(dx));
-    }
-  }
-};
-
-#define INSTANTIATE_SPARSE_SOFTMAX_CROSS_ENTROPY_KERNEL_UTIL_CUDA(data_type_pair, index_type_pair) \
-  template struct SparseSoftmaxCrossEntropyKernelUtil<                                             \
-      DeviceType::kCUDA, OF_PP_PAIR_FIRST(data_type_pair), OF_PP_PAIR_FIRST(index_type_pair)>;
-OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_SPARSE_SOFTMAX_CROSS_ENTROPY_KERNEL_UTIL_CUDA,
-                                 FLOATING_DATA_TYPE_SEQ FLOAT16_DATA_TYPE_SEQ, INDEX_DATA_TYPE_SEQ);
-#undef INSTANTIATE_SPARSE_SOFTMAX_CROSS_ENTROPY_KERNEL_UTIL_CUDA
-
-}  // namespace user_op
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "hip/hip_runtime.h"
+#include "oneflow/user/kernels/sparse_softmax_cross_entropy_kernel_util.h"
+#include "oneflow/core/hip/softmax.hip.h"
+#include "oneflow/core/ep/rocm/cuda_stream.h"
+
+namespace oneflow {
+namespace user_op {
+namespace {
+
+template<typename T>
+__inline__ __device__ T Exp(T x);
+
+template<>
+__inline__ __device__ float Exp<float>(float x) {
+#ifdef OF_SOFTMAX_USE_FAST_MATH
+  return __expf(x);
+#else
+  return exp(x);
+#endif
+}
+
+template<>
+__inline__ __device__ double Exp<double>(double x) {
+  return exp(x);
+}
+
+template<>
+__inline__ __device__ half Exp<half>(half x) {
+#ifdef OF_SOFTMAX_USE_FAST_MATH
+  return __float2half(__expf(__half2float(x)));
+#else
+  return __float2half(exp(__half2float(x)));
+#endif
+}
+
+template<typename T, typename K, typename IndexType>
+__global__ void ComputeDiffGpu(const int64_t num_instances, const int64_t num_classes,
+                               const int64_t depth, const int64_t lower_bound, const T* prob,
+                               const K* labels, const T* dy, T* dx) {
+  CUDA_1D_KERNEL_LOOP_T(IndexType, i, num_instances) {
+    const IndexType row_id = i / num_classes;
+    const IndexType col_id = i - row_id * num_classes;
+    assert(labels[row_id] >= 0);
+    assert(labels[row_id] < depth);
+    K label = labels[row_id] - lower_bound;
+    if (label == col_id) {
+      dx[i] = dy[row_id] * (Exp(prob[i]) - 1);
+    } else {
+      dx[i] = dy[row_id] * Exp(prob[i]);
+    }
+  }
+}
+
+template<typename K, typename IndexType>
+__global__ void ComputeDiffGpuHalf(const int64_t num_instances, const int64_t num_classes,
+                                   const int64_t depth, const int64_t lower_bound, const half* prob,
+                                   const K* labels, const half* dy, half* dx) {
+  CUDA_1D_KERNEL_LOOP_T(IndexType, i, num_instances) {
+    const IndexType row_id = i / num_classes;
+    const IndexType col_id = i - row_id * num_classes;
+    assert(labels[row_id] >= 0);
+    assert(labels[row_id] < depth);
+    K label = labels[row_id] - lower_bound;
+    if (label == col_id) {
+      dx[i] = __hmul(dy[row_id], __hsub(Exp(prob[i]), __float2half(1.0)));
+    } else {
+      dx[i] = __hmul(dy[row_id], Exp(prob[i]));
+    }
+  }
+}
+
+}  // namespace
+
+template<typename T, typename K>
+struct SparseSoftmaxCrossEntropyKernelUtil<DeviceType::kCUDA, T, K> {
+  static void ComputeDiff(ep::Stream* stream, const int64_t num_instances,
+                          const int64_t num_classes, const int64_t depth, const int64_t lower_bound,
+                          const T* prob, const K* labels, const T* dy, T* dx) {
+    if (num_instances < GetMaxVal<int32_t>() / 2) {
+      ComputeDiffGpu<T, K, int32_t><<<BlocksNum4ThreadsNum(num_instances), kCudaThreadsNumPerBlock,
+                                      0, stream->As<ep::CudaStream>()->cuda_stream()>>>(
+          num_instances, num_classes, depth, lower_bound, prob, labels, dy, dx);
+    } else {
+      // NOTE(chengcheng): int division ('/') of i will reduce performance of int64_t.
+      ComputeDiffGpu<T, K, int64_t><<<BlocksNum4ThreadsNum(num_instances), kCudaThreadsNumPerBlock,
+                                      0, stream->As<ep::CudaStream>()->cuda_stream()>>>(
+          num_instances, num_classes, depth, lower_bound, prob, labels, dy, dx);
+    }
+  }
+};
+
+template<typename K>
+struct SparseSoftmaxCrossEntropyKernelUtil<DeviceType::kCUDA, float16, K> {
+  static void ComputeDiff(ep::Stream* stream, const int64_t num_instances,
+                          const int64_t num_classes, const int64_t depth, const int64_t lower_bound,
+                          const float16* prob, const K* labels, const float16* dy, float16* dx) {
+    if (num_instances < GetMaxVal<int32_t>() / 2) {
+      ComputeDiffGpuHalf<K, int32_t><<<BlocksNum4ThreadsNum(num_instances), kCudaThreadsNumPerBlock,
+                                       0, stream->As<ep::CudaStream>()->cuda_stream()>>>(
+          num_instances, num_classes, depth, lower_bound, reinterpret_cast<const half*>(prob),
+          labels, reinterpret_cast<const half*>(dy), reinterpret_cast<half*>(dx));
+    } else {
+      ComputeDiffGpuHalf<K, int64_t><<<BlocksNum4ThreadsNum(num_instances), kCudaThreadsNumPerBlock,
+                                       0, stream->As<ep::CudaStream>()->cuda_stream()>>>(
+          num_instances, num_classes, depth, lower_bound, reinterpret_cast<const half*>(prob),
+          labels, reinterpret_cast<const half*>(dy), reinterpret_cast<half*>(dx));
+    }
+  }
+};
+
+#define INSTANTIATE_SPARSE_SOFTMAX_CROSS_ENTROPY_KERNEL_UTIL_CUDA(data_type_pair, index_type_pair) \
+  template struct SparseSoftmaxCrossEntropyKernelUtil<                                             \
+      DeviceType::kCUDA, OF_PP_PAIR_FIRST(data_type_pair), OF_PP_PAIR_FIRST(index_type_pair)>;
+OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_SPARSE_SOFTMAX_CROSS_ENTROPY_KERNEL_UTIL_CUDA,
+                                 FLOATING_DATA_TYPE_SEQ FLOAT16_DATA_TYPE_SEQ, INDEX_DATA_TYPE_SEQ);
+#undef INSTANTIATE_SPARSE_SOFTMAX_CROSS_ENTROPY_KERNEL_UTIL_CUDA
+
+}  // namespace user_op
 }  // namespace oneflow
\ No newline at end of file
diff --git a/oneflow/user/kernels/sqrt_square_sum_kernel_util.hip.cpp b/oneflow/user/kernels/sqrt_square_sum_kernel_util.hip.cpp
index 7db4c19..fbe4d2a 100644
--- a/oneflow/user/kernels/sqrt_square_sum_kernel_util.hip.cpp
+++ b/oneflow/user/kernels/sqrt_square_sum_kernel_util.hip.cpp
@@ -1,83 +1,83 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "hip/hip_runtime.h"
-#include "oneflow/user/kernels/sqrt_square_sum_kernel_util.h"
-#include "oneflow/core/hip/atomic.hip.h"
-#include "oneflow/core/ep/rocm/cuda_stream.h"
-#include <hipcub/hipcub.hpp>
-
-namespace oneflow {
-
-namespace {
-
-template<typename T>
-__global__ void SqrtSquareSumForOneThreadBlock(int64_t n, const T* x, T* y) {
-  T t_sum = 0;
-  CUDA_1D_KERNEL_LOOP(i, n) { t_sum += x[i] * x[i]; }
-  typedef hipcub::BlockReduce<T, kCudaThreadsNumPerBlock> BlockReduce;
-  __shared__ typename BlockReduce::TempStorage temp_storage;
-  T b_sum = BlockReduce(temp_storage).Sum(t_sum);
-  if (threadIdx.x == 0) { *y = sqrt(b_sum); }
-}
-
-template<typename T>
-__global__ void SqrtSumForMultiThreadBlock(int64_t n, const T* x, T* y) {
-  T t_sum = 0;
-  CUDA_1D_KERNEL_LOOP(i, n) { t_sum += x[i]; }
-  typedef hipcub::BlockReduce<T, kCudaThreadsNumPerBlock> BlockReduce;
-  __shared__ typename BlockReduce::TempStorage temp_storage;
-  T b_sum = BlockReduce(temp_storage).Sum(t_sum);
-  if (threadIdx.x == 0) { *y = sqrt(b_sum); }
-}
-
-template<typename T>
-__global__ void SquareSumForMultiThreadBlock(int64_t n, const T* x, T* tmp) {
-  T t_sum = 0;
-  CUDA_1D_KERNEL_LOOP(i, n) { t_sum += x[i] * x[i]; }
-  typedef hipcub::BlockReduce<T, kCudaThreadsNumPerBlock> BlockReduce;
-  __shared__ typename BlockReduce::TempStorage temp_storage;
-  T b_sum = BlockReduce(temp_storage).Sum(t_sum);
-  if (threadIdx.x == 0) { tmp[blockIdx.x] = b_sum; }
-}
-
-}  // namespace
-
-template<typename T>
-struct SqrtSquareSumKernelUtil<DeviceType::kCUDA, T> {
-  static void SqrtSquareSum(ep::Stream* stream, int64_t n, const T* x, T* y, T* tmp) {
-    const int32_t num_blocks = BlocksNum4ThreadsNum(n);
-    CHECK_GE(num_blocks, 0);
-    if (num_blocks == 1) {
-      SqrtSquareSumForOneThreadBlock<T>
-          <<<1, kCudaThreadsNumPerBlock, 0, stream->As<ep::CudaStream>()->cuda_stream()>>>(n, x, y);
-    } else {
-      Memset<DeviceType::kCUDA>(stream, y, 0, sizeof(T));
-      SquareSumForMultiThreadBlock<T>
-          <<<num_blocks, kCudaThreadsNumPerBlock, 0, stream->As<ep::CudaStream>()->cuda_stream()>>>(
-              n, x, tmp);
-      SqrtSumForMultiThreadBlock<T>
-          <<<1, kCudaThreadsNumPerBlock, 0, stream->As<ep::CudaStream>()->cuda_stream()>>>(
-              num_blocks, tmp, y);
-    }
-  }
-};
-
-#define INSTANTIATE_SQRT_SQUARE_SUM_KERNEL_UTIL_CUDA(type_cpp, type_proto) \
-  template struct SqrtSquareSumKernelUtil<DeviceType::kCUDA, type_cpp>;
-OF_PP_FOR_EACH_TUPLE(INSTANTIATE_SQRT_SQUARE_SUM_KERNEL_UTIL_CUDA, FLOATING_DATA_TYPE_SEQ);
-#undef INSTANTIATE_SQRT_SQUARE_SUM_KERNEL_UTIL_CUDA
-
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "hip/hip_runtime.h"
+#include "oneflow/user/kernels/sqrt_square_sum_kernel_util.h"
+#include "oneflow/core/hip/atomic.hip.h"
+#include "oneflow/core/ep/rocm/cuda_stream.h"
+#include <hipcub/hipcub.hpp>
+
+namespace oneflow {
+
+namespace {
+
+template<typename T>
+__global__ void SqrtSquareSumForOneThreadBlock(int64_t n, const T* x, T* y) {
+  T t_sum = 0;
+  CUDA_1D_KERNEL_LOOP(i, n) { t_sum += x[i] * x[i]; }
+  typedef hipcub::BlockReduce<T, kCudaThreadsNumPerBlock> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage temp_storage;
+  T b_sum = BlockReduce(temp_storage).Sum(t_sum);
+  if (threadIdx.x == 0) { *y = sqrt(b_sum); }
+}
+
+template<typename T>
+__global__ void SqrtSumForMultiThreadBlock(int64_t n, const T* x, T* y) {
+  T t_sum = 0;
+  CUDA_1D_KERNEL_LOOP(i, n) { t_sum += x[i]; }
+  typedef hipcub::BlockReduce<T, kCudaThreadsNumPerBlock> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage temp_storage;
+  T b_sum = BlockReduce(temp_storage).Sum(t_sum);
+  if (threadIdx.x == 0) { *y = sqrt(b_sum); }
+}
+
+template<typename T>
+__global__ void SquareSumForMultiThreadBlock(int64_t n, const T* x, T* tmp) {
+  T t_sum = 0;
+  CUDA_1D_KERNEL_LOOP(i, n) { t_sum += x[i] * x[i]; }
+  typedef hipcub::BlockReduce<T, kCudaThreadsNumPerBlock> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage temp_storage;
+  T b_sum = BlockReduce(temp_storage).Sum(t_sum);
+  if (threadIdx.x == 0) { tmp[blockIdx.x] = b_sum; }
+}
+
+}  // namespace
+
+template<typename T>
+struct SqrtSquareSumKernelUtil<DeviceType::kCUDA, T> {
+  static void SqrtSquareSum(ep::Stream* stream, int64_t n, const T* x, T* y, T* tmp) {
+    const int32_t num_blocks = BlocksNum4ThreadsNum(n);
+    CHECK_GE(num_blocks, 0);
+    if (num_blocks == 1) {
+      SqrtSquareSumForOneThreadBlock<T>
+          <<<1, kCudaThreadsNumPerBlock, 0, stream->As<ep::CudaStream>()->cuda_stream()>>>(n, x, y);
+    } else {
+      Memset<DeviceType::kCUDA>(stream, y, 0, sizeof(T));
+      SquareSumForMultiThreadBlock<T>
+          <<<num_blocks, kCudaThreadsNumPerBlock, 0, stream->As<ep::CudaStream>()->cuda_stream()>>>(
+              n, x, tmp);
+      SqrtSumForMultiThreadBlock<T>
+          <<<1, kCudaThreadsNumPerBlock, 0, stream->As<ep::CudaStream>()->cuda_stream()>>>(
+              num_blocks, tmp, y);
+    }
+  }
+};
+
+#define INSTANTIATE_SQRT_SQUARE_SUM_KERNEL_UTIL_CUDA(type_cpp, type_proto) \
+  template struct SqrtSquareSumKernelUtil<DeviceType::kCUDA, type_cpp>;
+OF_PP_FOR_EACH_TUPLE(INSTANTIATE_SQRT_SQUARE_SUM_KERNEL_UTIL_CUDA, FLOATING_DATA_TYPE_SEQ);
+#undef INSTANTIATE_SQRT_SQUARE_SUM_KERNEL_UTIL_CUDA
+
 }  // namespace oneflow
\ No newline at end of file
diff --git a/oneflow/user/kernels/square_sum_kernel_util.hip.cpp b/oneflow/user/kernels/square_sum_kernel_util.hip.cpp
index 490c6b9..62c8562 100644
--- a/oneflow/user/kernels/square_sum_kernel_util.hip.cpp
+++ b/oneflow/user/kernels/square_sum_kernel_util.hip.cpp
@@ -1,105 +1,105 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "hip/hip_runtime.h"
-#include "oneflow/user/kernels/square_sum_kernel_util.h"
-#include "oneflow/core/hip/atomic.hip.h"
-#include "oneflow/core/ep/rocm/cuda_stream.h"
-#include <hipcub/hipcub.hpp>
-
-namespace oneflow {
-
-namespace {
-
-template<typename T, bool ONE_BLOCK>
-__global__ void SquareSumGpu(int64_t n, const T* x, T* y) {
-  T t_sum = 0;
-  CUDA_1D_KERNEL_LOOP(i, n) { t_sum += x[i] * x[i]; }
-  typedef hipcub::BlockReduce<T, kCudaThreadsNumPerBlock> BlockReduce;
-  __shared__ typename BlockReduce::TempStorage temp_storage;
-  T b_sum = BlockReduce(temp_storage).Sum(t_sum);
-  if (threadIdx.x == 0) {
-    if (ONE_BLOCK) {
-      *y = b_sum;
-    } else {
-      cuda::atomic::Add(y, b_sum);
-    }
-  }
-}
-
-constexpr int64_t kMultiSquareSumMaxSize = 64;
-
-template<typename T>
-struct MultiSquareSumParams {
-  SquareSumParam<T> params[kMultiSquareSumMaxSize];
-  int32_t size;
-};
-
-template<typename T>
-__global__ void MultiSquareSumGpu(const MultiSquareSumParams<T> params, T* y) {
-  T t_sum = 0;
-  for (int i = 0; i < params.size; ++i) {
-    const SquareSumParam<T> param = params.params[i];
-    CUDA_1D_KERNEL_LOOP(j, param.count) { t_sum += param.ptr[j] * param.ptr[j]; }
-  }
-  typedef hipcub::BlockReduce<T, kCudaThreadsNumPerBlock> BlockReduce;
-  __shared__ typename BlockReduce::TempStorage temp_storage;
-  T b_sum = BlockReduce(temp_storage).Sum(t_sum);
-  if (threadIdx.x == 0) { cuda::atomic::Add(y, b_sum); }
-}
-
-}  // namespace
-
-template<typename T>
-struct SquareSumKernelUtil<DeviceType::kCUDA, T> {
-  static void SquareSum(ep::Stream* stream, int64_t n, const T* x, T* y) {
-    const int32_t num_blocks = BlocksNum4ThreadsNum(n);
-    CHECK_GE(num_blocks, 0);
-    if (num_blocks == 0) {
-      Memset<DeviceType::kCUDA>(stream, y, 0, sizeof(T));
-    } else if (num_blocks == 1) {
-      SquareSumGpu<T, true>
-          <<<1, kCudaThreadsNumPerBlock, 0, stream->As<ep::CudaStream>()->cuda_stream()>>>(n, x, y);
-    } else {
-      Memset<DeviceType::kCUDA>(stream, y, 0, sizeof(T));
-      SquareSumGpu<T, false>
-          <<<num_blocks, kCudaThreadsNumPerBlock, 0, stream->As<ep::CudaStream>()->cuda_stream()>>>(
-              n, x, y);
-    }
-  }
-
-  static void MultiSquareSum(ep::Stream* stream, const std::vector<SquareSumParam<T>>& params,
-                             T* y) {
-    Memset<DeviceType::kCUDA>(stream, y, 0, sizeof(T));
-    for (int64_t start = 0; start < params.size(); start += kMultiSquareSumMaxSize) {
-      MultiSquareSumParams<T> gpu_params{};
-      int64_t max_count = 0;
-      gpu_params.size = std::min<int64_t>(start + kMultiSquareSumMaxSize, params.size()) - start;
-      for (int64_t i = 0; i < gpu_params.size; ++i) {
-        gpu_params.params[i] = params[start + i];
-        max_count = std::max(max_count, gpu_params.params[i].count);
-      }
-      MultiSquareSumGpu<T><<<BlocksNum4ThreadsNum(max_count), kCudaThreadsNumPerBlock, 0,
-                             stream->As<ep::CudaStream>()->cuda_stream()>>>(gpu_params, y);
-    }
-  }
-};
-
-#define INSTANTIATE_SQUARE_SUM_KERNEL_UTIL_CUDA(type_cpp, type_proto) \
-  template struct SquareSumKernelUtil<DeviceType::kCUDA, type_cpp>;
-OF_PP_FOR_EACH_TUPLE(INSTANTIATE_SQUARE_SUM_KERNEL_UTIL_CUDA, FLOATING_DATA_TYPE_SEQ);
-#undef INSTANTIATE_SQUARE_SUM_KERNEL_UTIL_CUDA
-
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "hip/hip_runtime.h"
+#include "oneflow/user/kernels/square_sum_kernel_util.h"
+#include "oneflow/core/hip/atomic.hip.h"
+#include "oneflow/core/ep/rocm/cuda_stream.h"
+#include <hipcub/hipcub.hpp>
+
+namespace oneflow {
+
+namespace {
+
+template<typename T, bool ONE_BLOCK>
+__global__ void SquareSumGpu(int64_t n, const T* x, T* y) {
+  T t_sum = 0;
+  CUDA_1D_KERNEL_LOOP(i, n) { t_sum += x[i] * x[i]; }
+  typedef hipcub::BlockReduce<T, kCudaThreadsNumPerBlock> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage temp_storage;
+  T b_sum = BlockReduce(temp_storage).Sum(t_sum);
+  if (threadIdx.x == 0) {
+    if (ONE_BLOCK) {
+      *y = b_sum;
+    } else {
+      cuda::atomic::Add(y, b_sum);
+    }
+  }
+}
+
+constexpr int64_t kMultiSquareSumMaxSize = 64;
+
+template<typename T>
+struct MultiSquareSumParams {
+  SquareSumParam<T> params[kMultiSquareSumMaxSize];
+  int32_t size;
+};
+
+template<typename T>
+__global__ void MultiSquareSumGpu(const MultiSquareSumParams<T> params, T* y) {
+  T t_sum = 0;
+  for (int i = 0; i < params.size; ++i) {
+    const SquareSumParam<T> param = params.params[i];
+    CUDA_1D_KERNEL_LOOP(j, param.count) { t_sum += param.ptr[j] * param.ptr[j]; }
+  }
+  typedef hipcub::BlockReduce<T, kCudaThreadsNumPerBlock> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage temp_storage;
+  T b_sum = BlockReduce(temp_storage).Sum(t_sum);
+  if (threadIdx.x == 0) { cuda::atomic::Add(y, b_sum); }
+}
+
+}  // namespace
+
+template<typename T>
+struct SquareSumKernelUtil<DeviceType::kCUDA, T> {
+  static void SquareSum(ep::Stream* stream, int64_t n, const T* x, T* y) {
+    const int32_t num_blocks = BlocksNum4ThreadsNum(n);
+    CHECK_GE(num_blocks, 0);
+    if (num_blocks == 0) {
+      Memset<DeviceType::kCUDA>(stream, y, 0, sizeof(T));
+    } else if (num_blocks == 1) {
+      SquareSumGpu<T, true>
+          <<<1, kCudaThreadsNumPerBlock, 0, stream->As<ep::CudaStream>()->cuda_stream()>>>(n, x, y);
+    } else {
+      Memset<DeviceType::kCUDA>(stream, y, 0, sizeof(T));
+      SquareSumGpu<T, false>
+          <<<num_blocks, kCudaThreadsNumPerBlock, 0, stream->As<ep::CudaStream>()->cuda_stream()>>>(
+              n, x, y);
+    }
+  }
+
+  static void MultiSquareSum(ep::Stream* stream, const std::vector<SquareSumParam<T>>& params,
+                             T* y) {
+    Memset<DeviceType::kCUDA>(stream, y, 0, sizeof(T));
+    for (int64_t start = 0; start < params.size(); start += kMultiSquareSumMaxSize) {
+      MultiSquareSumParams<T> gpu_params{};
+      int64_t max_count = 0;
+      gpu_params.size = std::min<int64_t>(start + kMultiSquareSumMaxSize, params.size()) - start;
+      for (int64_t i = 0; i < gpu_params.size; ++i) {
+        gpu_params.params[i] = params[start + i];
+        max_count = std::max(max_count, gpu_params.params[i].count);
+      }
+      MultiSquareSumGpu<T><<<BlocksNum4ThreadsNum(max_count), kCudaThreadsNumPerBlock, 0,
+                             stream->As<ep::CudaStream>()->cuda_stream()>>>(gpu_params, y);
+    }
+  }
+};
+
+#define INSTANTIATE_SQUARE_SUM_KERNEL_UTIL_CUDA(type_cpp, type_proto) \
+  template struct SquareSumKernelUtil<DeviceType::kCUDA, type_cpp>;
+OF_PP_FOR_EACH_TUPLE(INSTANTIATE_SQUARE_SUM_KERNEL_UTIL_CUDA, FLOATING_DATA_TYPE_SEQ);
+#undef INSTANTIATE_SQUARE_SUM_KERNEL_UTIL_CUDA
+
 }  // namespace oneflow
\ No newline at end of file
diff --git a/oneflow/user/kernels/stateful_opkernel.cpp b/oneflow/user/kernels/stateful_opkernel.cpp
index b72a456..db373fe 100644
--- a/oneflow/user/kernels/stateful_opkernel.cpp
+++ b/oneflow/user/kernels/stateful_opkernel.cpp
@@ -1,901 +1,901 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "oneflow/user/kernels/stateful_opkernel.h"
-#include "oneflow/core/framework/attr_value_accessor.h"
-#include "oneflow/core/framework/user_op_conf.h"
-#include "oneflow/core/framework/user_op_registry_manager.h"
-#include "oneflow/core/eager/eager_blob_object.h"
-#include "oneflow/core/framework/attr_map.h"
-#include "oneflow/core/rpc/include/global_process_ctx.h"
-#include "oneflow/core/framework/consistent_tensor_infer_cache.h"
-#include "oneflow/core/operator/operator.h"
-#include "oneflow/core/profiler/profiler.h"
-#include "oneflow/core/profiler/profile_manager.h"
-#include "oneflow/core/profiler/event_recorder.h"
-#include "oneflow/core/eager/call_context.h"
-
-namespace oneflow {
-namespace one {
-
-class ConsistentTensorInferResult;
-
-using ArgVec = std::vector<std::pair<std::string, int32_t>>;
-
-using EagerBlobObjectListRawPtr = const std::vector<std::shared_ptr<vm::EagerBlobObject>>*;
-using ConsistentTensorInferResultRawPtr = const ConsistentTensorInferResult*;
-
-class ZeroCopyBaseContextHelper {
- public:
-  ZeroCopyBaseContextHelper(const std::shared_ptr<const ArgTuple>& input_arg_tuple,
-                            const std::shared_ptr<const ArgTuple>& output_arg_tuple)
-      : input_arg_tuple_(input_arg_tuple), output_arg_tuple_(output_arg_tuple) {}
-
-#define RETURN_IF_FOUND(inputs, outputs, post_action)                                             \
-  int32_t i = TryGetTensorTupleIndex(input_arg_tuple_->arg_name2bn_index2tensor_tuple_index(),    \
-                                     arg_name, index);                                            \
-  if (i >= 0) { return (inputs).at(i) post_action; }                                              \
-  i = TryGetTensorTupleIndex(output_arg_tuple_->arg_name2bn_index2tensor_tuple_index(), arg_name, \
-                             index);                                                              \
-  if (i >= 0) { return (outputs).at(i) post_action; }
-
-  user_op::TensorDesc* TensorDesc4ArgNameAndIndex(eager::CallContext* call_ctx,
-                                                  const std::string& arg_name,
-                                                  const int32_t index) const {
-    RETURN_IF_FOUND(*call_ctx->inputs(), *call_ctx->outputs(), .get());
-    return nullptr;
-  }
-
-  user_op::Tensor* Tensor4ArgNameAndIndex(eager::CallContext* call_ctx, const std::string& arg_name,
-                                          const int32_t index) const {
-    RETURN_IF_FOUND(*call_ctx->inputs(), *call_ctx->outputs(), .get());
-    if (arg_name == "tmp_buffer" && index == 0) { return call_ctx->mut_tmp_tensor(); }
-    return nullptr;
-  }
-
-  const ConsistentTensorMeta* ConsistentTensorMeta4ArgNameAndIndex(eager::CallContext* call_ctx,
-                                                                   const std::string& arg_name,
-                                                                   const int32_t index) const {
-    const auto& consistent_tensor_infer_result = call_ctx->consistent_tensor_infer_result();
-    RETURN_IF_FOUND(consistent_tensor_infer_result->input_tensor_metas(),
-                    consistent_tensor_infer_result->output_tensor_metas(),
-                    .shared_from_symbol().get());
-    return nullptr;
-  }
-
-  Optional<Symbol<ParallelDesc>> parallel_desc(eager::CallContext* call_ctx) const {
-    const auto& consistent_tensor_infer_result = call_ctx->consistent_tensor_infer_result();
-    if (!consistent_tensor_infer_result) { return Optional<Symbol<ParallelDesc>>(); }
-    if (!consistent_tensor_infer_result->input_tensor_metas().empty()) {
-      return consistent_tensor_infer_result->input_tensor_metas().at(0)->parallel_desc();
-    } else if (!consistent_tensor_infer_result->output_tensor_metas().empty()) {
-      return consistent_tensor_infer_result->output_tensor_metas().at(0)->parallel_desc();
-    } else {
-      UNIMPLEMENTED();
-      return Optional<Symbol<ParallelDesc>>();
-    }
-  }
-
-  const ParallelContext& parallel_ctx(eager::CallContext* call_ctx) const {
-    const auto& parallel_desc = this->parallel_desc(call_ctx);
-    if (parallel_desc.has_value()) {
-      const auto& parallel_desc_symbol = CHECK_JUST(parallel_desc);
-      return *CHECK_JUST(GetParallelContext4CurrentProcessCtx(parallel_desc_symbol));
-    } else {
-      static ParallelContext single_device_parallel_ctx(MakeSingleDeviceParallelCtx());
-      return single_device_parallel_ctx;
-    }
-  }
-
-  const ArgVec& inputs() const { return input_arg_tuple_->indexed_arg_name_and_index(); }
-  const ArgVec& outputs() const { return output_arg_tuple_->indexed_arg_name_and_index(); }
-
- private:
-  static int32_t TryGetTensorTupleIndex(const std::unordered_map<std::string, std::vector<int32_t>>&
-                                            arg_name2bn_index2tensor_tuple_index,
-                                        const std::string& arg_name, const int32_t arg_index) {
-    auto it = arg_name2bn_index2tensor_tuple_index.find(arg_name);
-    if (it != arg_name2bn_index2tensor_tuple_index.end()) { return it->second.at(arg_index); }
-    return -1;
-  }
-
-  static ParallelContext MakeSingleDeviceParallelCtx() {
-    ParallelContext single_device_parallel_ctx;
-    single_device_parallel_ctx.set_parallel_id(0);
-    single_device_parallel_ctx.set_parallel_num(1);
-    return single_device_parallel_ctx;
-  }
-
-  std::shared_ptr<const ArgTuple> input_arg_tuple_;
-  std::shared_ptr<const ArgTuple> output_arg_tuple_;
-};
-
-class UserKernelBaseContextHelper final : public ZeroCopyBaseContextHelper {
- public:
-  UserKernelBaseContextHelper(DeviceType device_type,
-                              const std::shared_ptr<const ArgTuple>& input_arg_tuple,
-                              const std::shared_ptr<const ArgTuple>& output_arg_tuple)
-      : ZeroCopyBaseContextHelper(input_arg_tuple, output_arg_tuple), device_type_(device_type) {}
-
-  ~UserKernelBaseContextHelper() = default;
-
-  DeviceType device_type() const { return device_type_; }
-  const JobDesc& job_desc() const {
-    UNIMPLEMENTED();
-    return *(const JobDesc*)nullptr;
-  }
-
- private:
-  const DeviceType device_type_;
-};
-
-class UserOpInferContextHelper final {
- public:
-  UserOpInferContextHelper(const user_op::UserOpConfWrapper* user_op_conf,
-                           const std::shared_ptr<const ArgTuple>& input_arg_tuple,
-                           const std::shared_ptr<const ArgTuple>& output_arg_tuple)
-      : user_op_conf_(user_op_conf),
-        zero_copy_base_ctx_helper_(input_arg_tuple, output_arg_tuple) {}
-
-  ~UserOpInferContextHelper() = default;
-
-  const user_op::TensorDesc* LogicalTensorDesc4ArgNameAndIndex(eager::CallContext* call_ctx,
-                                                               const std::string& arg_name,
-                                                               int32_t index) const {
-    UNIMPLEMENTED();
-    return nullptr;
-  }
-
-  const user_op::TensorDesc& InputTensorDesc(eager::CallContext* call_ctx,
-                                             const std::string& arg_name, int32_t index) const {
-    return *CHECK_NOTNULL(TensorDesc4ArgNameAndIndex(call_ctx, arg_name, index));
-  }
-
-  user_op::TensorDesc* OutputTensorDesc(eager::CallContext* call_ctx, const std::string& arg_name,
-                                        int32_t index) const {
-    return TensorDesc4ArgNameAndIndex(call_ctx, arg_name, index);
-  }
-  user_op::TensorDesc* TensorDesc4ArgNameAndIndex(eager::CallContext* call_ctx,
-                                                  const std::string& arg_name,
-                                                  int32_t index) const {
-    return zero_copy_base_ctx_helper_.TensorDesc4ArgNameAndIndex(call_ctx, arg_name, index);
-  }
-
-  const Shape& InputShape(eager::CallContext* call_ctx, const std::string& arg_name,
-                          int32_t index) const {
-    return *Shape4ArgNameAndIndex(call_ctx, arg_name, index);
-  }
-  Shape* OutputShape(eager::CallContext* call_ctx, const std::string& arg_name,
-                     int32_t index) const {
-    return Shape4ArgNameAndIndex(call_ctx, arg_name, index);
-  }
-  Shape* Shape4ArgNameAndIndex(eager::CallContext* call_ctx, const std::string& arg_name,
-                               int32_t index) const {
-    return NonNullTensorDesc4ArgNameAndIndex(call_ctx, arg_name, index)->mut_shape();
-  }
-  const Stride& InputStride(eager::CallContext* call_ctx, const std::string& arg_name,
-                            int32_t index) const {
-    return *Stride4ArgNameAndIndex(call_ctx, arg_name, index);
-  }
-  Stride* OutputStride(eager::CallContext* call_ctx, const std::string& arg_name,
-                       int32_t index) const {
-    return Stride4ArgNameAndIndex(call_ctx, arg_name, index);
-  }
-  Stride* Stride4ArgNameAndIndex(eager::CallContext* call_ctx, const std::string& arg_name,
-                                 int32_t index) const {
-    return NonNullTensorDesc4ArgNameAndIndex(call_ctx, arg_name, index)->mut_stride();
-  }
-  const DataType& InputDType(eager::CallContext* call_ctx, const std::string& arg_name,
-                             int32_t index) const {
-    return *Dtype4ArgNameAndIndex(call_ctx, arg_name, index);
-  }
-  DataType* OutputDType(eager::CallContext* call_ctx, const std::string& arg_name,
-                        int32_t index) const {
-    return Dtype4ArgNameAndIndex(call_ctx, arg_name, index);
-  }
-  DataType* Dtype4ArgNameAndIndex(eager::CallContext* call_ctx, const std::string& arg_name,
-                                  int32_t index) const {
-    return NonNullTensorDesc4ArgNameAndIndex(call_ctx, arg_name, index)->mut_data_type();
-  }
-  bool InputIsDynamic(eager::CallContext* call_ctx, const std::string& arg_name,
-                      int32_t index) const {
-    return *IsDynamic4ArgNameAndIndex(call_ctx, arg_name, index);
-  }
-  bool* OutputIsDynamic(eager::CallContext* call_ctx, const std::string& arg_name,
-                        int32_t index) const {
-    return IsDynamic4ArgNameAndIndex(call_ctx, arg_name, index);
-  }
-  bool* IsDynamic4ArgNameAndIndex(eager::CallContext* call_ctx, const std::string& arg_name,
-                                  int32_t index) const {
-    return NonNullTensorDesc4ArgNameAndIndex(call_ctx, arg_name, index)->mut_is_dynamic();
-  }
-
-  const ArgVec& inputs() const { return zero_copy_base_ctx_helper_.inputs(); }
-  const ArgVec& outputs() const { return zero_copy_base_ctx_helper_.outputs(); }
-  const JobDesc* job_desc() const {
-    UNIMPLEMENTED();
-    return nullptr;
-  }
-  const ParallelContext& parallel_ctx(eager::CallContext* call_ctx) const {
-    return zero_copy_base_ctx_helper_.parallel_ctx(call_ctx);
-  }
-  const ParallelDesc& parallel_desc(eager::CallContext* call_ctx) const {
-    return *CHECK_JUST(zero_copy_base_ctx_helper_.parallel_desc(call_ctx));
-  }
-  const SbpParallel& SbpParallel4ArgNameAndIndex(eager::CallContext* call_ctx,
-                                                 const std::string& arg_name, int32_t index) const {
-    const auto& nd_sbp = NdSbp4ArgNameAndIndex(call_ctx, arg_name, index);
-    CHECK_EQ(nd_sbp.sbp_parallel_size(), 1);
-    return nd_sbp.sbp_parallel(0);
-  }
-  const NdSbp& NdSbp4ArgNameAndIndex(eager::CallContext* call_ctx, const std::string& arg_name,
-                                     int32_t index) const {
-    return *CHECK_NOTNULL(zero_copy_base_ctx_helper_.ConsistentTensorMeta4ArgNameAndIndex(
-                              call_ctx, arg_name, index))
-                ->nd_sbp();
-  }
-
-  int64_t parallel_num(eager::CallContext* call_ctx) const {
-    return parallel_ctx(call_ctx).parallel_num();
-  }
-
-  const std::string& input(const std::string& arg_name, int32_t index) const {
-    return user_op_conf().input(arg_name, index);
-  }
-  const std::string& output(const std::string& arg_name, int32_t index) const {
-    return user_op_conf().output(arg_name, index);
-  }
-  bool has_input(const std::string& arg_name, int32_t index) const {
-    return user_op_conf().has_input(arg_name, index);
-  }
-  bool has_output(const std::string& arg_name, int32_t index) const {
-    return user_op_conf().has_output(arg_name, index);
-  }
-  int32_t input_size(const std::string& arg_name) const {
-    return user_op_conf().input_size(arg_name);
-  }
-  int32_t output_size(const std::string& arg_name) const {
-    return user_op_conf().output_size(arg_name);
-  }
-  const std::string& op_name() const { return user_op_conf().op_name(); }
-  const std::string& op_type_name() const { return user_op_conf().op_type_name(); }
-  const std::string& op_loc() const { return user_op_conf_->op_conf().loc(); }
-
-  const user_op::UserOpConfWrapper& user_op_conf() const { return *user_op_conf_; }
-  const std::shared_ptr<const user_op::AttrVal>& Attr4Name(eager::CallContext* call_ctx,
-                                                           const std::string& attr_name) const {
-    return call_ctx->composed_attrs().Attr4Name(attr_name);
-  }
-
- private:
-  user_op::TensorDesc* NonNullTensorDesc4ArgNameAndIndex(eager::CallContext* call_ctx,
-                                                         const std::string& arg_name,
-                                                         int32_t index) const {
-    user_op::TensorDesc* tensor_desc = TensorDesc4ArgNameAndIndex(call_ctx, arg_name, index);
-    if (!tensor_desc) { LOG(FATAL) << "Arg (" << arg_name << "," << index << ") is not found"; }
-    return tensor_desc;
-  }
-
-  const user_op::UserOpConfWrapper* user_op_conf_;
-  ZeroCopyBaseContextHelper zero_copy_base_ctx_helper_;
-};
-
-class UserOpInferContext : public user_op::InferContext {
- public:
-  UserOpInferContext(const UserOpInferContextHelper* helper, eager::CallContext* call_ctx)
-      : helper_(helper), call_ctx_(call_ctx) {}
-
-  ~UserOpInferContext() override = default;
-
-  const user_op::TensorDesc* LogicalTensorDesc4ArgNameAndIndex(const std::string& arg_name,
-                                                               int32_t index) const override {
-    return helper_->LogicalTensorDesc4ArgNameAndIndex(call_ctx_, arg_name, index);
-  }
-
-  const user_op::TensorDesc& InputTensorDesc(const std::string& arg_name,
-                                             int32_t index) const override {
-    return helper_->InputTensorDesc(call_ctx_, arg_name, index);
-  }
-  user_op::TensorDesc* OutputTensorDesc(const std::string& arg_name, int32_t index) override {
-    return helper_->OutputTensorDesc(call_ctx_, arg_name, index);
-  }
-  user_op::TensorDesc* TensorDesc4ArgNameAndIndex(const std::string& arg_name, int32_t index) {
-    return helper_->TensorDesc4ArgNameAndIndex(call_ctx_, arg_name, index);
-  }
-
-  const Shape& InputShape(const std::string& arg_name, int32_t index) const override {
-    return helper_->InputShape(call_ctx_, arg_name, index);
-  }
-  Shape* OutputShape(const std::string& arg_name, int32_t index) override {
-    return helper_->OutputShape(call_ctx_, arg_name, index);
-  }
-  Shape* Shape4ArgNameAndIndex(const std::string& arg_name, int32_t index) override {
-    return helper_->Shape4ArgNameAndIndex(call_ctx_, arg_name, index);
-  }
-  const Stride& InputStride(const std::string& arg_name, int32_t index) const override {
-    return helper_->InputStride(call_ctx_, arg_name, index);
-  }
-  Stride* OutputStride(const std::string& arg_name, int32_t index) override {
-    return helper_->OutputStride(call_ctx_, arg_name, index);
-  }
-  Stride* Stride4ArgNameAndIndex(const std::string& arg_name, int32_t index) override {
-    return helper_->Stride4ArgNameAndIndex(call_ctx_, arg_name, index);
-  }
-  const DataType& InputDType(const std::string& arg_name, int32_t index) const override {
-    return helper_->InputDType(call_ctx_, arg_name, index);
-  }
-  DataType* OutputDType(const std::string& arg_name, int32_t index) override {
-    return helper_->OutputDType(call_ctx_, arg_name, index);
-  }
-  DataType* Dtype4ArgNameAndIndex(const std::string& arg_name, int32_t index) override {
-    return helper_->Dtype4ArgNameAndIndex(call_ctx_, arg_name, index);
-  }
-  bool InputIsDynamic(const std::string& arg_name, int32_t index) const override {
-    return helper_->InputIsDynamic(call_ctx_, arg_name, index);
-  }
-  bool* OutputIsDynamic(const std::string& arg_name, int32_t index) override {
-    return helper_->OutputIsDynamic(call_ctx_, arg_name, index);
-  }
-  bool* IsDynamic4ArgNameAndIndex(const std::string& arg_name, int32_t index) override {
-    return helper_->IsDynamic4ArgNameAndIndex(call_ctx_, arg_name, index);
-  }
-
-  const ArgVec& inputs() const override { return helper_->inputs(); }
-  const ArgVec& outputs() const override { return helper_->outputs(); }
-  const JobDesc* job_desc() const override { return helper_->job_desc(); }
-  const ParallelContext& parallel_ctx() const override { return helper_->parallel_ctx(call_ctx_); }
-  const ParallelDesc& parallel_desc() const override { return helper_->parallel_desc(call_ctx_); }
-  const SbpParallel& SbpParallel4ArgNameAndIndex(const std::string& arg_name,
-                                                 int32_t index) const override {
-    return helper_->SbpParallel4ArgNameAndIndex(call_ctx_, arg_name, index);
-  }
-  const NdSbp& NdSbp4ArgNameAndIndex(const std::string& arg_name, int32_t index) const override {
-    return helper_->NdSbp4ArgNameAndIndex(call_ctx_, arg_name, index);
-  }
-
-  int64_t parallel_num() const override { return helper_->parallel_num(call_ctx_); }
-
-  const std::string& input(const std::string& arg_name, int32_t index) const override {
-    return helper_->input(arg_name, index);
-  }
-  const std::string& output(const std::string& arg_name, int32_t index) const override {
-    return helper_->output(arg_name, index);
-  }
-  bool has_input(const std::string& arg_name, int32_t index) const override {
-    return helper_->has_input(arg_name, index);
-  }
-  bool has_output(const std::string& arg_name, int32_t index) const override {
-    return helper_->has_output(arg_name, index);
-  }
-  int32_t input_size(const std::string& arg_name) const override {
-    return helper_->input_size(arg_name);
-  }
-  int32_t output_size(const std::string& arg_name) const override {
-    return helper_->output_size(arg_name);
-  }
-  const std::string& op_name() const override { return helper_->op_name(); }
-  const std::string& op_type_name() const override { return helper_->op_type_name(); }
-  const std::string& op_loc() const override { return helper_->op_loc(); }
-
- private:
-  const std::shared_ptr<const user_op::AttrVal>& Attr4Name(
-      const std::string& attr_name) const override {
-    return helper_->Attr4Name(call_ctx_, attr_name);
-  }
-
-  const UserOpInferContextHelper* helper_;
-  eager::CallContext* call_ctx_;
-};
-
-class UserKernelComputeContextHelper final {
- public:
-  UserKernelComputeContextHelper(DeviceType device_type,
-                                 const user_op::UserOpConfWrapper* user_op_conf,
-                                 const std::shared_ptr<const ArgTuple>& input_arg_tuple,
-                                 const std::shared_ptr<const ArgTuple>& output_arg_tuple)
-      : user_op_conf_(user_op_conf),
-        base_ctx_helper_(device_type, input_arg_tuple, output_arg_tuple) {}
-
-  ~UserKernelComputeContextHelper() = default;
-
-  const user_op::TensorDesc* TensorDesc4ArgNameAndIndex(eager::CallContext* call_ctx,
-                                                        const std::string& arg_name,
-                                                        int32_t index) const {
-    return base_ctx_helper_.TensorDesc4ArgNameAndIndex(call_ctx, arg_name, index);
-  }
-
-  user_op::Tensor* Tensor4ArgNameAndIndex(eager::CallContext* call_ctx, const std::string& arg_name,
-                                          int32_t index) const {
-    return base_ctx_helper_.Tensor4ArgNameAndIndex(call_ctx, arg_name, index);
-  }
-  ep::Stream* stream(DeviceCtx* device_ctx) const {
-    CHECK(device_ctx);
-    return device_ctx->stream();
-  }
-
-  DeviceType device_type() const { return base_ctx_helper_.device_type(); }
-  const ParallelContext& parallel_ctx(eager::CallContext* call_ctx) const {
-    return base_ctx_helper_.parallel_ctx(call_ctx);
-  }
-
-  const ArgVec& inputs() const { return base_ctx_helper_.inputs(); }
-  const ArgVec& outputs() const { return base_ctx_helper_.outputs(); }
-
-  const user_op::UserOpConfWrapper& user_op_conf() const { return *user_op_conf_; }
-  const std::shared_ptr<const user_op::AttrVal>& Attr4Name(eager::CallContext* call_ctx,
-                                                           const std::string& attr_name) const {
-    return call_ctx->composed_attrs().Attr4Name(attr_name);
-  }
-
- private:
-  const user_op::UserOpConfWrapper* user_op_conf_;
-  UserKernelBaseContextHelper base_ctx_helper_;
-};
-
-class UserKernelComputeContext final : public user_op::KernelComputeContext {
- public:
-  UserKernelComputeContext(const UserKernelComputeContextHelper* helper,
-                           eager::CallContext* call_ctx, DeviceCtx* device_ctx)
-      : helper_(helper), call_ctx_(call_ctx), device_ctx_(device_ctx) {}
-
-  ~UserKernelComputeContext() = default;
-
-  const user_op::TensorDesc* TensorDesc4ArgNameAndIndex(const std::string& arg_name,
-                                                        int32_t index) const override {
-    return helper_->TensorDesc4ArgNameAndIndex(call_ctx_, arg_name, index);
-  }
-
-  user_op::Tensor* Tensor4ArgNameAndIndex(const std::string& arg_name, int32_t index) override {
-    return helper_->Tensor4ArgNameAndIndex(call_ctx_, arg_name, index);
-  }
-
-  ep::Stream* stream() override { return helper_->stream(device_ctx_); }
-
-  DeviceType device_type() const override { return helper_->device_type(); }
-
-  const ParallelContext& parallel_ctx() const override { return helper_->parallel_ctx(call_ctx_); }
-
-  const ArgVec& inputs() const override { return helper_->inputs(); }
-  const ArgVec& outputs() const override { return helper_->outputs(); }
-
- private:
-  const user_op::UserOpConfWrapper& user_op_conf() const override {
-    return helper_->user_op_conf();
-  }
-
-  const std::shared_ptr<const user_op::AttrVal>& Attr4Name(
-      const std::string& attr_name) const override {
-    return helper_->Attr4Name(call_ctx_, attr_name);
-  }
-
-  const UserKernelComputeContextHelper* helper_;
-  eager::CallContext* call_ctx_;
-  DeviceCtx* device_ctx_;
-};
-
-class UserKernelRegContextHelper final {
- public:
-  UserKernelRegContextHelper(DeviceType device_type, const user_op::UserOpConfWrapper* user_op_conf,
-                             const std::shared_ptr<const ArgTuple>& input_arg_tuple,
-                             const std::shared_ptr<const ArgTuple>& output_arg_tuple)
-      : user_op_conf_(user_op_conf),
-        base_ctx_helper_(device_type, input_arg_tuple, output_arg_tuple) {}
-  ~UserKernelRegContextHelper() = default;
-
-  DeviceType device_type() const { return base_ctx_helper_.device_type(); }
-  const ParallelContext& parallel_ctx(eager::CallContext* call_ctx) const {
-    return base_ctx_helper_.parallel_ctx(call_ctx);
-  }
-  const user_op::TensorDesc* TensorDesc4ArgNameAndIndex(eager::CallContext* call_ctx,
-                                                        const std::string& arg_name,
-                                                        int32_t index) const {
-    return base_ctx_helper_.TensorDesc4ArgNameAndIndex(call_ctx, arg_name, index);
-  }
-  const ArgVec& inputs() const { return base_ctx_helper_.inputs(); }
-  const ArgVec& outputs() const { return base_ctx_helper_.outputs(); }
-
-  const user_op::UserOpConfWrapper& user_op_conf() const { return *user_op_conf_; }
-
-  const std::shared_ptr<const user_op::AttrVal>& Attr4Name(eager::CallContext* call_ctx,
-                                                           const std::string& attr_name) const {
-    return call_ctx->composed_attrs().Attr4Name(attr_name);
-  }
-
- private:
-  const user_op::UserOpConfWrapper* user_op_conf_;
-  UserKernelBaseContextHelper base_ctx_helper_;
-};
-
-class UserKernelRegContext final : public user_op::KernelRegContext {
- public:
-  UserKernelRegContext(const UserKernelRegContextHelper* helper, eager::CallContext* call_ctx)
-      : helper_(helper), call_ctx_(call_ctx) {}
-  ~UserKernelRegContext() = default;
-
-  DeviceType device_type() const override { return helper_->device_type(); }
-  const ParallelContext& parallel_ctx() const override { return helper_->parallel_ctx(call_ctx_); }
-  const user_op::TensorDesc* TensorDesc4ArgNameAndIndex(const std::string& arg_name,
-                                                        int32_t index) const override {
-    return helper_->TensorDesc4ArgNameAndIndex(call_ctx_, arg_name, index);
-  }
-  const ArgVec& inputs() const override { return helper_->inputs(); }
-  const ArgVec& outputs() const override { return helper_->outputs(); }
-
-  const user_op::UserOpConfWrapper& user_op_conf() const override {
-    return helper_->user_op_conf();
-  }
-
- private:
-  const std::shared_ptr<const user_op::AttrVal>& Attr4Name(
-      const std::string& attr_name) const override {
-    return helper_->Attr4Name(call_ctx_, attr_name);
-  }
-
-  const UserKernelRegContextHelper* helper_;
-  eager::CallContext* call_ctx_;
-};
-
-class UserKernelInitAndCacheContextHelper final {
- public:
-  UserKernelInitAndCacheContextHelper(DeviceType device_type,
-                                      const user_op::UserOpConfWrapper* user_op_conf,
-                                      const std::shared_ptr<const ArgTuple>& input_arg_tuple,
-                                      const std::shared_ptr<const ArgTuple>& output_arg_tuple)
-      : user_op_conf_(user_op_conf),
-        base_ctx_helper_(device_type, input_arg_tuple, output_arg_tuple) {}
-
-  ~UserKernelInitAndCacheContextHelper() = default;
-
-  ep::Stream* stream(DeviceCtx* device_ctx) const {
-    CHECK(device_ctx);
-    return device_ctx->stream();
-  }
-
-  DeviceType device_type() const { return base_ctx_helper_.device_type(); }
-  const ParallelContext& parallel_ctx(eager::CallContext* call_ctx) const {
-    return base_ctx_helper_.parallel_ctx(call_ctx);
-  }
-  const user_op::TensorDesc* TensorDesc4ArgNameAndIndex(eager::CallContext* call_ctx,
-                                                        const std::string& arg_name,
-                                                        int32_t index) const {
-    return base_ctx_helper_.TensorDesc4ArgNameAndIndex(call_ctx, arg_name, index);
-  }
-  const user_op::TensorDesc* LogicalTensorDesc4ArgNameAndIndex(eager::CallContext* call_ctx,
-                                                               const std::string& arg_name,
-                                                               int32_t index) const {
-    return base_ctx_helper_.ConsistentTensorMeta4ArgNameAndIndex(call_ctx, arg_name, index);
-  }
-  const SbpParallel& SbpParallel4ArgNameAndIndex(eager::CallContext* call_ctx,
-                                                 const std::string& arg_name, int32_t index) const {
-    const auto& nd_sbp = NdSbp4ArgNameAndIndex(call_ctx, arg_name, index);
-    CHECK_EQ(nd_sbp.sbp_parallel_size(), 1);
-    return nd_sbp.sbp_parallel(0);
-  }
-
-  const NdSbp& NdSbp4ArgNameAndIndex(eager::CallContext* call_ctx, const std::string& arg_name,
-                                     int32_t index) const {
-    return *CHECK_NOTNULL(
-                base_ctx_helper_.ConsistentTensorMeta4ArgNameAndIndex(call_ctx, arg_name, index))
-                ->nd_sbp();
-  }
-
-  const ArgVec& inputs() const { return base_ctx_helper_.inputs(); }
-  const ArgVec& outputs() const { return base_ctx_helper_.outputs(); }
-  const ParallelDesc& parallel_desc(eager::CallContext* call_ctx) const {
-    return *CHECK_JUST(base_ctx_helper_.parallel_desc(call_ctx));
-  }
-
-  const std::shared_ptr<const user_op::AttrVal>& Attr4Name(eager::CallContext* call_ctx,
-                                                           const std::string& attr_name) const {
-    return call_ctx->composed_attrs().Attr4Name(attr_name);
-  }
-
-  const user_op::UserOpConfWrapper& user_op_conf() const { return *user_op_conf_; }
-
- private:
-  const user_op::UserOpConfWrapper* user_op_conf_;
-  UserKernelBaseContextHelper base_ctx_helper_;
-};
-
-class UserKernelInitAndCacheContext final : public user_op::KernelInitContext,
-                                            public user_op::KernelCacheContext {
- public:
-  UserKernelInitAndCacheContext(const UserKernelInitAndCacheContextHelper* helper,
-                                eager::CallContext* call_ctx, DeviceCtx* device_ctx)
-      : helper_(helper), call_ctx_(call_ctx), device_ctx_(device_ctx) {}
-
-  ~UserKernelInitAndCacheContext() override = default;
-
-  ep::Stream* stream() override { return helper_->stream(device_ctx_); }
-
-  DeviceType device_type() const override { return helper_->device_type(); }
-  const ParallelContext& parallel_ctx() const override { return helper_->parallel_ctx(call_ctx_); }
-  const user_op::TensorDesc* TensorDesc4ArgNameAndIndex(const std::string& arg_name,
-                                                        int32_t index) const override {
-    return helper_->TensorDesc4ArgNameAndIndex(call_ctx_, arg_name, index);
-  }
-  const user_op::TensorDesc* LogicalTensorDesc4ArgNameAndIndex(const std::string& arg_name,
-                                                               int32_t index) const override {
-    return helper_->LogicalTensorDesc4ArgNameAndIndex(call_ctx_, arg_name, index);
-  }
-  const SbpParallel& SbpParallel4ArgNameAndIndex(const std::string& arg_name,
-                                                 int32_t index) const override {
-    return helper_->SbpParallel4ArgNameAndIndex(call_ctx_, arg_name, index);
-  }
-
-  const NdSbp& NdSbp4ArgNameAndIndex(const std::string& arg_name, int32_t index) const override {
-    return helper_->NdSbp4ArgNameAndIndex(call_ctx_, arg_name, index);
-  }
-
-  const ArgVec& inputs() const override { return helper_->inputs(); }
-  const ArgVec& outputs() const override { return helper_->outputs(); }
-  const ParallelDesc& parallel_desc() const override { return helper_->parallel_desc(call_ctx_); }
-
- private:
-  const std::shared_ptr<const user_op::AttrVal>& Attr4Name(
-      const std::string& attr_name) const override {
-    return helper_->Attr4Name(call_ctx_, attr_name);
-  }
-
-  const user_op::UserOpConfWrapper& user_op_conf() const override {
-    return helper_->user_op_conf();
-  }
-
-  const UserKernelInitAndCacheContextHelper* helper_;
-  eager::CallContext* call_ctx_;
-  DeviceCtx* device_ctx_;
-};
-
-namespace {
-
-Maybe<void> InitTensorTupleIndexes4Bns(const std::shared_ptr<const OperatorConf>& op_conf,
-                                       const ArgVec& indexed_input_pairs,
-                                       const ArgVec& indexed_output_pairs,
-                                       std::vector<int64_t>* input_tuple_indexes4const_ibns,
-                                       std::vector<int64_t>* input_tuple_indexes4mut_ibns,
-                                       std::vector<int64_t>* output_tuple_indexes4mut_obns,
-                                       std::vector<int64_t>* output_tuple_indexes4mut2_obns) {
-  const auto* op_reg_val =
-      user_op::UserOpRegistryMgr::Get().GetOpRegistryResult(op_conf->user_conf().op_type_name());
-  CHECK_NOTNULL_OR_RETURN(op_reg_val);
-
-  ArgModifierSignature arg_modifier_signature;
-  for (const auto& pair : indexed_input_pairs) {
-    const std::string ibn = GenRepeatedBn(pair.first, pair.second);
-    arg_modifier_signature.mutable_ibn2input_blob_modifier()->insert(
-        {ibn, user_op::InputArgModifier()});
-  }
-  for (const auto& pair : indexed_output_pairs) {
-    const std::string obn = GenRepeatedBn(pair.first, pair.second);
-    arg_modifier_signature.mutable_obn2output_blob_modifier()->insert(
-        {obn, user_op::OutputArgModifier()});
-  }
-  user_op::UserOpConfWrapper op_conf_wrapper(op_conf);
-  if (op_reg_val->input_arg_modify_fn) {
-    user_op::GetInputArgModifier GetInputArgModifierFn =
-        [&arg_modifier_signature](const std::string& in_arg_name,
-                                  int32_t in_arg_index) -> user_op::InputArgModifier* {
-      const std::string ibn = GenRepeatedBn(in_arg_name, in_arg_index);
-      auto* map = arg_modifier_signature.mutable_ibn2input_blob_modifier();
-      return &map->at(ibn);
-    };
-    JUST(op_reg_val->input_arg_modify_fn(GetInputArgModifierFn, op_conf_wrapper));
-  }
-  if (op_reg_val->output_arg_modify_fn) {
-    user_op::GetOutputArgModifier GetOutputArgModifierFn =
-        [&arg_modifier_signature](const std::string& in_arg_name,
-                                  int32_t in_arg_index) -> user_op::OutputArgModifier* {
-      const std::string obn = GenRepeatedBn(in_arg_name, in_arg_index);
-      auto* map = arg_modifier_signature.mutable_obn2output_blob_modifier();
-      return &map->at(obn);
-    };
-    JUST(op_reg_val->output_arg_modify_fn(GetOutputArgModifierFn, op_conf_wrapper));
-  }
-
-  for (int i = 0; i < indexed_input_pairs.size(); i++) {
-    const auto& pair = indexed_input_pairs.at(i);
-    const std::string ibn = GenRepeatedBn(pair.first, pair.second);
-    if (arg_modifier_signature.ibn2input_blob_modifier().at(ibn).is_mutable()) {
-      input_tuple_indexes4mut_ibns->emplace_back(i);
-    } else {
-      input_tuple_indexes4const_ibns->emplace_back(i);
-    }
-  }
-
-  for (int i = 0; i < indexed_output_pairs.size(); i++) {
-    const auto& pair = indexed_output_pairs.at(i);
-    const std::string obn = GenRepeatedBn(pair.first, pair.second);
-    if (arg_modifier_signature.obn2output_blob_modifier().at(obn).header_infered_before_compute()) {
-      output_tuple_indexes4mut_obns->emplace_back(i);
-    } else {
-      output_tuple_indexes4mut2_obns->emplace_back(i);
-    }
-  }
-  return Maybe<void>::Ok();
-}
-
-}  // namespace
-
-/* static */ Maybe<StatefulOpKernel> StatefulOpKernel::New(
-    const std::shared_ptr<OperatorConf>& op_conf, const Symbol<Stream>& stream,
-    const AttrMap& base_attrs, const std::shared_ptr<const ParallelDesc>& parallel_desc,
-    const std::shared_ptr<const ArgTuple>& input_arg_tuple,
-    const std::shared_ptr<const ArgTuple>& output_arg_tuple) {
-  auto opkernel = std::shared_ptr<StatefulOpKernel>(new StatefulOpKernel());
-  opkernel->base_attrs_ = base_attrs;
-  opkernel->op_conf_ = op_conf;
-  opkernel->user_op_conf_.reset(new user_op::UserOpConfWrapper(op_conf));
-  opkernel->stream_ = stream;
-  opkernel->input_arg_tuple_ = input_arg_tuple;
-  opkernel->output_arg_tuple_ = output_arg_tuple;
-  opkernel->need_check_mem_case_ = true;
-
-  const DeviceType device_type = CHECK_JUST(DeviceType4DeviceTag(op_conf->device_tag()));
-  const user_op::UserOpConfWrapper* user_op_conf = opkernel->user_op_conf_.get();
-  opkernel->op_infer_ctx_helper_.reset(
-      new UserOpInferContextHelper(user_op_conf, input_arg_tuple, output_arg_tuple));
-
-  opkernel->init_and_cache_ctx_helper_.reset(new UserKernelInitAndCacheContextHelper(
-      device_type, opkernel->user_op_conf_.get(), opkernel->input_arg_tuple_,
-      opkernel->output_arg_tuple_));
-  opkernel->compute_ctx_helper_.reset(new UserKernelComputeContextHelper(
-      device_type, user_op_conf, input_arg_tuple, output_arg_tuple));
-  opkernel->reg_ctx_helper_.reset(
-      new UserKernelRegContextHelper(device_type, user_op_conf, input_arg_tuple, output_arg_tuple));
-  const auto* op_reg_val =
-      user_op::UserOpRegistryMgr::Get().GetOpRegistryResult(user_op_conf->op_type_name());
-  CHECK_NOTNULL_OR_RETURN(op_reg_val);
-  if (op_reg_val->logical_tensor_desc_infer_fn) {
-    opkernel->tensor_desc_infer_fn_ = op_reg_val->logical_tensor_desc_infer_fn;
-  } else {
-    return Error::UnimplementedError();
-  }
-  opkernel->data_type_infer_fn_ = op_reg_val->data_type_infer_fn;
-
-  JUST(InitTensorTupleIndexes4Bns(
-      op_conf, input_arg_tuple->indexed_arg_name_and_index(),
-      output_arg_tuple->indexed_arg_name_and_index(), &opkernel->input_tuple_indexes4const_ibns_,
-      &opkernel->input_tuple_indexes4mut_ibns_, &opkernel->output_tuple_indexes4mut_obns_,
-      &opkernel->output_tuple_indexes4mut2_obns_));
-
-  return opkernel;
-}
-
-StatefulOpKernel::~StatefulOpKernel() = default;
-
-size_t StatefulOpKernel::InferTmpSize(eager::CallContext* call_ctx,
-                                      const user_op::OpKernel* user_opkernel) const {
-  UserOpInferContext op_infer_ctx(op_infer_ctx_helper_.get(), call_ctx);
-  const auto& InferTmpSizeFn = GetInferTmpSizeFn(user_opkernel);
-  return InferTmpSizeFn(&op_infer_ctx);
-}
-
-Maybe<void> StatefulOpKernel::ChooseOpKernel(eager::CallContext* call_ctx,
-                                             const user_op::OpKernel** user_opkernel,
-                                             bool* need_temp_storage) {
-  OF_PROFILER_RANGE_GUARD("ChooseOpKernel");
-  DataType primary_dtype = kInvalidDataType;
-  const auto& inputs = call_ctx->inputs();
-  const auto& outputs = call_ctx->outputs();
-  if (likely(!inputs->empty())) {
-    primary_dtype = (*inputs)[0]->data_type();
-  } else if (likely(!outputs->empty())) {
-    primary_dtype = (*outputs)[0]->data_type();
-  } else {
-    // do nothing
-  }
-
-  UserKernelRegContext reg_ctx(reg_ctx_helper_.get(), call_ctx);
-  for (const auto& pair : dtype2cached_kernels_[primary_dtype]) {
-    if (likely(pair.first->is_matched_hob->get(reg_ctx))) {
-      *need_temp_storage = pair.first->need_temp_storage;
-      *user_opkernel = pair.second.get();
-      return Maybe<void>::Ok();
-    }
-  }
-
-  OF_PROFILER_RANGE_GUARD("fallback");
-
-  const auto& op_type_name = user_op_conf_->op_type_name();
-  const auto* kernel_reg_val =
-      JUST(user_op::UserOpRegistryMgr::Get().GetOpKernelRegistryResult(op_type_name, reg_ctx));
-  CHECK_NOTNULL(kernel_reg_val);
-  auto* kernel = kernel_reg_val->create_fn();
-  dtype2cached_kernels_[primary_dtype].push_back(
-      {kernel_reg_val, std::shared_ptr<const user_op::OpKernel>(kernel)});
-
-  infer_tmp_size_fn_map_.emplace(kernel, &kernel_reg_val->infer_tmp_size_fn);
-  *need_temp_storage = kernel_reg_val->need_temp_storage;
-  *user_opkernel = kernel;
-  return Maybe<void>::Ok();
-}
-
-void StatefulOpKernel::TryInitOpKernelStateAndCache(eager::CallContext* call_ctx,
-                                                    DeviceCtx* device_ctx,
-                                                    const user_op::OpKernel* op_kernel,
-                                                    user_op::OpKernelState** state,
-                                                    user_op::OpKernelCache** cache) {
-  UserKernelInitAndCacheContext init_and_cache_ctx(init_and_cache_ctx_helper_.get(), call_ctx,
-                                                   device_ctx);
-  if (state != nullptr) {
-    auto it = op_kernel_state_map_.find(op_kernel);
-    if (it != op_kernel_state_map_.end()) {
-      *state = it->second.get();
-    } else {
-      auto created_state = op_kernel->CreateOpKernelState(&init_and_cache_ctx);
-      op_kernel_state_map_.emplace(op_kernel, created_state);
-      *state = created_state.get();
-    }
-  }
-
-  {
-    auto& cache_in_map = op_kernel_cache_map_[op_kernel];
-    op_kernel->InitOpKernelCacheWithFlags(&init_and_cache_ctx,
-                                          user_op::OpKernelCache::kAllMayChanged, &cache_in_map);
-    *cache = cache_in_map.get();
-  }
-}
-
-const user_op::InferTmpSizeFn& StatefulOpKernel::GetInferTmpSizeFn(
-    const user_op::OpKernel* op_kernel) const {
-  return *infer_tmp_size_fn_map_.at(op_kernel);
-}
-
-user_op::TensorDescInferFn StatefulOpKernel::TensorDescInferFn() const {
-  return tensor_desc_infer_fn_;
-}
-
-user_op::DataTypeInferFn StatefulOpKernel::DataTypeInferFn() const { return data_type_infer_fn_; }
-
-void StatefulOpKernel::Compute(eager::CallContext* call_ctx, DeviceCtx* device_ctx,
-                               const user_op::OpKernel* user_opkernel,
-                               user_op::OpKernelState* state,
-                               const user_op::OpKernelCache* cache) const {
-  UserKernelComputeContext compute_context(compute_ctx_helper_.get(), call_ctx, device_ctx);
-  auto* compute_ctx = &compute_context;
-  OF_PROFILER_RANGE_GUARD("Compute");
-  if (Singleton<profiler::ProfileManager>::Get()) {
-#if defined(WITH_CUDA) || defined(WITH_ROCM)
-    const auto CalMemorySize = [compute_ctx](const one::ArgVec& args) -> int64_t {
-      const auto Func = [compute_ctx](int64_t mem_size, const auto& pair) {
-        const auto tensor = compute_ctx->Tensor4ArgNameAndIndex(pair.first, pair.second);
-        return mem_size + tensor->shape_view().elem_cnt() * GetSizeOfDataType(tensor->data_type());
-      };
-      return std::accumulate(args.begin(), args.end(), static_cast<int64_t>(0), Func);
-    };
-#endif
-    auto er_guard = CHECK_JUST(profiler::EventRecorder::CreateKernelEventRecorder(
-        op_type_name(),
-#if defined(WITH_CUDA) || defined(WITH_ROCM)
-        [compute_ctx, CalMemorySize]() -> int64_t {
-          return CalMemorySize(compute_ctx->inputs()) + CalMemorySize(compute_ctx->outputs());
-        },
-#endif
-        [compute_ctx]() -> std::vector<Shape> {
-          std::vector<Shape> shapes;
-          for (const auto& pair : compute_ctx->inputs()) {
-            shapes.emplace_back(
-                compute_ctx->TensorDesc4ArgNameAndIndex(pair.first, pair.second)->shape());
-          }
-          return shapes;
-        }));
-    user_opkernel->Compute(compute_ctx, state, cache);
-  } else {
-    user_opkernel->Compute(compute_ctx, state, cache);
-  }
-}
-
-}  // namespace one
-}  // namespace oneflow
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "oneflow/user/kernels/stateful_opkernel.h"
+#include "oneflow/core/framework/attr_value_accessor.h"
+#include "oneflow/core/framework/user_op_conf.h"
+#include "oneflow/core/framework/user_op_registry_manager.h"
+#include "oneflow/core/eager/eager_blob_object.h"
+#include "oneflow/core/framework/attr_map.h"
+#include "oneflow/core/rpc/include/global_process_ctx.h"
+#include "oneflow/core/framework/consistent_tensor_infer_cache.h"
+#include "oneflow/core/operator/operator.h"
+#include "oneflow/core/profiler/profiler.h"
+#include "oneflow/core/profiler/profile_manager.h"
+#include "oneflow/core/profiler/event_recorder.h"
+#include "oneflow/core/eager/call_context.h"
+
+namespace oneflow {
+namespace one {
+
+class ConsistentTensorInferResult;
+
+using ArgVec = std::vector<std::pair<std::string, int32_t>>;
+
+using EagerBlobObjectListRawPtr = const std::vector<std::shared_ptr<vm::EagerBlobObject>>*;
+using ConsistentTensorInferResultRawPtr = const ConsistentTensorInferResult*;
+
+class ZeroCopyBaseContextHelper {
+ public:
+  ZeroCopyBaseContextHelper(const std::shared_ptr<const ArgTuple>& input_arg_tuple,
+                            const std::shared_ptr<const ArgTuple>& output_arg_tuple)
+      : input_arg_tuple_(input_arg_tuple), output_arg_tuple_(output_arg_tuple) {}
+
+#define RETURN_IF_FOUND(inputs, outputs, post_action)                                             \
+  int32_t i = TryGetTensorTupleIndex(input_arg_tuple_->arg_name2bn_index2tensor_tuple_index(),    \
+                                     arg_name, index);                                            \
+  if (i >= 0) { return (inputs).at(i) post_action; }                                              \
+  i = TryGetTensorTupleIndex(output_arg_tuple_->arg_name2bn_index2tensor_tuple_index(), arg_name, \
+                             index);                                                              \
+  if (i >= 0) { return (outputs).at(i) post_action; }
+
+  user_op::TensorDesc* TensorDesc4ArgNameAndIndex(eager::CallContext* call_ctx,
+                                                  const std::string& arg_name,
+                                                  const int32_t index) const {
+    RETURN_IF_FOUND(*call_ctx->inputs(), *call_ctx->outputs(), .get());
+    return nullptr;
+  }
+
+  user_op::Tensor* Tensor4ArgNameAndIndex(eager::CallContext* call_ctx, const std::string& arg_name,
+                                          const int32_t index) const {
+    RETURN_IF_FOUND(*call_ctx->inputs(), *call_ctx->outputs(), .get());
+    if (arg_name == "tmp_buffer" && index == 0) { return call_ctx->mut_tmp_tensor(); }
+    return nullptr;
+  }
+
+  const ConsistentTensorMeta* ConsistentTensorMeta4ArgNameAndIndex(eager::CallContext* call_ctx,
+                                                                   const std::string& arg_name,
+                                                                   const int32_t index) const {
+    const auto& consistent_tensor_infer_result = call_ctx->consistent_tensor_infer_result();
+    RETURN_IF_FOUND(consistent_tensor_infer_result->input_tensor_metas(),
+                    consistent_tensor_infer_result->output_tensor_metas(),
+                    .shared_from_symbol().get());
+    return nullptr;
+  }
+
+  Optional<Symbol<ParallelDesc>> parallel_desc(eager::CallContext* call_ctx) const {
+    const auto& consistent_tensor_infer_result = call_ctx->consistent_tensor_infer_result();
+    if (!consistent_tensor_infer_result) { return Optional<Symbol<ParallelDesc>>(); }
+    if (!consistent_tensor_infer_result->input_tensor_metas().empty()) {
+      return consistent_tensor_infer_result->input_tensor_metas().at(0)->parallel_desc();
+    } else if (!consistent_tensor_infer_result->output_tensor_metas().empty()) {
+      return consistent_tensor_infer_result->output_tensor_metas().at(0)->parallel_desc();
+    } else {
+      UNIMPLEMENTED();
+      return Optional<Symbol<ParallelDesc>>();
+    }
+  }
+
+  const ParallelContext& parallel_ctx(eager::CallContext* call_ctx) const {
+    const auto& parallel_desc = this->parallel_desc(call_ctx);
+    if (parallel_desc.has_value()) {
+      const auto& parallel_desc_symbol = CHECK_JUST(parallel_desc);
+      return *CHECK_JUST(GetParallelContext4CurrentProcessCtx(parallel_desc_symbol));
+    } else {
+      static ParallelContext single_device_parallel_ctx(MakeSingleDeviceParallelCtx());
+      return single_device_parallel_ctx;
+    }
+  }
+
+  const ArgVec& inputs() const { return input_arg_tuple_->indexed_arg_name_and_index(); }
+  const ArgVec& outputs() const { return output_arg_tuple_->indexed_arg_name_and_index(); }
+
+ private:
+  static int32_t TryGetTensorTupleIndex(const std::unordered_map<std::string, std::vector<int32_t>>&
+                                            arg_name2bn_index2tensor_tuple_index,
+                                        const std::string& arg_name, const int32_t arg_index) {
+    auto it = arg_name2bn_index2tensor_tuple_index.find(arg_name);
+    if (it != arg_name2bn_index2tensor_tuple_index.end()) { return it->second.at(arg_index); }
+    return -1;
+  }
+
+  static ParallelContext MakeSingleDeviceParallelCtx() {
+    ParallelContext single_device_parallel_ctx;
+    single_device_parallel_ctx.set_parallel_id(0);
+    single_device_parallel_ctx.set_parallel_num(1);
+    return single_device_parallel_ctx;
+  }
+
+  std::shared_ptr<const ArgTuple> input_arg_tuple_;
+  std::shared_ptr<const ArgTuple> output_arg_tuple_;
+};
+
+class UserKernelBaseContextHelper final : public ZeroCopyBaseContextHelper {
+ public:
+  UserKernelBaseContextHelper(DeviceType device_type,
+                              const std::shared_ptr<const ArgTuple>& input_arg_tuple,
+                              const std::shared_ptr<const ArgTuple>& output_arg_tuple)
+      : ZeroCopyBaseContextHelper(input_arg_tuple, output_arg_tuple), device_type_(device_type) {}
+
+  ~UserKernelBaseContextHelper() = default;
+
+  DeviceType device_type() const { return device_type_; }
+  const JobDesc& job_desc() const {
+    UNIMPLEMENTED();
+    return *(const JobDesc*)nullptr;
+  }
+
+ private:
+  const DeviceType device_type_;
+};
+
+class UserOpInferContextHelper final {
+ public:
+  UserOpInferContextHelper(const user_op::UserOpConfWrapper* user_op_conf,
+                           const std::shared_ptr<const ArgTuple>& input_arg_tuple,
+                           const std::shared_ptr<const ArgTuple>& output_arg_tuple)
+      : user_op_conf_(user_op_conf),
+        zero_copy_base_ctx_helper_(input_arg_tuple, output_arg_tuple) {}
+
+  ~UserOpInferContextHelper() = default;
+
+  const user_op::TensorDesc* LogicalTensorDesc4ArgNameAndIndex(eager::CallContext* call_ctx,
+                                                               const std::string& arg_name,
+                                                               int32_t index) const {
+    UNIMPLEMENTED();
+    return nullptr;
+  }
+
+  const user_op::TensorDesc& InputTensorDesc(eager::CallContext* call_ctx,
+                                             const std::string& arg_name, int32_t index) const {
+    return *CHECK_NOTNULL(TensorDesc4ArgNameAndIndex(call_ctx, arg_name, index));
+  }
+
+  user_op::TensorDesc* OutputTensorDesc(eager::CallContext* call_ctx, const std::string& arg_name,
+                                        int32_t index) const {
+    return TensorDesc4ArgNameAndIndex(call_ctx, arg_name, index);
+  }
+  user_op::TensorDesc* TensorDesc4ArgNameAndIndex(eager::CallContext* call_ctx,
+                                                  const std::string& arg_name,
+                                                  int32_t index) const {
+    return zero_copy_base_ctx_helper_.TensorDesc4ArgNameAndIndex(call_ctx, arg_name, index);
+  }
+
+  const Shape& InputShape(eager::CallContext* call_ctx, const std::string& arg_name,
+                          int32_t index) const {
+    return *Shape4ArgNameAndIndex(call_ctx, arg_name, index);
+  }
+  Shape* OutputShape(eager::CallContext* call_ctx, const std::string& arg_name,
+                     int32_t index) const {
+    return Shape4ArgNameAndIndex(call_ctx, arg_name, index);
+  }
+  Shape* Shape4ArgNameAndIndex(eager::CallContext* call_ctx, const std::string& arg_name,
+                               int32_t index) const {
+    return NonNullTensorDesc4ArgNameAndIndex(call_ctx, arg_name, index)->mut_shape();
+  }
+  const Stride& InputStride(eager::CallContext* call_ctx, const std::string& arg_name,
+                            int32_t index) const {
+    return *Stride4ArgNameAndIndex(call_ctx, arg_name, index);
+  }
+  Stride* OutputStride(eager::CallContext* call_ctx, const std::string& arg_name,
+                       int32_t index) const {
+    return Stride4ArgNameAndIndex(call_ctx, arg_name, index);
+  }
+  Stride* Stride4ArgNameAndIndex(eager::CallContext* call_ctx, const std::string& arg_name,
+                                 int32_t index) const {
+    return NonNullTensorDesc4ArgNameAndIndex(call_ctx, arg_name, index)->mut_stride();
+  }
+  const DataType& InputDType(eager::CallContext* call_ctx, const std::string& arg_name,
+                             int32_t index) const {
+    return *Dtype4ArgNameAndIndex(call_ctx, arg_name, index);
+  }
+  DataType* OutputDType(eager::CallContext* call_ctx, const std::string& arg_name,
+                        int32_t index) const {
+    return Dtype4ArgNameAndIndex(call_ctx, arg_name, index);
+  }
+  DataType* Dtype4ArgNameAndIndex(eager::CallContext* call_ctx, const std::string& arg_name,
+                                  int32_t index) const {
+    return NonNullTensorDesc4ArgNameAndIndex(call_ctx, arg_name, index)->mut_data_type();
+  }
+  bool InputIsDynamic(eager::CallContext* call_ctx, const std::string& arg_name,
+                      int32_t index) const {
+    return *IsDynamic4ArgNameAndIndex(call_ctx, arg_name, index);
+  }
+  bool* OutputIsDynamic(eager::CallContext* call_ctx, const std::string& arg_name,
+                        int32_t index) const {
+    return IsDynamic4ArgNameAndIndex(call_ctx, arg_name, index);
+  }
+  bool* IsDynamic4ArgNameAndIndex(eager::CallContext* call_ctx, const std::string& arg_name,
+                                  int32_t index) const {
+    return NonNullTensorDesc4ArgNameAndIndex(call_ctx, arg_name, index)->mut_is_dynamic();
+  }
+
+  const ArgVec& inputs() const { return zero_copy_base_ctx_helper_.inputs(); }
+  const ArgVec& outputs() const { return zero_copy_base_ctx_helper_.outputs(); }
+  const JobDesc* job_desc() const {
+    UNIMPLEMENTED();
+    return nullptr;
+  }
+  const ParallelContext& parallel_ctx(eager::CallContext* call_ctx) const {
+    return zero_copy_base_ctx_helper_.parallel_ctx(call_ctx);
+  }
+  const ParallelDesc& parallel_desc(eager::CallContext* call_ctx) const {
+    return *CHECK_JUST(zero_copy_base_ctx_helper_.parallel_desc(call_ctx));
+  }
+  const SbpParallel& SbpParallel4ArgNameAndIndex(eager::CallContext* call_ctx,
+                                                 const std::string& arg_name, int32_t index) const {
+    const auto& nd_sbp = NdSbp4ArgNameAndIndex(call_ctx, arg_name, index);
+    CHECK_EQ(nd_sbp.sbp_parallel_size(), 1);
+    return nd_sbp.sbp_parallel(0);
+  }
+  const NdSbp& NdSbp4ArgNameAndIndex(eager::CallContext* call_ctx, const std::string& arg_name,
+                                     int32_t index) const {
+    return *CHECK_NOTNULL(zero_copy_base_ctx_helper_.ConsistentTensorMeta4ArgNameAndIndex(
+                              call_ctx, arg_name, index))
+                ->nd_sbp();
+  }
+
+  int64_t parallel_num(eager::CallContext* call_ctx) const {
+    return parallel_ctx(call_ctx).parallel_num();
+  }
+
+  const std::string& input(const std::string& arg_name, int32_t index) const {
+    return user_op_conf().input(arg_name, index);
+  }
+  const std::string& output(const std::string& arg_name, int32_t index) const {
+    return user_op_conf().output(arg_name, index);
+  }
+  bool has_input(const std::string& arg_name, int32_t index) const {
+    return user_op_conf().has_input(arg_name, index);
+  }
+  bool has_output(const std::string& arg_name, int32_t index) const {
+    return user_op_conf().has_output(arg_name, index);
+  }
+  int32_t input_size(const std::string& arg_name) const {
+    return user_op_conf().input_size(arg_name);
+  }
+  int32_t output_size(const std::string& arg_name) const {
+    return user_op_conf().output_size(arg_name);
+  }
+  const std::string& op_name() const { return user_op_conf().op_name(); }
+  const std::string& op_type_name() const { return user_op_conf().op_type_name(); }
+  const std::string& op_loc() const { return user_op_conf_->op_conf().loc(); }
+
+  const user_op::UserOpConfWrapper& user_op_conf() const { return *user_op_conf_; }
+  const std::shared_ptr<const user_op::AttrVal>& Attr4Name(eager::CallContext* call_ctx,
+                                                           const std::string& attr_name) const {
+    return call_ctx->composed_attrs().Attr4Name(attr_name);
+  }
+
+ private:
+  user_op::TensorDesc* NonNullTensorDesc4ArgNameAndIndex(eager::CallContext* call_ctx,
+                                                         const std::string& arg_name,
+                                                         int32_t index) const {
+    user_op::TensorDesc* tensor_desc = TensorDesc4ArgNameAndIndex(call_ctx, arg_name, index);
+    if (!tensor_desc) { LOG(FATAL) << "Arg (" << arg_name << "," << index << ") is not found"; }
+    return tensor_desc;
+  }
+
+  const user_op::UserOpConfWrapper* user_op_conf_;
+  ZeroCopyBaseContextHelper zero_copy_base_ctx_helper_;
+};
+
+class UserOpInferContext : public user_op::InferContext {
+ public:
+  UserOpInferContext(const UserOpInferContextHelper* helper, eager::CallContext* call_ctx)
+      : helper_(helper), call_ctx_(call_ctx) {}
+
+  ~UserOpInferContext() override = default;
+
+  const user_op::TensorDesc* LogicalTensorDesc4ArgNameAndIndex(const std::string& arg_name,
+                                                               int32_t index) const override {
+    return helper_->LogicalTensorDesc4ArgNameAndIndex(call_ctx_, arg_name, index);
+  }
+
+  const user_op::TensorDesc& InputTensorDesc(const std::string& arg_name,
+                                             int32_t index) const override {
+    return helper_->InputTensorDesc(call_ctx_, arg_name, index);
+  }
+  user_op::TensorDesc* OutputTensorDesc(const std::string& arg_name, int32_t index) override {
+    return helper_->OutputTensorDesc(call_ctx_, arg_name, index);
+  }
+  user_op::TensorDesc* TensorDesc4ArgNameAndIndex(const std::string& arg_name, int32_t index) {
+    return helper_->TensorDesc4ArgNameAndIndex(call_ctx_, arg_name, index);
+  }
+
+  const Shape& InputShape(const std::string& arg_name, int32_t index) const override {
+    return helper_->InputShape(call_ctx_, arg_name, index);
+  }
+  Shape* OutputShape(const std::string& arg_name, int32_t index) override {
+    return helper_->OutputShape(call_ctx_, arg_name, index);
+  }
+  Shape* Shape4ArgNameAndIndex(const std::string& arg_name, int32_t index) override {
+    return helper_->Shape4ArgNameAndIndex(call_ctx_, arg_name, index);
+  }
+  const Stride& InputStride(const std::string& arg_name, int32_t index) const override {
+    return helper_->InputStride(call_ctx_, arg_name, index);
+  }
+  Stride* OutputStride(const std::string& arg_name, int32_t index) override {
+    return helper_->OutputStride(call_ctx_, arg_name, index);
+  }
+  Stride* Stride4ArgNameAndIndex(const std::string& arg_name, int32_t index) override {
+    return helper_->Stride4ArgNameAndIndex(call_ctx_, arg_name, index);
+  }
+  const DataType& InputDType(const std::string& arg_name, int32_t index) const override {
+    return helper_->InputDType(call_ctx_, arg_name, index);
+  }
+  DataType* OutputDType(const std::string& arg_name, int32_t index) override {
+    return helper_->OutputDType(call_ctx_, arg_name, index);
+  }
+  DataType* Dtype4ArgNameAndIndex(const std::string& arg_name, int32_t index) override {
+    return helper_->Dtype4ArgNameAndIndex(call_ctx_, arg_name, index);
+  }
+  bool InputIsDynamic(const std::string& arg_name, int32_t index) const override {
+    return helper_->InputIsDynamic(call_ctx_, arg_name, index);
+  }
+  bool* OutputIsDynamic(const std::string& arg_name, int32_t index) override {
+    return helper_->OutputIsDynamic(call_ctx_, arg_name, index);
+  }
+  bool* IsDynamic4ArgNameAndIndex(const std::string& arg_name, int32_t index) override {
+    return helper_->IsDynamic4ArgNameAndIndex(call_ctx_, arg_name, index);
+  }
+
+  const ArgVec& inputs() const override { return helper_->inputs(); }
+  const ArgVec& outputs() const override { return helper_->outputs(); }
+  const JobDesc* job_desc() const override { return helper_->job_desc(); }
+  const ParallelContext& parallel_ctx() const override { return helper_->parallel_ctx(call_ctx_); }
+  const ParallelDesc& parallel_desc() const override { return helper_->parallel_desc(call_ctx_); }
+  const SbpParallel& SbpParallel4ArgNameAndIndex(const std::string& arg_name,
+                                                 int32_t index) const override {
+    return helper_->SbpParallel4ArgNameAndIndex(call_ctx_, arg_name, index);
+  }
+  const NdSbp& NdSbp4ArgNameAndIndex(const std::string& arg_name, int32_t index) const override {
+    return helper_->NdSbp4ArgNameAndIndex(call_ctx_, arg_name, index);
+  }
+
+  int64_t parallel_num() const override { return helper_->parallel_num(call_ctx_); }
+
+  const std::string& input(const std::string& arg_name, int32_t index) const override {
+    return helper_->input(arg_name, index);
+  }
+  const std::string& output(const std::string& arg_name, int32_t index) const override {
+    return helper_->output(arg_name, index);
+  }
+  bool has_input(const std::string& arg_name, int32_t index) const override {
+    return helper_->has_input(arg_name, index);
+  }
+  bool has_output(const std::string& arg_name, int32_t index) const override {
+    return helper_->has_output(arg_name, index);
+  }
+  int32_t input_size(const std::string& arg_name) const override {
+    return helper_->input_size(arg_name);
+  }
+  int32_t output_size(const std::string& arg_name) const override {
+    return helper_->output_size(arg_name);
+  }
+  const std::string& op_name() const override { return helper_->op_name(); }
+  const std::string& op_type_name() const override { return helper_->op_type_name(); }
+  const std::string& op_loc() const override { return helper_->op_loc(); }
+
+ private:
+  const std::shared_ptr<const user_op::AttrVal>& Attr4Name(
+      const std::string& attr_name) const override {
+    return helper_->Attr4Name(call_ctx_, attr_name);
+  }
+
+  const UserOpInferContextHelper* helper_;
+  eager::CallContext* call_ctx_;
+};
+
+class UserKernelComputeContextHelper final {
+ public:
+  UserKernelComputeContextHelper(DeviceType device_type,
+                                 const user_op::UserOpConfWrapper* user_op_conf,
+                                 const std::shared_ptr<const ArgTuple>& input_arg_tuple,
+                                 const std::shared_ptr<const ArgTuple>& output_arg_tuple)
+      : user_op_conf_(user_op_conf),
+        base_ctx_helper_(device_type, input_arg_tuple, output_arg_tuple) {}
+
+  ~UserKernelComputeContextHelper() = default;
+
+  const user_op::TensorDesc* TensorDesc4ArgNameAndIndex(eager::CallContext* call_ctx,
+                                                        const std::string& arg_name,
+                                                        int32_t index) const {
+    return base_ctx_helper_.TensorDesc4ArgNameAndIndex(call_ctx, arg_name, index);
+  }
+
+  user_op::Tensor* Tensor4ArgNameAndIndex(eager::CallContext* call_ctx, const std::string& arg_name,
+                                          int32_t index) const {
+    return base_ctx_helper_.Tensor4ArgNameAndIndex(call_ctx, arg_name, index);
+  }
+  ep::Stream* stream(DeviceCtx* device_ctx) const {
+    CHECK(device_ctx);
+    return device_ctx->stream();
+  }
+
+  DeviceType device_type() const { return base_ctx_helper_.device_type(); }
+  const ParallelContext& parallel_ctx(eager::CallContext* call_ctx) const {
+    return base_ctx_helper_.parallel_ctx(call_ctx);
+  }
+
+  const ArgVec& inputs() const { return base_ctx_helper_.inputs(); }
+  const ArgVec& outputs() const { return base_ctx_helper_.outputs(); }
+
+  const user_op::UserOpConfWrapper& user_op_conf() const { return *user_op_conf_; }
+  const std::shared_ptr<const user_op::AttrVal>& Attr4Name(eager::CallContext* call_ctx,
+                                                           const std::string& attr_name) const {
+    return call_ctx->composed_attrs().Attr4Name(attr_name);
+  }
+
+ private:
+  const user_op::UserOpConfWrapper* user_op_conf_;
+  UserKernelBaseContextHelper base_ctx_helper_;
+};
+
+class UserKernelComputeContext final : public user_op::KernelComputeContext {
+ public:
+  UserKernelComputeContext(const UserKernelComputeContextHelper* helper,
+                           eager::CallContext* call_ctx, DeviceCtx* device_ctx)
+      : helper_(helper), call_ctx_(call_ctx), device_ctx_(device_ctx) {}
+
+  ~UserKernelComputeContext() = default;
+
+  const user_op::TensorDesc* TensorDesc4ArgNameAndIndex(const std::string& arg_name,
+                                                        int32_t index) const override {
+    return helper_->TensorDesc4ArgNameAndIndex(call_ctx_, arg_name, index);
+  }
+
+  user_op::Tensor* Tensor4ArgNameAndIndex(const std::string& arg_name, int32_t index) override {
+    return helper_->Tensor4ArgNameAndIndex(call_ctx_, arg_name, index);
+  }
+
+  ep::Stream* stream() override { return helper_->stream(device_ctx_); }
+
+  DeviceType device_type() const override { return helper_->device_type(); }
+
+  const ParallelContext& parallel_ctx() const override { return helper_->parallel_ctx(call_ctx_); }
+
+  const ArgVec& inputs() const override { return helper_->inputs(); }
+  const ArgVec& outputs() const override { return helper_->outputs(); }
+
+ private:
+  const user_op::UserOpConfWrapper& user_op_conf() const override {
+    return helper_->user_op_conf();
+  }
+
+  const std::shared_ptr<const user_op::AttrVal>& Attr4Name(
+      const std::string& attr_name) const override {
+    return helper_->Attr4Name(call_ctx_, attr_name);
+  }
+
+  const UserKernelComputeContextHelper* helper_;
+  eager::CallContext* call_ctx_;
+  DeviceCtx* device_ctx_;
+};
+
+class UserKernelRegContextHelper final {
+ public:
+  UserKernelRegContextHelper(DeviceType device_type, const user_op::UserOpConfWrapper* user_op_conf,
+                             const std::shared_ptr<const ArgTuple>& input_arg_tuple,
+                             const std::shared_ptr<const ArgTuple>& output_arg_tuple)
+      : user_op_conf_(user_op_conf),
+        base_ctx_helper_(device_type, input_arg_tuple, output_arg_tuple) {}
+  ~UserKernelRegContextHelper() = default;
+
+  DeviceType device_type() const { return base_ctx_helper_.device_type(); }
+  const ParallelContext& parallel_ctx(eager::CallContext* call_ctx) const {
+    return base_ctx_helper_.parallel_ctx(call_ctx);
+  }
+  const user_op::TensorDesc* TensorDesc4ArgNameAndIndex(eager::CallContext* call_ctx,
+                                                        const std::string& arg_name,
+                                                        int32_t index) const {
+    return base_ctx_helper_.TensorDesc4ArgNameAndIndex(call_ctx, arg_name, index);
+  }
+  const ArgVec& inputs() const { return base_ctx_helper_.inputs(); }
+  const ArgVec& outputs() const { return base_ctx_helper_.outputs(); }
+
+  const user_op::UserOpConfWrapper& user_op_conf() const { return *user_op_conf_; }
+
+  const std::shared_ptr<const user_op::AttrVal>& Attr4Name(eager::CallContext* call_ctx,
+                                                           const std::string& attr_name) const {
+    return call_ctx->composed_attrs().Attr4Name(attr_name);
+  }
+
+ private:
+  const user_op::UserOpConfWrapper* user_op_conf_;
+  UserKernelBaseContextHelper base_ctx_helper_;
+};
+
+class UserKernelRegContext final : public user_op::KernelRegContext {
+ public:
+  UserKernelRegContext(const UserKernelRegContextHelper* helper, eager::CallContext* call_ctx)
+      : helper_(helper), call_ctx_(call_ctx) {}
+  ~UserKernelRegContext() = default;
+
+  DeviceType device_type() const override { return helper_->device_type(); }
+  const ParallelContext& parallel_ctx() const override { return helper_->parallel_ctx(call_ctx_); }
+  const user_op::TensorDesc* TensorDesc4ArgNameAndIndex(const std::string& arg_name,
+                                                        int32_t index) const override {
+    return helper_->TensorDesc4ArgNameAndIndex(call_ctx_, arg_name, index);
+  }
+  const ArgVec& inputs() const override { return helper_->inputs(); }
+  const ArgVec& outputs() const override { return helper_->outputs(); }
+
+  const user_op::UserOpConfWrapper& user_op_conf() const override {
+    return helper_->user_op_conf();
+  }
+
+ private:
+  const std::shared_ptr<const user_op::AttrVal>& Attr4Name(
+      const std::string& attr_name) const override {
+    return helper_->Attr4Name(call_ctx_, attr_name);
+  }
+
+  const UserKernelRegContextHelper* helper_;
+  eager::CallContext* call_ctx_;
+};
+
+class UserKernelInitAndCacheContextHelper final {
+ public:
+  UserKernelInitAndCacheContextHelper(DeviceType device_type,
+                                      const user_op::UserOpConfWrapper* user_op_conf,
+                                      const std::shared_ptr<const ArgTuple>& input_arg_tuple,
+                                      const std::shared_ptr<const ArgTuple>& output_arg_tuple)
+      : user_op_conf_(user_op_conf),
+        base_ctx_helper_(device_type, input_arg_tuple, output_arg_tuple) {}
+
+  ~UserKernelInitAndCacheContextHelper() = default;
+
+  ep::Stream* stream(DeviceCtx* device_ctx) const {
+    CHECK(device_ctx);
+    return device_ctx->stream();
+  }
+
+  DeviceType device_type() const { return base_ctx_helper_.device_type(); }
+  const ParallelContext& parallel_ctx(eager::CallContext* call_ctx) const {
+    return base_ctx_helper_.parallel_ctx(call_ctx);
+  }
+  const user_op::TensorDesc* TensorDesc4ArgNameAndIndex(eager::CallContext* call_ctx,
+                                                        const std::string& arg_name,
+                                                        int32_t index) const {
+    return base_ctx_helper_.TensorDesc4ArgNameAndIndex(call_ctx, arg_name, index);
+  }
+  const user_op::TensorDesc* LogicalTensorDesc4ArgNameAndIndex(eager::CallContext* call_ctx,
+                                                               const std::string& arg_name,
+                                                               int32_t index) const {
+    return base_ctx_helper_.ConsistentTensorMeta4ArgNameAndIndex(call_ctx, arg_name, index);
+  }
+  const SbpParallel& SbpParallel4ArgNameAndIndex(eager::CallContext* call_ctx,
+                                                 const std::string& arg_name, int32_t index) const {
+    const auto& nd_sbp = NdSbp4ArgNameAndIndex(call_ctx, arg_name, index);
+    CHECK_EQ(nd_sbp.sbp_parallel_size(), 1);
+    return nd_sbp.sbp_parallel(0);
+  }
+
+  const NdSbp& NdSbp4ArgNameAndIndex(eager::CallContext* call_ctx, const std::string& arg_name,
+                                     int32_t index) const {
+    return *CHECK_NOTNULL(
+                base_ctx_helper_.ConsistentTensorMeta4ArgNameAndIndex(call_ctx, arg_name, index))
+                ->nd_sbp();
+  }
+
+  const ArgVec& inputs() const { return base_ctx_helper_.inputs(); }
+  const ArgVec& outputs() const { return base_ctx_helper_.outputs(); }
+  const ParallelDesc& parallel_desc(eager::CallContext* call_ctx) const {
+    return *CHECK_JUST(base_ctx_helper_.parallel_desc(call_ctx));
+  }
+
+  const std::shared_ptr<const user_op::AttrVal>& Attr4Name(eager::CallContext* call_ctx,
+                                                           const std::string& attr_name) const {
+    return call_ctx->composed_attrs().Attr4Name(attr_name);
+  }
+
+  const user_op::UserOpConfWrapper& user_op_conf() const { return *user_op_conf_; }
+
+ private:
+  const user_op::UserOpConfWrapper* user_op_conf_;
+  UserKernelBaseContextHelper base_ctx_helper_;
+};
+
+class UserKernelInitAndCacheContext final : public user_op::KernelInitContext,
+                                            public user_op::KernelCacheContext {
+ public:
+  UserKernelInitAndCacheContext(const UserKernelInitAndCacheContextHelper* helper,
+                                eager::CallContext* call_ctx, DeviceCtx* device_ctx)
+      : helper_(helper), call_ctx_(call_ctx), device_ctx_(device_ctx) {}
+
+  ~UserKernelInitAndCacheContext() override = default;
+
+  ep::Stream* stream() override { return helper_->stream(device_ctx_); }
+
+  DeviceType device_type() const override { return helper_->device_type(); }
+  const ParallelContext& parallel_ctx() const override { return helper_->parallel_ctx(call_ctx_); }
+  const user_op::TensorDesc* TensorDesc4ArgNameAndIndex(const std::string& arg_name,
+                                                        int32_t index) const override {
+    return helper_->TensorDesc4ArgNameAndIndex(call_ctx_, arg_name, index);
+  }
+  const user_op::TensorDesc* LogicalTensorDesc4ArgNameAndIndex(const std::string& arg_name,
+                                                               int32_t index) const override {
+    return helper_->LogicalTensorDesc4ArgNameAndIndex(call_ctx_, arg_name, index);
+  }
+  const SbpParallel& SbpParallel4ArgNameAndIndex(const std::string& arg_name,
+                                                 int32_t index) const override {
+    return helper_->SbpParallel4ArgNameAndIndex(call_ctx_, arg_name, index);
+  }
+
+  const NdSbp& NdSbp4ArgNameAndIndex(const std::string& arg_name, int32_t index) const override {
+    return helper_->NdSbp4ArgNameAndIndex(call_ctx_, arg_name, index);
+  }
+
+  const ArgVec& inputs() const override { return helper_->inputs(); }
+  const ArgVec& outputs() const override { return helper_->outputs(); }
+  const ParallelDesc& parallel_desc() const override { return helper_->parallel_desc(call_ctx_); }
+
+ private:
+  const std::shared_ptr<const user_op::AttrVal>& Attr4Name(
+      const std::string& attr_name) const override {
+    return helper_->Attr4Name(call_ctx_, attr_name);
+  }
+
+  const user_op::UserOpConfWrapper& user_op_conf() const override {
+    return helper_->user_op_conf();
+  }
+
+  const UserKernelInitAndCacheContextHelper* helper_;
+  eager::CallContext* call_ctx_;
+  DeviceCtx* device_ctx_;
+};
+
+namespace {
+
+Maybe<void> InitTensorTupleIndexes4Bns(const std::shared_ptr<const OperatorConf>& op_conf,
+                                       const ArgVec& indexed_input_pairs,
+                                       const ArgVec& indexed_output_pairs,
+                                       std::vector<int64_t>* input_tuple_indexes4const_ibns,
+                                       std::vector<int64_t>* input_tuple_indexes4mut_ibns,
+                                       std::vector<int64_t>* output_tuple_indexes4mut_obns,
+                                       std::vector<int64_t>* output_tuple_indexes4mut2_obns) {
+  const auto* op_reg_val =
+      user_op::UserOpRegistryMgr::Get().GetOpRegistryResult(op_conf->user_conf().op_type_name());
+  CHECK_NOTNULL_OR_RETURN(op_reg_val);
+
+  ArgModifierSignature arg_modifier_signature;
+  for (const auto& pair : indexed_input_pairs) {
+    const std::string ibn = GenRepeatedBn(pair.first, pair.second);
+    arg_modifier_signature.mutable_ibn2input_blob_modifier()->insert(
+        {ibn, user_op::InputArgModifier()});
+  }
+  for (const auto& pair : indexed_output_pairs) {
+    const std::string obn = GenRepeatedBn(pair.first, pair.second);
+    arg_modifier_signature.mutable_obn2output_blob_modifier()->insert(
+        {obn, user_op::OutputArgModifier()});
+  }
+  user_op::UserOpConfWrapper op_conf_wrapper(op_conf);
+  if (op_reg_val->input_arg_modify_fn) {
+    user_op::GetInputArgModifier GetInputArgModifierFn =
+        [&arg_modifier_signature](const std::string& in_arg_name,
+                                  int32_t in_arg_index) -> user_op::InputArgModifier* {
+      const std::string ibn = GenRepeatedBn(in_arg_name, in_arg_index);
+      auto* map = arg_modifier_signature.mutable_ibn2input_blob_modifier();
+      return &map->at(ibn);
+    };
+    JUST(op_reg_val->input_arg_modify_fn(GetInputArgModifierFn, op_conf_wrapper));
+  }
+  if (op_reg_val->output_arg_modify_fn) {
+    user_op::GetOutputArgModifier GetOutputArgModifierFn =
+        [&arg_modifier_signature](const std::string& in_arg_name,
+                                  int32_t in_arg_index) -> user_op::OutputArgModifier* {
+      const std::string obn = GenRepeatedBn(in_arg_name, in_arg_index);
+      auto* map = arg_modifier_signature.mutable_obn2output_blob_modifier();
+      return &map->at(obn);
+    };
+    JUST(op_reg_val->output_arg_modify_fn(GetOutputArgModifierFn, op_conf_wrapper));
+  }
+
+  for (int i = 0; i < indexed_input_pairs.size(); i++) {
+    const auto& pair = indexed_input_pairs.at(i);
+    const std::string ibn = GenRepeatedBn(pair.first, pair.second);
+    if (arg_modifier_signature.ibn2input_blob_modifier().at(ibn).is_mutable()) {
+      input_tuple_indexes4mut_ibns->emplace_back(i);
+    } else {
+      input_tuple_indexes4const_ibns->emplace_back(i);
+    }
+  }
+
+  for (int i = 0; i < indexed_output_pairs.size(); i++) {
+    const auto& pair = indexed_output_pairs.at(i);
+    const std::string obn = GenRepeatedBn(pair.first, pair.second);
+    if (arg_modifier_signature.obn2output_blob_modifier().at(obn).header_infered_before_compute()) {
+      output_tuple_indexes4mut_obns->emplace_back(i);
+    } else {
+      output_tuple_indexes4mut2_obns->emplace_back(i);
+    }
+  }
+  return Maybe<void>::Ok();
+}
+
+}  // namespace
+
+/* static */ Maybe<StatefulOpKernel> StatefulOpKernel::New(
+    const std::shared_ptr<OperatorConf>& op_conf, const Symbol<Stream>& stream,
+    const AttrMap& base_attrs, const std::shared_ptr<const ParallelDesc>& parallel_desc,
+    const std::shared_ptr<const ArgTuple>& input_arg_tuple,
+    const std::shared_ptr<const ArgTuple>& output_arg_tuple) {
+  auto opkernel = std::shared_ptr<StatefulOpKernel>(new StatefulOpKernel());
+  opkernel->base_attrs_ = base_attrs;
+  opkernel->op_conf_ = op_conf;
+  opkernel->user_op_conf_.reset(new user_op::UserOpConfWrapper(op_conf));
+  opkernel->stream_ = stream;
+  opkernel->input_arg_tuple_ = input_arg_tuple;
+  opkernel->output_arg_tuple_ = output_arg_tuple;
+  opkernel->need_check_mem_case_ = true;
+
+  const DeviceType device_type = CHECK_JUST(DeviceType4DeviceTag(op_conf->device_tag()));
+  const user_op::UserOpConfWrapper* user_op_conf = opkernel->user_op_conf_.get();
+  opkernel->op_infer_ctx_helper_.reset(
+      new UserOpInferContextHelper(user_op_conf, input_arg_tuple, output_arg_tuple));
+
+  opkernel->init_and_cache_ctx_helper_.reset(new UserKernelInitAndCacheContextHelper(
+      device_type, opkernel->user_op_conf_.get(), opkernel->input_arg_tuple_,
+      opkernel->output_arg_tuple_));
+  opkernel->compute_ctx_helper_.reset(new UserKernelComputeContextHelper(
+      device_type, user_op_conf, input_arg_tuple, output_arg_tuple));
+  opkernel->reg_ctx_helper_.reset(
+      new UserKernelRegContextHelper(device_type, user_op_conf, input_arg_tuple, output_arg_tuple));
+  const auto* op_reg_val =
+      user_op::UserOpRegistryMgr::Get().GetOpRegistryResult(user_op_conf->op_type_name());
+  CHECK_NOTNULL_OR_RETURN(op_reg_val);
+  if (op_reg_val->logical_tensor_desc_infer_fn) {
+    opkernel->tensor_desc_infer_fn_ = op_reg_val->logical_tensor_desc_infer_fn;
+  } else {
+    return Error::UnimplementedError();
+  }
+  opkernel->data_type_infer_fn_ = op_reg_val->data_type_infer_fn;
+
+  JUST(InitTensorTupleIndexes4Bns(
+      op_conf, input_arg_tuple->indexed_arg_name_and_index(),
+      output_arg_tuple->indexed_arg_name_and_index(), &opkernel->input_tuple_indexes4const_ibns_,
+      &opkernel->input_tuple_indexes4mut_ibns_, &opkernel->output_tuple_indexes4mut_obns_,
+      &opkernel->output_tuple_indexes4mut2_obns_));
+
+  return opkernel;
+}
+
+StatefulOpKernel::~StatefulOpKernel() = default;
+
+size_t StatefulOpKernel::InferTmpSize(eager::CallContext* call_ctx,
+                                      const user_op::OpKernel* user_opkernel) const {
+  UserOpInferContext op_infer_ctx(op_infer_ctx_helper_.get(), call_ctx);
+  const auto& InferTmpSizeFn = GetInferTmpSizeFn(user_opkernel);
+  return InferTmpSizeFn(&op_infer_ctx);
+}
+
+Maybe<void> StatefulOpKernel::ChooseOpKernel(eager::CallContext* call_ctx,
+                                             const user_op::OpKernel** user_opkernel,
+                                             bool* need_temp_storage) {
+  OF_PROFILER_RANGE_GUARD("ChooseOpKernel");
+  DataType primary_dtype = kInvalidDataType;
+  const auto& inputs = call_ctx->inputs();
+  const auto& outputs = call_ctx->outputs();
+  if (likely(!inputs->empty())) {
+    primary_dtype = (*inputs)[0]->data_type();
+  } else if (likely(!outputs->empty())) {
+    primary_dtype = (*outputs)[0]->data_type();
+  } else {
+    // do nothing
+  }
+
+  UserKernelRegContext reg_ctx(reg_ctx_helper_.get(), call_ctx);
+  for (const auto& pair : dtype2cached_kernels_[primary_dtype]) {
+    if (likely(pair.first->is_matched_hob->get(reg_ctx))) {
+      *need_temp_storage = pair.first->need_temp_storage;
+      *user_opkernel = pair.second.get();
+      return Maybe<void>::Ok();
+    }
+  }
+
+  OF_PROFILER_RANGE_GUARD("fallback");
+
+  const auto& op_type_name = user_op_conf_->op_type_name();
+  const auto* kernel_reg_val =
+      JUST(user_op::UserOpRegistryMgr::Get().GetOpKernelRegistryResult(op_type_name, reg_ctx));
+  CHECK_NOTNULL(kernel_reg_val);
+  auto* kernel = kernel_reg_val->create_fn();
+  dtype2cached_kernels_[primary_dtype].push_back(
+      {kernel_reg_val, std::shared_ptr<const user_op::OpKernel>(kernel)});
+
+  infer_tmp_size_fn_map_.emplace(kernel, &kernel_reg_val->infer_tmp_size_fn);
+  *need_temp_storage = kernel_reg_val->need_temp_storage;
+  *user_opkernel = kernel;
+  return Maybe<void>::Ok();
+}
+
+void StatefulOpKernel::TryInitOpKernelStateAndCache(eager::CallContext* call_ctx,
+                                                    DeviceCtx* device_ctx,
+                                                    const user_op::OpKernel* op_kernel,
+                                                    user_op::OpKernelState** state,
+                                                    user_op::OpKernelCache** cache) {
+  UserKernelInitAndCacheContext init_and_cache_ctx(init_and_cache_ctx_helper_.get(), call_ctx,
+                                                   device_ctx);
+  if (state != nullptr) {
+    auto it = op_kernel_state_map_.find(op_kernel);
+    if (it != op_kernel_state_map_.end()) {
+      *state = it->second.get();
+    } else {
+      auto created_state = op_kernel->CreateOpKernelState(&init_and_cache_ctx);
+      op_kernel_state_map_.emplace(op_kernel, created_state);
+      *state = created_state.get();
+    }
+  }
+
+  {
+    auto& cache_in_map = op_kernel_cache_map_[op_kernel];
+    op_kernel->InitOpKernelCacheWithFlags(&init_and_cache_ctx,
+                                          user_op::OpKernelCache::kAllMayChanged, &cache_in_map);
+    *cache = cache_in_map.get();
+  }
+}
+
+const user_op::InferTmpSizeFn& StatefulOpKernel::GetInferTmpSizeFn(
+    const user_op::OpKernel* op_kernel) const {
+  return *infer_tmp_size_fn_map_.at(op_kernel);
+}
+
+user_op::TensorDescInferFn StatefulOpKernel::TensorDescInferFn() const {
+  return tensor_desc_infer_fn_;
+}
+
+user_op::DataTypeInferFn StatefulOpKernel::DataTypeInferFn() const { return data_type_infer_fn_; }
+
+void StatefulOpKernel::Compute(eager::CallContext* call_ctx, DeviceCtx* device_ctx,
+                               const user_op::OpKernel* user_opkernel,
+                               user_op::OpKernelState* state,
+                               const user_op::OpKernelCache* cache) const {
+  UserKernelComputeContext compute_context(compute_ctx_helper_.get(), call_ctx, device_ctx);
+  auto* compute_ctx = &compute_context;
+  OF_PROFILER_RANGE_GUARD("Compute");
+  if (Singleton<profiler::ProfileManager>::Get()) {
+#if defined(WITH_CUDA) || defined(WITH_ROCM)
+    const auto CalMemorySize = [compute_ctx](const one::ArgVec& args) -> int64_t {
+      const auto Func = [compute_ctx](int64_t mem_size, const auto& pair) {
+        const auto tensor = compute_ctx->Tensor4ArgNameAndIndex(pair.first, pair.second);
+        return mem_size + tensor->shape_view().elem_cnt() * GetSizeOfDataType(tensor->data_type());
+      };
+      return std::accumulate(args.begin(), args.end(), static_cast<int64_t>(0), Func);
+    };
+#endif
+    auto er_guard = CHECK_JUST(profiler::EventRecorder::CreateKernelEventRecorder(
+        op_type_name(),
+#if defined(WITH_CUDA) || defined(WITH_ROCM)
+        [compute_ctx, CalMemorySize]() -> int64_t {
+          return CalMemorySize(compute_ctx->inputs()) + CalMemorySize(compute_ctx->outputs());
+        },
+#endif
+        [compute_ctx]() -> std::vector<Shape> {
+          std::vector<Shape> shapes;
+          for (const auto& pair : compute_ctx->inputs()) {
+            shapes.emplace_back(
+                compute_ctx->TensorDesc4ArgNameAndIndex(pair.first, pair.second)->shape());
+          }
+          return shapes;
+        }));
+    user_opkernel->Compute(compute_ctx, state, cache);
+  } else {
+    user_opkernel->Compute(compute_ctx, state, cache);
+  }
+}
+
+}  // namespace one
+}  // namespace oneflow
diff --git a/oneflow/user/kernels/tf_prelu_kernel.hip.cpp b/oneflow/user/kernels/tf_prelu_kernel.hip.cpp
index 2a27ca3..572127b 100644
--- a/oneflow/user/kernels/tf_prelu_kernel.hip.cpp
+++ b/oneflow/user/kernels/tf_prelu_kernel.hip.cpp
@@ -1,254 +1,254 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "hip/hip_runtime.h"
-#include "oneflow/core/framework/framework.h"
-#include "oneflow/core/ndarray/ndarray_util.h"
-#include "oneflow/core/ep/rocm/cuda_stream.h"
-
-namespace oneflow {
-
-namespace {
-
-template<typename T>
-__global__ void BroadcastPReluForwardGpu(const int32_t elem_cnt, const int32_t alpha_size,
-                                         const int32_t inner_size, const T* x, const T* alpha,
-                                         T* y) {
-  T zero_val = static_cast<T>(0.0);
-  CUDA_1D_KERNEL_LOOP(i, elem_cnt) {
-    const T x_i = x[i];
-    const T alpha_i = alpha[(i / inner_size) % alpha_size];
-    y[i] = x_i > zero_val ? x_i : x_i * alpha_i;
-  }
-}
-
-template<typename T>
-__global__ void BroadcastPReluBackwardGpu(const int32_t elem_cnt, const int32_t alpha_size,
-                                          const int32_t inner_size, const T* x, const T* alpha,
-                                          const T* dy, T* dx, T* alpha_diff) {
-  T zero_val = static_cast<T>(0.0);
-  CUDA_1D_KERNEL_LOOP(i, elem_cnt) {
-    const T x_i = x[i];
-    const T dy_i = dy[i];
-    const T alpha_i = alpha[(i / inner_size) % alpha_size];
-    T dx_i = zero_val;
-    T alpha_diff_i = zero_val;
-    if (x_i > zero_val) {
-      dx_i = dy_i;
-      alpha_diff_i = zero_val;
-    } else {
-      dx_i = dy_i * alpha_i;
-      alpha_diff_i = dy_i * x_i;
-    }
-    dx[i] = dx_i;
-    alpha_diff[i] = alpha_diff_i;
-  }
-}
-
-template<typename T>
-__global__ void ElemwisePReluForwardGpu(const int32_t elem_cnt, const T* x, const T* alpha, T* y) {
-  T zero_val = static_cast<T>(0.0);
-  CUDA_1D_KERNEL_LOOP(i, elem_cnt) {
-    const T x_i = x[i];
-    const T alpha_i = alpha[i];
-    y[i] = x_i > zero_val ? x_i : x_i * alpha_i;
-  }
-}
-
-template<typename T>
-__global__ void ElemwisePReluBackwardGpu(const int32_t elem_cnt, const T* x, const T* alpha,
-                                         const T* dy, T* dx, T* alpha_diff) {
-  T zero_val = static_cast<T>(0.0);
-  CUDA_1D_KERNEL_LOOP(i, elem_cnt) {
-    const T x_i = x[i];
-    const T dy_i = dy[i];
-    const T alpha_i = alpha[i];
-    T dx_i = zero_val;
-    T alpha_diff_i = zero_val;
-    if (x_i > zero_val) {
-      dx_i = dy_i;
-      alpha_diff_i = zero_val;
-    } else {
-      dx_i = dy_i * alpha_i;
-      alpha_diff_i = dy_i * x_i;
-    }
-    dx[i] = dx_i;
-    alpha_diff[i] = alpha_diff_i;
-  }
-}
-
-bool IsAlphaShapeContiguous(const ShapeView& alpha_shape, const ShapeView& x_shape) {
-  if (alpha_shape.elem_cnt() == 1) { return true; }
-  int64_t begin_idx = -1;
-  for (int64_t i = 0; i < alpha_shape.NumAxes(); ++i) {
-    if (alpha_shape.At(i) != 1) {
-      begin_idx = i;
-      break;
-    }
-  }
-  CHECK_NE(begin_idx, -1);
-  int64_t end_idx = -1;
-  for (int64_t i = alpha_shape.NumAxes(); i > 0; --i) {
-    if (alpha_shape.At(i - 1) != 1) {
-      end_idx = i;
-      break;
-    }
-  }
-  CHECK_NE(end_idx, -1);
-  if (alpha_shape.elem_cnt() == x_shape.Count(begin_idx + 1, end_idx + 1)) {
-    return true;
-  } else {
-    return false;
-  }
-}
-
-int32_t GetOuterSize(const ShapeView& alpha_shape, const ShapeView& x_shape) {
-  int32_t outer_size = x_shape.At(0);
-  for (int32_t i = 0; i < alpha_shape.NumAxes(); ++i) {
-    if (alpha_shape.At(i) == 1) {
-      outer_size *= x_shape.At(i + 1);
-    } else {
-      break;
-    }
-  }
-  return outer_size;
-}
-
-}  // namespace
-
-template<typename T>
-class TfGpuPReluKernel final : public user_op::OpKernel {
- public:
-  TfGpuPReluKernel() = default;
-  ~TfGpuPReluKernel() = default;
-
- private:
-  using user_op::OpKernel::Compute;
-  void Compute(user_op::KernelComputeContext* ctx) const override {
-    const user_op::Tensor* x = ctx->Tensor4ArgNameAndIndex("x", 0);
-    const user_op::Tensor* alpha = ctx->Tensor4ArgNameAndIndex("alpha", 0);
-    user_op::Tensor* y = ctx->Tensor4ArgNameAndIndex("y", 0);
-    const int32_t elem_cnt = x->shape_view().elem_cnt();
-    if (IsAlphaShapeContiguous(alpha->shape_view(), x->shape_view())) {
-      const int32_t outer_size = GetOuterSize(alpha->shape_view(), x->shape_view());
-      const int32_t alpha_size = alpha->shape_view().elem_cnt();
-      const int32_t inner_size = elem_cnt / outer_size / alpha_size;
-      BroadcastPReluForwardGpu<T><<<BlocksNum4ThreadsNum(elem_cnt), kCudaThreadsNumPerBlock, 0,
-                                    ctx->stream()->As<ep::CudaStream>()->cuda_stream()>>>(
-          elem_cnt, alpha_size, inner_size, x->dptr<T>(), alpha->dptr<T>(), y->mut_dptr<T>());
-    } else {
-      user_op::Tensor* broadcasted_alpha = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0);
-      const Shape& left_extended_shape =
-          CreateLeftExtendedShape(ShapeView(alpha->shape_view()), x->shape_view().NumAxes());
-      NdarrayUtil<DeviceType::kCUDA, T>::BroadcastTo(
-          ctx->stream(), XpuVarNdarray<T>(x->shape_view(), broadcasted_alpha->mut_dptr<T>()),
-          XpuVarNdarray<const T>(left_extended_shape, alpha->dptr<T>()));
-      ElemwisePReluForwardGpu<T><<<BlocksNum4ThreadsNum(elem_cnt), kCudaThreadsNumPerBlock, 0,
-                                   ctx->stream()->As<ep::CudaStream>()->cuda_stream()>>>(
-          elem_cnt, x->dptr<T>(), broadcasted_alpha->dptr<T>(), y->mut_dptr<T>());
-    }
-  }
-  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
-};
-
-#define REGISTER_TF_CUDA_PRELU_KERNEL(dtype)                                           \
-  REGISTER_USER_KERNEL("tf_prelu")                                                     \
-      .SetCreateFn<TfGpuPReluKernel<dtype>>()                                          \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                 \
-                       && (user_op::HobDataType("y", 0) == GetDataType<dtype>::value)) \
-      .SetInferTmpSizeFn([](user_op::InferContext* ctx) {                              \
-        const Shape& in_shape = ctx->InputShape("x", 0);                               \
-        const Shape& alpha_shape = ctx->InputShape("alpha", 0);                        \
-        const int64_t tmp_buffer_size =                                                \
-            IsAlphaShapeContiguous(alpha_shape, in_shape)                              \
-                ? 0                                                                    \
-                : GetCudaAlignedSize(in_shape.elem_cnt() * sizeof(dtype));             \
-        return tmp_buffer_size;                                                        \
-      });
-
-REGISTER_TF_CUDA_PRELU_KERNEL(half)
-REGISTER_TF_CUDA_PRELU_KERNEL(float)
-REGISTER_TF_CUDA_PRELU_KERNEL(double)
-
-template<typename T>
-class TfGpuPReluGradKernel final : public user_op::OpKernel {
- public:
-  TfGpuPReluGradKernel() = default;
-  ~TfGpuPReluGradKernel() = default;
-
- private:
-  using user_op::OpKernel::Compute;
-  void Compute(user_op::KernelComputeContext* ctx) const override {
-    const user_op::Tensor* x = ctx->Tensor4ArgNameAndIndex("x", 0);
-    const user_op::Tensor* alpha = ctx->Tensor4ArgNameAndIndex("alpha", 0);
-    const user_op::Tensor* dy = ctx->Tensor4ArgNameAndIndex("dy", 0);
-    user_op::Tensor* dx = ctx->Tensor4ArgNameAndIndex("dx", 0);
-    user_op::Tensor* alpha_diff = ctx->Tensor4ArgNameAndIndex("alpha_diff", 0);
-    user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0);
-    const int32_t elem_cnt = x->shape_view().elem_cnt();
-    T* broadcasted_alpha_diff = tmp_buffer->mut_dptr<T>();
-    T* reduce_sum_tmp_buf = reinterpret_cast<T*>(tmp_buffer->mut_dptr<char>()
-                                                 + GetCudaAlignedSize(elem_cnt * sizeof(T)));
-    const Shape& left_extended_shape =
-        CreateLeftExtendedShape(ShapeView(alpha->shape_view()), x->shape_view().NumAxes());
-    if (IsAlphaShapeContiguous(alpha->shape_view(), x->shape_view())) {
-      const int32_t outer_size = GetOuterSize(alpha->shape_view(), x->shape_view());
-      const int32_t alpha_size = alpha->shape_view().elem_cnt();
-      const int32_t inner_size = elem_cnt / outer_size / alpha_size;
-      BroadcastPReluBackwardGpu<T><<<BlocksNum4ThreadsNum(elem_cnt), kCudaThreadsNumPerBlock, 0,
-                                     ctx->stream()->As<ep::CudaStream>()->cuda_stream()>>>(
-          elem_cnt, alpha_size, inner_size, x->dptr<T>(), alpha->dptr<T>(), dy->dptr<T>(),
-          dx->mut_dptr<T>(), broadcasted_alpha_diff);
-    } else {
-      T* broadcasted_alpha = reinterpret_cast<T*>(tmp_buffer->mut_dptr<char>()
-                                                  + 2 * GetCudaAlignedSize(elem_cnt * sizeof(T)));
-
-      NdarrayUtil<DeviceType::kCUDA, T>::BroadcastTo(
-          ctx->stream(), XpuVarNdarray<T>(x->shape_view(), broadcasted_alpha),
-          XpuVarNdarray<const T>(left_extended_shape, alpha->dptr<T>()));
-
-      ElemwisePReluBackwardGpu<T><<<BlocksNum4ThreadsNum(elem_cnt), kCudaThreadsNumPerBlock, 0,
-                                    ctx->stream()->As<ep::CudaStream>()->cuda_stream()>>>(
-          elem_cnt, x->dptr<T>(), broadcasted_alpha, dy->dptr<T>(), dx->mut_dptr<T>(),
-          broadcasted_alpha_diff);
-    }
-    NdarrayUtil<DeviceType::kCUDA, T>::ReduceSum(
-        ctx->stream(), XpuVarNdarray<T>(left_extended_shape, alpha_diff->mut_dptr<T>()),
-        XpuVarNdarray<const T>(x->shape_view(), broadcasted_alpha_diff),
-        XpuVarNdarray<T>(x->shape_view(), reduce_sum_tmp_buf));
-  }
-  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
-};
-
-#define REGISTER_TF_CUDA_PRELU_GRAD_KERNEL(dtype)                                       \
-  REGISTER_USER_KERNEL("tf_prelu_grad")                                                 \
-      .SetCreateFn<TfGpuPReluGradKernel<dtype>>()                                       \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                  \
-                       && (user_op::HobDataType("dx", 0) == GetDataType<dtype>::value)) \
-      .SetInferTmpSizeFn([](user_op::InferContext* ctx) {                               \
-        const Shape& in_shape = ctx->InputShape("x", 0);                                \
-        const Shape& alpha_shape = ctx->InputShape("alpha", 0);                         \
-        const int64_t tmp_buffer_size =                                                 \
-            IsAlphaShapeContiguous(alpha_shape, in_shape)                               \
-                ? 2 * GetCudaAlignedSize(in_shape.elem_cnt() * sizeof(dtype))           \
-                : 3 * GetCudaAlignedSize(in_shape.elem_cnt() * sizeof(dtype));          \
-        return tmp_buffer_size;                                                         \
-      });
-
-REGISTER_TF_CUDA_PRELU_GRAD_KERNEL(half)
-REGISTER_TF_CUDA_PRELU_GRAD_KERNEL(float)
-REGISTER_TF_CUDA_PRELU_GRAD_KERNEL(double)
-
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "hip/hip_runtime.h"
+#include "oneflow/core/framework/framework.h"
+#include "oneflow/core/ndarray/ndarray_util.h"
+#include "oneflow/core/ep/rocm/cuda_stream.h"
+
+namespace oneflow {
+
+namespace {
+
+template<typename T>
+__global__ void BroadcastPReluForwardGpu(const int32_t elem_cnt, const int32_t alpha_size,
+                                         const int32_t inner_size, const T* x, const T* alpha,
+                                         T* y) {
+  T zero_val = static_cast<T>(0.0);
+  CUDA_1D_KERNEL_LOOP(i, elem_cnt) {
+    const T x_i = x[i];
+    const T alpha_i = alpha[(i / inner_size) % alpha_size];
+    y[i] = x_i > zero_val ? x_i : x_i * alpha_i;
+  }
+}
+
+template<typename T>
+__global__ void BroadcastPReluBackwardGpu(const int32_t elem_cnt, const int32_t alpha_size,
+                                          const int32_t inner_size, const T* x, const T* alpha,
+                                          const T* dy, T* dx, T* alpha_diff) {
+  T zero_val = static_cast<T>(0.0);
+  CUDA_1D_KERNEL_LOOP(i, elem_cnt) {
+    const T x_i = x[i];
+    const T dy_i = dy[i];
+    const T alpha_i = alpha[(i / inner_size) % alpha_size];
+    T dx_i = zero_val;
+    T alpha_diff_i = zero_val;
+    if (x_i > zero_val) {
+      dx_i = dy_i;
+      alpha_diff_i = zero_val;
+    } else {
+      dx_i = dy_i * alpha_i;
+      alpha_diff_i = dy_i * x_i;
+    }
+    dx[i] = dx_i;
+    alpha_diff[i] = alpha_diff_i;
+  }
+}
+
+template<typename T>
+__global__ void ElemwisePReluForwardGpu(const int32_t elem_cnt, const T* x, const T* alpha, T* y) {
+  T zero_val = static_cast<T>(0.0);
+  CUDA_1D_KERNEL_LOOP(i, elem_cnt) {
+    const T x_i = x[i];
+    const T alpha_i = alpha[i];
+    y[i] = x_i > zero_val ? x_i : x_i * alpha_i;
+  }
+}
+
+template<typename T>
+__global__ void ElemwisePReluBackwardGpu(const int32_t elem_cnt, const T* x, const T* alpha,
+                                         const T* dy, T* dx, T* alpha_diff) {
+  T zero_val = static_cast<T>(0.0);
+  CUDA_1D_KERNEL_LOOP(i, elem_cnt) {
+    const T x_i = x[i];
+    const T dy_i = dy[i];
+    const T alpha_i = alpha[i];
+    T dx_i = zero_val;
+    T alpha_diff_i = zero_val;
+    if (x_i > zero_val) {
+      dx_i = dy_i;
+      alpha_diff_i = zero_val;
+    } else {
+      dx_i = dy_i * alpha_i;
+      alpha_diff_i = dy_i * x_i;
+    }
+    dx[i] = dx_i;
+    alpha_diff[i] = alpha_diff_i;
+  }
+}
+
+bool IsAlphaShapeContiguous(const ShapeView& alpha_shape, const ShapeView& x_shape) {
+  if (alpha_shape.elem_cnt() == 1) { return true; }
+  int64_t begin_idx = -1;
+  for (int64_t i = 0; i < alpha_shape.NumAxes(); ++i) {
+    if (alpha_shape.At(i) != 1) {
+      begin_idx = i;
+      break;
+    }
+  }
+  CHECK_NE(begin_idx, -1);
+  int64_t end_idx = -1;
+  for (int64_t i = alpha_shape.NumAxes(); i > 0; --i) {
+    if (alpha_shape.At(i - 1) != 1) {
+      end_idx = i;
+      break;
+    }
+  }
+  CHECK_NE(end_idx, -1);
+  if (alpha_shape.elem_cnt() == x_shape.Count(begin_idx + 1, end_idx + 1)) {
+    return true;
+  } else {
+    return false;
+  }
+}
+
+int32_t GetOuterSize(const ShapeView& alpha_shape, const ShapeView& x_shape) {
+  int32_t outer_size = x_shape.At(0);
+  for (int32_t i = 0; i < alpha_shape.NumAxes(); ++i) {
+    if (alpha_shape.At(i) == 1) {
+      outer_size *= x_shape.At(i + 1);
+    } else {
+      break;
+    }
+  }
+  return outer_size;
+}
+
+}  // namespace
+
+template<typename T>
+class TfGpuPReluKernel final : public user_op::OpKernel {
+ public:
+  TfGpuPReluKernel() = default;
+  ~TfGpuPReluKernel() = default;
+
+ private:
+  using user_op::OpKernel::Compute;
+  void Compute(user_op::KernelComputeContext* ctx) const override {
+    const user_op::Tensor* x = ctx->Tensor4ArgNameAndIndex("x", 0);
+    const user_op::Tensor* alpha = ctx->Tensor4ArgNameAndIndex("alpha", 0);
+    user_op::Tensor* y = ctx->Tensor4ArgNameAndIndex("y", 0);
+    const int32_t elem_cnt = x->shape_view().elem_cnt();
+    if (IsAlphaShapeContiguous(alpha->shape_view(), x->shape_view())) {
+      const int32_t outer_size = GetOuterSize(alpha->shape_view(), x->shape_view());
+      const int32_t alpha_size = alpha->shape_view().elem_cnt();
+      const int32_t inner_size = elem_cnt / outer_size / alpha_size;
+      BroadcastPReluForwardGpu<T><<<BlocksNum4ThreadsNum(elem_cnt), kCudaThreadsNumPerBlock, 0,
+                                    ctx->stream()->As<ep::CudaStream>()->cuda_stream()>>>(
+          elem_cnt, alpha_size, inner_size, x->dptr<T>(), alpha->dptr<T>(), y->mut_dptr<T>());
+    } else {
+      user_op::Tensor* broadcasted_alpha = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0);
+      const Shape& left_extended_shape =
+          CreateLeftExtendedShape(ShapeView(alpha->shape_view()), x->shape_view().NumAxes());
+      NdarrayUtil<DeviceType::kCUDA, T>::BroadcastTo(
+          ctx->stream(), XpuVarNdarray<T>(x->shape_view(), broadcasted_alpha->mut_dptr<T>()),
+          XpuVarNdarray<const T>(left_extended_shape, alpha->dptr<T>()));
+      ElemwisePReluForwardGpu<T><<<BlocksNum4ThreadsNum(elem_cnt), kCudaThreadsNumPerBlock, 0,
+                                   ctx->stream()->As<ep::CudaStream>()->cuda_stream()>>>(
+          elem_cnt, x->dptr<T>(), broadcasted_alpha->dptr<T>(), y->mut_dptr<T>());
+    }
+  }
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
+};
+
+#define REGISTER_TF_CUDA_PRELU_KERNEL(dtype)                                           \
+  REGISTER_USER_KERNEL("tf_prelu")                                                     \
+      .SetCreateFn<TfGpuPReluKernel<dtype>>()                                          \
+      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                 \
+                       && (user_op::HobDataType("y", 0) == GetDataType<dtype>::value)) \
+      .SetInferTmpSizeFn([](user_op::InferContext* ctx) {                              \
+        const Shape& in_shape = ctx->InputShape("x", 0);                               \
+        const Shape& alpha_shape = ctx->InputShape("alpha", 0);                        \
+        const int64_t tmp_buffer_size =                                                \
+            IsAlphaShapeContiguous(alpha_shape, in_shape)                              \
+                ? 0                                                                    \
+                : GetCudaAlignedSize(in_shape.elem_cnt() * sizeof(dtype));             \
+        return tmp_buffer_size;                                                        \
+      });
+
+REGISTER_TF_CUDA_PRELU_KERNEL(half)
+REGISTER_TF_CUDA_PRELU_KERNEL(float)
+REGISTER_TF_CUDA_PRELU_KERNEL(double)
+
+template<typename T>
+class TfGpuPReluGradKernel final : public user_op::OpKernel {
+ public:
+  TfGpuPReluGradKernel() = default;
+  ~TfGpuPReluGradKernel() = default;
+
+ private:
+  using user_op::OpKernel::Compute;
+  void Compute(user_op::KernelComputeContext* ctx) const override {
+    const user_op::Tensor* x = ctx->Tensor4ArgNameAndIndex("x", 0);
+    const user_op::Tensor* alpha = ctx->Tensor4ArgNameAndIndex("alpha", 0);
+    const user_op::Tensor* dy = ctx->Tensor4ArgNameAndIndex("dy", 0);
+    user_op::Tensor* dx = ctx->Tensor4ArgNameAndIndex("dx", 0);
+    user_op::Tensor* alpha_diff = ctx->Tensor4ArgNameAndIndex("alpha_diff", 0);
+    user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0);
+    const int32_t elem_cnt = x->shape_view().elem_cnt();
+    T* broadcasted_alpha_diff = tmp_buffer->mut_dptr<T>();
+    T* reduce_sum_tmp_buf = reinterpret_cast<T*>(tmp_buffer->mut_dptr<char>()
+                                                 + GetCudaAlignedSize(elem_cnt * sizeof(T)));
+    const Shape& left_extended_shape =
+        CreateLeftExtendedShape(ShapeView(alpha->shape_view()), x->shape_view().NumAxes());
+    if (IsAlphaShapeContiguous(alpha->shape_view(), x->shape_view())) {
+      const int32_t outer_size = GetOuterSize(alpha->shape_view(), x->shape_view());
+      const int32_t alpha_size = alpha->shape_view().elem_cnt();
+      const int32_t inner_size = elem_cnt / outer_size / alpha_size;
+      BroadcastPReluBackwardGpu<T><<<BlocksNum4ThreadsNum(elem_cnt), kCudaThreadsNumPerBlock, 0,
+                                     ctx->stream()->As<ep::CudaStream>()->cuda_stream()>>>(
+          elem_cnt, alpha_size, inner_size, x->dptr<T>(), alpha->dptr<T>(), dy->dptr<T>(),
+          dx->mut_dptr<T>(), broadcasted_alpha_diff);
+    } else {
+      T* broadcasted_alpha = reinterpret_cast<T*>(tmp_buffer->mut_dptr<char>()
+                                                  + 2 * GetCudaAlignedSize(elem_cnt * sizeof(T)));
+
+      NdarrayUtil<DeviceType::kCUDA, T>::BroadcastTo(
+          ctx->stream(), XpuVarNdarray<T>(x->shape_view(), broadcasted_alpha),
+          XpuVarNdarray<const T>(left_extended_shape, alpha->dptr<T>()));
+
+      ElemwisePReluBackwardGpu<T><<<BlocksNum4ThreadsNum(elem_cnt), kCudaThreadsNumPerBlock, 0,
+                                    ctx->stream()->As<ep::CudaStream>()->cuda_stream()>>>(
+          elem_cnt, x->dptr<T>(), broadcasted_alpha, dy->dptr<T>(), dx->mut_dptr<T>(),
+          broadcasted_alpha_diff);
+    }
+    NdarrayUtil<DeviceType::kCUDA, T>::ReduceSum(
+        ctx->stream(), XpuVarNdarray<T>(left_extended_shape, alpha_diff->mut_dptr<T>()),
+        XpuVarNdarray<const T>(x->shape_view(), broadcasted_alpha_diff),
+        XpuVarNdarray<T>(x->shape_view(), reduce_sum_tmp_buf));
+  }
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
+};
+
+#define REGISTER_TF_CUDA_PRELU_GRAD_KERNEL(dtype)                                       \
+  REGISTER_USER_KERNEL("tf_prelu_grad")                                                 \
+      .SetCreateFn<TfGpuPReluGradKernel<dtype>>()                                       \
+      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                  \
+                       && (user_op::HobDataType("dx", 0) == GetDataType<dtype>::value)) \
+      .SetInferTmpSizeFn([](user_op::InferContext* ctx) {                               \
+        const Shape& in_shape = ctx->InputShape("x", 0);                                \
+        const Shape& alpha_shape = ctx->InputShape("alpha", 0);                         \
+        const int64_t tmp_buffer_size =                                                 \
+            IsAlphaShapeContiguous(alpha_shape, in_shape)                               \
+                ? 2 * GetCudaAlignedSize(in_shape.elem_cnt() * sizeof(dtype))           \
+                : 3 * GetCudaAlignedSize(in_shape.elem_cnt() * sizeof(dtype));          \
+        return tmp_buffer_size;                                                         \
+      });
+
+REGISTER_TF_CUDA_PRELU_GRAD_KERNEL(half)
+REGISTER_TF_CUDA_PRELU_GRAD_KERNEL(float)
+REGISTER_TF_CUDA_PRELU_GRAD_KERNEL(double)
+
 }  // namespace oneflow
\ No newline at end of file
diff --git a/oneflow/user/kernels/to_contiguous_kernel.hip.cpp b/oneflow/user/kernels/to_contiguous_kernel.hip.cpp
index 72d23a8..ac2791c 100644
--- a/oneflow/user/kernels/to_contiguous_kernel.hip.cpp
+++ b/oneflow/user/kernels/to_contiguous_kernel.hip.cpp
@@ -1,161 +1,161 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "hip/hip_runtime.h"
-#include <type_traits>
-#include "oneflow/core/common/device_type.pb.h"
-#include "oneflow/user/kernels/to_contiguous_kernel.h"
-#include "oneflow/core/ep/rocm/cuda_stream.h"
-#include "oneflow/core/hip/elementwise.hip.h"
-
-namespace oneflow {
-
-namespace {
-
-constexpr int32_t kThreadWorkSize = 4;
-constexpr int32_t kNumThreads = 32 * 4;
-constexpr int32_t get_min_threads_num() { return kNumThreads; }
-constexpr int32_t get_block_work_size() { return kThreadWorkSize * kNumThreads; }
-constexpr int32_t get_num_blocks(int64_t elem_cnt) {
-  return (elem_cnt + get_block_work_size() - 1) / get_block_work_size();
-}
-
-struct StrideParam {
-  int32_t stride[SHAPE_MAX_AXIS_SIZE];
-
-  StrideParam(const int64_t* stride_vec, const size_t ndim) {
-    for (size_t i = 0; i < ndim; ++i) { stride[i] = stride_vec[i]; }
-  }
-};
-
-template<typename IndexType, size_t ndim>
-__device__ __forceinline__ IndexType compute_index(IndexType out_offset,
-                                                   const StrideParam& out_params,
-                                                   const StrideParam& in_params) {
-  IndexType in_offset = 0;
-  IndexType remaining = out_offset;
-
-#pragma unroll
-  for (size_t i = 0; i < ndim; ++i) {
-    const IndexType idx = static_cast<IndexType>(remaining / out_params.stride[i]);
-    remaining -= idx * out_params.stride[i];
-    in_offset += idx * in_params.stride[i];
-  }
-  return in_offset;
-}
-
-template<typename T, typename IndexType, size_t ndim>
-__global__ void ToContiguousForwardGpuParallel(IndexType count, const StrideParam in_stride,
-                                               const StrideParam out_stride, const T* in_dptr,
-                                               T* out_dptr, const int32_t num_block_threads,
-                                               const int32_t thread_work_size,
-                                               const int32_t block_work_size) {
-  IndexType remaining = count - block_work_size * blockIdx.x;
-  IndexType idx = blockIdx.x;
-  IndexType thread_idx = threadIdx.x;
-#pragma unroll
-  for (int32_t i = 0; i < thread_work_size; i++) {
-    if (thread_idx >= remaining) { return; }
-    IndexType out_idx = thread_idx + block_work_size * idx;
-    IndexType in_idx = compute_index<IndexType, ndim>(out_idx, out_stride, in_stride);
-    out_dptr[out_idx] = in_dptr[in_idx];
-    thread_idx += num_block_threads;
-  }
-}
-
-template<typename T, typename IndexType>
-void LaunchToContiguousKernel(ep::Stream* stream, IndexType count, const size_t ndim,
-                              IndexType block_size, const std::vector<int64_t>& in_stride,
-                              const DimVector& out_stride, const char* in_dptr, char* out_dptr) {
-  const int32_t num_blocks = get_num_blocks(count);
-  constexpr int32_t num_threads = get_min_threads_num();
-  constexpr int32_t block_work_size = get_block_work_size();
-  StrideParam param_in_stride(in_stride.data(), ndim), param_out_stride(out_stride.data(), ndim);
-
-  switch (ndim) {
-#define TO_CONTIGUOUS_FORWARD_GPU_PARALLEL(dim)                                             \
-  case dim:                                                                                 \
-    ToContiguousForwardGpuParallel<T, IndexType, dim>                                       \
-        <<<num_blocks, num_threads, 0, stream->As<ep::CudaStream>()->cuda_stream()>>>(      \
-            count, param_in_stride, param_out_stride, reinterpret_cast<const T*>(in_dptr),  \
-            reinterpret_cast<T*>(out_dptr), num_threads, kThreadWorkSize, block_work_size); \
-    break;
-
-    TO_CONTIGUOUS_FORWARD_GPU_PARALLEL(1)
-    TO_CONTIGUOUS_FORWARD_GPU_PARALLEL(2)
-    TO_CONTIGUOUS_FORWARD_GPU_PARALLEL(3)
-    TO_CONTIGUOUS_FORWARD_GPU_PARALLEL(4)
-    TO_CONTIGUOUS_FORWARD_GPU_PARALLEL(5)
-    TO_CONTIGUOUS_FORWARD_GPU_PARALLEL(6)
-    TO_CONTIGUOUS_FORWARD_GPU_PARALLEL(7)
-    TO_CONTIGUOUS_FORWARD_GPU_PARALLEL(8)
-    TO_CONTIGUOUS_FORWARD_GPU_PARALLEL(9)
-    TO_CONTIGUOUS_FORWARD_GPU_PARALLEL(10)
-    TO_CONTIGUOUS_FORWARD_GPU_PARALLEL(11)
-    TO_CONTIGUOUS_FORWARD_GPU_PARALLEL(12)
-    TO_CONTIGUOUS_FORWARD_GPU_PARALLEL(13)
-    TO_CONTIGUOUS_FORWARD_GPU_PARALLEL(14)
-    TO_CONTIGUOUS_FORWARD_GPU_PARALLEL(15)
-    TO_CONTIGUOUS_FORWARD_GPU_PARALLEL(16)
-    default: break;
-#undef TO_CONTIGUOUS_FORWARD_GPU_PARALLEL
-  }
-}
-
-}  // namespace
-
-template<typename T>
-struct ToContiguousUtil<DeviceType::kCUDA, T> : ToContiguousUtilBase {
-  using ToContiguousUtilBase::ToContiguousUtilBase;
-  static constexpr size_t dsize = sizeof(T);
-  void operator()() {
-    int constant_memory_size = 0;
-    const size_t ndims = contiguous_dim + 1;
-    if (ndims == 0) {
-      // 0-dim tensor
-      OF_CUDA_CHECK(hipMemcpyAsync(out_dptr, in_dptr, block_size * dsize, hipMemcpyDeviceToDevice,
-                                    stream->As<ep::CudaStream>()->cuda_stream()));
-    } else {
-      bool is_same = true;
-      for (int64_t i = contiguous_dim; i != -1; --i) {
-        if (out_stride[i] != in_stride[i]) {
-          is_same = false;
-          break;
-        }
-      }
-      if (is_same) {
-        // if input tensor's strides equals to output's, than just copy one memory-contiguous tensor
-        OF_CUDA_CHECK(hipMemcpyAsync(out_dptr, in_dptr, element_count * dsize,
-                                      hipMemcpyDeviceToDevice,
-                                      stream->As<ep::CudaStream>()->cuda_stream()));
-      } else {
-        if (element_count < GetMaxVal<int32_t>()) {
-          LaunchToContiguousKernel<T, int32_t>(stream, element_count, ndims, block_size, in_stride,
-                                               out_stride, in_dptr, out_dptr);
-        } else {
-          LaunchToContiguousKernel<T, int64_t>(stream, element_count, ndims, block_size, in_stride,
-                                               out_stride, in_dptr, out_dptr);
-        }
-      }
-    }
-  }
-};
-
-#define INSTANTIATE_TO_CONTIGUOUS_UTILS_FOR_CUDA(T) \
-  template struct ToContiguousUtil<DeviceType::kCUDA, T>;
-OF_PP_FOR_EACH_TUPLE(INSTANTIATE_TO_CONTIGUOUS_UTILS_FOR_CUDA,
-                     TO_CONTIGUOUS_TYPES TO_CONTIGUOUS_CUDA_SPECIAL_TYPE)
-
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "hip/hip_runtime.h"
+#include <type_traits>
+#include "oneflow/core/common/device_type.pb.h"
+#include "oneflow/user/kernels/to_contiguous_kernel.h"
+#include "oneflow/core/ep/rocm/cuda_stream.h"
+#include "oneflow/core/hip/elementwise.hip.h"
+
+namespace oneflow {
+
+namespace {
+
+constexpr int32_t kThreadWorkSize = 4;
+constexpr int32_t kNumThreads = 32 * 4;
+constexpr int32_t get_min_threads_num() { return kNumThreads; }
+constexpr int32_t get_block_work_size() { return kThreadWorkSize * kNumThreads; }
+constexpr int32_t get_num_blocks(int64_t elem_cnt) {
+  return (elem_cnt + get_block_work_size() - 1) / get_block_work_size();
+}
+
+struct StrideParam {
+  int32_t stride[SHAPE_MAX_AXIS_SIZE];
+
+  StrideParam(const int64_t* stride_vec, const size_t ndim) {
+    for (size_t i = 0; i < ndim; ++i) { stride[i] = stride_vec[i]; }
+  }
+};
+
+template<typename IndexType, size_t ndim>
+__device__ __forceinline__ IndexType compute_index(IndexType out_offset,
+                                                   const StrideParam& out_params,
+                                                   const StrideParam& in_params) {
+  IndexType in_offset = 0;
+  IndexType remaining = out_offset;
+
+#pragma unroll
+  for (size_t i = 0; i < ndim; ++i) {
+    const IndexType idx = static_cast<IndexType>(remaining / out_params.stride[i]);
+    remaining -= idx * out_params.stride[i];
+    in_offset += idx * in_params.stride[i];
+  }
+  return in_offset;
+}
+
+template<typename T, typename IndexType, size_t ndim>
+__global__ void ToContiguousForwardGpuParallel(IndexType count, const StrideParam in_stride,
+                                               const StrideParam out_stride, const T* in_dptr,
+                                               T* out_dptr, const int32_t num_block_threads,
+                                               const int32_t thread_work_size,
+                                               const int32_t block_work_size) {
+  IndexType remaining = count - block_work_size * blockIdx.x;
+  IndexType idx = blockIdx.x;
+  IndexType thread_idx = threadIdx.x;
+#pragma unroll
+  for (int32_t i = 0; i < thread_work_size; i++) {
+    if (thread_idx >= remaining) { return; }
+    IndexType out_idx = thread_idx + block_work_size * idx;
+    IndexType in_idx = compute_index<IndexType, ndim>(out_idx, out_stride, in_stride);
+    out_dptr[out_idx] = in_dptr[in_idx];
+    thread_idx += num_block_threads;
+  }
+}
+
+template<typename T, typename IndexType>
+void LaunchToContiguousKernel(ep::Stream* stream, IndexType count, const size_t ndim,
+                              IndexType block_size, const std::vector<int64_t>& in_stride,
+                              const DimVector& out_stride, const char* in_dptr, char* out_dptr) {
+  const int32_t num_blocks = get_num_blocks(count);
+  constexpr int32_t num_threads = get_min_threads_num();
+  constexpr int32_t block_work_size = get_block_work_size();
+  StrideParam param_in_stride(in_stride.data(), ndim), param_out_stride(out_stride.data(), ndim);
+
+  switch (ndim) {
+#define TO_CONTIGUOUS_FORWARD_GPU_PARALLEL(dim)                                             \
+  case dim:                                                                                 \
+    ToContiguousForwardGpuParallel<T, IndexType, dim>                                       \
+        <<<num_blocks, num_threads, 0, stream->As<ep::CudaStream>()->cuda_stream()>>>(      \
+            count, param_in_stride, param_out_stride, reinterpret_cast<const T*>(in_dptr),  \
+            reinterpret_cast<T*>(out_dptr), num_threads, kThreadWorkSize, block_work_size); \
+    break;
+
+    TO_CONTIGUOUS_FORWARD_GPU_PARALLEL(1)
+    TO_CONTIGUOUS_FORWARD_GPU_PARALLEL(2)
+    TO_CONTIGUOUS_FORWARD_GPU_PARALLEL(3)
+    TO_CONTIGUOUS_FORWARD_GPU_PARALLEL(4)
+    TO_CONTIGUOUS_FORWARD_GPU_PARALLEL(5)
+    TO_CONTIGUOUS_FORWARD_GPU_PARALLEL(6)
+    TO_CONTIGUOUS_FORWARD_GPU_PARALLEL(7)
+    TO_CONTIGUOUS_FORWARD_GPU_PARALLEL(8)
+    TO_CONTIGUOUS_FORWARD_GPU_PARALLEL(9)
+    TO_CONTIGUOUS_FORWARD_GPU_PARALLEL(10)
+    TO_CONTIGUOUS_FORWARD_GPU_PARALLEL(11)
+    TO_CONTIGUOUS_FORWARD_GPU_PARALLEL(12)
+    TO_CONTIGUOUS_FORWARD_GPU_PARALLEL(13)
+    TO_CONTIGUOUS_FORWARD_GPU_PARALLEL(14)
+    TO_CONTIGUOUS_FORWARD_GPU_PARALLEL(15)
+    TO_CONTIGUOUS_FORWARD_GPU_PARALLEL(16)
+    default: break;
+#undef TO_CONTIGUOUS_FORWARD_GPU_PARALLEL
+  }
+}
+
+}  // namespace
+
+template<typename T>
+struct ToContiguousUtil<DeviceType::kCUDA, T> : ToContiguousUtilBase {
+  using ToContiguousUtilBase::ToContiguousUtilBase;
+  static constexpr size_t dsize = sizeof(T);
+  void operator()() {
+    int constant_memory_size = 0;
+    const size_t ndims = contiguous_dim + 1;
+    if (ndims == 0) {
+      // 0-dim tensor
+      OF_CUDA_CHECK(hipMemcpyAsync(out_dptr, in_dptr, block_size * dsize, hipMemcpyDeviceToDevice,
+                                    stream->As<ep::CudaStream>()->cuda_stream()));
+    } else {
+      bool is_same = true;
+      for (int64_t i = contiguous_dim; i != -1; --i) {
+        if (out_stride[i] != in_stride[i]) {
+          is_same = false;
+          break;
+        }
+      }
+      if (is_same) {
+        // if input tensor's strides equals to output's, than just copy one memory-contiguous tensor
+        OF_CUDA_CHECK(hipMemcpyAsync(out_dptr, in_dptr, element_count * dsize,
+                                      hipMemcpyDeviceToDevice,
+                                      stream->As<ep::CudaStream>()->cuda_stream()));
+      } else {
+        if (element_count < GetMaxVal<int32_t>()) {
+          LaunchToContiguousKernel<T, int32_t>(stream, element_count, ndims, block_size, in_stride,
+                                               out_stride, in_dptr, out_dptr);
+        } else {
+          LaunchToContiguousKernel<T, int64_t>(stream, element_count, ndims, block_size, in_stride,
+                                               out_stride, in_dptr, out_dptr);
+        }
+      }
+    }
+  }
+};
+
+#define INSTANTIATE_TO_CONTIGUOUS_UTILS_FOR_CUDA(T) \
+  template struct ToContiguousUtil<DeviceType::kCUDA, T>;
+OF_PP_FOR_EACH_TUPLE(INSTANTIATE_TO_CONTIGUOUS_UTILS_FOR_CUDA,
+                     TO_CONTIGUOUS_TYPES TO_CONTIGUOUS_CUDA_SPECIAL_TYPE)
+
 }  // namespace oneflow
\ No newline at end of file
diff --git a/oneflow/user/kernels/tril_kernel.hip.cpp b/oneflow/user/kernels/tril_kernel.hip.cpp
index 817e8b0..02e7b3d 100644
--- a/oneflow/user/kernels/tril_kernel.hip.cpp
+++ b/oneflow/user/kernels/tril_kernel.hip.cpp
@@ -1,256 +1,256 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "hip/hip_runtime.h"
-#include "oneflow/core/common/data_type.h"
-#include "oneflow/core/framework/framework.h"
-#include "oneflow/core/kernel/util/cuda_half_util.h"
-#include "oneflow/core/ep/rocm/cuda_stream.h"
-
-namespace oneflow {
-
-namespace {
-
-template<typename T>
-__global__ void TrilGpu(const int64_t elem_cnt, const int64_t num_rows, const int64_t num_cols,
-                        const int64_t diagonal, const T* x, const T fill, T* y) {
-  const int64_t matrix_size = num_rows * num_cols;
-  CUDA_1D_KERNEL_LOOP_T(int64_t, k, elem_cnt) {
-    const int64_t offset_in_matrix = k % matrix_size;
-    const int64_t i = offset_in_matrix / num_cols;
-    const int64_t j = offset_in_matrix - num_cols * i;
-    y[k] = j > i + diagonal ? fill : x[k];
-  }
-}
-
-template<typename T>
-__global__ void TrilWarpProcessRowGpu(const int64_t total_rows, const int64_t num_rows,
-                                      const int64_t num_cols, const int64_t diagonal, const T* x,
-                                      const T fill, T* y) {
-  const int64_t warp_id = (blockIdx.x * blockDim.x + threadIdx.x) / kCudaWarpSize;
-  const int64_t lan_id = threadIdx.x % kCudaWarpSize;
-  const int64_t num_warp = blockDim.x * gridDim.x / kCudaWarpSize;
-  for (int64_t i = warp_id; i < total_rows; i += num_warp) {
-    const int64_t row = i % num_rows;
-    for (int64_t col = lan_id; col < num_cols; col += kCudaWarpSize) {
-      const int64_t idx = i * num_cols + col;
-      y[idx] = col > row + diagonal ? fill : x[idx];
-    }
-  }
-}
-
-template<>
-__global__ void TrilWarpProcessRowGpu<half>(const int64_t total_rows, const int64_t num_rows,
-                                            const int64_t num_cols, const int64_t diagonal,
-                                            const half* x, const half fill, half* y) {
-  const int64_t h2_num_cols = num_cols / 2;
-  const auto* x_h2 = reinterpret_cast<const half2*>(x);
-  auto* y_h2 = reinterpret_cast<half2*>(y);
-
-  const int64_t warp_id = (blockIdx.x * blockDim.x + threadIdx.x) / kCudaWarpSize;
-  const int64_t lan_id = threadIdx.x % kCudaWarpSize;
-  const int64_t num_warp = blockDim.x * gridDim.x / kCudaWarpSize;
-  for (int64_t i = warp_id; i < total_rows; i += num_warp) {
-    const int64_t row = i % num_rows;
-    for (int64_t col = lan_id; col < h2_num_cols; col += kCudaWarpSize) {
-      const int64_t idx = i * h2_num_cols + col;
-      const half2 x_val = x_h2[idx];
-      half2 y_val;
-      y_val.data.x = (2 * col) > row + diagonal ? fill : static_cast<half>(x_val.data.x);
-      y_val.data.y = (2 * col + 1) > row + diagonal ? fill : static_cast<half>(x_val.data.y);
-      y_h2[idx] = y_val;
-    }
-  }
-}
-
-template<typename T>
-__global__ void FusedScaleTrilGpu(const int64_t elem_cnt, const int64_t num_rows,
-                                  const int64_t num_cols, const int64_t diagonal, const T scale,
-                                  const T* x, const T fill, T* y) {
-  const int64_t matrix_size = num_rows * num_cols;
-  CUDA_1D_KERNEL_LOOP_T(int64_t, k, elem_cnt) {
-    const int64_t offset_in_matrix = k % matrix_size;
-    const int64_t i = offset_in_matrix / num_cols;
-    const int64_t j = offset_in_matrix - num_cols * i;
-    y[k] = j > i + diagonal ? fill : (scale * x[k]);
-  }
-}
-
-template<typename T>
-__global__ void FusedScaleTrilWarpProcessRowGpu(const int64_t total_rows, const int64_t num_rows,
-                                                const int64_t num_cols, const int64_t diagonal,
-                                                const T scale, const T* x, const T fill, T* y) {
-  const int64_t warp_id = (blockIdx.x * blockDim.x + threadIdx.x) / kCudaWarpSize;
-  const int64_t lan_id = threadIdx.x % kCudaWarpSize;
-  const int64_t num_warp = blockDim.x * gridDim.x / kCudaWarpSize;
-  for (int64_t i = warp_id; i < total_rows; i += num_warp) {
-    const int64_t row = i % num_rows;
-    for (int64_t col = lan_id; col < num_cols; col += kCudaWarpSize) {
-      const int64_t idx = i * num_cols + col;
-      y[idx] = col > row + diagonal ? fill : (scale * x[idx]);
-    }
-  }
-}
-
-template<>
-__global__ void FusedScaleTrilWarpProcessRowGpu<half>(const int64_t total_rows,
-                                                      const int64_t num_rows,
-                                                      const int64_t num_cols,
-                                                      const int64_t diagonal, const half scale,
-                                                      const half* x, const half fill, half* y) {
-  const int64_t h2_num_cols = num_cols / 2;
-  const auto* x_h2 = reinterpret_cast<const half2*>(x);
-  auto* y_h2 = reinterpret_cast<half2*>(y);
-  const half2 h2_scale = __half2half2(scale);
-  const int64_t warp_id = (blockIdx.x * blockDim.x + threadIdx.x) / kCudaWarpSize;
-  const int64_t lan_id = threadIdx.x % kCudaWarpSize;
-  const int64_t num_warp = blockDim.x * gridDim.x / kCudaWarpSize;
-  for (int64_t i = warp_id; i < total_rows; i += num_warp) {
-    const int64_t row = i % num_rows;
-    for (int64_t col = lan_id; col < h2_num_cols; col += kCudaWarpSize) {
-      const int64_t idx = i * h2_num_cols + col;
-      const half2 scaled_x = __hmul2(h2_scale, x_h2[idx]);
-      half2 y_val;
-      y_val.data.x = (2 * col) > row + diagonal ? fill : static_cast<half>(scaled_x.data.x);
-      y_val.data.y = (2 * col + 1) > row + diagonal ? fill : static_cast<half>(scaled_x.data.y);
-      y_h2[idx] = y_val;
-    }
-  }
-}
-
-template<typename T>
-T GetAttrVal(bool is_floating_val, double floating_value, int64_t integer_value) {
-  return is_floating_val ? static_cast<T>(floating_value) : static_cast<T>(integer_value);
-}
-
-template<>
-half GetAttrVal<half>(bool is_floating_val, double floating_value, int64_t integer_value) {
-  return is_floating_val ? __float2half(floating_value) : __float2half(integer_value);
-}
-
-}  // namespace
-
-template<typename T>
-class GpuTrilKernel final : public user_op::OpKernel {
- public:
-  GpuTrilKernel() = default;
-  ~GpuTrilKernel() override = default;
-
- private:
-  using user_op::OpKernel::Compute;
-  void Compute(user_op::KernelComputeContext* ctx) const override {
-    const user_op::Tensor* x = ctx->Tensor4ArgNameAndIndex("in", 0);
-    const auto shape = x->shape_view();
-    const auto diagonal = ctx->Attr<int64_t>("diagonal");
-    const int64_t num_rows = shape.At(shape.NumAxes() - 2);
-    const int64_t num_cols = shape.At(shape.NumAxes() - 1);
-    user_op::Tensor* y = ctx->Tensor4ArgNameAndIndex("out", 0);
-    const int32_t elem_cnt = shape.elem_cnt();
-    const T fill = GetAttrVal<T>(ctx->Attr<bool>("is_floating_fill_value"),
-                                 ctx->Attr<double>("floating_fill_value"),
-                                 ctx->Attr<int64_t>("integer_fill_value"));
-    if (num_cols % (kCudaWarpSize * 2) == 0) {
-      const int64_t total_rows = elem_cnt / num_cols;
-      TrilWarpProcessRowGpu<<<BlocksNum4ThreadsNum(total_rows * kCudaWarpSize),
-                              kCudaThreadsNumPerBlock, 0,
-                              ctx->stream()->As<ep::CudaStream>()->cuda_stream()>>>(
-          total_rows, num_rows, num_cols, diagonal, x->dptr<T>(), fill, y->mut_dptr<T>());
-    } else {
-      TrilGpu<<<BlocksNum4ThreadsNum(elem_cnt), kCudaThreadsNumPerBlock, 0,
-                ctx->stream()->As<ep::CudaStream>()->cuda_stream()>>>(
-          elem_cnt, num_rows, num_cols, diagonal, x->dptr<T>(), fill, y->mut_dptr<T>());
-    }
-  }
-  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
-};
-
-#define REGISTER_CUDA_TRIL_KERNEL(dtype)                                                        \
-  REGISTER_USER_KERNEL("tril")                                                                  \
-      .SetCreateFn<GpuTrilKernel<dtype>>()                                                      \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                          \
-                       && (user_op::HobDataType("out", 0) == GetDataType<dtype>::value))        \
-      .SetInplaceProposalFn([](const user_op::InferContext&,                                    \
-                               user_op::AddInplaceArgPair AddInplaceArgPairFn) -> Maybe<void> { \
-        OF_RETURN_IF_ERROR(AddInplaceArgPairFn("out", 0, "in", 0, true));                       \
-        return Maybe<void>::Ok();                                                               \
-      });
-
-REGISTER_CUDA_TRIL_KERNEL(float)
-REGISTER_CUDA_TRIL_KERNEL(double)
-REGISTER_CUDA_TRIL_KERNEL(bool)
-REGISTER_CUDA_TRIL_KERNEL(uint8_t)
-REGISTER_CUDA_TRIL_KERNEL(int8_t)
-REGISTER_CUDA_TRIL_KERNEL(int32_t)
-REGISTER_CUDA_TRIL_KERNEL(int64_t)
-REGISTER_CUDA_TRIL_KERNEL(half)
-
-template<typename T>
-class GpuFusedScaleTrilKernel final : public user_op::OpKernel {
- public:
-  GpuFusedScaleTrilKernel() = default;
-  ~GpuFusedScaleTrilKernel() override = default;
-
- private:
-  using user_op::OpKernel::Compute;
-  void Compute(user_op::KernelComputeContext* ctx) const override {
-    const user_op::Tensor* x = ctx->Tensor4ArgNameAndIndex("in", 0);
-    const auto shape = x->shape_view();
-    const auto diagonal = ctx->Attr<int64_t>("diagonal");
-    const int32_t num_rows = shape.At(shape.NumAxes() - 2);
-    const int32_t num_cols = shape.At(shape.NumAxes() - 1);
-    user_op::Tensor* y = ctx->Tensor4ArgNameAndIndex("out", 0);
-    const int32_t elem_cnt = shape.elem_cnt();
-    const T fill = GetAttrVal<T>(ctx->Attr<bool>("is_floating_fill_value"),
-                                 ctx->Attr<double>("floating_fill_value"),
-                                 ctx->Attr<int64_t>("integer_fill_value"));
-    const T scale = GetAttrVal<T>(ctx->Attr<bool>("is_floating_scale_value"),
-                                  ctx->Attr<double>("floating_scale_value"),
-                                  ctx->Attr<int64_t>("integer_scale_value"));
-    if (num_cols % (kCudaWarpSize * 2) == 0) {
-      const int64_t total_rows = elem_cnt / num_cols;
-      FusedScaleTrilWarpProcessRowGpu<<<BlocksNum4ThreadsNum(total_rows * kCudaWarpSize),
-                                        kCudaThreadsNumPerBlock, 0,
-                                        ctx->stream()->As<ep::CudaStream>()->cuda_stream()>>>(
-          total_rows, num_rows, num_cols, diagonal, scale, x->dptr<T>(), fill, y->mut_dptr<T>());
-    } else {
-      FusedScaleTrilGpu<<<BlocksNum4ThreadsNum(elem_cnt), kCudaThreadsNumPerBlock, 0,
-                          ctx->stream()->As<ep::CudaStream>()->cuda_stream()>>>(
-          elem_cnt, num_rows, num_cols, diagonal, scale, x->dptr<T>(), fill, y->mut_dptr<T>());
-    }
-  }
-  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
-};
-
-#define REGISTER_CUDA_FUSED_SCALE_TRIL_KERNEL(dtype)                                            \
-  REGISTER_USER_KERNEL("fused_scale_tril")                                                      \
-      .SetCreateFn<GpuFusedScaleTrilKernel<dtype>>()                                            \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                          \
-                       && (user_op::HobDataType("out", 0) == GetDataType<dtype>::value))        \
-      .SetInplaceProposalFn([](const user_op::InferContext&,                                    \
-                               user_op::AddInplaceArgPair AddInplaceArgPairFn) -> Maybe<void> { \
-        OF_RETURN_IF_ERROR(AddInplaceArgPairFn("out", 0, "in", 0, true));                       \
-        return Maybe<void>::Ok();                                                               \
-      });
-
-REGISTER_CUDA_FUSED_SCALE_TRIL_KERNEL(float)
-REGISTER_CUDA_FUSED_SCALE_TRIL_KERNEL(double)
-REGISTER_CUDA_FUSED_SCALE_TRIL_KERNEL(bool)
-REGISTER_CUDA_FUSED_SCALE_TRIL_KERNEL(uint8_t)
-REGISTER_CUDA_FUSED_SCALE_TRIL_KERNEL(int8_t)
-REGISTER_CUDA_FUSED_SCALE_TRIL_KERNEL(int32_t)
-REGISTER_CUDA_FUSED_SCALE_TRIL_KERNEL(int64_t)
-REGISTER_CUDA_FUSED_SCALE_TRIL_KERNEL(half)
-
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "hip/hip_runtime.h"
+#include "oneflow/core/common/data_type.h"
+#include "oneflow/core/framework/framework.h"
+#include "oneflow/core/kernel/util/cuda_half_util.h"
+#include "oneflow/core/ep/rocm/cuda_stream.h"
+
+namespace oneflow {
+
+namespace {
+
+template<typename T>
+__global__ void TrilGpu(const int64_t elem_cnt, const int64_t num_rows, const int64_t num_cols,
+                        const int64_t diagonal, const T* x, const T fill, T* y) {
+  const int64_t matrix_size = num_rows * num_cols;
+  CUDA_1D_KERNEL_LOOP_T(int64_t, k, elem_cnt) {
+    const int64_t offset_in_matrix = k % matrix_size;
+    const int64_t i = offset_in_matrix / num_cols;
+    const int64_t j = offset_in_matrix - num_cols * i;
+    y[k] = j > i + diagonal ? fill : x[k];
+  }
+}
+
+template<typename T>
+__global__ void TrilWarpProcessRowGpu(const int64_t total_rows, const int64_t num_rows,
+                                      const int64_t num_cols, const int64_t diagonal, const T* x,
+                                      const T fill, T* y) {
+  const int64_t warp_id = (blockIdx.x * blockDim.x + threadIdx.x) / kCudaWarpSize;
+  const int64_t lan_id = threadIdx.x % kCudaWarpSize;
+  const int64_t num_warp = blockDim.x * gridDim.x / kCudaWarpSize;
+  for (int64_t i = warp_id; i < total_rows; i += num_warp) {
+    const int64_t row = i % num_rows;
+    for (int64_t col = lan_id; col < num_cols; col += kCudaWarpSize) {
+      const int64_t idx = i * num_cols + col;
+      y[idx] = col > row + diagonal ? fill : x[idx];
+    }
+  }
+}
+
+template<>
+__global__ void TrilWarpProcessRowGpu<half>(const int64_t total_rows, const int64_t num_rows,
+                                            const int64_t num_cols, const int64_t diagonal,
+                                            const half* x, const half fill, half* y) {
+  const int64_t h2_num_cols = num_cols / 2;
+  const auto* x_h2 = reinterpret_cast<const half2*>(x);
+  auto* y_h2 = reinterpret_cast<half2*>(y);
+
+  const int64_t warp_id = (blockIdx.x * blockDim.x + threadIdx.x) / kCudaWarpSize;
+  const int64_t lan_id = threadIdx.x % kCudaWarpSize;
+  const int64_t num_warp = blockDim.x * gridDim.x / kCudaWarpSize;
+  for (int64_t i = warp_id; i < total_rows; i += num_warp) {
+    const int64_t row = i % num_rows;
+    for (int64_t col = lan_id; col < h2_num_cols; col += kCudaWarpSize) {
+      const int64_t idx = i * h2_num_cols + col;
+      const half2 x_val = x_h2[idx];
+      half2 y_val;
+      y_val.data.x = (2 * col) > row + diagonal ? fill : static_cast<half>(x_val.data.x);
+      y_val.data.y = (2 * col + 1) > row + diagonal ? fill : static_cast<half>(x_val.data.y);
+      y_h2[idx] = y_val;
+    }
+  }
+}
+
+template<typename T>
+__global__ void FusedScaleTrilGpu(const int64_t elem_cnt, const int64_t num_rows,
+                                  const int64_t num_cols, const int64_t diagonal, const T scale,
+                                  const T* x, const T fill, T* y) {
+  const int64_t matrix_size = num_rows * num_cols;
+  CUDA_1D_KERNEL_LOOP_T(int64_t, k, elem_cnt) {
+    const int64_t offset_in_matrix = k % matrix_size;
+    const int64_t i = offset_in_matrix / num_cols;
+    const int64_t j = offset_in_matrix - num_cols * i;
+    y[k] = j > i + diagonal ? fill : (scale * x[k]);
+  }
+}
+
+template<typename T>
+__global__ void FusedScaleTrilWarpProcessRowGpu(const int64_t total_rows, const int64_t num_rows,
+                                                const int64_t num_cols, const int64_t diagonal,
+                                                const T scale, const T* x, const T fill, T* y) {
+  const int64_t warp_id = (blockIdx.x * blockDim.x + threadIdx.x) / kCudaWarpSize;
+  const int64_t lan_id = threadIdx.x % kCudaWarpSize;
+  const int64_t num_warp = blockDim.x * gridDim.x / kCudaWarpSize;
+  for (int64_t i = warp_id; i < total_rows; i += num_warp) {
+    const int64_t row = i % num_rows;
+    for (int64_t col = lan_id; col < num_cols; col += kCudaWarpSize) {
+      const int64_t idx = i * num_cols + col;
+      y[idx] = col > row + diagonal ? fill : (scale * x[idx]);
+    }
+  }
+}
+
+template<>
+__global__ void FusedScaleTrilWarpProcessRowGpu<half>(const int64_t total_rows,
+                                                      const int64_t num_rows,
+                                                      const int64_t num_cols,
+                                                      const int64_t diagonal, const half scale,
+                                                      const half* x, const half fill, half* y) {
+  const int64_t h2_num_cols = num_cols / 2;
+  const auto* x_h2 = reinterpret_cast<const half2*>(x);
+  auto* y_h2 = reinterpret_cast<half2*>(y);
+  const half2 h2_scale = __half2half2(scale);
+  const int64_t warp_id = (blockIdx.x * blockDim.x + threadIdx.x) / kCudaWarpSize;
+  const int64_t lan_id = threadIdx.x % kCudaWarpSize;
+  const int64_t num_warp = blockDim.x * gridDim.x / kCudaWarpSize;
+  for (int64_t i = warp_id; i < total_rows; i += num_warp) {
+    const int64_t row = i % num_rows;
+    for (int64_t col = lan_id; col < h2_num_cols; col += kCudaWarpSize) {
+      const int64_t idx = i * h2_num_cols + col;
+      const half2 scaled_x = __hmul2(h2_scale, x_h2[idx]);
+      half2 y_val;
+      y_val.data.x = (2 * col) > row + diagonal ? fill : static_cast<half>(scaled_x.data.x);
+      y_val.data.y = (2 * col + 1) > row + diagonal ? fill : static_cast<half>(scaled_x.data.y);
+      y_h2[idx] = y_val;
+    }
+  }
+}
+
+template<typename T>
+T GetAttrVal(bool is_floating_val, double floating_value, int64_t integer_value) {
+  return is_floating_val ? static_cast<T>(floating_value) : static_cast<T>(integer_value);
+}
+
+template<>
+half GetAttrVal<half>(bool is_floating_val, double floating_value, int64_t integer_value) {
+  return is_floating_val ? __float2half(floating_value) : __float2half(integer_value);
+}
+
+}  // namespace
+
+template<typename T>
+class GpuTrilKernel final : public user_op::OpKernel {
+ public:
+  GpuTrilKernel() = default;
+  ~GpuTrilKernel() override = default;
+
+ private:
+  using user_op::OpKernel::Compute;
+  void Compute(user_op::KernelComputeContext* ctx) const override {
+    const user_op::Tensor* x = ctx->Tensor4ArgNameAndIndex("in", 0);
+    const auto shape = x->shape_view();
+    const auto diagonal = ctx->Attr<int64_t>("diagonal");
+    const int64_t num_rows = shape.At(shape.NumAxes() - 2);
+    const int64_t num_cols = shape.At(shape.NumAxes() - 1);
+    user_op::Tensor* y = ctx->Tensor4ArgNameAndIndex("out", 0);
+    const int32_t elem_cnt = shape.elem_cnt();
+    const T fill = GetAttrVal<T>(ctx->Attr<bool>("is_floating_fill_value"),
+                                 ctx->Attr<double>("floating_fill_value"),
+                                 ctx->Attr<int64_t>("integer_fill_value"));
+    if (num_cols % (kCudaWarpSize * 2) == 0) {
+      const int64_t total_rows = elem_cnt / num_cols;
+      TrilWarpProcessRowGpu<<<BlocksNum4ThreadsNum(total_rows * kCudaWarpSize),
+                              kCudaThreadsNumPerBlock, 0,
+                              ctx->stream()->As<ep::CudaStream>()->cuda_stream()>>>(
+          total_rows, num_rows, num_cols, diagonal, x->dptr<T>(), fill, y->mut_dptr<T>());
+    } else {
+      TrilGpu<<<BlocksNum4ThreadsNum(elem_cnt), kCudaThreadsNumPerBlock, 0,
+                ctx->stream()->As<ep::CudaStream>()->cuda_stream()>>>(
+          elem_cnt, num_rows, num_cols, diagonal, x->dptr<T>(), fill, y->mut_dptr<T>());
+    }
+  }
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
+};
+
+#define REGISTER_CUDA_TRIL_KERNEL(dtype)                                                        \
+  REGISTER_USER_KERNEL("tril")                                                                  \
+      .SetCreateFn<GpuTrilKernel<dtype>>()                                                      \
+      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                          \
+                       && (user_op::HobDataType("out", 0) == GetDataType<dtype>::value))        \
+      .SetInplaceProposalFn([](const user_op::InferContext&,                                    \
+                               user_op::AddInplaceArgPair AddInplaceArgPairFn) -> Maybe<void> { \
+        OF_RETURN_IF_ERROR(AddInplaceArgPairFn("out", 0, "in", 0, true));                       \
+        return Maybe<void>::Ok();                                                               \
+      });
+
+REGISTER_CUDA_TRIL_KERNEL(float)
+REGISTER_CUDA_TRIL_KERNEL(double)
+REGISTER_CUDA_TRIL_KERNEL(bool)
+REGISTER_CUDA_TRIL_KERNEL(uint8_t)
+REGISTER_CUDA_TRIL_KERNEL(int8_t)
+REGISTER_CUDA_TRIL_KERNEL(int32_t)
+REGISTER_CUDA_TRIL_KERNEL(int64_t)
+REGISTER_CUDA_TRIL_KERNEL(half)
+
+template<typename T>
+class GpuFusedScaleTrilKernel final : public user_op::OpKernel {
+ public:
+  GpuFusedScaleTrilKernel() = default;
+  ~GpuFusedScaleTrilKernel() override = default;
+
+ private:
+  using user_op::OpKernel::Compute;
+  void Compute(user_op::KernelComputeContext* ctx) const override {
+    const user_op::Tensor* x = ctx->Tensor4ArgNameAndIndex("in", 0);
+    const auto shape = x->shape_view();
+    const auto diagonal = ctx->Attr<int64_t>("diagonal");
+    const int32_t num_rows = shape.At(shape.NumAxes() - 2);
+    const int32_t num_cols = shape.At(shape.NumAxes() - 1);
+    user_op::Tensor* y = ctx->Tensor4ArgNameAndIndex("out", 0);
+    const int32_t elem_cnt = shape.elem_cnt();
+    const T fill = GetAttrVal<T>(ctx->Attr<bool>("is_floating_fill_value"),
+                                 ctx->Attr<double>("floating_fill_value"),
+                                 ctx->Attr<int64_t>("integer_fill_value"));
+    const T scale = GetAttrVal<T>(ctx->Attr<bool>("is_floating_scale_value"),
+                                  ctx->Attr<double>("floating_scale_value"),
+                                  ctx->Attr<int64_t>("integer_scale_value"));
+    if (num_cols % (kCudaWarpSize * 2) == 0) {
+      const int64_t total_rows = elem_cnt / num_cols;
+      FusedScaleTrilWarpProcessRowGpu<<<BlocksNum4ThreadsNum(total_rows * kCudaWarpSize),
+                                        kCudaThreadsNumPerBlock, 0,
+                                        ctx->stream()->As<ep::CudaStream>()->cuda_stream()>>>(
+          total_rows, num_rows, num_cols, diagonal, scale, x->dptr<T>(), fill, y->mut_dptr<T>());
+    } else {
+      FusedScaleTrilGpu<<<BlocksNum4ThreadsNum(elem_cnt), kCudaThreadsNumPerBlock, 0,
+                          ctx->stream()->As<ep::CudaStream>()->cuda_stream()>>>(
+          elem_cnt, num_rows, num_cols, diagonal, scale, x->dptr<T>(), fill, y->mut_dptr<T>());
+    }
+  }
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
+};
+
+#define REGISTER_CUDA_FUSED_SCALE_TRIL_KERNEL(dtype)                                            \
+  REGISTER_USER_KERNEL("fused_scale_tril")                                                      \
+      .SetCreateFn<GpuFusedScaleTrilKernel<dtype>>()                                            \
+      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                          \
+                       && (user_op::HobDataType("out", 0) == GetDataType<dtype>::value))        \
+      .SetInplaceProposalFn([](const user_op::InferContext&,                                    \
+                               user_op::AddInplaceArgPair AddInplaceArgPairFn) -> Maybe<void> { \
+        OF_RETURN_IF_ERROR(AddInplaceArgPairFn("out", 0, "in", 0, true));                       \
+        return Maybe<void>::Ok();                                                               \
+      });
+
+REGISTER_CUDA_FUSED_SCALE_TRIL_KERNEL(float)
+REGISTER_CUDA_FUSED_SCALE_TRIL_KERNEL(double)
+REGISTER_CUDA_FUSED_SCALE_TRIL_KERNEL(bool)
+REGISTER_CUDA_FUSED_SCALE_TRIL_KERNEL(uint8_t)
+REGISTER_CUDA_FUSED_SCALE_TRIL_KERNEL(int8_t)
+REGISTER_CUDA_FUSED_SCALE_TRIL_KERNEL(int32_t)
+REGISTER_CUDA_FUSED_SCALE_TRIL_KERNEL(int64_t)
+REGISTER_CUDA_FUSED_SCALE_TRIL_KERNEL(half)
+
 }  // namespace oneflow
\ No newline at end of file
diff --git a/oneflow/user/kernels/triu_kernel.hip.cpp b/oneflow/user/kernels/triu_kernel.hip.cpp
index 6ffb20c..23e511c 100644
--- a/oneflow/user/kernels/triu_kernel.hip.cpp
+++ b/oneflow/user/kernels/triu_kernel.hip.cpp
@@ -1,131 +1,131 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "hip/hip_runtime.h"
-#include "oneflow/core/common/data_type.h"
-#include "oneflow/core/framework/framework.h"
-#include "oneflow/core/kernel/util/cuda_half_util.h"
-#include "oneflow/core/ep/rocm/cuda_stream.h"
-
-namespace oneflow {
-
-namespace {
-
-template<typename T>
-__global__ void TriuGpu(const int64_t elem_cnt, const int64_t num_rows, const int64_t num_cols,
-                        const int64_t diagonal, const T* x, T* y) {
-  const int64_t matrix_size = num_rows * num_cols;
-  CUDA_1D_KERNEL_LOOP_T(int64_t, k, elem_cnt) {
-    const int64_t offset_in_matrix = k % matrix_size;
-    const int64_t i = offset_in_matrix / num_cols;
-    const int64_t j = offset_in_matrix - num_cols * i;
-    y[k] = j < i + diagonal ? static_cast<T>(0) : x[k];
-  }
-}
-
-template<typename T>
-__global__ void TriuWarpProcessRowGpu(const int64_t total_rows, const int64_t num_rows,
-                                      const int64_t num_cols, const int64_t diagonal, const T* x,
-                                      T* y) {
-  const int64_t warp_id = (blockIdx.x * blockDim.x + threadIdx.x) / kCudaWarpSize;
-  const int64_t lan_id = threadIdx.x % kCudaWarpSize;
-  const int64_t num_warp = blockDim.x * gridDim.x / kCudaWarpSize;
-  for (int64_t i = warp_id; i < total_rows; i += num_warp) {
-    const int64_t row = i % num_rows;
-    for (int64_t col = lan_id; col < num_cols; col += kCudaWarpSize) {
-      const int64_t idx = i * num_cols + col;
-      y[idx] = col < row + diagonal ? static_cast<T>(0) : x[idx];
-    }
-  }
-}
-
-template<>
-__global__ void TriuWarpProcessRowGpu<half>(const int64_t total_rows, const int64_t num_rows,
-                                            const int64_t num_cols, const int64_t diagonal,
-                                            const half* x, half* y) {
-  const int64_t h2_num_cols = num_cols / 2;
-  const auto* x_h2 = reinterpret_cast<const half2*>(x);
-  auto* y_h2 = reinterpret_cast<half2*>(y);
-
-  const int64_t warp_id = (blockIdx.x * blockDim.x + threadIdx.x) / kCudaWarpSize;
-  const int64_t lan_id = threadIdx.x % kCudaWarpSize;
-  const int64_t num_warp = blockDim.x * gridDim.x / kCudaWarpSize;
-  for (int64_t i = warp_id; i < total_rows; i += num_warp) {
-    const int64_t row = i % num_rows;
-    for (int64_t col = lan_id; col < h2_num_cols; col += kCudaWarpSize) {
-      const int64_t idx = i * h2_num_cols + col;
-      const half2 x_val = x_h2[idx];
-      half2 y_val;
-      y_val.data.x = (2 * col) < row + diagonal ? static_cast<half>(0) : static_cast<half>(x_val.data.x);
-      y_val.data.y = (2 * col + 1) < row + diagonal ? static_cast<half>(0) : static_cast<half>(x_val.data.y);
-      y_h2[idx] = y_val;
-    }
-  }
-}
-
-}  // namespace
-
-template<typename T>
-class GpuTriuKernel final : public user_op::OpKernel {
- public:
-  GpuTriuKernel() = default;
-  ~GpuTriuKernel() override = default;
-
- private:
-  using user_op::OpKernel::Compute;
-  void Compute(user_op::KernelComputeContext* ctx) const override {
-    const user_op::Tensor* x = ctx->Tensor4ArgNameAndIndex("in", 0);
-    const auto shape = x->shape_view();
-    const auto diagonal = ctx->Attr<int64_t>("diagonal");
-    const int64_t num_rows = shape.At(shape.NumAxes() - 2);
-    const int64_t num_cols = shape.At(shape.NumAxes() - 1);
-    user_op::Tensor* y = ctx->Tensor4ArgNameAndIndex("out", 0);
-    const int32_t elem_cnt = shape.elem_cnt();
-    if (elem_cnt == 0) { return; }
-    if (num_cols % (kCudaWarpSize * 2) == 0) {
-      const int64_t total_rows = elem_cnt / num_cols;
-      TriuWarpProcessRowGpu<<<BlocksNum4ThreadsNum(total_rows * kCudaWarpSize),
-                              kCudaThreadsNumPerBlock, 0,
-                              ctx->stream()->As<ep::CudaStream>()->cuda_stream()>>>(
-          total_rows, num_rows, num_cols, diagonal, x->dptr<T>(), y->mut_dptr<T>());
-    } else {
-      TriuGpu<<<BlocksNum4ThreadsNum(elem_cnt), kCudaThreadsNumPerBlock, 0,
-                ctx->stream()->As<ep::CudaStream>()->cuda_stream()>>>(
-          elem_cnt, num_rows, num_cols, diagonal, x->dptr<T>(), y->mut_dptr<T>());
-    }
-  }
-  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
-};
-
-#define REGISTER_CUDA_TRIU_KERNEL(dtype)                                                        \
-  REGISTER_USER_KERNEL("triu")                                                                  \
-      .SetCreateFn<GpuTriuKernel<dtype>>()                                                      \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                          \
-                       && (user_op::HobDataType("out", 0) == GetDataType<dtype>::value))        \
-      .SetInplaceProposalFn([](const user_op::InferContext&,                                    \
-                               user_op::AddInplaceArgPair AddInplaceArgPairFn) -> Maybe<void> { \
-        OF_RETURN_IF_ERROR(AddInplaceArgPairFn("out", 0, "in", 0, true));                       \
-        return Maybe<void>::Ok();                                                               \
-      });
-
-REGISTER_CUDA_TRIU_KERNEL(half)
-REGISTER_CUDA_TRIU_KERNEL(float)
-REGISTER_CUDA_TRIU_KERNEL(double)
-REGISTER_CUDA_TRIU_KERNEL(uint8_t)
-REGISTER_CUDA_TRIU_KERNEL(int8_t)
-REGISTER_CUDA_TRIU_KERNEL(int32_t)
-REGISTER_CUDA_TRIU_KERNEL(int64_t)
-
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "hip/hip_runtime.h"
+#include "oneflow/core/common/data_type.h"
+#include "oneflow/core/framework/framework.h"
+#include "oneflow/core/kernel/util/cuda_half_util.h"
+#include "oneflow/core/ep/rocm/cuda_stream.h"
+
+namespace oneflow {
+
+namespace {
+
+template<typename T>
+__global__ void TriuGpu(const int64_t elem_cnt, const int64_t num_rows, const int64_t num_cols,
+                        const int64_t diagonal, const T* x, T* y) {
+  const int64_t matrix_size = num_rows * num_cols;
+  CUDA_1D_KERNEL_LOOP_T(int64_t, k, elem_cnt) {
+    const int64_t offset_in_matrix = k % matrix_size;
+    const int64_t i = offset_in_matrix / num_cols;
+    const int64_t j = offset_in_matrix - num_cols * i;
+    y[k] = j < i + diagonal ? static_cast<T>(0) : x[k];
+  }
+}
+
+template<typename T>
+__global__ void TriuWarpProcessRowGpu(const int64_t total_rows, const int64_t num_rows,
+                                      const int64_t num_cols, const int64_t diagonal, const T* x,
+                                      T* y) {
+  const int64_t warp_id = (blockIdx.x * blockDim.x + threadIdx.x) / kCudaWarpSize;
+  const int64_t lan_id = threadIdx.x % kCudaWarpSize;
+  const int64_t num_warp = blockDim.x * gridDim.x / kCudaWarpSize;
+  for (int64_t i = warp_id; i < total_rows; i += num_warp) {
+    const int64_t row = i % num_rows;
+    for (int64_t col = lan_id; col < num_cols; col += kCudaWarpSize) {
+      const int64_t idx = i * num_cols + col;
+      y[idx] = col < row + diagonal ? static_cast<T>(0) : x[idx];
+    }
+  }
+}
+
+template<>
+__global__ void TriuWarpProcessRowGpu<half>(const int64_t total_rows, const int64_t num_rows,
+                                            const int64_t num_cols, const int64_t diagonal,
+                                            const half* x, half* y) {
+  const int64_t h2_num_cols = num_cols / 2;
+  const auto* x_h2 = reinterpret_cast<const half2*>(x);
+  auto* y_h2 = reinterpret_cast<half2*>(y);
+
+  const int64_t warp_id = (blockIdx.x * blockDim.x + threadIdx.x) / kCudaWarpSize;
+  const int64_t lan_id = threadIdx.x % kCudaWarpSize;
+  const int64_t num_warp = blockDim.x * gridDim.x / kCudaWarpSize;
+  for (int64_t i = warp_id; i < total_rows; i += num_warp) {
+    const int64_t row = i % num_rows;
+    for (int64_t col = lan_id; col < h2_num_cols; col += kCudaWarpSize) {
+      const int64_t idx = i * h2_num_cols + col;
+      const half2 x_val = x_h2[idx];
+      half2 y_val;
+      y_val.data.x = (2 * col) < row + diagonal ? static_cast<half>(0) : static_cast<half>(x_val.data.x);
+      y_val.data.y = (2 * col + 1) < row + diagonal ? static_cast<half>(0) : static_cast<half>(x_val.data.y);
+      y_h2[idx] = y_val;
+    }
+  }
+}
+
+}  // namespace
+
+template<typename T>
+class GpuTriuKernel final : public user_op::OpKernel {
+ public:
+  GpuTriuKernel() = default;
+  ~GpuTriuKernel() override = default;
+
+ private:
+  using user_op::OpKernel::Compute;
+  void Compute(user_op::KernelComputeContext* ctx) const override {
+    const user_op::Tensor* x = ctx->Tensor4ArgNameAndIndex("in", 0);
+    const auto shape = x->shape_view();
+    const auto diagonal = ctx->Attr<int64_t>("diagonal");
+    const int64_t num_rows = shape.At(shape.NumAxes() - 2);
+    const int64_t num_cols = shape.At(shape.NumAxes() - 1);
+    user_op::Tensor* y = ctx->Tensor4ArgNameAndIndex("out", 0);
+    const int32_t elem_cnt = shape.elem_cnt();
+    if (elem_cnt == 0) { return; }
+    if (num_cols % (kCudaWarpSize * 2) == 0) {
+      const int64_t total_rows = elem_cnt / num_cols;
+      TriuWarpProcessRowGpu<<<BlocksNum4ThreadsNum(total_rows * kCudaWarpSize),
+                              kCudaThreadsNumPerBlock, 0,
+                              ctx->stream()->As<ep::CudaStream>()->cuda_stream()>>>(
+          total_rows, num_rows, num_cols, diagonal, x->dptr<T>(), y->mut_dptr<T>());
+    } else {
+      TriuGpu<<<BlocksNum4ThreadsNum(elem_cnt), kCudaThreadsNumPerBlock, 0,
+                ctx->stream()->As<ep::CudaStream>()->cuda_stream()>>>(
+          elem_cnt, num_rows, num_cols, diagonal, x->dptr<T>(), y->mut_dptr<T>());
+    }
+  }
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
+};
+
+#define REGISTER_CUDA_TRIU_KERNEL(dtype)                                                        \
+  REGISTER_USER_KERNEL("triu")                                                                  \
+      .SetCreateFn<GpuTriuKernel<dtype>>()                                                      \
+      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                          \
+                       && (user_op::HobDataType("out", 0) == GetDataType<dtype>::value))        \
+      .SetInplaceProposalFn([](const user_op::InferContext&,                                    \
+                               user_op::AddInplaceArgPair AddInplaceArgPairFn) -> Maybe<void> { \
+        OF_RETURN_IF_ERROR(AddInplaceArgPairFn("out", 0, "in", 0, true));                       \
+        return Maybe<void>::Ok();                                                               \
+      });
+
+REGISTER_CUDA_TRIU_KERNEL(half)
+REGISTER_CUDA_TRIU_KERNEL(float)
+REGISTER_CUDA_TRIU_KERNEL(double)
+REGISTER_CUDA_TRIU_KERNEL(uint8_t)
+REGISTER_CUDA_TRIU_KERNEL(int8_t)
+REGISTER_CUDA_TRIU_KERNEL(int32_t)
+REGISTER_CUDA_TRIU_KERNEL(int64_t)
+
 }  // namespace oneflow
\ No newline at end of file
diff --git a/oneflow/user/kernels/two_stage_reduce_kernel_util.hip.cpp b/oneflow/user/kernels/two_stage_reduce_kernel_util.hip.cpp
index a12653b..bf27697 100644
--- a/oneflow/user/kernels/two_stage_reduce_kernel_util.hip.cpp
+++ b/oneflow/user/kernels/two_stage_reduce_kernel_util.hip.cpp
@@ -1,67 +1,67 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "hip/hip_runtime.h"
-#include "oneflow/user/kernels/two_stage_reduce_kernel_util.h"
-#include "oneflow/core/ep/rocm/cuda_stream.h"
-
-namespace oneflow {
-
-namespace {
-
-template<typename T, typename K>
-__global__ void DivideGpu(const int64_t n, const T* x, const K* count, T* y) {
-  CUDA_1D_KERNEL_LOOP(i, n) { y[i] = x[i] / count[i]; }
-}
-
-template<typename T, typename K>
-__global__ void MaskGpu(const int64_t n, const T* x, const K* mask, T* y) {
-  CUDA_1D_KERNEL_LOOP(i, n) { y[i] = static_cast<T>(mask[i]) * x[i]; }
-}
-
-template<typename T, typename K>
-__global__ void ScaleGpu(const int64_t n, const T* x, const K* scale, T* y) {
-  CUDA_1D_KERNEL_LOOP(i, n) { y[i] = x[i] * static_cast<T>(scale[i]); }
-}
-
-}  // namespace
-
-template<typename T, typename K>
-struct TwoStageReduceKernelUtil<DeviceType::kCUDA, T, K> {
-  static void Divide(ep::Stream* stream, const int64_t n, const T* x, const K* count, T* y) {
-    DivideGpu<T, K><<<BlocksNum4ThreadsNum(n), kCudaThreadsNumPerBlock, 0,
-                      stream->As<ep::CudaStream>()->cuda_stream()>>>(n, x, count, y);
-  }
-
-  static void Mask(ep::Stream* stream, const int64_t n, const T* x, const K* mask, T* y) {
-    MaskGpu<T, K><<<BlocksNum4ThreadsNum(n), kCudaThreadsNumPerBlock, 0,
-                    stream->As<ep::CudaStream>()->cuda_stream()>>>(n, x, mask, y);
-  }
-
-  static void Scale(ep::Stream* stream, const int64_t n, const T* x, const K* scale, T* y) {
-    ScaleGpu<T, K><<<BlocksNum4ThreadsNum(n), kCudaThreadsNumPerBlock, 0,
-                     stream->As<ep::CudaStream>()->cuda_stream()>>>(n, x, scale, y);
-  }
-};
-
-#define INSTANTIATE_TWO_STAGE_REDUCE_KERNEL_UTIL_CUDA(data_type_pair, index_type_pair)          \
-  template struct TwoStageReduceKernelUtil<DeviceType::kCUDA, OF_PP_PAIR_FIRST(data_type_pair), \
-                                           OF_PP_PAIR_FIRST(index_type_pair)>;
-OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_TWO_STAGE_REDUCE_KERNEL_UTIL_CUDA,
-                                 FLOATING_DATA_TYPE_SEQ INDEX_DATA_TYPE_SEQ BOOL_DATA_TYPE_SEQ,
-                                 INT_DATA_TYPE_SEQ BOOL_DATA_TYPE_SEQ);
-#undef INSTANTIATE_TWO_STAGE_REDUCE_KERNEL_UTIL_CUDA
-
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "hip/hip_runtime.h"
+#include "oneflow/user/kernels/two_stage_reduce_kernel_util.h"
+#include "oneflow/core/ep/rocm/cuda_stream.h"
+
+namespace oneflow {
+
+namespace {
+
+template<typename T, typename K>
+__global__ void DivideGpu(const int64_t n, const T* x, const K* count, T* y) {
+  CUDA_1D_KERNEL_LOOP(i, n) { y[i] = x[i] / count[i]; }
+}
+
+template<typename T, typename K>
+__global__ void MaskGpu(const int64_t n, const T* x, const K* mask, T* y) {
+  CUDA_1D_KERNEL_LOOP(i, n) { y[i] = static_cast<T>(mask[i]) * x[i]; }
+}
+
+template<typename T, typename K>
+__global__ void ScaleGpu(const int64_t n, const T* x, const K* scale, T* y) {
+  CUDA_1D_KERNEL_LOOP(i, n) { y[i] = x[i] * static_cast<T>(scale[i]); }
+}
+
+}  // namespace
+
+template<typename T, typename K>
+struct TwoStageReduceKernelUtil<DeviceType::kCUDA, T, K> {
+  static void Divide(ep::Stream* stream, const int64_t n, const T* x, const K* count, T* y) {
+    DivideGpu<T, K><<<BlocksNum4ThreadsNum(n), kCudaThreadsNumPerBlock, 0,
+                      stream->As<ep::CudaStream>()->cuda_stream()>>>(n, x, count, y);
+  }
+
+  static void Mask(ep::Stream* stream, const int64_t n, const T* x, const K* mask, T* y) {
+    MaskGpu<T, K><<<BlocksNum4ThreadsNum(n), kCudaThreadsNumPerBlock, 0,
+                    stream->As<ep::CudaStream>()->cuda_stream()>>>(n, x, mask, y);
+  }
+
+  static void Scale(ep::Stream* stream, const int64_t n, const T* x, const K* scale, T* y) {
+    ScaleGpu<T, K><<<BlocksNum4ThreadsNum(n), kCudaThreadsNumPerBlock, 0,
+                     stream->As<ep::CudaStream>()->cuda_stream()>>>(n, x, scale, y);
+  }
+};
+
+#define INSTANTIATE_TWO_STAGE_REDUCE_KERNEL_UTIL_CUDA(data_type_pair, index_type_pair)          \
+  template struct TwoStageReduceKernelUtil<DeviceType::kCUDA, OF_PP_PAIR_FIRST(data_type_pair), \
+                                           OF_PP_PAIR_FIRST(index_type_pair)>;
+OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_TWO_STAGE_REDUCE_KERNEL_UTIL_CUDA,
+                                 FLOATING_DATA_TYPE_SEQ INDEX_DATA_TYPE_SEQ BOOL_DATA_TYPE_SEQ,
+                                 INT_DATA_TYPE_SEQ BOOL_DATA_TYPE_SEQ);
+#undef INSTANTIATE_TWO_STAGE_REDUCE_KERNEL_UTIL_CUDA
+
 }  // namespace oneflow
\ No newline at end of file
diff --git a/oneflow/user/kernels/unfold_kernel_util.hip.cpp b/oneflow/user/kernels/unfold_kernel_util.hip.cpp
index 7831c8f..21874a5 100644
--- a/oneflow/user/kernels/unfold_kernel_util.hip.cpp
+++ b/oneflow/user/kernels/unfold_kernel_util.hip.cpp
@@ -1,70 +1,70 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#ifdef WITH_ROCM
-#include "hip/hip_runtime.h"
-#include "oneflow/core/framework/framework.h"
-#include "oneflow/core/hip/elementwise.hip.h"
-#include "oneflow/user/kernels/unfold_kernel_util.h"
-#include "oneflow/core/ep/rocm/cuda_stream.h"
-
-namespace oneflow {
-
-namespace user_op {
-
-namespace {
-
-constexpr int kBlockSize = cuda::elementwise::kBlockSize;
-
-int GetNumBlocks(int64_t elem_cnt) {
-  int num_blocks = 0;
-  OF_CUDA_CHECK(cuda::elementwise::GetNumBlocks(elem_cnt, &num_blocks));
-  return num_blocks;
-}
-
-// NDIM range: (1, 2, 3)
-// SDIM range: (1, 2), 1 indicates channels_last, 2 indicates channels_first
-template<typename T, typename INDEX_T, int NDIM, int SDIM>
-__global__ void CudaUnfoldForward(UnfoldParams<INDEX_T, NDIM, SDIM> params, const T* in, T* out) {
-  CUDA_1D_KERNEL_LOOP_T(INDEX_T, out_offset, params.out_elem_cnt) {
-    using ParamType = UnfoldParams<INDEX_T, NDIM, SDIM>;
-    INDEX_T in_index[ParamType::kInputNDim] = {0};
-    INDEX_T out_index[ParamType::kOutputNDim] = {0};
-    params.out_index_helper.OffsetToNdIndex(out_offset, out_index);
-    if (!UnfoldIndexTransform<INDEX_T, NDIM, SDIM>(params, out_index, in_index)) {
-      INDEX_T in_offset = params.in_index_helper.NdIndexToOffset(in_index);
-      out[out_offset] = in[in_offset];
-    } else {
-      out[out_offset] = static_cast<T>(kUnfoldPaddingValue);
-    }
-  }
-}
-
-}  // namespace
-
-template<typename T, typename INDEX_T, int NDIM, int SDIM>
-struct UnfoldKernelUtil<DeviceType::kCUDA, T, INDEX_T, NDIM, SDIM> {
-  using ParamType = UnfoldParams<INDEX_T, NDIM, SDIM>;
-  static void Forward(ep::Stream* stream, const UnfoldParams<INDEX_T, NDIM, SDIM>* params,
-                      const T* input_ptr, T* output_ptr) {
-    CudaUnfoldForward<T, INDEX_T, NDIM, SDIM>
-        <<<GetNumBlocks(params->out_elem_cnt), kBlockSize, 0,
-           stream->As<ep::CudaStream>()->cuda_stream()>>>(*params, input_ptr, output_ptr);
-  }
-};
-INSTANTIATE_UNFOLD_KERNEL_UTIL_FOR_DEVICE(DeviceType::kCUDA)
-}  // namespace user_op
-}  // namespace oneflow
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#ifdef WITH_ROCM
+#include "hip/hip_runtime.h"
+#include "oneflow/core/framework/framework.h"
+#include "oneflow/core/hip/elementwise.hip.h"
+#include "oneflow/user/kernels/unfold_kernel_util.h"
+#include "oneflow/core/ep/rocm/cuda_stream.h"
+
+namespace oneflow {
+
+namespace user_op {
+
+namespace {
+
+constexpr int kBlockSize = cuda::elementwise::kBlockSize;
+
+int GetNumBlocks(int64_t elem_cnt) {
+  int num_blocks = 0;
+  OF_CUDA_CHECK(cuda::elementwise::GetNumBlocks(elem_cnt, &num_blocks));
+  return num_blocks;
+}
+
+// NDIM range: (1, 2, 3)
+// SDIM range: (1, 2), 1 indicates channels_last, 2 indicates channels_first
+template<typename T, typename INDEX_T, int NDIM, int SDIM>
+__global__ void CudaUnfoldForward(UnfoldParams<INDEX_T, NDIM, SDIM> params, const T* in, T* out) {
+  CUDA_1D_KERNEL_LOOP_T(INDEX_T, out_offset, params.out_elem_cnt) {
+    using ParamType = UnfoldParams<INDEX_T, NDIM, SDIM>;
+    INDEX_T in_index[ParamType::kInputNDim] = {0};
+    INDEX_T out_index[ParamType::kOutputNDim] = {0};
+    params.out_index_helper.OffsetToNdIndex(out_offset, out_index);
+    if (!UnfoldIndexTransform<INDEX_T, NDIM, SDIM>(params, out_index, in_index)) {
+      INDEX_T in_offset = params.in_index_helper.NdIndexToOffset(in_index);
+      out[out_offset] = in[in_offset];
+    } else {
+      out[out_offset] = static_cast<T>(kUnfoldPaddingValue);
+    }
+  }
+}
+
+}  // namespace
+
+template<typename T, typename INDEX_T, int NDIM, int SDIM>
+struct UnfoldKernelUtil<DeviceType::kCUDA, T, INDEX_T, NDIM, SDIM> {
+  using ParamType = UnfoldParams<INDEX_T, NDIM, SDIM>;
+  static void Forward(ep::Stream* stream, const UnfoldParams<INDEX_T, NDIM, SDIM>* params,
+                      const T* input_ptr, T* output_ptr) {
+    CudaUnfoldForward<T, INDEX_T, NDIM, SDIM>
+        <<<GetNumBlocks(params->out_elem_cnt), kBlockSize, 0,
+           stream->As<ep::CudaStream>()->cuda_stream()>>>(*params, input_ptr, output_ptr);
+  }
+};
+INSTANTIATE_UNFOLD_KERNEL_UTIL_FOR_DEVICE(DeviceType::kCUDA)
+}  // namespace user_op
+}  // namespace oneflow
 #endif  // WITH_ROCM
\ No newline at end of file
diff --git a/oneflow/user/kernels/unfold_tensor_kernel.hip.cpp b/oneflow/user/kernels/unfold_tensor_kernel.hip.cpp
index cc21d30..c90cbc1 100644
--- a/oneflow/user/kernels/unfold_tensor_kernel.hip.cpp
+++ b/oneflow/user/kernels/unfold_tensor_kernel.hip.cpp
@@ -1,222 +1,222 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "hip/hip_runtime.h"
-#include "oneflow/core/device/cuda_util.h"
-#include "oneflow/core/hip/atomic.hip.h"
-#include "oneflow/user/kernels/unfold_tensor_kernel_utils.h"
-
-namespace oneflow {
-
-namespace {
-
-const int32_t NDIMS = 16;
-struct STRIDES {
-  int32_t val[NDIMS];
-};
-
-template<typename T>
-__global__ void UnfoldTensorCudaKernel(const T* in_ptr, const STRIDES out_stride,
-                                       const STRIDES out_shape, const int32_t out_dims,
-                                       const int32_t elements, T* out_ptr) {
-  int32_t gid = (blockDim.x * blockIdx.x) + threadIdx.x;
-  int32_t step = gridDim.x * blockDim.x;
-  while (gid < elements) {
-    int32_t offset = Offset(gid, out_stride.val, out_shape.val, out_dims - 1);
-    out_ptr[gid] = in_ptr[offset];
-    gid += step;
-  }
-}
-
-template<typename T>
-__global__ void UnfoldTensorGradCudaKernel(const T* dout_ptr, const STRIDES dout_stride,
-                                           const STRIDES dout_shape, const int32_t dout_dims,
-                                           const int32_t elements, T* din_ptr) {
-  int32_t gid = (blockDim.x * blockIdx.x) + threadIdx.x;
-  int32_t step = gridDim.x * blockDim.x;
-  while (gid < elements) {
-    int32_t offset = Offset(gid, dout_stride.val, dout_shape.val, dout_dims - 1);
-    cuda::atomic::Add(&din_ptr[offset], dout_ptr[gid]);
-    gid += step;
-  }
-}
-
-template<typename T>
-__global__ void InitPtr(const int32_t elements, T* ptr) {
-  int32_t gid = (blockDim.x * blockIdx.x) + threadIdx.x;
-  int32_t step = gridDim.x * blockDim.x;
-  while (gid < elements) {
-    ptr[gid] = static_cast<T>(0);
-    gid += step;
-  }
-}
-
-template<typename T>
-struct GpuUnfoldTensorFunctor final {
-  void operator()(ep::Stream* stream, const T* in_ptr, const STRIDES out_stride,
-                  const STRIDES out_shape, const int32_t out_dims, const int32_t elements,
-                  T* out_ptr) {
-    RUN_CUDA_KERNEL((UnfoldTensorCudaKernel<T>), stream, elements, in_ptr, out_stride, out_shape,
-                    out_dims, elements, out_ptr);
-  }
-};
-
-template<typename T>
-struct GpuUnfoldTensorGradFunctor final {
-  void operator()(ep::Stream* stream, const T* dout_ptr, const STRIDES dout_stride,
-                  const STRIDES dout_shape, const int32_t dout_dims, const int32_t dout_elements,
-                  const int32_t din_elements, T* din_ptr) {
-    RUN_CUDA_KERNEL((InitPtr<T>), stream, din_elements, din_elements, din_ptr);
-    RUN_CUDA_KERNEL((UnfoldTensorGradCudaKernel<T>), stream, dout_elements, dout_ptr, dout_stride,
-                    dout_shape, dout_dims, dout_elements, din_ptr);
-  }
-};
-
-}  // namespace
-
-template<typename T>
-class GpuUnfoldTensorKernel final : public user_op::OpKernel {
- public:
-  GpuUnfoldTensorKernel() = default;
-  ~GpuUnfoldTensorKernel() = default;
-
- private:
-  using user_op::OpKernel::Compute;
-  void Compute(user_op::KernelComputeContext* ctx) const override {
-    const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("x", 0);
-    user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("y", 0);
-
-    const ShapeView& in_shape = in->shape_view();
-    std::vector<int32_t> out_shape;
-    out_shape.resize(out->shape_view().NumAxes());
-    for (int i = 0; i < out->shape_view().NumAxes(); ++i) {
-      out_shape[i] = out->shape_view().At(i);
-    }
-    const int32_t in_dims = in_shape.NumAxes();
-    const int32_t out_dims = out_shape.size();
-    const int32_t dimension = ctx->Attr<int32_t>("dimension");
-    const int32_t step = ctx->Attr<int32_t>("step");
-
-    std::vector<int32_t> in_stride(in_dims, 1);
-    for (int32_t i = in_dims - 2; i >= 0; --i) {
-      in_stride[i] = in_shape.At(i + 1) * in_stride.at(i + 1);
-    }
-
-    std::vector<int32_t> out_stride(in_dims + 1);
-    out_stride[in_dims] = in_dims == 0 ? 1 : in_stride[dimension];
-    for (int d = 0; d < in_dims; ++d) {
-      if (d == dimension) {
-        out_stride[d] = step * in_stride[d];
-      } else {
-        out_stride[d] = in_stride[d];
-      }
-    }
-
-    const T* in_ptr = in->dptr<T>();
-    T* out_ptr = out->mut_dptr<T>();
-    const int32_t out_size = out->shape_view().elem_cnt();
-
-    STRIDES out_stride_cuda;
-    for (int i = 0; i < out_dims; ++i) { out_stride_cuda.val[i] = out_stride[i]; }
-    STRIDES out_shape_cuda;
-    for (int i = 0; i < out_dims; ++i) { out_shape_cuda.val[i] = out_shape[i]; }
-
-    GpuUnfoldTensorFunctor<T>()(ctx->stream(), in_ptr, out_stride_cuda, out_shape_cuda, out_dims,
-                                out_size, out_ptr);
-  }
-
-  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
-};
-
-#define REGISTER_UNFOLD_TENSOR_KERNEL(dtype)                           \
-  REGISTER_USER_KERNEL("unfold_tensor")                                \
-      .SetCreateFn<GpuUnfoldTensorKernel<dtype>>()                     \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \
-                       && (user_op::HobDataType("x", 0) == GetDataType<dtype>::value))
-
-REGISTER_UNFOLD_TENSOR_KERNEL(float);
-REGISTER_UNFOLD_TENSOR_KERNEL(double);
-REGISTER_UNFOLD_TENSOR_KERNEL(int32_t);
-REGISTER_UNFOLD_TENSOR_KERNEL(int64_t);
-
-template<typename T>
-class GpuUnfoldTensorGradKernel final : public user_op::OpKernel {
- public:
-  GpuUnfoldTensorGradKernel() = default;
-  ~GpuUnfoldTensorGradKernel() = default;
-
- private:
-  using user_op::OpKernel::Compute;
-  void Compute(user_op::KernelComputeContext* ctx) const override {
-    const user_op::Tensor* dout = ctx->Tensor4ArgNameAndIndex("dy", 0);
-    const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("x", 0);
-    user_op::Tensor* din = ctx->Tensor4ArgNameAndIndex("dx", 0);
-
-    const ShapeView& in_shape = in->shape_view();
-    const int32_t in_dims = in_shape.NumAxes();
-    std::vector<int32_t> din_stride(in_dims, 1);
-    for (int32_t i = in_dims - 2; i >= 0; --i) {
-      din_stride[i] = in_shape.At(i + 1) * din_stride.at(i + 1);
-    }
-
-    std::vector<int32_t> dout_shape;
-    dout_shape.resize(dout->shape_view().NumAxes());
-    for (int i = 0; i < dout->shape_view().NumAxes(); ++i) {
-      dout_shape[i] = dout->shape_view().At(i);
-    }
-
-    const int32_t dout_dims = dout_shape.size();
-    const int32_t dimension = ctx->Attr<int32_t>("dimension");
-    const int32_t step = ctx->Attr<int32_t>("step");
-
-    std::vector<int32_t> dout_stride(in_dims + 1);
-    dout_stride[in_dims] = in_dims == 0 ? 1 : din_stride[dimension];
-    for (int d = 0; d < in_dims; ++d) {
-      if (d == dimension) {
-        dout_stride[d] = step * din_stride[d];
-      } else {
-        dout_stride[d] = din_stride[d];
-      }
-    }
-
-    STRIDES dout_stride_cuda;
-    for (int i = 0; i < dout_dims; ++i) { dout_stride_cuda.val[i] = dout_stride[i]; }
-    STRIDES dout_shape_cuda;
-    for (int i = 0; i < dout_dims; ++i) { dout_shape_cuda.val[i] = dout_shape[i]; }
-
-    const T* dout_ptr = dout->dptr<T>();
-    T* din_ptr = din->mut_dptr<T>();
-    const int32_t dout_size = dout->shape_view().elem_cnt();
-    const int32_t din_size = din->shape_view().elem_cnt();
-
-    GpuUnfoldTensorGradFunctor<T>()(ctx->stream(), dout_ptr, dout_stride_cuda, dout_shape_cuda,
-                                    dout_dims, dout_size, din_size, din_ptr);
-  }
-
-  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
-};
-
-#define REGISTER_UNFOLD_TENSOR_GRAD_KERNEL(dtype)                      \
-  REGISTER_USER_KERNEL("unfold_tensor_grad")                           \
-      .SetCreateFn<GpuUnfoldTensorGradKernel<dtype>>()                 \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \
-                       && (user_op::HobDataType("x", 0) == GetDataType<dtype>::value))
-
-REGISTER_UNFOLD_TENSOR_GRAD_KERNEL(float);
-REGISTER_UNFOLD_TENSOR_GRAD_KERNEL(double);
-REGISTER_UNFOLD_TENSOR_GRAD_KERNEL(int32_t);
-REGISTER_UNFOLD_TENSOR_GRAD_KERNEL(int64_t);
-
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "hip/hip_runtime.h"
+#include "oneflow/core/device/cuda_util.h"
+#include "oneflow/core/hip/atomic.hip.h"
+#include "oneflow/user/kernels/unfold_tensor_kernel_utils.h"
+
+namespace oneflow {
+
+namespace {
+
+const int32_t NDIMS = 16;
+struct STRIDES {
+  int32_t val[NDIMS];
+};
+
+template<typename T>
+__global__ void UnfoldTensorCudaKernel(const T* in_ptr, const STRIDES out_stride,
+                                       const STRIDES out_shape, const int32_t out_dims,
+                                       const int32_t elements, T* out_ptr) {
+  int32_t gid = (blockDim.x * blockIdx.x) + threadIdx.x;
+  int32_t step = gridDim.x * blockDim.x;
+  while (gid < elements) {
+    int32_t offset = Offset(gid, out_stride.val, out_shape.val, out_dims - 1);
+    out_ptr[gid] = in_ptr[offset];
+    gid += step;
+  }
+}
+
+template<typename T>
+__global__ void UnfoldTensorGradCudaKernel(const T* dout_ptr, const STRIDES dout_stride,
+                                           const STRIDES dout_shape, const int32_t dout_dims,
+                                           const int32_t elements, T* din_ptr) {
+  int32_t gid = (blockDim.x * blockIdx.x) + threadIdx.x;
+  int32_t step = gridDim.x * blockDim.x;
+  while (gid < elements) {
+    int32_t offset = Offset(gid, dout_stride.val, dout_shape.val, dout_dims - 1);
+    cuda::atomic::Add(&din_ptr[offset], dout_ptr[gid]);
+    gid += step;
+  }
+}
+
+template<typename T>
+__global__ void InitPtr(const int32_t elements, T* ptr) {
+  int32_t gid = (blockDim.x * blockIdx.x) + threadIdx.x;
+  int32_t step = gridDim.x * blockDim.x;
+  while (gid < elements) {
+    ptr[gid] = static_cast<T>(0);
+    gid += step;
+  }
+}
+
+template<typename T>
+struct GpuUnfoldTensorFunctor final {
+  void operator()(ep::Stream* stream, const T* in_ptr, const STRIDES out_stride,
+                  const STRIDES out_shape, const int32_t out_dims, const int32_t elements,
+                  T* out_ptr) {
+    RUN_CUDA_KERNEL((UnfoldTensorCudaKernel<T>), stream, elements, in_ptr, out_stride, out_shape,
+                    out_dims, elements, out_ptr);
+  }
+};
+
+template<typename T>
+struct GpuUnfoldTensorGradFunctor final {
+  void operator()(ep::Stream* stream, const T* dout_ptr, const STRIDES dout_stride,
+                  const STRIDES dout_shape, const int32_t dout_dims, const int32_t dout_elements,
+                  const int32_t din_elements, T* din_ptr) {
+    RUN_CUDA_KERNEL((InitPtr<T>), stream, din_elements, din_elements, din_ptr);
+    RUN_CUDA_KERNEL((UnfoldTensorGradCudaKernel<T>), stream, dout_elements, dout_ptr, dout_stride,
+                    dout_shape, dout_dims, dout_elements, din_ptr);
+  }
+};
+
+}  // namespace
+
+template<typename T>
+class GpuUnfoldTensorKernel final : public user_op::OpKernel {
+ public:
+  GpuUnfoldTensorKernel() = default;
+  ~GpuUnfoldTensorKernel() = default;
+
+ private:
+  using user_op::OpKernel::Compute;
+  void Compute(user_op::KernelComputeContext* ctx) const override {
+    const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("x", 0);
+    user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("y", 0);
+
+    const ShapeView& in_shape = in->shape_view();
+    std::vector<int32_t> out_shape;
+    out_shape.resize(out->shape_view().NumAxes());
+    for (int i = 0; i < out->shape_view().NumAxes(); ++i) {
+      out_shape[i] = out->shape_view().At(i);
+    }
+    const int32_t in_dims = in_shape.NumAxes();
+    const int32_t out_dims = out_shape.size();
+    const int32_t dimension = ctx->Attr<int32_t>("dimension");
+    const int32_t step = ctx->Attr<int32_t>("step");
+
+    std::vector<int32_t> in_stride(in_dims, 1);
+    for (int32_t i = in_dims - 2; i >= 0; --i) {
+      in_stride[i] = in_shape.At(i + 1) * in_stride.at(i + 1);
+    }
+
+    std::vector<int32_t> out_stride(in_dims + 1);
+    out_stride[in_dims] = in_dims == 0 ? 1 : in_stride[dimension];
+    for (int d = 0; d < in_dims; ++d) {
+      if (d == dimension) {
+        out_stride[d] = step * in_stride[d];
+      } else {
+        out_stride[d] = in_stride[d];
+      }
+    }
+
+    const T* in_ptr = in->dptr<T>();
+    T* out_ptr = out->mut_dptr<T>();
+    const int32_t out_size = out->shape_view().elem_cnt();
+
+    STRIDES out_stride_cuda;
+    for (int i = 0; i < out_dims; ++i) { out_stride_cuda.val[i] = out_stride[i]; }
+    STRIDES out_shape_cuda;
+    for (int i = 0; i < out_dims; ++i) { out_shape_cuda.val[i] = out_shape[i]; }
+
+    GpuUnfoldTensorFunctor<T>()(ctx->stream(), in_ptr, out_stride_cuda, out_shape_cuda, out_dims,
+                                out_size, out_ptr);
+  }
+
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
+};
+
+#define REGISTER_UNFOLD_TENSOR_KERNEL(dtype)                           \
+  REGISTER_USER_KERNEL("unfold_tensor")                                \
+      .SetCreateFn<GpuUnfoldTensorKernel<dtype>>()                     \
+      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \
+                       && (user_op::HobDataType("x", 0) == GetDataType<dtype>::value))
+
+REGISTER_UNFOLD_TENSOR_KERNEL(float);
+REGISTER_UNFOLD_TENSOR_KERNEL(double);
+REGISTER_UNFOLD_TENSOR_KERNEL(int32_t);
+REGISTER_UNFOLD_TENSOR_KERNEL(int64_t);
+
+template<typename T>
+class GpuUnfoldTensorGradKernel final : public user_op::OpKernel {
+ public:
+  GpuUnfoldTensorGradKernel() = default;
+  ~GpuUnfoldTensorGradKernel() = default;
+
+ private:
+  using user_op::OpKernel::Compute;
+  void Compute(user_op::KernelComputeContext* ctx) const override {
+    const user_op::Tensor* dout = ctx->Tensor4ArgNameAndIndex("dy", 0);
+    const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("x", 0);
+    user_op::Tensor* din = ctx->Tensor4ArgNameAndIndex("dx", 0);
+
+    const ShapeView& in_shape = in->shape_view();
+    const int32_t in_dims = in_shape.NumAxes();
+    std::vector<int32_t> din_stride(in_dims, 1);
+    for (int32_t i = in_dims - 2; i >= 0; --i) {
+      din_stride[i] = in_shape.At(i + 1) * din_stride.at(i + 1);
+    }
+
+    std::vector<int32_t> dout_shape;
+    dout_shape.resize(dout->shape_view().NumAxes());
+    for (int i = 0; i < dout->shape_view().NumAxes(); ++i) {
+      dout_shape[i] = dout->shape_view().At(i);
+    }
+
+    const int32_t dout_dims = dout_shape.size();
+    const int32_t dimension = ctx->Attr<int32_t>("dimension");
+    const int32_t step = ctx->Attr<int32_t>("step");
+
+    std::vector<int32_t> dout_stride(in_dims + 1);
+    dout_stride[in_dims] = in_dims == 0 ? 1 : din_stride[dimension];
+    for (int d = 0; d < in_dims; ++d) {
+      if (d == dimension) {
+        dout_stride[d] = step * din_stride[d];
+      } else {
+        dout_stride[d] = din_stride[d];
+      }
+    }
+
+    STRIDES dout_stride_cuda;
+    for (int i = 0; i < dout_dims; ++i) { dout_stride_cuda.val[i] = dout_stride[i]; }
+    STRIDES dout_shape_cuda;
+    for (int i = 0; i < dout_dims; ++i) { dout_shape_cuda.val[i] = dout_shape[i]; }
+
+    const T* dout_ptr = dout->dptr<T>();
+    T* din_ptr = din->mut_dptr<T>();
+    const int32_t dout_size = dout->shape_view().elem_cnt();
+    const int32_t din_size = din->shape_view().elem_cnt();
+
+    GpuUnfoldTensorGradFunctor<T>()(ctx->stream(), dout_ptr, dout_stride_cuda, dout_shape_cuda,
+                                    dout_dims, dout_size, din_size, din_ptr);
+  }
+
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
+};
+
+#define REGISTER_UNFOLD_TENSOR_GRAD_KERNEL(dtype)                      \
+  REGISTER_USER_KERNEL("unfold_tensor_grad")                           \
+      .SetCreateFn<GpuUnfoldTensorGradKernel<dtype>>()                 \
+      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \
+                       && (user_op::HobDataType("x", 0) == GetDataType<dtype>::value))
+
+REGISTER_UNFOLD_TENSOR_GRAD_KERNEL(float);
+REGISTER_UNFOLD_TENSOR_GRAD_KERNEL(double);
+REGISTER_UNFOLD_TENSOR_GRAD_KERNEL(int32_t);
+REGISTER_UNFOLD_TENSOR_GRAD_KERNEL(int64_t);
+
 }  // namespace oneflow
\ No newline at end of file
diff --git a/oneflow/user/kernels/unique_kernel_util.hip.cpp b/oneflow/user/kernels/unique_kernel_util.hip.cpp
index b69bba1..287850f 100644
--- a/oneflow/user/kernels/unique_kernel_util.hip.cpp
+++ b/oneflow/user/kernels/unique_kernel_util.hip.cpp
@@ -1,86 +1,86 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "oneflow/user/kernels/unique_kernel_util.h"
-#include "oneflow/core/hip/unique.hip.h"
-#include "oneflow/core/ep/rocm/cuda_stream.h"
-
-namespace oneflow {
-
-namespace {
-
-constexpr cuda::unique::Flag kUniqueFlag = cuda::unique::kOutputInverseIndices;
-constexpr cuda::unique::Flag kUniqueWithCountsFlag =
-    cuda::unique::kOutputInverseIndices | cuda::unique::kOutputCounts;
-
-}  // namespace
-
-template<typename KEY, typename IDX>
-struct UniqueKernelUtil<DeviceType::kCUDA, KEY, IDX> {
-  static void Unique(ep::Stream* stream, int64_t n, const KEY* in, IDX* num_unique, KEY* unique_out,
-                     IDX* idx_out, void* workspace, int64_t workspace_size_in_bytes);
-  static void UniqueWithCounts(ep::Stream* stream, int64_t n, const KEY* in, IDX* num_unique,
-                               KEY* unique_out, IDX* idx_out, IDX* count, void* workspace,
-                               int64_t workspace_size_in_bytes);
-  static void GetUniqueWorkspaceSizeInBytes(ep::Stream* stream, int64_t n,
-                                            int64_t* workspace_size_in_bytes);
-  static void GetUniqueWithCountsWorkspaceSizeInBytes(ep::Stream* stream, int64_t n,
-                                                      int64_t* workspace_size_in_bytes);
-};
-
-template<typename KEY, typename IDX>
-void UniqueKernelUtil<DeviceType::kCUDA, KEY, IDX>::Unique(ep::Stream* stream, int64_t n,
-                                                           const KEY* in, IDX* num_unique,
-                                                           KEY* unique_out, IDX* idx_out,
-                                                           void* workspace,
-                                                           int64_t workspace_size_in_bytes) {
-  OF_CUDA_CHECK((cuda::unique::Launch<KEY, IDX>(kUniqueFlag, n, in, unique_out, num_unique, idx_out,
-                                                nullptr, workspace, workspace_size_in_bytes,
-                                                stream->As<ep::CudaStream>()->cuda_stream())));
-}
-
-template<typename KEY, typename IDX>
-void UniqueKernelUtil<DeviceType::kCUDA, KEY, IDX>::UniqueWithCounts(
-    ep::Stream* stream, int64_t n, const KEY* in, IDX* num_unique, KEY* unique_out, IDX* idx_out,
-    IDX* count, void* workspace, int64_t workspace_size_in_bytes) {
-  OF_CUDA_CHECK((cuda::unique::Launch<KEY, IDX>(
-      kUniqueWithCountsFlag, n, in, unique_out, num_unique, idx_out, count, workspace,
-      workspace_size_in_bytes, stream->As<ep::CudaStream>()->cuda_stream())));
-}
-
-template<typename KEY, typename IDX>
-void UniqueKernelUtil<DeviceType::kCUDA, KEY, IDX>::GetUniqueWorkspaceSizeInBytes(
-    ep::Stream* stream, int64_t n, int64_t* workspace_size_in_bytes) {
-  size_t ws = 0;
-  OF_CUDA_CHECK((cuda::unique::GetWorkspaceSize<KEY, IDX>(kUniqueFlag, n, &ws)));
-  *workspace_size_in_bytes = static_cast<int64_t>(ws);
-}
-
-template<typename KEY, typename IDX>
-void UniqueKernelUtil<DeviceType::kCUDA, KEY, IDX>::GetUniqueWithCountsWorkspaceSizeInBytes(
-    ep::Stream* stream, int64_t n, int64_t* workspace_size_in_bytes) {
-  size_t ws = 0;
-  OF_CUDA_CHECK((cuda::unique::GetWorkspaceSize<KEY, IDX>(kUniqueWithCountsFlag, n, &ws)));
-  *workspace_size_in_bytes = static_cast<int64_t>(ws);
-}
-
-#define INSTANTIATE_UNIQUE_KERNEL_UTIL_CUDA(key_type_pair, idx_type_pair)              \
-  template struct UniqueKernelUtil<DeviceType::kCUDA, OF_PP_PAIR_FIRST(key_type_pair), \
-                                   OF_PP_PAIR_FIRST(idx_type_pair)>;
-OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_UNIQUE_KERNEL_UTIL_CUDA, ARITHMETIC_DATA_TYPE_SEQ,
-                                 INDEX_DATA_TYPE_SEQ);
-#undef INSTANTIATE_UNIQUE_KERNEL_UTIL_CUDA
-
-}  // namespace oneflow
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "oneflow/user/kernels/unique_kernel_util.h"
+#include "oneflow/core/hip/unique.hip.h"
+#include "oneflow/core/ep/rocm/cuda_stream.h"
+
+namespace oneflow {
+
+namespace {
+
+constexpr cuda::unique::Flag kUniqueFlag = cuda::unique::kOutputInverseIndices;
+constexpr cuda::unique::Flag kUniqueWithCountsFlag =
+    cuda::unique::kOutputInverseIndices | cuda::unique::kOutputCounts;
+
+}  // namespace
+
+template<typename KEY, typename IDX>
+struct UniqueKernelUtil<DeviceType::kCUDA, KEY, IDX> {
+  static void Unique(ep::Stream* stream, int64_t n, const KEY* in, IDX* num_unique, KEY* unique_out,
+                     IDX* idx_out, void* workspace, int64_t workspace_size_in_bytes);
+  static void UniqueWithCounts(ep::Stream* stream, int64_t n, const KEY* in, IDX* num_unique,
+                               KEY* unique_out, IDX* idx_out, IDX* count, void* workspace,
+                               int64_t workspace_size_in_bytes);
+  static void GetUniqueWorkspaceSizeInBytes(ep::Stream* stream, int64_t n,
+                                            int64_t* workspace_size_in_bytes);
+  static void GetUniqueWithCountsWorkspaceSizeInBytes(ep::Stream* stream, int64_t n,
+                                                      int64_t* workspace_size_in_bytes);
+};
+
+template<typename KEY, typename IDX>
+void UniqueKernelUtil<DeviceType::kCUDA, KEY, IDX>::Unique(ep::Stream* stream, int64_t n,
+                                                           const KEY* in, IDX* num_unique,
+                                                           KEY* unique_out, IDX* idx_out,
+                                                           void* workspace,
+                                                           int64_t workspace_size_in_bytes) {
+  OF_CUDA_CHECK((cuda::unique::Launch<KEY, IDX>(kUniqueFlag, n, in, unique_out, num_unique, idx_out,
+                                                nullptr, workspace, workspace_size_in_bytes,
+                                                stream->As<ep::CudaStream>()->cuda_stream())));
+}
+
+template<typename KEY, typename IDX>
+void UniqueKernelUtil<DeviceType::kCUDA, KEY, IDX>::UniqueWithCounts(
+    ep::Stream* stream, int64_t n, const KEY* in, IDX* num_unique, KEY* unique_out, IDX* idx_out,
+    IDX* count, void* workspace, int64_t workspace_size_in_bytes) {
+  OF_CUDA_CHECK((cuda::unique::Launch<KEY, IDX>(
+      kUniqueWithCountsFlag, n, in, unique_out, num_unique, idx_out, count, workspace,
+      workspace_size_in_bytes, stream->As<ep::CudaStream>()->cuda_stream())));
+}
+
+template<typename KEY, typename IDX>
+void UniqueKernelUtil<DeviceType::kCUDA, KEY, IDX>::GetUniqueWorkspaceSizeInBytes(
+    ep::Stream* stream, int64_t n, int64_t* workspace_size_in_bytes) {
+  size_t ws = 0;
+  OF_CUDA_CHECK((cuda::unique::GetWorkspaceSize<KEY, IDX>(kUniqueFlag, n, &ws)));
+  *workspace_size_in_bytes = static_cast<int64_t>(ws);
+}
+
+template<typename KEY, typename IDX>
+void UniqueKernelUtil<DeviceType::kCUDA, KEY, IDX>::GetUniqueWithCountsWorkspaceSizeInBytes(
+    ep::Stream* stream, int64_t n, int64_t* workspace_size_in_bytes) {
+  size_t ws = 0;
+  OF_CUDA_CHECK((cuda::unique::GetWorkspaceSize<KEY, IDX>(kUniqueWithCountsFlag, n, &ws)));
+  *workspace_size_in_bytes = static_cast<int64_t>(ws);
+}
+
+#define INSTANTIATE_UNIQUE_KERNEL_UTIL_CUDA(key_type_pair, idx_type_pair)              \
+  template struct UniqueKernelUtil<DeviceType::kCUDA, OF_PP_PAIR_FIRST(key_type_pair), \
+                                   OF_PP_PAIR_FIRST(idx_type_pair)>;
+OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_UNIQUE_KERNEL_UTIL_CUDA, ARITHMETIC_DATA_TYPE_SEQ,
+                                 INDEX_DATA_TYPE_SEQ);
+#undef INSTANTIATE_UNIQUE_KERNEL_UTIL_CUDA
+
+}  // namespace oneflow
diff --git a/oneflow/user/kernels/unsorted_segment_sum_kernel_util.hip.cpp b/oneflow/user/kernels/unsorted_segment_sum_kernel_util.hip.cpp
index 6334ee7..1b9dfa9 100644
--- a/oneflow/user/kernels/unsorted_segment_sum_kernel_util.hip.cpp
+++ b/oneflow/user/kernels/unsorted_segment_sum_kernel_util.hip.cpp
@@ -1,222 +1,222 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "hip/hip_runtime.h"
-#include "oneflow/core/common/nd_index_offset_helper.h"
-#include "oneflow/user/kernels/unsorted_segment_sum_kernel_util.h"
-#include "oneflow/core/hip/atomic.hip.h"
-#include "oneflow/core/kernel/kernel.h"
-#include "oneflow/core/ep/rocm/cuda_stream.h"
-#include <assert.h>
-
-namespace oneflow {
-
-namespace {
-
-template<typename T>
-__device__ __forceinline__ bool IsZero(T v) {
-  return v == 0;
-}
-
-template<>
-__device__ __forceinline__ bool IsZero<half>(half v) {
-  return v == static_cast<half>(0);
-}
-
-template<>
-__device__ __forceinline__ bool IsZero<half2>(half2 v) {
-  // return v.x == static_cast<half>(0) && v.y == static_cast<half>(0);
-  return v.data.x == 0 && v.data.y == 0;
-}
-
-template<typename T, typename K, typename IDX, typename U>
-__global__ void UnsortedSegmentSumGpu(const IDX data_elem_cnt,
-                                      const NdIndexOffsetHelper<IDX, 3> in_helper,
-                                      const NdIndexOffsetHelper<IDX, 3> out_helper, const U* data,
-                                      const K* segment_ids, const IDX num_segments,
-                                      const IDX segment_id_offset, T* out) {
-  CUDA_1D_KERNEL_LOOP_T(IDX, i, data_elem_cnt) {
-    const U val = data[i];
-    if (!IsZero(val)) {
-      IDX outer_idx, segment_id_idx, inner_idx;
-      in_helper.OffsetToNdIndex(i, outer_idx, segment_id_idx, inner_idx);
-      const K origin_idx = segment_ids[segment_id_idx];
-      assert(origin_idx >= 0);
-      const IDX idx = origin_idx - segment_id_offset;
-      if (idx >= 0 && idx < num_segments) {
-        const int64_t out_offset = out_helper.NdIndexToOffset(outer_idx, idx, inner_idx);
-        if (out_offset >= 0) { cuda::atomic::Add(out + out_offset, static_cast<T>(val)); }
-      }
-    }
-  }
-}
-
-template<typename T, typename K, typename IDX, typename U>
-__global__ void UnsortedSegmentColSumGpu(const IDX data_elem_cnt,
-                                         const NdIndexOffsetHelper<IDX, 2> in_helper,
-                                         const NdIndexOffsetHelper<IDX, 2> out_helper,
-                                         const U* data, const K* segment_ids,
-                                         const IDX num_segments, const IDX segment_id_offset,
-                                         T* out) {
-  CUDA_1D_KERNEL_LOOP_T(IDX, i, data_elem_cnt) {
-    const U val = data[i];
-    if (!IsZero(val)) {
-      IDX outer_idx, segment_id_idx;
-      in_helper.OffsetToNdIndex(i, outer_idx, segment_id_idx);
-      const K origin_idx = segment_ids[segment_id_idx];
-      assert(origin_idx >= 0);
-      const IDX idx = origin_idx - segment_id_offset;
-      if (idx >= 0 && idx < num_segments) {
-        const int64_t out_offset = out_helper.NdIndexToOffset(outer_idx, idx);
-        if (out_offset >= 0) { cuda::atomic::Add(out + out_offset, static_cast<T>(val)); }
-      }
-    }
-  }
-}
-
-template<typename T, typename K, typename IDX, typename U>
-__global__ void UnsortedSegmentRowSumGpu(const IDX data_elem_cnt,
-                                         const NdIndexOffsetHelper<IDX, 2> in_helper,
-                                         const NdIndexOffsetHelper<IDX, 2> out_helper,
-                                         const U* data, const K* segment_ids,
-                                         const IDX num_segments, const IDX segment_id_offset,
-                                         T* out) {
-  CUDA_1D_KERNEL_LOOP_T(IDX, i, data_elem_cnt) {
-    const U val = data[i];
-    if (!IsZero(val)) {
-      IDX segment_id_idx, inner_idx;
-      in_helper.OffsetToNdIndex(i, segment_id_idx, inner_idx);
-      const K origin_idx = segment_ids[segment_id_idx];
-      assert(origin_idx >= 0);
-      const IDX idx = origin_idx - segment_id_offset;
-      if (idx >= 0 && idx < num_segments) {
-        const int64_t out_offset = out_helper.NdIndexToOffset(idx, inner_idx);
-        if (out_offset >= 0) { cuda::atomic::Add(out + out_offset, static_cast<T>(val)); }
-      }
-    }
-  }
-}
-
-template<typename T, typename K, typename IDX, typename U>
-void UnsortedSegmentSumUtil(ep::Stream* stream, const K* segment_ids, const U* data,
-                            IDX num_segment_ids, IDX num_segments, IDX outer_dim_size,
-                            IDX inner_dim_size, IDX segment_id_offset, T* out) {
-  const IDX data_elem_cnt = num_segment_ids * outer_dim_size * inner_dim_size;
-  if (inner_dim_size == 1) {
-    NdIndexOffsetHelper<IDX, 2> in_helper(outer_dim_size, num_segment_ids);
-    NdIndexOffsetHelper<IDX, 2> out_helper(outer_dim_size, num_segments);
-    UnsortedSegmentColSumGpu<T, K, IDX, U>
-        <<<BlocksNum4ThreadsNum(data_elem_cnt), kCudaThreadsNumPerBlock, 0,
-           stream->As<ep::CudaStream>()->cuda_stream()>>>(data_elem_cnt, in_helper, out_helper,
-                                                          data, segment_ids, num_segments,
-                                                          segment_id_offset, out);
-
-  } else if (outer_dim_size == 1) {
-    NdIndexOffsetHelper<IDX, 2> in_helper(num_segment_ids, inner_dim_size);
-    NdIndexOffsetHelper<IDX, 2> out_helper(num_segments, inner_dim_size);
-    UnsortedSegmentRowSumGpu<T, K, IDX, U>
-        <<<BlocksNum4ThreadsNum(data_elem_cnt), kCudaThreadsNumPerBlock, 0,
-           stream->As<ep::CudaStream>()->cuda_stream()>>>(data_elem_cnt, in_helper, out_helper,
-                                                          data, segment_ids, num_segments,
-                                                          segment_id_offset, out);
-
-  } else {
-    NdIndexOffsetHelper<IDX, 3> in_helper(outer_dim_size, num_segment_ids, inner_dim_size);
-    NdIndexOffsetHelper<IDX, 3> out_helper(outer_dim_size, num_segments, inner_dim_size);
-    UnsortedSegmentSumGpu<T, K, IDX, U>
-        <<<BlocksNum4ThreadsNum(data_elem_cnt), kCudaThreadsNumPerBlock, 0,
-           stream->As<ep::CudaStream>()->cuda_stream()>>>(data_elem_cnt, in_helper, out_helper,
-                                                          data, segment_ids, num_segments,
-                                                          segment_id_offset, out);
-  }
-}
-
-template<typename T, typename K, typename IDX, typename U>
-void DispatchDataType(ep::Stream* stream, const K* segment_ids, const U* data,
-                      int64_t num_segment_ids, int64_t num_segments, int64_t outer_dim_size,
-                      int64_t inner_dim_size, int64_t segment_id_offset, T* out) {
-  auto* cuda_stream = stream->As<ep::CudaStream>();
-  if (std::is_same<T, half>::value && std::is_same<U, half>::value
-      && cuda_stream->device_properties().major >= 6
-      && reinterpret_cast<uintptr_t>(data) % sizeof(half2) == 0
-      && reinterpret_cast<uintptr_t>(out) % sizeof(half2) == 0 && inner_dim_size % 2 == 0) {
-    UnsortedSegmentSumUtil<half2, K, IDX, half2>(
-        stream, segment_ids, reinterpret_cast<const half2*>(data), num_segment_ids, num_segments,
-        outer_dim_size, inner_dim_size / 2, segment_id_offset, reinterpret_cast<half2*>(out));
-  } else {
-    UnsortedSegmentSumUtil<T, K, IDX, U>(stream, segment_ids, data, num_segment_ids, num_segments,
-                                         outer_dim_size, inner_dim_size, segment_id_offset, out);
-  }
-}
-
-}  // namespace
-
-template<typename T, typename K, typename U>
-struct UnsortedSegmentSumKernelUtil<DeviceType::kCUDA, T, K, U> final {
-  static void UnsortedSegmentSum(ep::Stream* stream, const K* segment_ids, const U* data,
-                                 int64_t num_segment_ids, int64_t num_segments,
-                                 int64_t outer_dim_size, int64_t inner_dim_size,
-                                 int64_t segment_id_offset, T* out) {
-    const int64_t data_elem_cnt = num_segment_ids * outer_dim_size * inner_dim_size;
-    const int64_t out_elem_cnt = outer_dim_size * num_segments * inner_dim_size;
-
-    if (std::max(data_elem_cnt, out_elem_cnt) < GetMaxVal<int32_t>() / 2) {
-      DispatchDataType<T, K, int32_t, U>(stream, segment_ids, data, num_segment_ids, num_segments,
-                                         outer_dim_size, inner_dim_size, segment_id_offset, out);
-    } else {
-      DispatchDataType<T, K, int64_t, U>(stream, segment_ids, data, num_segment_ids, num_segments,
-                                         outer_dim_size, inner_dim_size, segment_id_offset, out);
-    }
-  }
-};
-
-template<typename K>
-struct UnsortedSegmentSumKernelUtil<DeviceType::kCUDA, float, K, float16> final {
-  static void UnsortedSegmentSum(ep::Stream* stream, const K* segment_ids, const float16* data,
-                                 int64_t num_segment_ids, int64_t num_segments,
-                                 int64_t outer_dim_size, int64_t inner_dim_size,
-                                 int64_t segment_id_offset, float* out) {
-    UnsortedSegmentSumKernelUtil<DeviceType::kCUDA, float, K, half>::UnsortedSegmentSum(
-        stream, segment_ids, reinterpret_cast<const half*>(data), num_segment_ids, num_segments,
-        outer_dim_size, inner_dim_size, segment_id_offset, out);
-  }
-};
-
-#define INITIATE_UNSORTED_SEGMENT_SUM_KERNEL_UTIL_CUDA(in_type_pair, index_type_pair)             \
-  template struct UnsortedSegmentSumKernelUtil<DeviceType::kCUDA, OF_PP_PAIR_FIRST(in_type_pair), \
-                                               OF_PP_PAIR_FIRST(index_type_pair),                 \
-                                               OF_PP_PAIR_FIRST(in_type_pair)>;
-OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INITIATE_UNSORTED_SEGMENT_SUM_KERNEL_UTIL_CUDA,
-                                 UNSORTED_SEGMENT_SUM_DATA_TYPE_SEQ,
-                                 UNSORTED_SEGMENT_SUM_INDEX_TYPE_SEQ);
-#undef INITIATE_UNSORTED_SEGMENT_SUM_KERNEL_UTIL_CUDA
-
-#define INITIATE_UNSORTED_SEGMENT_SUM_KERNEL_HALF_CUDA(in_type_pair, index_type_pair,             \
-                                                       out_type_pair)                             \
-  template struct UnsortedSegmentSumKernelUtil<DeviceType::kCUDA, OF_PP_PAIR_FIRST(in_type_pair), \
-                                               OF_PP_PAIR_FIRST(index_type_pair),                 \
-                                               OF_PP_PAIR_FIRST(out_type_pair)>;
-
-OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INITIATE_UNSORTED_SEGMENT_SUM_KERNEL_HALF_CUDA,
-                                 OF_PP_MAKE_TUPLE_SEQ(float, DataType::kFloat),
-                                 UNSORTED_SEGMENT_SUM_INDEX_TYPE_SEQ, FLOAT16_DATA_TYPE_SEQ);
-
-#undef INITIATE_UNSORTED_SEGMENT_SUM_KERNEL_HALF_CUDA
-
-template struct UnsortedSegmentSumKernelUtil<DeviceType::kCUDA, half, uint32_t, half>;
-template struct UnsortedSegmentSumKernelUtil<DeviceType::kCUDA, half, int32_t, half>;
-template struct UnsortedSegmentSumKernelUtil<DeviceType::kCUDA, half, int64_t, half>;
-
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "hip/hip_runtime.h"
+#include "oneflow/core/common/nd_index_offset_helper.h"
+#include "oneflow/user/kernels/unsorted_segment_sum_kernel_util.h"
+#include "oneflow/core/hip/atomic.hip.h"
+#include "oneflow/core/kernel/kernel.h"
+#include "oneflow/core/ep/rocm/cuda_stream.h"
+#include <assert.h>
+
+namespace oneflow {
+
+namespace {
+
+template<typename T>
+__device__ __forceinline__ bool IsZero(T v) {
+  return v == 0;
+}
+
+template<>
+__device__ __forceinline__ bool IsZero<half>(half v) {
+  return v == static_cast<half>(0);
+}
+
+template<>
+__device__ __forceinline__ bool IsZero<half2>(half2 v) {
+  // return v.x == static_cast<half>(0) && v.y == static_cast<half>(0);
+  return v.data.x == 0 && v.data.y == 0;
+}
+
+template<typename T, typename K, typename IDX, typename U>
+__global__ void UnsortedSegmentSumGpu(const IDX data_elem_cnt,
+                                      const NdIndexOffsetHelper<IDX, 3> in_helper,
+                                      const NdIndexOffsetHelper<IDX, 3> out_helper, const U* data,
+                                      const K* segment_ids, const IDX num_segments,
+                                      const IDX segment_id_offset, T* out) {
+  CUDA_1D_KERNEL_LOOP_T(IDX, i, data_elem_cnt) {
+    const U val = data[i];
+    if (!IsZero(val)) {
+      IDX outer_idx, segment_id_idx, inner_idx;
+      in_helper.OffsetToNdIndex(i, outer_idx, segment_id_idx, inner_idx);
+      const K origin_idx = segment_ids[segment_id_idx];
+      assert(origin_idx >= 0);
+      const IDX idx = origin_idx - segment_id_offset;
+      if (idx >= 0 && idx < num_segments) {
+        const int64_t out_offset = out_helper.NdIndexToOffset(outer_idx, idx, inner_idx);
+        if (out_offset >= 0) { cuda::atomic::Add(out + out_offset, static_cast<T>(val)); }
+      }
+    }
+  }
+}
+
+template<typename T, typename K, typename IDX, typename U>
+__global__ void UnsortedSegmentColSumGpu(const IDX data_elem_cnt,
+                                         const NdIndexOffsetHelper<IDX, 2> in_helper,
+                                         const NdIndexOffsetHelper<IDX, 2> out_helper,
+                                         const U* data, const K* segment_ids,
+                                         const IDX num_segments, const IDX segment_id_offset,
+                                         T* out) {
+  CUDA_1D_KERNEL_LOOP_T(IDX, i, data_elem_cnt) {
+    const U val = data[i];
+    if (!IsZero(val)) {
+      IDX outer_idx, segment_id_idx;
+      in_helper.OffsetToNdIndex(i, outer_idx, segment_id_idx);
+      const K origin_idx = segment_ids[segment_id_idx];
+      assert(origin_idx >= 0);
+      const IDX idx = origin_idx - segment_id_offset;
+      if (idx >= 0 && idx < num_segments) {
+        const int64_t out_offset = out_helper.NdIndexToOffset(outer_idx, idx);
+        if (out_offset >= 0) { cuda::atomic::Add(out + out_offset, static_cast<T>(val)); }
+      }
+    }
+  }
+}
+
+template<typename T, typename K, typename IDX, typename U>
+__global__ void UnsortedSegmentRowSumGpu(const IDX data_elem_cnt,
+                                         const NdIndexOffsetHelper<IDX, 2> in_helper,
+                                         const NdIndexOffsetHelper<IDX, 2> out_helper,
+                                         const U* data, const K* segment_ids,
+                                         const IDX num_segments, const IDX segment_id_offset,
+                                         T* out) {
+  CUDA_1D_KERNEL_LOOP_T(IDX, i, data_elem_cnt) {
+    const U val = data[i];
+    if (!IsZero(val)) {
+      IDX segment_id_idx, inner_idx;
+      in_helper.OffsetToNdIndex(i, segment_id_idx, inner_idx);
+      const K origin_idx = segment_ids[segment_id_idx];
+      assert(origin_idx >= 0);
+      const IDX idx = origin_idx - segment_id_offset;
+      if (idx >= 0 && idx < num_segments) {
+        const int64_t out_offset = out_helper.NdIndexToOffset(idx, inner_idx);
+        if (out_offset >= 0) { cuda::atomic::Add(out + out_offset, static_cast<T>(val)); }
+      }
+    }
+  }
+}
+
+template<typename T, typename K, typename IDX, typename U>
+void UnsortedSegmentSumUtil(ep::Stream* stream, const K* segment_ids, const U* data,
+                            IDX num_segment_ids, IDX num_segments, IDX outer_dim_size,
+                            IDX inner_dim_size, IDX segment_id_offset, T* out) {
+  const IDX data_elem_cnt = num_segment_ids * outer_dim_size * inner_dim_size;
+  if (inner_dim_size == 1) {
+    NdIndexOffsetHelper<IDX, 2> in_helper(outer_dim_size, num_segment_ids);
+    NdIndexOffsetHelper<IDX, 2> out_helper(outer_dim_size, num_segments);
+    UnsortedSegmentColSumGpu<T, K, IDX, U>
+        <<<BlocksNum4ThreadsNum(data_elem_cnt), kCudaThreadsNumPerBlock, 0,
+           stream->As<ep::CudaStream>()->cuda_stream()>>>(data_elem_cnt, in_helper, out_helper,
+                                                          data, segment_ids, num_segments,
+                                                          segment_id_offset, out);
+
+  } else if (outer_dim_size == 1) {
+    NdIndexOffsetHelper<IDX, 2> in_helper(num_segment_ids, inner_dim_size);
+    NdIndexOffsetHelper<IDX, 2> out_helper(num_segments, inner_dim_size);
+    UnsortedSegmentRowSumGpu<T, K, IDX, U>
+        <<<BlocksNum4ThreadsNum(data_elem_cnt), kCudaThreadsNumPerBlock, 0,
+           stream->As<ep::CudaStream>()->cuda_stream()>>>(data_elem_cnt, in_helper, out_helper,
+                                                          data, segment_ids, num_segments,
+                                                          segment_id_offset, out);
+
+  } else {
+    NdIndexOffsetHelper<IDX, 3> in_helper(outer_dim_size, num_segment_ids, inner_dim_size);
+    NdIndexOffsetHelper<IDX, 3> out_helper(outer_dim_size, num_segments, inner_dim_size);
+    UnsortedSegmentSumGpu<T, K, IDX, U>
+        <<<BlocksNum4ThreadsNum(data_elem_cnt), kCudaThreadsNumPerBlock, 0,
+           stream->As<ep::CudaStream>()->cuda_stream()>>>(data_elem_cnt, in_helper, out_helper,
+                                                          data, segment_ids, num_segments,
+                                                          segment_id_offset, out);
+  }
+}
+
+template<typename T, typename K, typename IDX, typename U>
+void DispatchDataType(ep::Stream* stream, const K* segment_ids, const U* data,
+                      int64_t num_segment_ids, int64_t num_segments, int64_t outer_dim_size,
+                      int64_t inner_dim_size, int64_t segment_id_offset, T* out) {
+  auto* cuda_stream = stream->As<ep::CudaStream>();
+  if (std::is_same<T, half>::value && std::is_same<U, half>::value
+      && cuda_stream->device_properties().major >= 6
+      && reinterpret_cast<uintptr_t>(data) % sizeof(half2) == 0
+      && reinterpret_cast<uintptr_t>(out) % sizeof(half2) == 0 && inner_dim_size % 2 == 0) {
+    UnsortedSegmentSumUtil<half2, K, IDX, half2>(
+        stream, segment_ids, reinterpret_cast<const half2*>(data), num_segment_ids, num_segments,
+        outer_dim_size, inner_dim_size / 2, segment_id_offset, reinterpret_cast<half2*>(out));
+  } else {
+    UnsortedSegmentSumUtil<T, K, IDX, U>(stream, segment_ids, data, num_segment_ids, num_segments,
+                                         outer_dim_size, inner_dim_size, segment_id_offset, out);
+  }
+}
+
+}  // namespace
+
+template<typename T, typename K, typename U>
+struct UnsortedSegmentSumKernelUtil<DeviceType::kCUDA, T, K, U> final {
+  static void UnsortedSegmentSum(ep::Stream* stream, const K* segment_ids, const U* data,
+                                 int64_t num_segment_ids, int64_t num_segments,
+                                 int64_t outer_dim_size, int64_t inner_dim_size,
+                                 int64_t segment_id_offset, T* out) {
+    const int64_t data_elem_cnt = num_segment_ids * outer_dim_size * inner_dim_size;
+    const int64_t out_elem_cnt = outer_dim_size * num_segments * inner_dim_size;
+
+    if (std::max(data_elem_cnt, out_elem_cnt) < GetMaxVal<int32_t>() / 2) {
+      DispatchDataType<T, K, int32_t, U>(stream, segment_ids, data, num_segment_ids, num_segments,
+                                         outer_dim_size, inner_dim_size, segment_id_offset, out);
+    } else {
+      DispatchDataType<T, K, int64_t, U>(stream, segment_ids, data, num_segment_ids, num_segments,
+                                         outer_dim_size, inner_dim_size, segment_id_offset, out);
+    }
+  }
+};
+
+template<typename K>
+struct UnsortedSegmentSumKernelUtil<DeviceType::kCUDA, float, K, float16> final {
+  static void UnsortedSegmentSum(ep::Stream* stream, const K* segment_ids, const float16* data,
+                                 int64_t num_segment_ids, int64_t num_segments,
+                                 int64_t outer_dim_size, int64_t inner_dim_size,
+                                 int64_t segment_id_offset, float* out) {
+    UnsortedSegmentSumKernelUtil<DeviceType::kCUDA, float, K, half>::UnsortedSegmentSum(
+        stream, segment_ids, reinterpret_cast<const half*>(data), num_segment_ids, num_segments,
+        outer_dim_size, inner_dim_size, segment_id_offset, out);
+  }
+};
+
+#define INITIATE_UNSORTED_SEGMENT_SUM_KERNEL_UTIL_CUDA(in_type_pair, index_type_pair)             \
+  template struct UnsortedSegmentSumKernelUtil<DeviceType::kCUDA, OF_PP_PAIR_FIRST(in_type_pair), \
+                                               OF_PP_PAIR_FIRST(index_type_pair),                 \
+                                               OF_PP_PAIR_FIRST(in_type_pair)>;
+OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INITIATE_UNSORTED_SEGMENT_SUM_KERNEL_UTIL_CUDA,
+                                 UNSORTED_SEGMENT_SUM_DATA_TYPE_SEQ,
+                                 UNSORTED_SEGMENT_SUM_INDEX_TYPE_SEQ);
+#undef INITIATE_UNSORTED_SEGMENT_SUM_KERNEL_UTIL_CUDA
+
+#define INITIATE_UNSORTED_SEGMENT_SUM_KERNEL_HALF_CUDA(in_type_pair, index_type_pair,             \
+                                                       out_type_pair)                             \
+  template struct UnsortedSegmentSumKernelUtil<DeviceType::kCUDA, OF_PP_PAIR_FIRST(in_type_pair), \
+                                               OF_PP_PAIR_FIRST(index_type_pair),                 \
+                                               OF_PP_PAIR_FIRST(out_type_pair)>;
+
+OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INITIATE_UNSORTED_SEGMENT_SUM_KERNEL_HALF_CUDA,
+                                 OF_PP_MAKE_TUPLE_SEQ(float, DataType::kFloat),
+                                 UNSORTED_SEGMENT_SUM_INDEX_TYPE_SEQ, FLOAT16_DATA_TYPE_SEQ);
+
+#undef INITIATE_UNSORTED_SEGMENT_SUM_KERNEL_HALF_CUDA
+
+template struct UnsortedSegmentSumKernelUtil<DeviceType::kCUDA, half, uint32_t, half>;
+template struct UnsortedSegmentSumKernelUtil<DeviceType::kCUDA, half, int32_t, half>;
+template struct UnsortedSegmentSumKernelUtil<DeviceType::kCUDA, half, int64_t, half>;
+
 }  // namespace oneflow
\ No newline at end of file
diff --git a/oneflow/user/kernels/upsample_bicubic_2d_kernel.hip.cpp b/oneflow/user/kernels/upsample_bicubic_2d_kernel.hip.cpp
index 6525f41..a9324b9 100644
--- a/oneflow/user/kernels/upsample_bicubic_2d_kernel.hip.cpp
+++ b/oneflow/user/kernels/upsample_bicubic_2d_kernel.hip.cpp
@@ -1,234 +1,234 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "hip/hip_runtime.h"
-#include "oneflow/core/framework/framework.h"
-#include "oneflow/core/kernel/new_kernel_util.h"
-#include "oneflow/core/common/nd_index_offset_helper.h"
-#include "oneflow/core/hip/atomic.hip.h"
-#include "oneflow/user/kernels/upsample_kernel.h"
-
-namespace oneflow {
-
-namespace {
-
-template<typename T>
-__device__ void upsample_increment_value_bounded_cuda(T* data, int64_t width, int64_t height,
-                                                      int64_t x, int64_t y, T value) {
-  int64_t access_x = max(min(x, width - 1), static_cast<int64_t>(0));
-  int64_t access_y = max(min(y, height - 1), static_cast<int64_t>(0));
-  cuda::atomic::Add(data + access_y * width + access_x, value);
-}
-
-template<typename T>
-__global__ void UpsampleBicubic2dForward(const int64_t elem_cnt, const T* in_dptr,
-                                         const int64_t nbatch, const int64_t channels,
-                                         const int64_t in_height, const int64_t in_width,
-                                         const int64_t out_height, const int64_t out_width,
-                                         const float scale_height, const float scale_width,
-                                         bool align_corners, T* out_dptr) {
-  CUDA_1D_KERNEL_LOOP(idx, elem_cnt) {
-    const int output_x = idx % out_width;
-    const int output_y = idx / out_width;
-
-    const T* in = in_dptr;
-    T* out = out_dptr;
-
-    const T real_x = GetAreaPixel(scale_width, output_x, align_corners, /*cubic=*/true);
-    int64_t input_x = floor(1.0 * real_x);
-    const T t_x = real_x - input_x;
-
-    const T real_y = GetAreaPixel(scale_height, output_y, align_corners, /*cubic=*/true);
-    int64_t input_y = floor(1.0 * real_y);
-    const T t_y = real_y - input_y;
-
-    for (int64_t c = 0; c < channels * nbatch; c++) {
-      T coefficients[4];
-
-      // Interpolate 4 times in the x direction
-      for (int64_t i = 0; i < 4; i++) {
-        coefficients[i] = cubic_interp1d<T>(
-            upsample_get_value_bounded<T>(in, in_width, in_height, input_x - 1, input_y - 1 + i),
-            upsample_get_value_bounded<T>(in, in_width, in_height, input_x + 0, input_y - 1 + i),
-            upsample_get_value_bounded<T>(in, in_width, in_height, input_x + 1, input_y - 1 + i),
-            upsample_get_value_bounded<T>(in, in_width, in_height, input_x + 2, input_y - 1 + i),
-            t_x);
-      }
-
-      // Interpolate in the y direction using x interpolations
-      out[output_y * out_width + output_x] = cubic_interp1d<T>(
-          coefficients[0], coefficients[1], coefficients[2], coefficients[3], t_y);
-
-      // Move to next channel
-      in += in_width * in_height;
-      out += out_width * out_height;
-    }
-  }
-}
-
-template<typename T>
-__global__ void UpsampleBicubic2dBackward(const int64_t elem_cnt, const T* dy_dptr,
-                                          const int64_t nbatch, const int64_t channels,
-                                          const int64_t in_height, const int64_t in_width,
-                                          const int64_t out_height, const int64_t out_width,
-                                          const float scale_height, const float scale_width,
-                                          bool align_corners, T* dx_dptr) {
-  CUDA_1D_KERNEL_LOOP(idx, elem_cnt) {
-    const int output_x = idx % out_width;
-    const int output_y = idx / out_width;
-
-    T* in = dx_dptr;
-    const T* out = dy_dptr;
-
-    T real_x = GetAreaPixel(scale_width, output_x, align_corners, true);
-    int64_t input_x = floor(1.0 * real_x);
-    T t_x = real_x - input_x;
-
-    T real_y = GetAreaPixel(scale_height, output_y, align_corners, true);
-    int64_t input_y = floor(1.0 * real_y);
-    T t_y = real_y - input_y;
-
-    T x_coeffs[4];
-    T y_coeffs[4];
-
-    get_cubic_upsample_coefficients<T>(x_coeffs, t_x);
-    get_cubic_upsample_coefficients<T>(y_coeffs, t_y);
-
-    for (int64_t c = 0; c < channels * nbatch; c++) {
-      T out_value = out[output_y * out_width + output_x];
-
-      for (int64_t i = 0; i < 4; i++) {
-        for (int64_t j = 0; j < 4; j++) {
-          upsample_increment_value_bounded_cuda<T>(in, in_width, in_height, input_x - 1 + i,
-                                                   input_y - 1 + j,
-                                                   out_value * y_coeffs[j] * x_coeffs[i]);
-        }
-      }
-
-      in += in_width * in_height;
-      out += out_width * out_height;
-    }
-  }
-}
-
-}  // namespace
-
-template<typename T>
-class UpsampleBicubic2dGPUKernel final : public user_op::OpKernel {
- public:
-  UpsampleBicubic2dGPUKernel() = default;
-  ~UpsampleBicubic2dGPUKernel() = default;
-
- private:
-  using user_op::OpKernel::Compute;
-  void Compute(user_op::KernelComputeContext* ctx) const override {
-    const user_op::Tensor* x_tensor = ctx->Tensor4ArgNameAndIndex("x", 0);
-    user_op::Tensor* y_tensor = ctx->Tensor4ArgNameAndIndex("y", 0);
-    const T* in_ptr = x_tensor->dptr<T>();
-    T* out_ptr = y_tensor->mut_dptr<T>();
-    const bool align_corners = ctx->Attr<bool>("align_corners");
-
-    const int nbatch = x_tensor->shape_view().At(0);
-    const int channels = x_tensor->shape_view().At(1);
-    const int64_t in_height = x_tensor->shape_view().At(2);
-    const int64_t in_width = x_tensor->shape_view().At(3);
-    const int64_t out_height = y_tensor->shape_view().At(2);
-    const int64_t out_width = y_tensor->shape_view().At(3);
-    const std::vector<int64_t> output_size = ctx->Attr<std::vector<int64_t>>("output_size");
-    double height_scale = ctx->Attr<double>("height_scale");
-    double width_scale = ctx->Attr<double>("width_scale");
-    if (!output_size.empty()) {
-      height_scale = static_cast<double>(out_height) / static_cast<double>(in_height);
-      width_scale = static_cast<double>(out_width) / static_cast<double>(in_width);
-    }
-    const int64_t elem_cnt = out_height * out_width;
-
-    if (in_height == out_height && in_width == out_width) {
-      Memcpy<DeviceType::kCUDA>(
-          ctx->stream(), y_tensor->mut_dptr<void>(), x_tensor->dptr<void>(),
-          x_tensor->shape_view().elem_cnt() * GetSizeOfDataType(x_tensor->data_type()));
-    } else {
-      const T scale_height = GetAreaPixelScale(in_height, out_height, align_corners, height_scale);
-      const T scale_width = GetAreaPixelScale(in_width, out_width, align_corners, width_scale);
-
-      RUN_CUDA_KERNEL((UpsampleBicubic2dForward<T>), ctx->stream(), elem_cnt, elem_cnt,
-                      x_tensor->dptr<T>(), nbatch, channels, in_height, in_width, out_height,
-                      out_width, scale_height, scale_width, align_corners, y_tensor->mut_dptr<T>());
-    }
-  }
-  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
-};
-
-template<typename T>
-class UpsampleBicubic2dGradGPUKernel final : public user_op::OpKernel {
- public:
-  UpsampleBicubic2dGradGPUKernel() = default;
-  ~UpsampleBicubic2dGradGPUKernel() = default;
-
- private:
-  using user_op::OpKernel::Compute;
-  void Compute(user_op::KernelComputeContext* ctx) const override {
-    user_op::Tensor* dx_tensor = ctx->Tensor4ArgNameAndIndex("dx", 0);
-    Memset<DeviceType::kCUDA>(ctx->stream(), dx_tensor->mut_dptr<T>(), 0,
-                              dx_tensor->shape_view().elem_cnt() * sizeof(T));
-    const user_op::Tensor* dy_tensor = ctx->Tensor4ArgNameAndIndex("dy", 0);
-    const bool align_corners = ctx->Attr<bool>("align_corners");
-
-    const int nbatch = dx_tensor->shape_view().At(0);
-    const int channels = dx_tensor->shape_view().At(1);
-    const int64_t in_height = dx_tensor->shape_view().At(2);
-    const int64_t in_width = dx_tensor->shape_view().At(3);
-    const int64_t out_height = dy_tensor->shape_view().At(2);
-    const int64_t out_width = dy_tensor->shape_view().At(3);
-    const std::vector<int64_t> output_size = ctx->Attr<std::vector<int64_t>>("output_size");
-    double height_scale = ctx->Attr<double>("height_scale");
-    double width_scale = ctx->Attr<double>("width_scale");
-    if (!output_size.empty()) {
-      height_scale = static_cast<double>(out_height) / static_cast<double>(in_height);
-      width_scale = static_cast<double>(out_width) / static_cast<double>(in_width);
-    }
-    const int64_t elem_cnt = out_height * out_width;
-
-    if (in_height == out_height && in_width == out_width) {
-      Memcpy<DeviceType::kCUDA>(
-          ctx->stream(), dx_tensor->mut_dptr<void>(), dy_tensor->dptr<void>(),
-          dy_tensor->shape_view().elem_cnt() * GetSizeOfDataType(dy_tensor->data_type()));
-    } else {
-      const T scale_height = GetAreaPixelScale(in_height, out_height, align_corners, height_scale);
-      const T scale_width = GetAreaPixelScale(in_width, out_width, align_corners, width_scale);
-
-      RUN_CUDA_KERNEL((UpsampleBicubic2dBackward<T>), ctx->stream(), elem_cnt, elem_cnt,
-                      dy_tensor->dptr<T>(), nbatch, channels, in_height, in_width, out_height,
-                      out_width, scale_height, scale_width, align_corners,
-                      dx_tensor->mut_dptr<T>());
-    }
-  }
-  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
-};
-
-#define REGISTER_UPSAMPLE_BICUBIC_CUDA_KERNEL(dtype)                                    \
-  REGISTER_USER_KERNEL("upsample_bicubic_2d")                                           \
-      .SetCreateFn<UpsampleBicubic2dGPUKernel<dtype>>()                                 \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                  \
-                       && (user_op::HobDataType("y", 0) == GetDataType<dtype>::value)); \
-  REGISTER_USER_KERNEL("upsample_bicubic_2d_grad")                                      \
-      .SetCreateFn<UpsampleBicubic2dGradGPUKernel<dtype>>()                             \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                  \
-                       && (user_op::HobDataType("dx", 0) == GetDataType<dtype>::value));
-
-REGISTER_UPSAMPLE_BICUBIC_CUDA_KERNEL(float)
-REGISTER_UPSAMPLE_BICUBIC_CUDA_KERNEL(double)
-
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "hip/hip_runtime.h"
+#include "oneflow/core/framework/framework.h"
+#include "oneflow/core/kernel/new_kernel_util.h"
+#include "oneflow/core/common/nd_index_offset_helper.h"
+#include "oneflow/core/hip/atomic.hip.h"
+#include "oneflow/user/kernels/upsample_kernel.h"
+
+namespace oneflow {
+
+namespace {
+
+template<typename T>
+__device__ void upsample_increment_value_bounded_cuda(T* data, int64_t width, int64_t height,
+                                                      int64_t x, int64_t y, T value) {
+  int64_t access_x = max(min(x, width - 1), static_cast<int64_t>(0));
+  int64_t access_y = max(min(y, height - 1), static_cast<int64_t>(0));
+  cuda::atomic::Add(data + access_y * width + access_x, value);
+}
+
+template<typename T>
+__global__ void UpsampleBicubic2dForward(const int64_t elem_cnt, const T* in_dptr,
+                                         const int64_t nbatch, const int64_t channels,
+                                         const int64_t in_height, const int64_t in_width,
+                                         const int64_t out_height, const int64_t out_width,
+                                         const float scale_height, const float scale_width,
+                                         bool align_corners, T* out_dptr) {
+  CUDA_1D_KERNEL_LOOP(idx, elem_cnt) {
+    const int output_x = idx % out_width;
+    const int output_y = idx / out_width;
+
+    const T* in = in_dptr;
+    T* out = out_dptr;
+
+    const T real_x = GetAreaPixel(scale_width, output_x, align_corners, /*cubic=*/true);
+    int64_t input_x = floor(1.0 * real_x);
+    const T t_x = real_x - input_x;
+
+    const T real_y = GetAreaPixel(scale_height, output_y, align_corners, /*cubic=*/true);
+    int64_t input_y = floor(1.0 * real_y);
+    const T t_y = real_y - input_y;
+
+    for (int64_t c = 0; c < channels * nbatch; c++) {
+      T coefficients[4];
+
+      // Interpolate 4 times in the x direction
+      for (int64_t i = 0; i < 4; i++) {
+        coefficients[i] = cubic_interp1d<T>(
+            upsample_get_value_bounded<T>(in, in_width, in_height, input_x - 1, input_y - 1 + i),
+            upsample_get_value_bounded<T>(in, in_width, in_height, input_x + 0, input_y - 1 + i),
+            upsample_get_value_bounded<T>(in, in_width, in_height, input_x + 1, input_y - 1 + i),
+            upsample_get_value_bounded<T>(in, in_width, in_height, input_x + 2, input_y - 1 + i),
+            t_x);
+      }
+
+      // Interpolate in the y direction using x interpolations
+      out[output_y * out_width + output_x] = cubic_interp1d<T>(
+          coefficients[0], coefficients[1], coefficients[2], coefficients[3], t_y);
+
+      // Move to next channel
+      in += in_width * in_height;
+      out += out_width * out_height;
+    }
+  }
+}
+
+template<typename T>
+__global__ void UpsampleBicubic2dBackward(const int64_t elem_cnt, const T* dy_dptr,
+                                          const int64_t nbatch, const int64_t channels,
+                                          const int64_t in_height, const int64_t in_width,
+                                          const int64_t out_height, const int64_t out_width,
+                                          const float scale_height, const float scale_width,
+                                          bool align_corners, T* dx_dptr) {
+  CUDA_1D_KERNEL_LOOP(idx, elem_cnt) {
+    const int output_x = idx % out_width;
+    const int output_y = idx / out_width;
+
+    T* in = dx_dptr;
+    const T* out = dy_dptr;
+
+    T real_x = GetAreaPixel(scale_width, output_x, align_corners, true);
+    int64_t input_x = floor(1.0 * real_x);
+    T t_x = real_x - input_x;
+
+    T real_y = GetAreaPixel(scale_height, output_y, align_corners, true);
+    int64_t input_y = floor(1.0 * real_y);
+    T t_y = real_y - input_y;
+
+    T x_coeffs[4];
+    T y_coeffs[4];
+
+    get_cubic_upsample_coefficients<T>(x_coeffs, t_x);
+    get_cubic_upsample_coefficients<T>(y_coeffs, t_y);
+
+    for (int64_t c = 0; c < channels * nbatch; c++) {
+      T out_value = out[output_y * out_width + output_x];
+
+      for (int64_t i = 0; i < 4; i++) {
+        for (int64_t j = 0; j < 4; j++) {
+          upsample_increment_value_bounded_cuda<T>(in, in_width, in_height, input_x - 1 + i,
+                                                   input_y - 1 + j,
+                                                   out_value * y_coeffs[j] * x_coeffs[i]);
+        }
+      }
+
+      in += in_width * in_height;
+      out += out_width * out_height;
+    }
+  }
+}
+
+}  // namespace
+
+template<typename T>
+class UpsampleBicubic2dGPUKernel final : public user_op::OpKernel {
+ public:
+  UpsampleBicubic2dGPUKernel() = default;
+  ~UpsampleBicubic2dGPUKernel() = default;
+
+ private:
+  using user_op::OpKernel::Compute;
+  void Compute(user_op::KernelComputeContext* ctx) const override {
+    const user_op::Tensor* x_tensor = ctx->Tensor4ArgNameAndIndex("x", 0);
+    user_op::Tensor* y_tensor = ctx->Tensor4ArgNameAndIndex("y", 0);
+    const T* in_ptr = x_tensor->dptr<T>();
+    T* out_ptr = y_tensor->mut_dptr<T>();
+    const bool align_corners = ctx->Attr<bool>("align_corners");
+
+    const int nbatch = x_tensor->shape_view().At(0);
+    const int channels = x_tensor->shape_view().At(1);
+    const int64_t in_height = x_tensor->shape_view().At(2);
+    const int64_t in_width = x_tensor->shape_view().At(3);
+    const int64_t out_height = y_tensor->shape_view().At(2);
+    const int64_t out_width = y_tensor->shape_view().At(3);
+    const std::vector<int64_t> output_size = ctx->Attr<std::vector<int64_t>>("output_size");
+    double height_scale = ctx->Attr<double>("height_scale");
+    double width_scale = ctx->Attr<double>("width_scale");
+    if (!output_size.empty()) {
+      height_scale = static_cast<double>(out_height) / static_cast<double>(in_height);
+      width_scale = static_cast<double>(out_width) / static_cast<double>(in_width);
+    }
+    const int64_t elem_cnt = out_height * out_width;
+
+    if (in_height == out_height && in_width == out_width) {
+      Memcpy<DeviceType::kCUDA>(
+          ctx->stream(), y_tensor->mut_dptr<void>(), x_tensor->dptr<void>(),
+          x_tensor->shape_view().elem_cnt() * GetSizeOfDataType(x_tensor->data_type()));
+    } else {
+      const T scale_height = GetAreaPixelScale(in_height, out_height, align_corners, height_scale);
+      const T scale_width = GetAreaPixelScale(in_width, out_width, align_corners, width_scale);
+
+      RUN_CUDA_KERNEL((UpsampleBicubic2dForward<T>), ctx->stream(), elem_cnt, elem_cnt,
+                      x_tensor->dptr<T>(), nbatch, channels, in_height, in_width, out_height,
+                      out_width, scale_height, scale_width, align_corners, y_tensor->mut_dptr<T>());
+    }
+  }
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
+};
+
+template<typename T>
+class UpsampleBicubic2dGradGPUKernel final : public user_op::OpKernel {
+ public:
+  UpsampleBicubic2dGradGPUKernel() = default;
+  ~UpsampleBicubic2dGradGPUKernel() = default;
+
+ private:
+  using user_op::OpKernel::Compute;
+  void Compute(user_op::KernelComputeContext* ctx) const override {
+    user_op::Tensor* dx_tensor = ctx->Tensor4ArgNameAndIndex("dx", 0);
+    Memset<DeviceType::kCUDA>(ctx->stream(), dx_tensor->mut_dptr<T>(), 0,
+                              dx_tensor->shape_view().elem_cnt() * sizeof(T));
+    const user_op::Tensor* dy_tensor = ctx->Tensor4ArgNameAndIndex("dy", 0);
+    const bool align_corners = ctx->Attr<bool>("align_corners");
+
+    const int nbatch = dx_tensor->shape_view().At(0);
+    const int channels = dx_tensor->shape_view().At(1);
+    const int64_t in_height = dx_tensor->shape_view().At(2);
+    const int64_t in_width = dx_tensor->shape_view().At(3);
+    const int64_t out_height = dy_tensor->shape_view().At(2);
+    const int64_t out_width = dy_tensor->shape_view().At(3);
+    const std::vector<int64_t> output_size = ctx->Attr<std::vector<int64_t>>("output_size");
+    double height_scale = ctx->Attr<double>("height_scale");
+    double width_scale = ctx->Attr<double>("width_scale");
+    if (!output_size.empty()) {
+      height_scale = static_cast<double>(out_height) / static_cast<double>(in_height);
+      width_scale = static_cast<double>(out_width) / static_cast<double>(in_width);
+    }
+    const int64_t elem_cnt = out_height * out_width;
+
+    if (in_height == out_height && in_width == out_width) {
+      Memcpy<DeviceType::kCUDA>(
+          ctx->stream(), dx_tensor->mut_dptr<void>(), dy_tensor->dptr<void>(),
+          dy_tensor->shape_view().elem_cnt() * GetSizeOfDataType(dy_tensor->data_type()));
+    } else {
+      const T scale_height = GetAreaPixelScale(in_height, out_height, align_corners, height_scale);
+      const T scale_width = GetAreaPixelScale(in_width, out_width, align_corners, width_scale);
+
+      RUN_CUDA_KERNEL((UpsampleBicubic2dBackward<T>), ctx->stream(), elem_cnt, elem_cnt,
+                      dy_tensor->dptr<T>(), nbatch, channels, in_height, in_width, out_height,
+                      out_width, scale_height, scale_width, align_corners,
+                      dx_tensor->mut_dptr<T>());
+    }
+  }
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
+};
+
+#define REGISTER_UPSAMPLE_BICUBIC_CUDA_KERNEL(dtype)                                    \
+  REGISTER_USER_KERNEL("upsample_bicubic_2d")                                           \
+      .SetCreateFn<UpsampleBicubic2dGPUKernel<dtype>>()                                 \
+      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                  \
+                       && (user_op::HobDataType("y", 0) == GetDataType<dtype>::value)); \
+  REGISTER_USER_KERNEL("upsample_bicubic_2d_grad")                                      \
+      .SetCreateFn<UpsampleBicubic2dGradGPUKernel<dtype>>()                             \
+      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                  \
+                       && (user_op::HobDataType("dx", 0) == GetDataType<dtype>::value));
+
+REGISTER_UPSAMPLE_BICUBIC_CUDA_KERNEL(float)
+REGISTER_UPSAMPLE_BICUBIC_CUDA_KERNEL(double)
+
 }  // namespace oneflow
\ No newline at end of file
diff --git a/oneflow/user/kernels/upsample_bilinear_2d_kernel.hip.cpp b/oneflow/user/kernels/upsample_bilinear_2d_kernel.hip.cpp
index 1a4eb29..b1756a9 100644
--- a/oneflow/user/kernels/upsample_bilinear_2d_kernel.hip.cpp
+++ b/oneflow/user/kernels/upsample_bilinear_2d_kernel.hip.cpp
@@ -1,190 +1,190 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "hip/hip_runtime.h"
-#include "oneflow/core/framework/framework.h"
-#include "oneflow/core/kernel/new_kernel_util.h"
-#include "oneflow/core/common/nd_index_offset_helper.h"
-#include "oneflow/core/hip/atomic.hip.h"
-#include "oneflow/user/kernels/upsample_kernel.h"
-
-namespace oneflow {
-
-namespace {
-
-template<typename T>
-__global__ void UpsampleBilinear2DForward(const int64_t elem_cnt, const T* in_dptr,
-                                          NdIndexOffsetHelper<int64_t, 4> in_helper,
-                                          NdIndexOffsetHelper<int64_t, 4> out_helper,
-                                          const int64_t in_height, const int64_t in_width,
-                                          const T scale_h, const T scale_w,
-                                          const bool align_corners, T* out_dptr) {
-  CUDA_1D_KERNEL_LOOP(index, elem_cnt) {
-    int64_t n, c, h, w;
-    out_helper.OffsetToNdIndex(index, n, c, h, w);
-    BilinearParam<T> params;
-    GetBilinearParam(align_corners, h, w, in_height, in_width, scale_h, scale_w, &params);
-    const int64_t top_offset = in_helper.NdIndexToOffset(n, c, params.top_h_index, 0);
-    const int64_t bottom_offset = in_helper.NdIndexToOffset(n, c, params.bottom_h_index, 0);
-    const T top_left = in_dptr[top_offset + params.left_w_index];
-    const T top_right = in_dptr[top_offset + params.right_w_index];
-    const T bottom_left = in_dptr[bottom_offset + params.left_w_index];
-    const T bottom_right = in_dptr[bottom_offset + params.right_w_index];
-    out_dptr[index] =
-        (1 - params.h_lerp) * ((1 - params.w_lerp) * top_left + params.w_lerp * top_right)
-        + params.h_lerp * ((1 - params.w_lerp) * bottom_left + params.w_lerp * bottom_right);
-  }
-}
-
-template<typename T>
-__global__ void UpsampleBilinearBackward(const int64_t elem_cnt, const T* dy_dptr,
-                                         NdIndexOffsetHelper<int64_t, 4> dy_helper,
-                                         NdIndexOffsetHelper<int64_t, 4> dx_helper,
-                                         const int64_t dx_height, const int64_t dx_width,
-                                         const T scale_h, const T scale_w, const bool align_corners,
-                                         T* dx_dptr) {
-  CUDA_1D_KERNEL_LOOP(index, elem_cnt) {
-    int64_t n, c, h, w;
-    dy_helper.OffsetToNdIndex(index, n, c, h, w);
-    BilinearParam<T> params;
-    GetBilinearParam(align_corners, h, w, dx_height, dx_width, scale_h, scale_w, &params);
-    const int64_t top_offset = dx_helper.NdIndexToOffset(n, c, params.top_h_index, 0);
-    const int64_t bottom_offset = dx_helper.NdIndexToOffset(n, c, params.bottom_h_index, 0);
-    const T dy = dy_dptr[index];
-    const T dbottom = params.h_lerp * dy;
-    T* dx_dptr_bottom_offset = dx_dptr + bottom_offset;
-    cuda::atomic::Add(dx_dptr_bottom_offset + params.left_w_index,
-                      static_cast<T>((1 - params.w_lerp) * dbottom));
-    cuda::atomic::Add(dx_dptr_bottom_offset + params.right_w_index,
-                      static_cast<T>(params.w_lerp * dbottom));
-    const T dtop = dy - dbottom;
-    T* dx_dptr_top_offset = dx_dptr + top_offset;
-    cuda::atomic::Add(dx_dptr_top_offset + params.left_w_index,
-                      static_cast<T>((1 - params.w_lerp) * dtop));
-    cuda::atomic::Add(dx_dptr_top_offset + params.right_w_index,
-                      static_cast<T>(params.w_lerp * dtop));
-  }
-}
-
-}  // namespace
-
-template<typename T>
-class UpsampleBilinear2DGPUKernel final : public user_op::OpKernel {
- public:
-  UpsampleBilinear2DGPUKernel() = default;
-  ~UpsampleBilinear2DGPUKernel() = default;
-
- private:
-  using user_op::OpKernel::Compute;
-  void Compute(user_op::KernelComputeContext* ctx) const override {
-    const user_op::Tensor* x_tensor = ctx->Tensor4ArgNameAndIndex("x", 0);
-    user_op::Tensor* y_tensor = ctx->Tensor4ArgNameAndIndex("y", 0);
-    const bool align_corners = ctx->Attr<bool>("align_corners");
-    const std::vector<int64_t> output_size = ctx->Attr<std::vector<int64_t>>("output_size");
-    double height_scale = ctx->Attr<double>("height_scale");
-    double width_scale = ctx->Attr<double>("width_scale");
-    const int64_t elem_cnt = y_tensor->shape_view().elem_cnt();
-    NdIndexOffsetHelper<int64_t, 4> in_helper(
-        x_tensor->shape_view().At(0), x_tensor->shape_view().At(1), x_tensor->shape_view().At(2),
-        x_tensor->shape_view().At(3));
-    NdIndexOffsetHelper<int64_t, 4> out_helper(
-        y_tensor->shape_view().At(0), y_tensor->shape_view().At(1), y_tensor->shape_view().At(2),
-        y_tensor->shape_view().At(3));
-
-    const int64_t in_height = x_tensor->shape_view().At(2);
-    const int64_t in_width = x_tensor->shape_view().At(3);
-    const int64_t out_height = y_tensor->shape_view().At(2);
-    const int64_t out_width = y_tensor->shape_view().At(3);
-    if (!output_size.empty()) {
-      height_scale = static_cast<double>(out_height) / static_cast<double>(in_height);
-      width_scale = static_cast<double>(out_width) / static_cast<double>(in_width);
-    }
-    if (in_height == out_height && in_width == out_width) {
-      Memcpy<DeviceType::kCUDA>(
-          ctx->stream(), y_tensor->mut_dptr<void>(), x_tensor->dptr<void>(),
-          x_tensor->shape_view().elem_cnt() * GetSizeOfDataType(x_tensor->data_type()));
-    } else {
-      const T scale_height = GetAreaPixelScale(in_height, out_height, align_corners, height_scale);
-      const T scale_width = GetAreaPixelScale(in_width, out_width, align_corners, width_scale);
-      RUN_CUDA_KERNEL((UpsampleBilinear2DForward<T>), ctx->stream(), elem_cnt, elem_cnt,
-                      x_tensor->dptr<T>(), in_helper, out_helper, in_height, in_width, scale_height,
-                      scale_width, align_corners, y_tensor->mut_dptr<T>());
-    }
-  }
-  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
-};
-
-template<typename T>
-class UpsampleBilinear2DGradGPUKernel final : public user_op::OpKernel {
- public:
-  UpsampleBilinear2DGradGPUKernel() = default;
-  ~UpsampleBilinear2DGradGPUKernel() = default;
-
- private:
-  using user_op::OpKernel::Compute;
-  void Compute(user_op::KernelComputeContext* ctx) const override {
-    user_op::Tensor* dx_tensor = ctx->Tensor4ArgNameAndIndex("dx", 0);
-    Memset<DeviceType::kCUDA>(ctx->stream(), dx_tensor->mut_dptr<T>(), 0,
-                              dx_tensor->shape_view().elem_cnt() * sizeof(T));
-    const user_op::Tensor* dy_tensor = ctx->Tensor4ArgNameAndIndex("dy", 0);
-    const bool align_corners = ctx->Attr<bool>("align_corners");
-    const std::vector<int64_t> output_size = ctx->Attr<std::vector<int64_t>>("output_size");
-    double height_scale = ctx->Attr<double>("height_scale");
-    double width_scale = ctx->Attr<double>("width_scale");
-    const int64_t elem_cnt = dy_tensor->shape_view().elem_cnt();
-    NdIndexOffsetHelper<int64_t, 4> dy_helper(
-        dy_tensor->shape_view().At(0), dy_tensor->shape_view().At(1), dy_tensor->shape_view().At(2),
-        dy_tensor->shape_view().At(3));
-    NdIndexOffsetHelper<int64_t, 4> dx_helper(
-        dx_tensor->shape_view().At(0), dx_tensor->shape_view().At(1), dx_tensor->shape_view().At(2),
-        dx_tensor->shape_view().At(3));
-
-    const int64_t in_height = dx_tensor->shape_view().At(2);
-    const int64_t in_width = dx_tensor->shape_view().At(3);
-    const int64_t out_height = dy_tensor->shape_view().At(2);
-    const int64_t out_width = dy_tensor->shape_view().At(3);
-    if (!output_size.empty()) {
-      height_scale = static_cast<double>(out_height) / static_cast<double>(in_height);
-      width_scale = static_cast<double>(out_width) / static_cast<double>(in_width);
-    }
-    if (in_height == out_height && in_width == out_width) {
-      Memcpy<DeviceType::kCUDA>(
-          ctx->stream(), dx_tensor->mut_dptr<void>(), dy_tensor->dptr<void>(),
-          dy_tensor->shape_view().elem_cnt() * GetSizeOfDataType(dy_tensor->data_type()));
-    } else {
-      const T scale_height = GetAreaPixelScale(in_height, out_height, align_corners, height_scale);
-      const T scale_width = GetAreaPixelScale(in_width, out_width, align_corners, width_scale);
-      RUN_CUDA_KERNEL((UpsampleBilinearBackward<T>), ctx->stream(), elem_cnt, elem_cnt,
-                      dy_tensor->dptr<T>(), dy_helper, dx_helper, in_height, in_width, scale_height,
-                      scale_width, align_corners, dx_tensor->mut_dptr<T>());
-    }
-  }
-  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
-};
-
-#define REGISTER_UPSAMPLE_BILINEAR_2D_CUDA_KERNEL(dtype)                                \
-  REGISTER_USER_KERNEL("upsample_bilinear_2d")                                          \
-      .SetCreateFn<UpsampleBilinear2DGPUKernel<dtype>>()                                \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                  \
-                       && (user_op::HobDataType("y", 0) == GetDataType<dtype>::value)); \
-  REGISTER_USER_KERNEL("upsample_bilinear_2d_grad")                                     \
-      .SetCreateFn<UpsampleBilinear2DGradGPUKernel<dtype>>()                            \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                  \
-                       && (user_op::HobDataType("dx", 0) == GetDataType<dtype>::value));
-
-REGISTER_UPSAMPLE_BILINEAR_2D_CUDA_KERNEL(float)
-REGISTER_UPSAMPLE_BILINEAR_2D_CUDA_KERNEL(double)
-
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "hip/hip_runtime.h"
+#include "oneflow/core/framework/framework.h"
+#include "oneflow/core/kernel/new_kernel_util.h"
+#include "oneflow/core/common/nd_index_offset_helper.h"
+#include "oneflow/core/hip/atomic.hip.h"
+#include "oneflow/user/kernels/upsample_kernel.h"
+
+namespace oneflow {
+
+namespace {
+
+template<typename T>
+__global__ void UpsampleBilinear2DForward(const int64_t elem_cnt, const T* in_dptr,
+                                          NdIndexOffsetHelper<int64_t, 4> in_helper,
+                                          NdIndexOffsetHelper<int64_t, 4> out_helper,
+                                          const int64_t in_height, const int64_t in_width,
+                                          const T scale_h, const T scale_w,
+                                          const bool align_corners, T* out_dptr) {
+  CUDA_1D_KERNEL_LOOP(index, elem_cnt) {
+    int64_t n, c, h, w;
+    out_helper.OffsetToNdIndex(index, n, c, h, w);
+    BilinearParam<T> params;
+    GetBilinearParam(align_corners, h, w, in_height, in_width, scale_h, scale_w, &params);
+    const int64_t top_offset = in_helper.NdIndexToOffset(n, c, params.top_h_index, 0);
+    const int64_t bottom_offset = in_helper.NdIndexToOffset(n, c, params.bottom_h_index, 0);
+    const T top_left = in_dptr[top_offset + params.left_w_index];
+    const T top_right = in_dptr[top_offset + params.right_w_index];
+    const T bottom_left = in_dptr[bottom_offset + params.left_w_index];
+    const T bottom_right = in_dptr[bottom_offset + params.right_w_index];
+    out_dptr[index] =
+        (1 - params.h_lerp) * ((1 - params.w_lerp) * top_left + params.w_lerp * top_right)
+        + params.h_lerp * ((1 - params.w_lerp) * bottom_left + params.w_lerp * bottom_right);
+  }
+}
+
+template<typename T>
+__global__ void UpsampleBilinearBackward(const int64_t elem_cnt, const T* dy_dptr,
+                                         NdIndexOffsetHelper<int64_t, 4> dy_helper,
+                                         NdIndexOffsetHelper<int64_t, 4> dx_helper,
+                                         const int64_t dx_height, const int64_t dx_width,
+                                         const T scale_h, const T scale_w, const bool align_corners,
+                                         T* dx_dptr) {
+  CUDA_1D_KERNEL_LOOP(index, elem_cnt) {
+    int64_t n, c, h, w;
+    dy_helper.OffsetToNdIndex(index, n, c, h, w);
+    BilinearParam<T> params;
+    GetBilinearParam(align_corners, h, w, dx_height, dx_width, scale_h, scale_w, &params);
+    const int64_t top_offset = dx_helper.NdIndexToOffset(n, c, params.top_h_index, 0);
+    const int64_t bottom_offset = dx_helper.NdIndexToOffset(n, c, params.bottom_h_index, 0);
+    const T dy = dy_dptr[index];
+    const T dbottom = params.h_lerp * dy;
+    T* dx_dptr_bottom_offset = dx_dptr + bottom_offset;
+    cuda::atomic::Add(dx_dptr_bottom_offset + params.left_w_index,
+                      static_cast<T>((1 - params.w_lerp) * dbottom));
+    cuda::atomic::Add(dx_dptr_bottom_offset + params.right_w_index,
+                      static_cast<T>(params.w_lerp * dbottom));
+    const T dtop = dy - dbottom;
+    T* dx_dptr_top_offset = dx_dptr + top_offset;
+    cuda::atomic::Add(dx_dptr_top_offset + params.left_w_index,
+                      static_cast<T>((1 - params.w_lerp) * dtop));
+    cuda::atomic::Add(dx_dptr_top_offset + params.right_w_index,
+                      static_cast<T>(params.w_lerp * dtop));
+  }
+}
+
+}  // namespace
+
+template<typename T>
+class UpsampleBilinear2DGPUKernel final : public user_op::OpKernel {
+ public:
+  UpsampleBilinear2DGPUKernel() = default;
+  ~UpsampleBilinear2DGPUKernel() = default;
+
+ private:
+  using user_op::OpKernel::Compute;
+  void Compute(user_op::KernelComputeContext* ctx) const override {
+    const user_op::Tensor* x_tensor = ctx->Tensor4ArgNameAndIndex("x", 0);
+    user_op::Tensor* y_tensor = ctx->Tensor4ArgNameAndIndex("y", 0);
+    const bool align_corners = ctx->Attr<bool>("align_corners");
+    const std::vector<int64_t> output_size = ctx->Attr<std::vector<int64_t>>("output_size");
+    double height_scale = ctx->Attr<double>("height_scale");
+    double width_scale = ctx->Attr<double>("width_scale");
+    const int64_t elem_cnt = y_tensor->shape_view().elem_cnt();
+    NdIndexOffsetHelper<int64_t, 4> in_helper(
+        x_tensor->shape_view().At(0), x_tensor->shape_view().At(1), x_tensor->shape_view().At(2),
+        x_tensor->shape_view().At(3));
+    NdIndexOffsetHelper<int64_t, 4> out_helper(
+        y_tensor->shape_view().At(0), y_tensor->shape_view().At(1), y_tensor->shape_view().At(2),
+        y_tensor->shape_view().At(3));
+
+    const int64_t in_height = x_tensor->shape_view().At(2);
+    const int64_t in_width = x_tensor->shape_view().At(3);
+    const int64_t out_height = y_tensor->shape_view().At(2);
+    const int64_t out_width = y_tensor->shape_view().At(3);
+    if (!output_size.empty()) {
+      height_scale = static_cast<double>(out_height) / static_cast<double>(in_height);
+      width_scale = static_cast<double>(out_width) / static_cast<double>(in_width);
+    }
+    if (in_height == out_height && in_width == out_width) {
+      Memcpy<DeviceType::kCUDA>(
+          ctx->stream(), y_tensor->mut_dptr<void>(), x_tensor->dptr<void>(),
+          x_tensor->shape_view().elem_cnt() * GetSizeOfDataType(x_tensor->data_type()));
+    } else {
+      const T scale_height = GetAreaPixelScale(in_height, out_height, align_corners, height_scale);
+      const T scale_width = GetAreaPixelScale(in_width, out_width, align_corners, width_scale);
+      RUN_CUDA_KERNEL((UpsampleBilinear2DForward<T>), ctx->stream(), elem_cnt, elem_cnt,
+                      x_tensor->dptr<T>(), in_helper, out_helper, in_height, in_width, scale_height,
+                      scale_width, align_corners, y_tensor->mut_dptr<T>());
+    }
+  }
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
+};
+
+template<typename T>
+class UpsampleBilinear2DGradGPUKernel final : public user_op::OpKernel {
+ public:
+  UpsampleBilinear2DGradGPUKernel() = default;
+  ~UpsampleBilinear2DGradGPUKernel() = default;
+
+ private:
+  using user_op::OpKernel::Compute;
+  void Compute(user_op::KernelComputeContext* ctx) const override {
+    user_op::Tensor* dx_tensor = ctx->Tensor4ArgNameAndIndex("dx", 0);
+    Memset<DeviceType::kCUDA>(ctx->stream(), dx_tensor->mut_dptr<T>(), 0,
+                              dx_tensor->shape_view().elem_cnt() * sizeof(T));
+    const user_op::Tensor* dy_tensor = ctx->Tensor4ArgNameAndIndex("dy", 0);
+    const bool align_corners = ctx->Attr<bool>("align_corners");
+    const std::vector<int64_t> output_size = ctx->Attr<std::vector<int64_t>>("output_size");
+    double height_scale = ctx->Attr<double>("height_scale");
+    double width_scale = ctx->Attr<double>("width_scale");
+    const int64_t elem_cnt = dy_tensor->shape_view().elem_cnt();
+    NdIndexOffsetHelper<int64_t, 4> dy_helper(
+        dy_tensor->shape_view().At(0), dy_tensor->shape_view().At(1), dy_tensor->shape_view().At(2),
+        dy_tensor->shape_view().At(3));
+    NdIndexOffsetHelper<int64_t, 4> dx_helper(
+        dx_tensor->shape_view().At(0), dx_tensor->shape_view().At(1), dx_tensor->shape_view().At(2),
+        dx_tensor->shape_view().At(3));
+
+    const int64_t in_height = dx_tensor->shape_view().At(2);
+    const int64_t in_width = dx_tensor->shape_view().At(3);
+    const int64_t out_height = dy_tensor->shape_view().At(2);
+    const int64_t out_width = dy_tensor->shape_view().At(3);
+    if (!output_size.empty()) {
+      height_scale = static_cast<double>(out_height) / static_cast<double>(in_height);
+      width_scale = static_cast<double>(out_width) / static_cast<double>(in_width);
+    }
+    if (in_height == out_height && in_width == out_width) {
+      Memcpy<DeviceType::kCUDA>(
+          ctx->stream(), dx_tensor->mut_dptr<void>(), dy_tensor->dptr<void>(),
+          dy_tensor->shape_view().elem_cnt() * GetSizeOfDataType(dy_tensor->data_type()));
+    } else {
+      const T scale_height = GetAreaPixelScale(in_height, out_height, align_corners, height_scale);
+      const T scale_width = GetAreaPixelScale(in_width, out_width, align_corners, width_scale);
+      RUN_CUDA_KERNEL((UpsampleBilinearBackward<T>), ctx->stream(), elem_cnt, elem_cnt,
+                      dy_tensor->dptr<T>(), dy_helper, dx_helper, in_height, in_width, scale_height,
+                      scale_width, align_corners, dx_tensor->mut_dptr<T>());
+    }
+  }
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
+};
+
+#define REGISTER_UPSAMPLE_BILINEAR_2D_CUDA_KERNEL(dtype)                                \
+  REGISTER_USER_KERNEL("upsample_bilinear_2d")                                          \
+      .SetCreateFn<UpsampleBilinear2DGPUKernel<dtype>>()                                \
+      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                  \
+                       && (user_op::HobDataType("y", 0) == GetDataType<dtype>::value)); \
+  REGISTER_USER_KERNEL("upsample_bilinear_2d_grad")                                     \
+      .SetCreateFn<UpsampleBilinear2DGradGPUKernel<dtype>>()                            \
+      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                  \
+                       && (user_op::HobDataType("dx", 0) == GetDataType<dtype>::value));
+
+REGISTER_UPSAMPLE_BILINEAR_2D_CUDA_KERNEL(float)
+REGISTER_UPSAMPLE_BILINEAR_2D_CUDA_KERNEL(double)
+
 }  // namespace oneflow
\ No newline at end of file
diff --git a/oneflow/user/kernels/upsample_linear_1d_kernel.hip.cpp b/oneflow/user/kernels/upsample_linear_1d_kernel.hip.cpp
index a949f1e..9850fa2 100644
--- a/oneflow/user/kernels/upsample_linear_1d_kernel.hip.cpp
+++ b/oneflow/user/kernels/upsample_linear_1d_kernel.hip.cpp
@@ -1,163 +1,163 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "hip/hip_runtime.h"
-#include "oneflow/core/framework/framework.h"
-#include "oneflow/core/kernel/new_kernel_util.h"
-#include "oneflow/core/common/nd_index_offset_helper.h"
-#include "oneflow/core/hip/atomic.hip.h"
-#include "oneflow/user/kernels/upsample_kernel.h"
-
-namespace oneflow {
-
-namespace {
-
-template<typename T>
-__global__ void UpsampleLinear1DForward(const int64_t elem_cnt, const T* in_dptr,
-                                        NdIndexOffsetHelper<int64_t, 3> in_helper,
-                                        NdIndexOffsetHelper<int64_t, 3> out_helper,
-                                        const int in_height, const double scale_factor,
-                                        bool align_corners, T* out_dptr) {
-  CUDA_1D_KERNEL_LOOP(index, elem_cnt) {
-    int64_t n, c, h;
-    out_helper.OffsetToNdIndex(index, n, c, h);
-    const double h1r = GetLinearInputIndex(h, scale_factor, align_corners);
-    const int64_t h1 = h1r;
-    const int64_t h1p = (h1 < in_height - 1) ? 1 : 0;
-    const double h1lambda = h1r - h1;
-    const double h0lambda = static_cast<double>(1.) - h1lambda;
-    out_dptr[index] = h0lambda * in_dptr[in_helper.NdIndexToOffset(n, c, h1)]
-                      + h1lambda * in_dptr[in_helper.NdIndexToOffset(n, c, h1 + h1p)];
-  }
-}
-
-template<typename T>
-__global__ void UpsampleLinear1DBackward(const int64_t elem_cnt, const T* dy_dptr,
-                                         NdIndexOffsetHelper<int64_t, 3> dy_helper,
-                                         NdIndexOffsetHelper<int64_t, 3> dx_helper,
-                                         const int in_height, const double scale_factor,
-                                         bool align_corners, T* dx_dptr) {
-  CUDA_1D_KERNEL_LOOP(index, elem_cnt) {
-    int64_t n, c, h;
-    dy_helper.OffsetToNdIndex(index, n, c, h);
-    const double h1r = GetLinearInputIndex(h, scale_factor, align_corners);
-    const int64_t h1 = h1r;
-    const int64_t h1p = (h1 < in_height - 1) ? 1 : 0;
-    const double h1lambda = h1r - h1;
-    const double h0lambda = static_cast<double>(1.) - h1lambda;
-
-    cuda::atomic::Add(dx_dptr + dx_helper.NdIndexToOffset(n, c, h1), h0lambda * dy_dptr[index]);
-    cuda::atomic::Add(dx_dptr + dx_helper.NdIndexToOffset(n, c, h1 + h1p),
-                      h1lambda * dy_dptr[index]);
-  }
-}
-
-}  // namespace
-
-template<typename T>
-class UpsampleLinear1DGPUKernel final : public user_op::OpKernel {
- public:
-  UpsampleLinear1DGPUKernel() = default;
-  ~UpsampleLinear1DGPUKernel() = default;
-
- private:
-  using user_op::OpKernel::Compute;
-  void Compute(user_op::KernelComputeContext* ctx) const override {
-    const user_op::Tensor* x_tensor = ctx->Tensor4ArgNameAndIndex("x", 0);
-    user_op::Tensor* y_tensor = ctx->Tensor4ArgNameAndIndex("y", 0);
-    const bool align_corners = ctx->Attr<bool>("align_corners");
-    const int64_t elem_cnt = y_tensor->shape_view().elem_cnt();
-    NdIndexOffsetHelper<int64_t, 3> in_helper(
-        x_tensor->shape_view().At(0), x_tensor->shape_view().At(1), x_tensor->shape_view().At(2));
-    NdIndexOffsetHelper<int64_t, 3> out_helper(
-        y_tensor->shape_view().At(0), y_tensor->shape_view().At(1), y_tensor->shape_view().At(2));
-    const int64_t in_height = x_tensor->shape_view().At(2);
-    const int64_t out_height = y_tensor->shape_view().At(2);
-    const std::vector<int64_t> output_size = ctx->Attr<std::vector<int64_t>>("output_size");
-    double height_scale = ctx->Attr<double>("scale_factor");
-    if (!output_size.empty()) {
-      height_scale = static_cast<double>(out_height) / static_cast<double>(in_height);
-    }
-    if (in_height == out_height) {
-      Memcpy<DeviceType::kCUDA>(
-          ctx->stream(), y_tensor->mut_dptr<void>(), x_tensor->dptr<void>(),
-          x_tensor->shape_view().elem_cnt() * GetSizeOfDataType(x_tensor->data_type()));
-    } else {
-      const T scale_height = GetAreaPixelScale(in_height, out_height, align_corners, height_scale);
-      RUN_CUDA_KERNEL((UpsampleLinear1DForward<T>), ctx->stream(), elem_cnt, elem_cnt,
-                      x_tensor->dptr<T>(), in_helper, out_helper, in_height, scale_height,
-                      align_corners, y_tensor->mut_dptr<T>());
-    }
-  }
-  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
-};
-
-template<typename T>
-class UpsampleLinearGrad1DGPUKernel final : public user_op::OpKernel {
- public:
-  UpsampleLinearGrad1DGPUKernel() = default;
-  ~UpsampleLinearGrad1DGPUKernel() = default;
-
- private:
-  using user_op::OpKernel::Compute;
-  void Compute(user_op::KernelComputeContext* ctx) const override {
-    user_op::Tensor* dx_tensor = ctx->Tensor4ArgNameAndIndex("dx", 0);
-    Memset<DeviceType::kCUDA>(ctx->stream(), dx_tensor->mut_dptr<T>(), 0,
-                              dx_tensor->shape_view().elem_cnt() * sizeof(T));
-    const user_op::Tensor* dy_tensor = ctx->Tensor4ArgNameAndIndex("dy", 0);
-    const bool align_corners = ctx->Attr<bool>("align_corners");
-
-    NdIndexOffsetHelper<int64_t, 3> dy_helper(dy_tensor->shape_view().At(0),
-                                              dy_tensor->shape_view().At(1),
-                                              dy_tensor->shape_view().At(2));
-    NdIndexOffsetHelper<int64_t, 3> dx_helper(dx_tensor->shape_view().At(0),
-                                              dx_tensor->shape_view().At(1),
-                                              dx_tensor->shape_view().At(2));
-    const int64_t elem_cnt = dy_tensor->shape_view().elem_cnt();
-    const int64_t in_height = dx_tensor->shape_view().At(2);
-    const int64_t out_height = dy_tensor->shape_view().At(2);
-    const std::vector<int64_t> output_size = ctx->Attr<std::vector<int64_t>>("output_size");
-    double height_scale = ctx->Attr<double>("scale_factor");
-    if (!output_size.empty()) {
-      height_scale = static_cast<double>(out_height) / static_cast<double>(in_height);
-    }
-    if (in_height == out_height) {
-      Memcpy<DeviceType::kCUDA>(
-          ctx->stream(), dx_tensor->mut_dptr<void>(), dy_tensor->dptr<void>(),
-          dy_tensor->shape_view().elem_cnt() * GetSizeOfDataType(dy_tensor->data_type()));
-    } else {
-      const T scale_height = GetAreaPixelScale(in_height, out_height, align_corners, height_scale);
-      RUN_CUDA_KERNEL((UpsampleLinear1DBackward<T>), ctx->stream(), elem_cnt, elem_cnt,
-                      dy_tensor->dptr<T>(), dy_helper, dx_helper, in_height, scale_height,
-                      align_corners, dx_tensor->mut_dptr<T>());
-    }
-  }
-  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
-};
-
-#define REGISTER_UPSAMPLELINEAR1D_CUDA_KERNEL(dtype)                                    \
-  REGISTER_USER_KERNEL("upsample_linear_1d")                                            \
-      .SetCreateFn<UpsampleLinear1DGPUKernel<dtype>>()                                  \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                  \
-                       && (user_op::HobDataType("y", 0) == GetDataType<dtype>::value)); \
-  REGISTER_USER_KERNEL("upsample_linear_1d_grad")                                       \
-      .SetCreateFn<UpsampleLinearGrad1DGPUKernel<dtype>>()                              \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                  \
-                       && (user_op::HobDataType("dx", 0) == GetDataType<dtype>::value));
-
-REGISTER_UPSAMPLELINEAR1D_CUDA_KERNEL(float)
-REGISTER_UPSAMPLELINEAR1D_CUDA_KERNEL(double)
-
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "hip/hip_runtime.h"
+#include "oneflow/core/framework/framework.h"
+#include "oneflow/core/kernel/new_kernel_util.h"
+#include "oneflow/core/common/nd_index_offset_helper.h"
+#include "oneflow/core/hip/atomic.hip.h"
+#include "oneflow/user/kernels/upsample_kernel.h"
+
+namespace oneflow {
+
+namespace {
+
+template<typename T>
+__global__ void UpsampleLinear1DForward(const int64_t elem_cnt, const T* in_dptr,
+                                        NdIndexOffsetHelper<int64_t, 3> in_helper,
+                                        NdIndexOffsetHelper<int64_t, 3> out_helper,
+                                        const int in_height, const double scale_factor,
+                                        bool align_corners, T* out_dptr) {
+  CUDA_1D_KERNEL_LOOP(index, elem_cnt) {
+    int64_t n, c, h;
+    out_helper.OffsetToNdIndex(index, n, c, h);
+    const double h1r = GetLinearInputIndex(h, scale_factor, align_corners);
+    const int64_t h1 = h1r;
+    const int64_t h1p = (h1 < in_height - 1) ? 1 : 0;
+    const double h1lambda = h1r - h1;
+    const double h0lambda = static_cast<double>(1.) - h1lambda;
+    out_dptr[index] = h0lambda * in_dptr[in_helper.NdIndexToOffset(n, c, h1)]
+                      + h1lambda * in_dptr[in_helper.NdIndexToOffset(n, c, h1 + h1p)];
+  }
+}
+
+template<typename T>
+__global__ void UpsampleLinear1DBackward(const int64_t elem_cnt, const T* dy_dptr,
+                                         NdIndexOffsetHelper<int64_t, 3> dy_helper,
+                                         NdIndexOffsetHelper<int64_t, 3> dx_helper,
+                                         const int in_height, const double scale_factor,
+                                         bool align_corners, T* dx_dptr) {
+  CUDA_1D_KERNEL_LOOP(index, elem_cnt) {
+    int64_t n, c, h;
+    dy_helper.OffsetToNdIndex(index, n, c, h);
+    const double h1r = GetLinearInputIndex(h, scale_factor, align_corners);
+    const int64_t h1 = h1r;
+    const int64_t h1p = (h1 < in_height - 1) ? 1 : 0;
+    const double h1lambda = h1r - h1;
+    const double h0lambda = static_cast<double>(1.) - h1lambda;
+
+    cuda::atomic::Add(dx_dptr + dx_helper.NdIndexToOffset(n, c, h1), h0lambda * dy_dptr[index]);
+    cuda::atomic::Add(dx_dptr + dx_helper.NdIndexToOffset(n, c, h1 + h1p),
+                      h1lambda * dy_dptr[index]);
+  }
+}
+
+}  // namespace
+
+template<typename T>
+class UpsampleLinear1DGPUKernel final : public user_op::OpKernel {
+ public:
+  UpsampleLinear1DGPUKernel() = default;
+  ~UpsampleLinear1DGPUKernel() = default;
+
+ private:
+  using user_op::OpKernel::Compute;
+  void Compute(user_op::KernelComputeContext* ctx) const override {
+    const user_op::Tensor* x_tensor = ctx->Tensor4ArgNameAndIndex("x", 0);
+    user_op::Tensor* y_tensor = ctx->Tensor4ArgNameAndIndex("y", 0);
+    const bool align_corners = ctx->Attr<bool>("align_corners");
+    const int64_t elem_cnt = y_tensor->shape_view().elem_cnt();
+    NdIndexOffsetHelper<int64_t, 3> in_helper(
+        x_tensor->shape_view().At(0), x_tensor->shape_view().At(1), x_tensor->shape_view().At(2));
+    NdIndexOffsetHelper<int64_t, 3> out_helper(
+        y_tensor->shape_view().At(0), y_tensor->shape_view().At(1), y_tensor->shape_view().At(2));
+    const int64_t in_height = x_tensor->shape_view().At(2);
+    const int64_t out_height = y_tensor->shape_view().At(2);
+    const std::vector<int64_t> output_size = ctx->Attr<std::vector<int64_t>>("output_size");
+    double height_scale = ctx->Attr<double>("scale_factor");
+    if (!output_size.empty()) {
+      height_scale = static_cast<double>(out_height) / static_cast<double>(in_height);
+    }
+    if (in_height == out_height) {
+      Memcpy<DeviceType::kCUDA>(
+          ctx->stream(), y_tensor->mut_dptr<void>(), x_tensor->dptr<void>(),
+          x_tensor->shape_view().elem_cnt() * GetSizeOfDataType(x_tensor->data_type()));
+    } else {
+      const T scale_height = GetAreaPixelScale(in_height, out_height, align_corners, height_scale);
+      RUN_CUDA_KERNEL((UpsampleLinear1DForward<T>), ctx->stream(), elem_cnt, elem_cnt,
+                      x_tensor->dptr<T>(), in_helper, out_helper, in_height, scale_height,
+                      align_corners, y_tensor->mut_dptr<T>());
+    }
+  }
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
+};
+
+template<typename T>
+class UpsampleLinearGrad1DGPUKernel final : public user_op::OpKernel {
+ public:
+  UpsampleLinearGrad1DGPUKernel() = default;
+  ~UpsampleLinearGrad1DGPUKernel() = default;
+
+ private:
+  using user_op::OpKernel::Compute;
+  void Compute(user_op::KernelComputeContext* ctx) const override {
+    user_op::Tensor* dx_tensor = ctx->Tensor4ArgNameAndIndex("dx", 0);
+    Memset<DeviceType::kCUDA>(ctx->stream(), dx_tensor->mut_dptr<T>(), 0,
+                              dx_tensor->shape_view().elem_cnt() * sizeof(T));
+    const user_op::Tensor* dy_tensor = ctx->Tensor4ArgNameAndIndex("dy", 0);
+    const bool align_corners = ctx->Attr<bool>("align_corners");
+
+    NdIndexOffsetHelper<int64_t, 3> dy_helper(dy_tensor->shape_view().At(0),
+                                              dy_tensor->shape_view().At(1),
+                                              dy_tensor->shape_view().At(2));
+    NdIndexOffsetHelper<int64_t, 3> dx_helper(dx_tensor->shape_view().At(0),
+                                              dx_tensor->shape_view().At(1),
+                                              dx_tensor->shape_view().At(2));
+    const int64_t elem_cnt = dy_tensor->shape_view().elem_cnt();
+    const int64_t in_height = dx_tensor->shape_view().At(2);
+    const int64_t out_height = dy_tensor->shape_view().At(2);
+    const std::vector<int64_t> output_size = ctx->Attr<std::vector<int64_t>>("output_size");
+    double height_scale = ctx->Attr<double>("scale_factor");
+    if (!output_size.empty()) {
+      height_scale = static_cast<double>(out_height) / static_cast<double>(in_height);
+    }
+    if (in_height == out_height) {
+      Memcpy<DeviceType::kCUDA>(
+          ctx->stream(), dx_tensor->mut_dptr<void>(), dy_tensor->dptr<void>(),
+          dy_tensor->shape_view().elem_cnt() * GetSizeOfDataType(dy_tensor->data_type()));
+    } else {
+      const T scale_height = GetAreaPixelScale(in_height, out_height, align_corners, height_scale);
+      RUN_CUDA_KERNEL((UpsampleLinear1DBackward<T>), ctx->stream(), elem_cnt, elem_cnt,
+                      dy_tensor->dptr<T>(), dy_helper, dx_helper, in_height, scale_height,
+                      align_corners, dx_tensor->mut_dptr<T>());
+    }
+  }
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
+};
+
+#define REGISTER_UPSAMPLELINEAR1D_CUDA_KERNEL(dtype)                                    \
+  REGISTER_USER_KERNEL("upsample_linear_1d")                                            \
+      .SetCreateFn<UpsampleLinear1DGPUKernel<dtype>>()                                  \
+      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                  \
+                       && (user_op::HobDataType("y", 0) == GetDataType<dtype>::value)); \
+  REGISTER_USER_KERNEL("upsample_linear_1d_grad")                                       \
+      .SetCreateFn<UpsampleLinearGrad1DGPUKernel<dtype>>()                              \
+      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                  \
+                       && (user_op::HobDataType("dx", 0) == GetDataType<dtype>::value));
+
+REGISTER_UPSAMPLELINEAR1D_CUDA_KERNEL(float)
+REGISTER_UPSAMPLELINEAR1D_CUDA_KERNEL(double)
+
 }  // namespace oneflow
\ No newline at end of file
diff --git a/oneflow/user/kernels/upsample_nearest_kernel.hip.cpp b/oneflow/user/kernels/upsample_nearest_kernel.hip.cpp
index c007355..4de05b7 100644
--- a/oneflow/user/kernels/upsample_nearest_kernel.hip.cpp
+++ b/oneflow/user/kernels/upsample_nearest_kernel.hip.cpp
@@ -1,412 +1,412 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "hip/hip_runtime.h"
-#include "oneflow/core/framework/framework.h"
-#include "oneflow/core/kernel/new_kernel_util.h"
-#include "oneflow/core/common/nd_index_offset_helper.h"
-#include "oneflow/core/hip/atomic.hip.h"
-#include "oneflow/user/kernels/upsample_kernel.h"
-
-namespace oneflow {
-
-namespace {
-
-template<typename T>
-__global__ void UpsampleNearest1DForward(const int64_t elem_cnt, const T* in_dptr,
-                                         NdIndexOffsetHelper<int64_t, 3> in_helper,
-                                         NdIndexOffsetHelper<int64_t, 3> out_helper,
-                                         const int64_t in_height, const double scale_factor,
-                                         T* out_dptr) {
-  CUDA_1D_KERNEL_LOOP(index, elem_cnt) {
-    int64_t n, c, h;
-    out_helper.OffsetToNdIndex(index, n, c, h);
-    const int64_t in_h = GetNearestInputIndex(h, scale_factor, in_height);
-    out_dptr[index] = in_dptr[in_helper.NdIndexToOffset(n, c, in_h)];
-  }
-}
-
-template<typename T>
-__global__ void UpsampleNearest1DBackward(const int64_t elem_cnt, const T* dy_dptr,
-                                          NdIndexOffsetHelper<int64_t, 3> dy_helper,
-                                          NdIndexOffsetHelper<int64_t, 3> dx_helper,
-                                          const int64_t in_height, const double scale_factor,
-                                          T* dx_dptr) {
-  CUDA_1D_KERNEL_LOOP(index, elem_cnt) {
-    int64_t n, c, h;
-    dy_helper.OffsetToNdIndex(index, n, c, h);
-    const int64_t dx_h = GetNearestInputIndex(h, scale_factor, in_height);
-    cuda::atomic::Add(dx_dptr + dx_helper.NdIndexToOffset(n, c, dx_h), dy_dptr[index]);
-  }
-}
-
-template<typename T>
-__global__ void UpsampleNearest2DForward(const int64_t elem_cnt, const T* in_dptr,
-                                         NdIndexOffsetHelper<int64_t, 4> in_helper,
-                                         NdIndexOffsetHelper<int64_t, 4> out_helper,
-                                         const int64_t in_height, const int64_t in_width,
-                                         const double scale_h, const double scale_w, T* out_dptr) {
-  CUDA_1D_KERNEL_LOOP(index, elem_cnt) {
-    int64_t n, c, h, w;
-    out_helper.OffsetToNdIndex(index, n, c, h, w);
-    const int64_t in_h = GetNearestInputIndex(h, scale_h, in_height);
-    const int64_t in_w = GetNearestInputIndex(w, scale_w, in_width);
-    out_dptr[index] = in_dptr[in_helper.NdIndexToOffset(n, c, in_h, in_w)];
-  }
-}
-
-template<typename T>
-__global__ void UpsampleNearest2DBackward(const int64_t elem_cnt, const T* dy_dptr,
-                                          NdIndexOffsetHelper<int64_t, 4> dy_helper,
-                                          NdIndexOffsetHelper<int64_t, 4> dx_helper,
-                                          const int64_t dx_height, const int64_t dx_width,
-                                          const double scale_h, const double scale_w, T* dx_dptr) {
-  CUDA_1D_KERNEL_LOOP(index, elem_cnt) {
-    int64_t n, c, h, w;
-    dy_helper.OffsetToNdIndex(index, n, c, h, w);
-    const int64_t dx_h = GetNearestInputIndex(h, scale_h, dx_height);
-    const int64_t dx_w = GetNearestInputIndex(w, scale_w, dx_width);
-    cuda::atomic::Add(dx_dptr + dx_helper.NdIndexToOffset(n, c, dx_h, dx_w), dy_dptr[index]);
-  }
-}
-
-template<typename T>
-__global__ void UpsampleNearest3DForward(const int64_t elem_cnt, const T* in_dptr,
-                                         NdIndexOffsetHelper<int64_t, 5> in_helper,
-                                         NdIndexOffsetHelper<int64_t, 5> out_helper,
-                                         const int64_t in_depth, const int64_t in_height,
-                                         const int64_t in_width, const float scale_d,
-                                         const float scale_h, const float scale_w, T* out_dptr) {
-  CUDA_1D_KERNEL_LOOP(index, elem_cnt) {
-    int64_t n, c, d, h, w;
-    out_helper.OffsetToNdIndex(index, n, c, d, h, w);
-    const int64_t in_h = GetNearestInputIndex(h, scale_h, in_height);
-    const int64_t in_w = GetNearestInputIndex(w, scale_w, in_width);
-    const int64_t in_d = GetNearestInputIndex(d, scale_d, in_depth);
-    out_dptr[index] = in_dptr[in_helper.NdIndexToOffset(n, c, in_d, in_h, in_w)];
-  }
-}
-
-template<typename T>
-__global__ void UpsampleNearest3DBackward(const int64_t elem_cnt, const T* dy_dptr,
-                                          NdIndexOffsetHelper<int64_t, 5> dy_helper,
-                                          NdIndexOffsetHelper<int64_t, 5> dx_helper,
-                                          const int64_t in_depth, const int64_t in_height,
-                                          const int64_t in_width, const float scale_d,
-                                          const float scale_h, const float scale_w, T* dx_dptr) {
-  CUDA_1D_KERNEL_LOOP(index, elem_cnt) {
-    int64_t n, c, d, h, w;
-    dy_helper.OffsetToNdIndex(index, n, c, d, h, w);
-    const int64_t dx_h = GetNearestInputIndex(h, scale_h, in_height);
-    const int64_t dx_w = GetNearestInputIndex(w, scale_w, in_width);
-    const int64_t in_d = GetNearestInputIndex(d, scale_d, in_depth);
-    cuda::atomic::Add(dx_dptr + dx_helper.NdIndexToOffset(n, c, in_d, dx_h, dx_w), dy_dptr[index]);
-  }
-}
-
-}  // namespace
-
-template<typename T>
-class UpsampleNearest1DGPUKernel final : public user_op::OpKernel {
- public:
-  UpsampleNearest1DGPUKernel() = default;
-  ~UpsampleNearest1DGPUKernel() = default;
-
- private:
-  using user_op::OpKernel::Compute;
-  void Compute(user_op::KernelComputeContext* ctx) const override {
-    const user_op::Tensor* x_tensor = ctx->Tensor4ArgNameAndIndex("x", 0);
-    user_op::Tensor* y_tensor = ctx->Tensor4ArgNameAndIndex("y", 0);
-    const std::vector<int64_t> output_size = ctx->Attr<std::vector<int64_t>>("output_size");
-    double height_scale = ctx->Attr<double>("scale_factor");
-    const int64_t elem_cnt = y_tensor->shape_view().elem_cnt();
-    const int64_t in_height = x_tensor->shape_view().At(2);
-    const int64_t out_height = y_tensor->shape_view().At(2);
-    if (!output_size.empty()) {
-      height_scale = static_cast<double>(out_height) / static_cast<double>(in_height);
-    }
-    if (in_height == out_height) {
-      Memcpy<DeviceType::kCUDA>(
-          ctx->stream(), y_tensor->mut_dptr<void>(), x_tensor->dptr<void>(),
-          x_tensor->shape_view().elem_cnt() * GetSizeOfDataType(x_tensor->data_type()));
-    } else {
-      NdIndexOffsetHelper<int64_t, 3> in_helper(
-          x_tensor->shape_view().At(0), x_tensor->shape_view().At(1), x_tensor->shape_view().At(2));
-      NdIndexOffsetHelper<int64_t, 3> out_helper(
-          y_tensor->shape_view().At(0), y_tensor->shape_view().At(1), y_tensor->shape_view().At(2));
-      RUN_CUDA_KERNEL((UpsampleNearest1DForward<T>), ctx->stream(), elem_cnt, elem_cnt,
-                      x_tensor->dptr<T>(), in_helper, out_helper, x_tensor->shape_view().At(2),
-                      1.f / height_scale, y_tensor->mut_dptr<T>());
-    }
-  }
-  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
-};
-
-template<typename T>
-class UpsampleNearestGrad1DGPUKernel final : public user_op::OpKernel {
- public:
-  UpsampleNearestGrad1DGPUKernel() = default;
-  ~UpsampleNearestGrad1DGPUKernel() = default;
-
- private:
-  using user_op::OpKernel::Compute;
-  void Compute(user_op::KernelComputeContext* ctx) const override {
-    user_op::Tensor* dx_tensor = ctx->Tensor4ArgNameAndIndex("dx", 0);
-
-    Memset<DeviceType::kCUDA>(ctx->stream(), dx_tensor->mut_dptr<T>(), 0,
-                              dx_tensor->shape_view().elem_cnt() * sizeof(T));
-    const user_op::Tensor* dy_tensor = ctx->Tensor4ArgNameAndIndex("dy", 0);
-    const std::vector<int64_t> output_size = ctx->Attr<std::vector<int64_t>>("output_size");
-    double height_scale = ctx->Attr<double>("scale_factor");
-    const int64_t elem_cnt = dy_tensor->shape_view().elem_cnt();
-    const int64_t in_height = dx_tensor->shape_view().At(2);
-    const int64_t out_height = dy_tensor->shape_view().At(2);
-    if (!output_size.empty()) {
-      height_scale = static_cast<double>(out_height) / static_cast<double>(in_height);
-    }
-    if (in_height == out_height) {
-      Memcpy<DeviceType::kCUDA>(
-          ctx->stream(), dx_tensor->mut_dptr<void>(), dy_tensor->dptr<void>(),
-          dy_tensor->shape_view().elem_cnt() * GetSizeOfDataType(dy_tensor->data_type()));
-    } else {
-      NdIndexOffsetHelper<int64_t, 3> dy_helper(dy_tensor->shape_view().At(0),
-                                                dy_tensor->shape_view().At(1),
-                                                dy_tensor->shape_view().At(2));
-      NdIndexOffsetHelper<int64_t, 3> dx_helper(dx_tensor->shape_view().At(0),
-                                                dx_tensor->shape_view().At(1),
-                                                dx_tensor->shape_view().At(2));
-      RUN_CUDA_KERNEL((UpsampleNearest1DBackward<T>), ctx->stream(), elem_cnt, elem_cnt,
-                      dy_tensor->dptr<T>(), dy_helper, dx_helper, dx_tensor->shape_view().At(2),
-                      1.f / height_scale, dx_tensor->mut_dptr<T>());
-    }
-  }
-  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
-};
-
-#define REGISTER_UPSAMPNEAREST1D_CUDA_KERNEL(dtype)                                     \
-  REGISTER_USER_KERNEL("upsample_nearest_1d")                                           \
-      .SetCreateFn<UpsampleNearest1DGPUKernel<dtype>>()                                 \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                  \
-                       && (user_op::HobDataType("y", 0) == GetDataType<dtype>::value)); \
-  REGISTER_USER_KERNEL("upsample_nearest_1d_grad")                                      \
-      .SetCreateFn<UpsampleNearestGrad1DGPUKernel<dtype>>()                             \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                  \
-                       && (user_op::HobDataType("dx", 0) == GetDataType<dtype>::value));
-
-REGISTER_UPSAMPNEAREST1D_CUDA_KERNEL(float)
-REGISTER_UPSAMPNEAREST1D_CUDA_KERNEL(double)
-
-template<typename T>
-class UpsampleNearest2DGPUKernel final : public user_op::OpKernel {
- public:
-  UpsampleNearest2DGPUKernel() = default;
-  ~UpsampleNearest2DGPUKernel() = default;
-
- private:
-  using user_op::OpKernel::Compute;
-  void Compute(user_op::KernelComputeContext* ctx) const override {
-    const user_op::Tensor* x_tensor = ctx->Tensor4ArgNameAndIndex("x", 0);
-    user_op::Tensor* y_tensor = ctx->Tensor4ArgNameAndIndex("y", 0);
-    const std::vector<int64_t> output_size = ctx->Attr<std::vector<int64_t>>("output_size");
-    double height_scale = ctx->Attr<double>("height_scale");
-    double width_scale = ctx->Attr<double>("width_scale");
-    const int64_t elem_cnt = y_tensor->shape_view().elem_cnt();
-    const int64_t in_height = x_tensor->shape_view().At(2);
-    const int64_t in_width = x_tensor->shape_view().At(3);
-    const int64_t out_height = y_tensor->shape_view().At(2);
-    const int64_t out_width = y_tensor->shape_view().At(3);
-    if (!output_size.empty()) {
-      height_scale = static_cast<double>(out_height) / static_cast<double>(in_height);
-      width_scale = static_cast<double>(out_width) / static_cast<double>(in_width);
-    }
-
-    if (in_height == out_height && in_width == out_width) {
-      Memcpy<DeviceType::kCUDA>(
-          ctx->stream(), y_tensor->mut_dptr<void>(), x_tensor->dptr<void>(),
-          x_tensor->shape_view().elem_cnt() * GetSizeOfDataType(x_tensor->data_type()));
-    } else {
-      NdIndexOffsetHelper<int64_t, 4> in_helper(
-          x_tensor->shape_view().At(0), x_tensor->shape_view().At(1), x_tensor->shape_view().At(2),
-          x_tensor->shape_view().At(3));
-      NdIndexOffsetHelper<int64_t, 4> out_helper(
-          y_tensor->shape_view().At(0), y_tensor->shape_view().At(1), y_tensor->shape_view().At(2),
-          y_tensor->shape_view().At(3));
-      RUN_CUDA_KERNEL((UpsampleNearest2DForward<T>), ctx->stream(), elem_cnt, elem_cnt,
-                      x_tensor->dptr<T>(), in_helper, out_helper, x_tensor->shape_view().At(2),
-                      x_tensor->shape_view().At(3), 1.f / height_scale, 1.f / width_scale,
-                      y_tensor->mut_dptr<T>());
-    }
-  }
-  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
-};
-
-template<typename T>
-class UpsampleNearest2DGradGPUKernel final : public user_op::OpKernel {
- public:
-  UpsampleNearest2DGradGPUKernel() = default;
-  ~UpsampleNearest2DGradGPUKernel() = default;
-
- private:
-  using user_op::OpKernel::Compute;
-  void Compute(user_op::KernelComputeContext* ctx) const override {
-    user_op::Tensor* dx_tensor = ctx->Tensor4ArgNameAndIndex("dx", 0);
-
-    Memset<DeviceType::kCUDA>(ctx->stream(), dx_tensor->mut_dptr<T>(), 0,
-                              dx_tensor->shape_view().elem_cnt() * sizeof(T));
-    const user_op::Tensor* dy_tensor = ctx->Tensor4ArgNameAndIndex("dy", 0);
-    const std::vector<int64_t> output_size = ctx->Attr<std::vector<int64_t>>("output_size");
-    double height_scale = ctx->Attr<double>("height_scale");
-    double width_scale = ctx->Attr<double>("width_scale");
-    const int64_t elem_cnt = dy_tensor->shape_view().elem_cnt();
-    const int64_t in_height = dx_tensor->shape_view().At(2);
-    const int64_t in_width = dx_tensor->shape_view().At(3);
-    const int64_t out_height = dy_tensor->shape_view().At(2);
-    const int64_t out_width = dy_tensor->shape_view().At(3);
-    if (!output_size.empty()) {
-      height_scale = static_cast<double>(out_height) / static_cast<double>(in_height);
-      width_scale = static_cast<double>(out_width) / static_cast<double>(in_width);
-    }
-    if (in_height == out_height && in_width == out_width) {
-      Memcpy<DeviceType::kCUDA>(
-          ctx->stream(), dx_tensor->mut_dptr<void>(), dy_tensor->dptr<void>(),
-          dy_tensor->shape_view().elem_cnt() * GetSizeOfDataType(dy_tensor->data_type()));
-    } else {
-      NdIndexOffsetHelper<int64_t, 4> dy_helper(
-          dy_tensor->shape_view().At(0), dy_tensor->shape_view().At(1),
-          dy_tensor->shape_view().At(2), dy_tensor->shape_view().At(3));
-      NdIndexOffsetHelper<int64_t, 4> dx_helper(
-          dx_tensor->shape_view().At(0), dx_tensor->shape_view().At(1),
-          dx_tensor->shape_view().At(2), dx_tensor->shape_view().At(3));
-      RUN_CUDA_KERNEL((UpsampleNearest2DBackward<T>), ctx->stream(), elem_cnt, elem_cnt,
-                      dy_tensor->dptr<T>(), dy_helper, dx_helper, dx_tensor->shape_view().At(2),
-                      dx_tensor->shape_view().At(3), 1.f / height_scale, 1.f / width_scale,
-                      dx_tensor->mut_dptr<T>());
-    }
-  }
-  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
-};
-
-#define REGISTER_UPSAMPLE_NEAREST_2D_CUDA_KERNEL(dtype)                                 \
-  REGISTER_USER_KERNEL("upsample_nearest_2d")                                           \
-      .SetCreateFn<UpsampleNearest2DGPUKernel<dtype>>()                                 \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                  \
-                       && (user_op::HobDataType("y", 0) == GetDataType<dtype>::value)); \
-  REGISTER_USER_KERNEL("upsample_nearest_2d_grad")                                      \
-      .SetCreateFn<UpsampleNearest2DGradGPUKernel<dtype>>()                             \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                  \
-                       && (user_op::HobDataType("dx", 0) == GetDataType<dtype>::value));
-
-REGISTER_UPSAMPLE_NEAREST_2D_CUDA_KERNEL(float)
-REGISTER_UPSAMPLE_NEAREST_2D_CUDA_KERNEL(double)
-
-template<typename T>
-class UpsampleNearest3DGPUKernel final : public user_op::OpKernel {
- public:
-  UpsampleNearest3DGPUKernel() = default;
-  ~UpsampleNearest3DGPUKernel() = default;
-
- private:
-  using user_op::OpKernel::Compute;
-  void Compute(user_op::KernelComputeContext* ctx) const override {
-    const user_op::Tensor* x_tensor = ctx->Tensor4ArgNameAndIndex("x", 0);
-    user_op::Tensor* y_tensor = ctx->Tensor4ArgNameAndIndex("y", 0);
-    const std::vector<int64_t> output_size = ctx->Attr<std::vector<int64_t>>("output_size");
-    double depth_scale = ctx->Attr<double>("depth_scale");
-    double height_scale = ctx->Attr<double>("height_scale");
-    double width_scale = ctx->Attr<double>("width_scale");
-    const int64_t in_depth = x_tensor->shape_view().At(2);
-    const int64_t in_height = x_tensor->shape_view().At(3);
-    const int64_t in_width = x_tensor->shape_view().At(4);
-    const int64_t out_depth = y_tensor->shape_view().At(2);
-    const int64_t out_height = y_tensor->shape_view().At(3);
-    const int64_t out_width = y_tensor->shape_view().At(4);
-    const int64_t elem_cnt = y_tensor->shape_view().elem_cnt();
-    if (!output_size.empty()) {
-      depth_scale = static_cast<double>(out_depth) / static_cast<double>(in_depth);
-      height_scale = static_cast<double>(out_height) / static_cast<double>(in_height);
-      width_scale = static_cast<double>(out_width) / static_cast<double>(in_width);
-    }
-    NdIndexOffsetHelper<int64_t, 5> in_helper(
-        x_tensor->shape_view().At(0), x_tensor->shape_view().At(1), x_tensor->shape_view().At(2),
-        x_tensor->shape_view().At(3), x_tensor->shape_view().At(4));
-    NdIndexOffsetHelper<int64_t, 5> out_helper(
-        y_tensor->shape_view().At(0), y_tensor->shape_view().At(1), y_tensor->shape_view().At(2),
-        y_tensor->shape_view().At(3), y_tensor->shape_view().At(4));
-    RUN_CUDA_KERNEL((UpsampleNearest3DForward<T>), ctx->stream(), elem_cnt, elem_cnt,
-                    x_tensor->dptr<T>(), in_helper, out_helper, x_tensor->shape_view().At(2),
-                    x_tensor->shape_view().At(3), x_tensor->shape_view().At(4), 1.f / depth_scale,
-                    1.f / height_scale, 1.f / width_scale, y_tensor->mut_dptr<T>());
-  }
-  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
-};
-
-template<typename T>
-class UpsampleNearestGrad3DGPUKernel final : public user_op::OpKernel {
- public:
-  UpsampleNearestGrad3DGPUKernel() = default;
-  ~UpsampleNearestGrad3DGPUKernel() = default;
-
- private:
-  using user_op::OpKernel::Compute;
-  void Compute(user_op::KernelComputeContext* ctx) const override {
-    user_op::Tensor* dx_tensor = ctx->Tensor4ArgNameAndIndex("dx", 0);
-
-    Memset<DeviceType::kCUDA>(ctx->stream(), dx_tensor->mut_dptr<T>(), 0,
-                              dx_tensor->shape_view().elem_cnt() * sizeof(T));
-    const user_op::Tensor* dy_tensor = ctx->Tensor4ArgNameAndIndex("dy", 0);
-    const std::vector<int64_t> output_size = ctx->Attr<std::vector<int64_t>>("output_size");
-    double depth_scale = ctx->Attr<double>("depth_scale");
-    double height_scale = ctx->Attr<double>("height_scale");
-    double width_scale = ctx->Attr<double>("width_scale");
-    const int64_t in_depth = dx_tensor->shape_view().At(2);
-    const int64_t in_height = dx_tensor->shape_view().At(3);
-    const int64_t in_width = dx_tensor->shape_view().At(4);
-    const int64_t out_depth = dy_tensor->shape_view().At(2);
-    const int64_t out_height = dy_tensor->shape_view().At(3);
-    const int64_t out_width = dy_tensor->shape_view().At(4);
-    const int64_t elem_cnt = dy_tensor->shape_view().elem_cnt();
-    if (!output_size.empty()) {
-      depth_scale = static_cast<double>(out_depth) / static_cast<double>(in_depth);
-      height_scale = static_cast<double>(out_height) / static_cast<double>(in_height);
-      width_scale = static_cast<double>(out_width) / static_cast<double>(in_width);
-    }
-    NdIndexOffsetHelper<int64_t, 5> dy_helper(
-        dy_tensor->shape_view().At(0), dy_tensor->shape_view().At(1), dy_tensor->shape_view().At(2),
-        dy_tensor->shape_view().At(3), dy_tensor->shape_view().At(4));
-    NdIndexOffsetHelper<int64_t, 5> dx_helper(
-        dx_tensor->shape_view().At(0), dx_tensor->shape_view().At(1), dx_tensor->shape_view().At(2),
-        dx_tensor->shape_view().At(3), dx_tensor->shape_view().At(4));
-    RUN_CUDA_KERNEL((UpsampleNearest3DBackward<T>), ctx->stream(), elem_cnt, elem_cnt,
-                    dy_tensor->dptr<T>(), dy_helper, dx_helper, dx_tensor->shape_view().At(2),
-                    dx_tensor->shape_view().At(3), dx_tensor->shape_view().At(4), 1.f / depth_scale,
-                    1.f / height_scale, 1.f / width_scale, dx_tensor->mut_dptr<T>());
-  }
-  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
-};
-
-#define REGISTER_UPSAMPNEAREST3D_CUDA_KERNEL(dtype)                                     \
-  REGISTER_USER_KERNEL("upsample_nearest_3d")                                           \
-      .SetCreateFn<UpsampleNearest3DGPUKernel<dtype>>()                                 \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                  \
-                       && (user_op::HobDataType("y", 0) == GetDataType<dtype>::value)); \
-  REGISTER_USER_KERNEL("upsample_nearest_3d_grad")                                      \
-      .SetCreateFn<UpsampleNearestGrad3DGPUKernel<dtype>>()                             \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                  \
-                       && (user_op::HobDataType("dx", 0) == GetDataType<dtype>::value));
-
-REGISTER_UPSAMPNEAREST3D_CUDA_KERNEL(float)
-REGISTER_UPSAMPNEAREST3D_CUDA_KERNEL(double)
-
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "hip/hip_runtime.h"
+#include "oneflow/core/framework/framework.h"
+#include "oneflow/core/kernel/new_kernel_util.h"
+#include "oneflow/core/common/nd_index_offset_helper.h"
+#include "oneflow/core/hip/atomic.hip.h"
+#include "oneflow/user/kernels/upsample_kernel.h"
+
+namespace oneflow {
+
+namespace {
+
+template<typename T>
+__global__ void UpsampleNearest1DForward(const int64_t elem_cnt, const T* in_dptr,
+                                         NdIndexOffsetHelper<int64_t, 3> in_helper,
+                                         NdIndexOffsetHelper<int64_t, 3> out_helper,
+                                         const int64_t in_height, const double scale_factor,
+                                         T* out_dptr) {
+  CUDA_1D_KERNEL_LOOP(index, elem_cnt) {
+    int64_t n, c, h;
+    out_helper.OffsetToNdIndex(index, n, c, h);
+    const int64_t in_h = GetNearestInputIndex(h, scale_factor, in_height);
+    out_dptr[index] = in_dptr[in_helper.NdIndexToOffset(n, c, in_h)];
+  }
+}
+
+template<typename T>
+__global__ void UpsampleNearest1DBackward(const int64_t elem_cnt, const T* dy_dptr,
+                                          NdIndexOffsetHelper<int64_t, 3> dy_helper,
+                                          NdIndexOffsetHelper<int64_t, 3> dx_helper,
+                                          const int64_t in_height, const double scale_factor,
+                                          T* dx_dptr) {
+  CUDA_1D_KERNEL_LOOP(index, elem_cnt) {
+    int64_t n, c, h;
+    dy_helper.OffsetToNdIndex(index, n, c, h);
+    const int64_t dx_h = GetNearestInputIndex(h, scale_factor, in_height);
+    cuda::atomic::Add(dx_dptr + dx_helper.NdIndexToOffset(n, c, dx_h), dy_dptr[index]);
+  }
+}
+
+template<typename T>
+__global__ void UpsampleNearest2DForward(const int64_t elem_cnt, const T* in_dptr,
+                                         NdIndexOffsetHelper<int64_t, 4> in_helper,
+                                         NdIndexOffsetHelper<int64_t, 4> out_helper,
+                                         const int64_t in_height, const int64_t in_width,
+                                         const double scale_h, const double scale_w, T* out_dptr) {
+  CUDA_1D_KERNEL_LOOP(index, elem_cnt) {
+    int64_t n, c, h, w;
+    out_helper.OffsetToNdIndex(index, n, c, h, w);
+    const int64_t in_h = GetNearestInputIndex(h, scale_h, in_height);
+    const int64_t in_w = GetNearestInputIndex(w, scale_w, in_width);
+    out_dptr[index] = in_dptr[in_helper.NdIndexToOffset(n, c, in_h, in_w)];
+  }
+}
+
+template<typename T>
+__global__ void UpsampleNearest2DBackward(const int64_t elem_cnt, const T* dy_dptr,
+                                          NdIndexOffsetHelper<int64_t, 4> dy_helper,
+                                          NdIndexOffsetHelper<int64_t, 4> dx_helper,
+                                          const int64_t dx_height, const int64_t dx_width,
+                                          const double scale_h, const double scale_w, T* dx_dptr) {
+  CUDA_1D_KERNEL_LOOP(index, elem_cnt) {
+    int64_t n, c, h, w;
+    dy_helper.OffsetToNdIndex(index, n, c, h, w);
+    const int64_t dx_h = GetNearestInputIndex(h, scale_h, dx_height);
+    const int64_t dx_w = GetNearestInputIndex(w, scale_w, dx_width);
+    cuda::atomic::Add(dx_dptr + dx_helper.NdIndexToOffset(n, c, dx_h, dx_w), dy_dptr[index]);
+  }
+}
+
+template<typename T>
+__global__ void UpsampleNearest3DForward(const int64_t elem_cnt, const T* in_dptr,
+                                         NdIndexOffsetHelper<int64_t, 5> in_helper,
+                                         NdIndexOffsetHelper<int64_t, 5> out_helper,
+                                         const int64_t in_depth, const int64_t in_height,
+                                         const int64_t in_width, const float scale_d,
+                                         const float scale_h, const float scale_w, T* out_dptr) {
+  CUDA_1D_KERNEL_LOOP(index, elem_cnt) {
+    int64_t n, c, d, h, w;
+    out_helper.OffsetToNdIndex(index, n, c, d, h, w);
+    const int64_t in_h = GetNearestInputIndex(h, scale_h, in_height);
+    const int64_t in_w = GetNearestInputIndex(w, scale_w, in_width);
+    const int64_t in_d = GetNearestInputIndex(d, scale_d, in_depth);
+    out_dptr[index] = in_dptr[in_helper.NdIndexToOffset(n, c, in_d, in_h, in_w)];
+  }
+}
+
+template<typename T>
+__global__ void UpsampleNearest3DBackward(const int64_t elem_cnt, const T* dy_dptr,
+                                          NdIndexOffsetHelper<int64_t, 5> dy_helper,
+                                          NdIndexOffsetHelper<int64_t, 5> dx_helper,
+                                          const int64_t in_depth, const int64_t in_height,
+                                          const int64_t in_width, const float scale_d,
+                                          const float scale_h, const float scale_w, T* dx_dptr) {
+  CUDA_1D_KERNEL_LOOP(index, elem_cnt) {
+    int64_t n, c, d, h, w;
+    dy_helper.OffsetToNdIndex(index, n, c, d, h, w);
+    const int64_t dx_h = GetNearestInputIndex(h, scale_h, in_height);
+    const int64_t dx_w = GetNearestInputIndex(w, scale_w, in_width);
+    const int64_t in_d = GetNearestInputIndex(d, scale_d, in_depth);
+    cuda::atomic::Add(dx_dptr + dx_helper.NdIndexToOffset(n, c, in_d, dx_h, dx_w), dy_dptr[index]);
+  }
+}
+
+}  // namespace
+
+template<typename T>
+class UpsampleNearest1DGPUKernel final : public user_op::OpKernel {
+ public:
+  UpsampleNearest1DGPUKernel() = default;
+  ~UpsampleNearest1DGPUKernel() = default;
+
+ private:
+  using user_op::OpKernel::Compute;
+  void Compute(user_op::KernelComputeContext* ctx) const override {
+    const user_op::Tensor* x_tensor = ctx->Tensor4ArgNameAndIndex("x", 0);
+    user_op::Tensor* y_tensor = ctx->Tensor4ArgNameAndIndex("y", 0);
+    const std::vector<int64_t> output_size = ctx->Attr<std::vector<int64_t>>("output_size");
+    double height_scale = ctx->Attr<double>("scale_factor");
+    const int64_t elem_cnt = y_tensor->shape_view().elem_cnt();
+    const int64_t in_height = x_tensor->shape_view().At(2);
+    const int64_t out_height = y_tensor->shape_view().At(2);
+    if (!output_size.empty()) {
+      height_scale = static_cast<double>(out_height) / static_cast<double>(in_height);
+    }
+    if (in_height == out_height) {
+      Memcpy<DeviceType::kCUDA>(
+          ctx->stream(), y_tensor->mut_dptr<void>(), x_tensor->dptr<void>(),
+          x_tensor->shape_view().elem_cnt() * GetSizeOfDataType(x_tensor->data_type()));
+    } else {
+      NdIndexOffsetHelper<int64_t, 3> in_helper(
+          x_tensor->shape_view().At(0), x_tensor->shape_view().At(1), x_tensor->shape_view().At(2));
+      NdIndexOffsetHelper<int64_t, 3> out_helper(
+          y_tensor->shape_view().At(0), y_tensor->shape_view().At(1), y_tensor->shape_view().At(2));
+      RUN_CUDA_KERNEL((UpsampleNearest1DForward<T>), ctx->stream(), elem_cnt, elem_cnt,
+                      x_tensor->dptr<T>(), in_helper, out_helper, x_tensor->shape_view().At(2),
+                      1.f / height_scale, y_tensor->mut_dptr<T>());
+    }
+  }
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
+};
+
+template<typename T>
+class UpsampleNearestGrad1DGPUKernel final : public user_op::OpKernel {
+ public:
+  UpsampleNearestGrad1DGPUKernel() = default;
+  ~UpsampleNearestGrad1DGPUKernel() = default;
+
+ private:
+  using user_op::OpKernel::Compute;
+  void Compute(user_op::KernelComputeContext* ctx) const override {
+    user_op::Tensor* dx_tensor = ctx->Tensor4ArgNameAndIndex("dx", 0);
+
+    Memset<DeviceType::kCUDA>(ctx->stream(), dx_tensor->mut_dptr<T>(), 0,
+                              dx_tensor->shape_view().elem_cnt() * sizeof(T));
+    const user_op::Tensor* dy_tensor = ctx->Tensor4ArgNameAndIndex("dy", 0);
+    const std::vector<int64_t> output_size = ctx->Attr<std::vector<int64_t>>("output_size");
+    double height_scale = ctx->Attr<double>("scale_factor");
+    const int64_t elem_cnt = dy_tensor->shape_view().elem_cnt();
+    const int64_t in_height = dx_tensor->shape_view().At(2);
+    const int64_t out_height = dy_tensor->shape_view().At(2);
+    if (!output_size.empty()) {
+      height_scale = static_cast<double>(out_height) / static_cast<double>(in_height);
+    }
+    if (in_height == out_height) {
+      Memcpy<DeviceType::kCUDA>(
+          ctx->stream(), dx_tensor->mut_dptr<void>(), dy_tensor->dptr<void>(),
+          dy_tensor->shape_view().elem_cnt() * GetSizeOfDataType(dy_tensor->data_type()));
+    } else {
+      NdIndexOffsetHelper<int64_t, 3> dy_helper(dy_tensor->shape_view().At(0),
+                                                dy_tensor->shape_view().At(1),
+                                                dy_tensor->shape_view().At(2));
+      NdIndexOffsetHelper<int64_t, 3> dx_helper(dx_tensor->shape_view().At(0),
+                                                dx_tensor->shape_view().At(1),
+                                                dx_tensor->shape_view().At(2));
+      RUN_CUDA_KERNEL((UpsampleNearest1DBackward<T>), ctx->stream(), elem_cnt, elem_cnt,
+                      dy_tensor->dptr<T>(), dy_helper, dx_helper, dx_tensor->shape_view().At(2),
+                      1.f / height_scale, dx_tensor->mut_dptr<T>());
+    }
+  }
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
+};
+
+#define REGISTER_UPSAMPNEAREST1D_CUDA_KERNEL(dtype)                                     \
+  REGISTER_USER_KERNEL("upsample_nearest_1d")                                           \
+      .SetCreateFn<UpsampleNearest1DGPUKernel<dtype>>()                                 \
+      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                  \
+                       && (user_op::HobDataType("y", 0) == GetDataType<dtype>::value)); \
+  REGISTER_USER_KERNEL("upsample_nearest_1d_grad")                                      \
+      .SetCreateFn<UpsampleNearestGrad1DGPUKernel<dtype>>()                             \
+      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                  \
+                       && (user_op::HobDataType("dx", 0) == GetDataType<dtype>::value));
+
+REGISTER_UPSAMPNEAREST1D_CUDA_KERNEL(float)
+REGISTER_UPSAMPNEAREST1D_CUDA_KERNEL(double)
+
+template<typename T>
+class UpsampleNearest2DGPUKernel final : public user_op::OpKernel {
+ public:
+  UpsampleNearest2DGPUKernel() = default;
+  ~UpsampleNearest2DGPUKernel() = default;
+
+ private:
+  using user_op::OpKernel::Compute;
+  void Compute(user_op::KernelComputeContext* ctx) const override {
+    const user_op::Tensor* x_tensor = ctx->Tensor4ArgNameAndIndex("x", 0);
+    user_op::Tensor* y_tensor = ctx->Tensor4ArgNameAndIndex("y", 0);
+    const std::vector<int64_t> output_size = ctx->Attr<std::vector<int64_t>>("output_size");
+    double height_scale = ctx->Attr<double>("height_scale");
+    double width_scale = ctx->Attr<double>("width_scale");
+    const int64_t elem_cnt = y_tensor->shape_view().elem_cnt();
+    const int64_t in_height = x_tensor->shape_view().At(2);
+    const int64_t in_width = x_tensor->shape_view().At(3);
+    const int64_t out_height = y_tensor->shape_view().At(2);
+    const int64_t out_width = y_tensor->shape_view().At(3);
+    if (!output_size.empty()) {
+      height_scale = static_cast<double>(out_height) / static_cast<double>(in_height);
+      width_scale = static_cast<double>(out_width) / static_cast<double>(in_width);
+    }
+
+    if (in_height == out_height && in_width == out_width) {
+      Memcpy<DeviceType::kCUDA>(
+          ctx->stream(), y_tensor->mut_dptr<void>(), x_tensor->dptr<void>(),
+          x_tensor->shape_view().elem_cnt() * GetSizeOfDataType(x_tensor->data_type()));
+    } else {
+      NdIndexOffsetHelper<int64_t, 4> in_helper(
+          x_tensor->shape_view().At(0), x_tensor->shape_view().At(1), x_tensor->shape_view().At(2),
+          x_tensor->shape_view().At(3));
+      NdIndexOffsetHelper<int64_t, 4> out_helper(
+          y_tensor->shape_view().At(0), y_tensor->shape_view().At(1), y_tensor->shape_view().At(2),
+          y_tensor->shape_view().At(3));
+      RUN_CUDA_KERNEL((UpsampleNearest2DForward<T>), ctx->stream(), elem_cnt, elem_cnt,
+                      x_tensor->dptr<T>(), in_helper, out_helper, x_tensor->shape_view().At(2),
+                      x_tensor->shape_view().At(3), 1.f / height_scale, 1.f / width_scale,
+                      y_tensor->mut_dptr<T>());
+    }
+  }
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
+};
+
+template<typename T>
+class UpsampleNearest2DGradGPUKernel final : public user_op::OpKernel {
+ public:
+  UpsampleNearest2DGradGPUKernel() = default;
+  ~UpsampleNearest2DGradGPUKernel() = default;
+
+ private:
+  using user_op::OpKernel::Compute;
+  void Compute(user_op::KernelComputeContext* ctx) const override {
+    user_op::Tensor* dx_tensor = ctx->Tensor4ArgNameAndIndex("dx", 0);
+
+    Memset<DeviceType::kCUDA>(ctx->stream(), dx_tensor->mut_dptr<T>(), 0,
+                              dx_tensor->shape_view().elem_cnt() * sizeof(T));
+    const user_op::Tensor* dy_tensor = ctx->Tensor4ArgNameAndIndex("dy", 0);
+    const std::vector<int64_t> output_size = ctx->Attr<std::vector<int64_t>>("output_size");
+    double height_scale = ctx->Attr<double>("height_scale");
+    double width_scale = ctx->Attr<double>("width_scale");
+    const int64_t elem_cnt = dy_tensor->shape_view().elem_cnt();
+    const int64_t in_height = dx_tensor->shape_view().At(2);
+    const int64_t in_width = dx_tensor->shape_view().At(3);
+    const int64_t out_height = dy_tensor->shape_view().At(2);
+    const int64_t out_width = dy_tensor->shape_view().At(3);
+    if (!output_size.empty()) {
+      height_scale = static_cast<double>(out_height) / static_cast<double>(in_height);
+      width_scale = static_cast<double>(out_width) / static_cast<double>(in_width);
+    }
+    if (in_height == out_height && in_width == out_width) {
+      Memcpy<DeviceType::kCUDA>(
+          ctx->stream(), dx_tensor->mut_dptr<void>(), dy_tensor->dptr<void>(),
+          dy_tensor->shape_view().elem_cnt() * GetSizeOfDataType(dy_tensor->data_type()));
+    } else {
+      NdIndexOffsetHelper<int64_t, 4> dy_helper(
+          dy_tensor->shape_view().At(0), dy_tensor->shape_view().At(1),
+          dy_tensor->shape_view().At(2), dy_tensor->shape_view().At(3));
+      NdIndexOffsetHelper<int64_t, 4> dx_helper(
+          dx_tensor->shape_view().At(0), dx_tensor->shape_view().At(1),
+          dx_tensor->shape_view().At(2), dx_tensor->shape_view().At(3));
+      RUN_CUDA_KERNEL((UpsampleNearest2DBackward<T>), ctx->stream(), elem_cnt, elem_cnt,
+                      dy_tensor->dptr<T>(), dy_helper, dx_helper, dx_tensor->shape_view().At(2),
+                      dx_tensor->shape_view().At(3), 1.f / height_scale, 1.f / width_scale,
+                      dx_tensor->mut_dptr<T>());
+    }
+  }
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
+};
+
+#define REGISTER_UPSAMPLE_NEAREST_2D_CUDA_KERNEL(dtype)                                 \
+  REGISTER_USER_KERNEL("upsample_nearest_2d")                                           \
+      .SetCreateFn<UpsampleNearest2DGPUKernel<dtype>>()                                 \
+      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                  \
+                       && (user_op::HobDataType("y", 0) == GetDataType<dtype>::value)); \
+  REGISTER_USER_KERNEL("upsample_nearest_2d_grad")                                      \
+      .SetCreateFn<UpsampleNearest2DGradGPUKernel<dtype>>()                             \
+      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                  \
+                       && (user_op::HobDataType("dx", 0) == GetDataType<dtype>::value));
+
+REGISTER_UPSAMPLE_NEAREST_2D_CUDA_KERNEL(float)
+REGISTER_UPSAMPLE_NEAREST_2D_CUDA_KERNEL(double)
+
+template<typename T>
+class UpsampleNearest3DGPUKernel final : public user_op::OpKernel {
+ public:
+  UpsampleNearest3DGPUKernel() = default;
+  ~UpsampleNearest3DGPUKernel() = default;
+
+ private:
+  using user_op::OpKernel::Compute;
+  void Compute(user_op::KernelComputeContext* ctx) const override {
+    const user_op::Tensor* x_tensor = ctx->Tensor4ArgNameAndIndex("x", 0);
+    user_op::Tensor* y_tensor = ctx->Tensor4ArgNameAndIndex("y", 0);
+    const std::vector<int64_t> output_size = ctx->Attr<std::vector<int64_t>>("output_size");
+    double depth_scale = ctx->Attr<double>("depth_scale");
+    double height_scale = ctx->Attr<double>("height_scale");
+    double width_scale = ctx->Attr<double>("width_scale");
+    const int64_t in_depth = x_tensor->shape_view().At(2);
+    const int64_t in_height = x_tensor->shape_view().At(3);
+    const int64_t in_width = x_tensor->shape_view().At(4);
+    const int64_t out_depth = y_tensor->shape_view().At(2);
+    const int64_t out_height = y_tensor->shape_view().At(3);
+    const int64_t out_width = y_tensor->shape_view().At(4);
+    const int64_t elem_cnt = y_tensor->shape_view().elem_cnt();
+    if (!output_size.empty()) {
+      depth_scale = static_cast<double>(out_depth) / static_cast<double>(in_depth);
+      height_scale = static_cast<double>(out_height) / static_cast<double>(in_height);
+      width_scale = static_cast<double>(out_width) / static_cast<double>(in_width);
+    }
+    NdIndexOffsetHelper<int64_t, 5> in_helper(
+        x_tensor->shape_view().At(0), x_tensor->shape_view().At(1), x_tensor->shape_view().At(2),
+        x_tensor->shape_view().At(3), x_tensor->shape_view().At(4));
+    NdIndexOffsetHelper<int64_t, 5> out_helper(
+        y_tensor->shape_view().At(0), y_tensor->shape_view().At(1), y_tensor->shape_view().At(2),
+        y_tensor->shape_view().At(3), y_tensor->shape_view().At(4));
+    RUN_CUDA_KERNEL((UpsampleNearest3DForward<T>), ctx->stream(), elem_cnt, elem_cnt,
+                    x_tensor->dptr<T>(), in_helper, out_helper, x_tensor->shape_view().At(2),
+                    x_tensor->shape_view().At(3), x_tensor->shape_view().At(4), 1.f / depth_scale,
+                    1.f / height_scale, 1.f / width_scale, y_tensor->mut_dptr<T>());
+  }
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
+};
+
+template<typename T>
+class UpsampleNearestGrad3DGPUKernel final : public user_op::OpKernel {
+ public:
+  UpsampleNearestGrad3DGPUKernel() = default;
+  ~UpsampleNearestGrad3DGPUKernel() = default;
+
+ private:
+  using user_op::OpKernel::Compute;
+  void Compute(user_op::KernelComputeContext* ctx) const override {
+    user_op::Tensor* dx_tensor = ctx->Tensor4ArgNameAndIndex("dx", 0);
+
+    Memset<DeviceType::kCUDA>(ctx->stream(), dx_tensor->mut_dptr<T>(), 0,
+                              dx_tensor->shape_view().elem_cnt() * sizeof(T));
+    const user_op::Tensor* dy_tensor = ctx->Tensor4ArgNameAndIndex("dy", 0);
+    const std::vector<int64_t> output_size = ctx->Attr<std::vector<int64_t>>("output_size");
+    double depth_scale = ctx->Attr<double>("depth_scale");
+    double height_scale = ctx->Attr<double>("height_scale");
+    double width_scale = ctx->Attr<double>("width_scale");
+    const int64_t in_depth = dx_tensor->shape_view().At(2);
+    const int64_t in_height = dx_tensor->shape_view().At(3);
+    const int64_t in_width = dx_tensor->shape_view().At(4);
+    const int64_t out_depth = dy_tensor->shape_view().At(2);
+    const int64_t out_height = dy_tensor->shape_view().At(3);
+    const int64_t out_width = dy_tensor->shape_view().At(4);
+    const int64_t elem_cnt = dy_tensor->shape_view().elem_cnt();
+    if (!output_size.empty()) {
+      depth_scale = static_cast<double>(out_depth) / static_cast<double>(in_depth);
+      height_scale = static_cast<double>(out_height) / static_cast<double>(in_height);
+      width_scale = static_cast<double>(out_width) / static_cast<double>(in_width);
+    }
+    NdIndexOffsetHelper<int64_t, 5> dy_helper(
+        dy_tensor->shape_view().At(0), dy_tensor->shape_view().At(1), dy_tensor->shape_view().At(2),
+        dy_tensor->shape_view().At(3), dy_tensor->shape_view().At(4));
+    NdIndexOffsetHelper<int64_t, 5> dx_helper(
+        dx_tensor->shape_view().At(0), dx_tensor->shape_view().At(1), dx_tensor->shape_view().At(2),
+        dx_tensor->shape_view().At(3), dx_tensor->shape_view().At(4));
+    RUN_CUDA_KERNEL((UpsampleNearest3DBackward<T>), ctx->stream(), elem_cnt, elem_cnt,
+                    dy_tensor->dptr<T>(), dy_helper, dx_helper, dx_tensor->shape_view().At(2),
+                    dx_tensor->shape_view().At(3), dx_tensor->shape_view().At(4), 1.f / depth_scale,
+                    1.f / height_scale, 1.f / width_scale, dx_tensor->mut_dptr<T>());
+  }
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
+};
+
+#define REGISTER_UPSAMPNEAREST3D_CUDA_KERNEL(dtype)                                     \
+  REGISTER_USER_KERNEL("upsample_nearest_3d")                                           \
+      .SetCreateFn<UpsampleNearest3DGPUKernel<dtype>>()                                 \
+      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                  \
+                       && (user_op::HobDataType("y", 0) == GetDataType<dtype>::value)); \
+  REGISTER_USER_KERNEL("upsample_nearest_3d_grad")                                      \
+      .SetCreateFn<UpsampleNearestGrad3DGPUKernel<dtype>>()                             \
+      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                  \
+                       && (user_op::HobDataType("dx", 0) == GetDataType<dtype>::value));
+
+REGISTER_UPSAMPNEAREST3D_CUDA_KERNEL(float)
+REGISTER_UPSAMPNEAREST3D_CUDA_KERNEL(double)
+
 }  // namespace oneflow
\ No newline at end of file
diff --git a/oneflow/user/kernels/upsample_trilinear_3d_kernel.hip.cpp b/oneflow/user/kernels/upsample_trilinear_3d_kernel.hip.cpp
index 489312f..030c651 100644
--- a/oneflow/user/kernels/upsample_trilinear_3d_kernel.hip.cpp
+++ b/oneflow/user/kernels/upsample_trilinear_3d_kernel.hip.cpp
@@ -1,237 +1,237 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "hip/hip_runtime.h"
-#include "oneflow/core/framework/framework.h"
-#include "oneflow/core/kernel/new_kernel_util.h"
-#include "oneflow/core/common/nd_index_offset_helper.h"
-#include "oneflow/core/hip/atomic.hip.h"
-#include "oneflow/user/kernels/upsample_kernel.h"
-
-namespace oneflow {
-
-namespace {
-
-template<typename T>
-__global__ void UpsampleTrilinear3DForward(const int64_t elem_cnt, const T* in_dptr,
-                                           NdIndexOffsetHelper<int64_t, 5> in_helper,
-                                           NdIndexOffsetHelper<int64_t, 5> out_helper,
-                                           const int64_t in_depth, const int64_t in_height,
-                                           const int64_t in_width, const T rdepth, const T rheight,
-                                           const T rwidth, const bool align_corners, T* out_dptr) {
-  CUDA_1D_KERNEL_LOOP(index, elem_cnt) {
-    int64_t n, c, d, h, w;
-    out_helper.OffsetToNdIndex(index, n, c, d, h, w);
-
-    const T t1r = GetAreaPixel(rdepth, d, align_corners);
-    const int64_t t1 = t1r;
-    const int64_t t1p = (t1 < in_depth - 1) ? 1 : 0;
-    const T t1lambda = t1r - t1;
-    const T t0lambda = static_cast<T>(1.) - t1lambda;
-
-    const T h1r = GetAreaPixel(rheight, h, align_corners);
-    const int64_t h1 = h1r;
-    const int64_t h1p = (h1 < in_height - 1) ? 1 : 0;
-    const T h1lambda = h1r - h1;
-    const T h0lambda = static_cast<T>(1.) - h1lambda;
-
-    const T w1r = GetAreaPixel(rwidth, w, align_corners);
-    const int64_t w1 = w1r;
-    const int64_t w1p = (w1 < in_width - 1) ? 1 : 0;
-    const T w1lambda = w1r - w1;
-    const T w0lambda = static_cast<T>(1.) - w1lambda;
-
-    const T* pos1 = &in_dptr[in_helper.NdIndexToOffset(n, c, t1, h1, w1)];
-
-    out_dptr[index] =
-        t0lambda
-            * (h0lambda * (w0lambda * pos1[0] + w1lambda * pos1[w1p])
-               + h1lambda
-                     * (w0lambda * pos1[h1p * in_width] + w1lambda * pos1[h1p * in_width + w1p]))
-        + t1lambda
-              * (h0lambda
-                     * (w0lambda * pos1[t1p * in_height * in_width]
-                        + w1lambda * pos1[t1p * in_height * in_width + w1p])
-                 + h1lambda
-                       * (w0lambda * pos1[t1p * in_height * in_width + h1p * in_width]
-                          + w1lambda * pos1[t1p * in_height * in_width + h1p * in_width + w1p]));
-  }
-}
-
-template<typename T>
-__global__ void UpsampleTrilinear3DBackward(const int64_t elem_cnt, const T* dy_dptr,
-                                            NdIndexOffsetHelper<int64_t, 5> dy_helper,
-                                            NdIndexOffsetHelper<int64_t, 5> dx_helper,
-                                            const int64_t in_depth, const int64_t in_height,
-                                            const int64_t in_width, const T rdepth, const T rheight,
-                                            const T rwidth, const bool align_corners, T* dx_dptr) {
-  CUDA_1D_KERNEL_LOOP(index, elem_cnt) {
-    int64_t n, c, d, h, w;
-    dy_helper.OffsetToNdIndex(index, n, c, d, h, w);
-
-    const T t1r = GetAreaPixel(rdepth, d, align_corners);
-    const int64_t t1 = t1r;
-    const int64_t t1p = (t1 < in_depth - 1) ? 1 : 0;
-    const T t1lambda = t1r - t1;
-    const T t0lambda = static_cast<T>(1.) - t1lambda;
-
-    const T h1r = GetAreaPixel(rheight, h, align_corners);
-    const int64_t h1 = h1r;
-    const int64_t h1p = (h1 < in_height - 1) ? 1 : 0;
-    const T h1lambda = h1r - h1;
-    const T h0lambda = static_cast<T>(1.) - h1lambda;
-
-    const T w1r = GetAreaPixel(rwidth, w, align_corners);
-    const int64_t w1 = w1r;
-    const int64_t w1p = (w1 < in_width - 1) ? 1 : 0;
-    const T w1lambda = w1r - w1;
-    const T w0lambda = static_cast<T>(1.) - w1lambda;
-
-    T* pos1 = &dx_dptr[dx_helper.NdIndexToOffset(n, c, t1, h1, w1)];
-    const T* pos2 = &dy_dptr[index];
-
-    cuda::atomic::Add(pos1 + 0, t0lambda * h0lambda * w0lambda * pos2[0]);
-    cuda::atomic::Add(pos1 + w1p, t0lambda * h0lambda * w1lambda * pos2[0]);
-    cuda::atomic::Add(pos1 + h1p * in_width, t0lambda * h1lambda * w0lambda * pos2[0]);
-    cuda::atomic::Add(pos1 + h1p * in_width + w1p, t0lambda * h1lambda * w1lambda * pos2[0]);
-    cuda::atomic::Add(pos1 + t1p * in_height * in_width, t1lambda * h0lambda * w0lambda * pos2[0]);
-    cuda::atomic::Add(pos1 + t1p * in_height * in_width + w1p,
-                      t1lambda * h0lambda * w1lambda * pos2[0]);
-    cuda::atomic::Add(pos1 + t1p * in_height * in_width + h1p * in_width,
-                      t1lambda * h1lambda * w0lambda * pos2[0]);
-    cuda::atomic::Add(pos1 + t1p * in_height * in_width + h1p * in_width + w1p,
-                      t1lambda * h1lambda * w1lambda * pos2[0]);
-  }
-}
-
-}  // namespace
-
-template<typename T>
-class UpsampleTrilinear3DGPUKernel final : public user_op::OpKernel {
- public:
-  UpsampleTrilinear3DGPUKernel() = default;
-  ~UpsampleTrilinear3DGPUKernel() = default;
-
- private:
-  using user_op::OpKernel::Compute;
-  void Compute(user_op::KernelComputeContext* ctx) const override {
-    const user_op::Tensor* x_tensor = ctx->Tensor4ArgNameAndIndex("x", 0);
-    user_op::Tensor* y_tensor = ctx->Tensor4ArgNameAndIndex("y", 0);
-    const bool align_corners = ctx->Attr<bool>("align_corners");
-    const int64_t elem_cnt = y_tensor->shape_view().elem_cnt();
-    NdIndexOffsetHelper<int64_t, 5> in_helper(
-        x_tensor->shape_view().At(0), x_tensor->shape_view().At(1), x_tensor->shape_view().At(2),
-        x_tensor->shape_view().At(3), x_tensor->shape_view().At(4));
-    NdIndexOffsetHelper<int64_t, 5> out_helper(
-        y_tensor->shape_view().At(0), y_tensor->shape_view().At(1), y_tensor->shape_view().At(2),
-        y_tensor->shape_view().At(3), y_tensor->shape_view().At(4));
-
-    const int64_t in_depth = x_tensor->shape_view().At(2);
-    const int64_t in_height = x_tensor->shape_view().At(3);
-    const int64_t in_width = x_tensor->shape_view().At(4);
-
-    const int64_t out_depth = y_tensor->shape_view().At(2);
-    const int64_t out_height = y_tensor->shape_view().At(3);
-    const int64_t out_width = y_tensor->shape_view().At(4);
-
-    const std::vector<int64_t> output_size = ctx->Attr<std::vector<int64_t>>("output_size");
-    double depth_scale = ctx->Attr<double>("depth_scale");
-    double height_scale = ctx->Attr<double>("height_scale");
-    double width_scale = ctx->Attr<double>("width_scale");
-    if (!output_size.empty()) {
-      depth_scale = static_cast<double>(out_depth) / static_cast<double>(in_depth);
-      height_scale = static_cast<double>(out_height) / static_cast<double>(in_height);
-      width_scale = static_cast<double>(out_width) / static_cast<double>(in_width);
-    }
-
-    const T scale_depth = GetAreaPixelScale(in_depth, out_depth, align_corners, depth_scale);
-    const T scale_height = GetAreaPixelScale(in_height, out_height, align_corners, height_scale);
-    const T scale_width = GetAreaPixelScale(in_width, out_width, align_corners, width_scale);
-
-    RUN_CUDA_KERNEL((UpsampleTrilinear3DForward<T>), ctx->stream(), elem_cnt, elem_cnt,
-                    x_tensor->dptr<T>(), in_helper, out_helper, x_tensor->shape_view().At(2),
-                    x_tensor->shape_view().At(3), x_tensor->shape_view().At(4), scale_depth,
-                    scale_height, scale_width, align_corners, y_tensor->mut_dptr<T>());
-  }
-  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
-};
-
-template<typename T>
-class UpsampleTrilinearGrad3DGPUKernel final : public user_op::OpKernel {
- public:
-  UpsampleTrilinearGrad3DGPUKernel() = default;
-  ~UpsampleTrilinearGrad3DGPUKernel() = default;
-
- private:
-  using user_op::OpKernel::Compute;
-  void Compute(user_op::KernelComputeContext* ctx) const override {
-    user_op::Tensor* dx_tensor = ctx->Tensor4ArgNameAndIndex("dx", 0);
-
-    Memset<DeviceType::kCUDA>(ctx->stream(), dx_tensor->mut_dptr<T>(), 0,
-                              dx_tensor->shape_view().elem_cnt() * sizeof(T));
-    const user_op::Tensor* dy_tensor = ctx->Tensor4ArgNameAndIndex("dy", 0);
-    const bool align_corners = ctx->Attr<bool>("align_corners");
-    const int64_t elem_cnt = dy_tensor->shape_view().elem_cnt();
-    NdIndexOffsetHelper<int64_t, 5> dy_helper(
-        dy_tensor->shape_view().At(0), dy_tensor->shape_view().At(1), dy_tensor->shape_view().At(2),
-        dy_tensor->shape_view().At(3), dy_tensor->shape_view().At(4));
-    NdIndexOffsetHelper<int64_t, 5> dx_helper(
-        dx_tensor->shape_view().At(0), dx_tensor->shape_view().At(1), dx_tensor->shape_view().At(2),
-        dx_tensor->shape_view().At(3), dx_tensor->shape_view().At(4));
-
-    const int64_t in_depth = dx_tensor->shape_view().At(2);
-    const int64_t in_height = dx_tensor->shape_view().At(3);
-    const int64_t in_width = dx_tensor->shape_view().At(4);
-
-    const int64_t out_depth = dy_tensor->shape_view().At(2);
-    const int64_t out_height = dy_tensor->shape_view().At(3);
-    const int64_t out_width = dy_tensor->shape_view().At(4);
-
-    const std::vector<int64_t> output_size = ctx->Attr<std::vector<int64_t>>("output_size");
-    double depth_scale = ctx->Attr<double>("depth_scale");
-    double height_scale = ctx->Attr<double>("height_scale");
-    double width_scale = ctx->Attr<double>("width_scale");
-    if (!output_size.empty()) {
-      depth_scale = static_cast<double>(out_depth) / static_cast<double>(in_depth);
-      height_scale = static_cast<double>(out_height) / static_cast<double>(in_height);
-      width_scale = static_cast<double>(out_width) / static_cast<double>(in_width);
-    }
-
-    const T scale_depth = GetAreaPixelScale(in_depth, out_depth, align_corners, depth_scale);
-    const T scale_height = GetAreaPixelScale(in_height, out_height, align_corners, height_scale);
-    const T scale_width = GetAreaPixelScale(in_width, out_width, align_corners, width_scale);
-
-    RUN_CUDA_KERNEL((UpsampleTrilinear3DBackward<T>), ctx->stream(), elem_cnt, elem_cnt,
-                    dy_tensor->dptr<T>(), dy_helper, dx_helper, dx_tensor->shape_view().At(2),
-                    dx_tensor->shape_view().At(3), dx_tensor->shape_view().At(4), scale_depth,
-                    scale_height, scale_width, align_corners, dx_tensor->mut_dptr<T>());
-  }
-  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
-};
-
-#define REGISTER_UPSAMPTRILINEAR3D_CUDA_KERNEL(dtype)                                   \
-  REGISTER_USER_KERNEL("upsample_trilinear_3d")                                         \
-      .SetCreateFn<UpsampleTrilinear3DGPUKernel<dtype>>()                               \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                  \
-                       && (user_op::HobDataType("y", 0) == GetDataType<dtype>::value)); \
-  REGISTER_USER_KERNEL("upsample_trilinear_3d_grad")                                    \
-      .SetCreateFn<UpsampleTrilinearGrad3DGPUKernel<dtype>>()                           \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                  \
-                       && (user_op::HobDataType("dx", 0) == GetDataType<dtype>::value));
-
-REGISTER_UPSAMPTRILINEAR3D_CUDA_KERNEL(float)
-REGISTER_UPSAMPTRILINEAR3D_CUDA_KERNEL(double)
-
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "hip/hip_runtime.h"
+#include "oneflow/core/framework/framework.h"
+#include "oneflow/core/kernel/new_kernel_util.h"
+#include "oneflow/core/common/nd_index_offset_helper.h"
+#include "oneflow/core/hip/atomic.hip.h"
+#include "oneflow/user/kernels/upsample_kernel.h"
+
+namespace oneflow {
+
+namespace {
+
+template<typename T>
+__global__ void UpsampleTrilinear3DForward(const int64_t elem_cnt, const T* in_dptr,
+                                           NdIndexOffsetHelper<int64_t, 5> in_helper,
+                                           NdIndexOffsetHelper<int64_t, 5> out_helper,
+                                           const int64_t in_depth, const int64_t in_height,
+                                           const int64_t in_width, const T rdepth, const T rheight,
+                                           const T rwidth, const bool align_corners, T* out_dptr) {
+  CUDA_1D_KERNEL_LOOP(index, elem_cnt) {
+    int64_t n, c, d, h, w;
+    out_helper.OffsetToNdIndex(index, n, c, d, h, w);
+
+    const T t1r = GetAreaPixel(rdepth, d, align_corners);
+    const int64_t t1 = t1r;
+    const int64_t t1p = (t1 < in_depth - 1) ? 1 : 0;
+    const T t1lambda = t1r - t1;
+    const T t0lambda = static_cast<T>(1.) - t1lambda;
+
+    const T h1r = GetAreaPixel(rheight, h, align_corners);
+    const int64_t h1 = h1r;
+    const int64_t h1p = (h1 < in_height - 1) ? 1 : 0;
+    const T h1lambda = h1r - h1;
+    const T h0lambda = static_cast<T>(1.) - h1lambda;
+
+    const T w1r = GetAreaPixel(rwidth, w, align_corners);
+    const int64_t w1 = w1r;
+    const int64_t w1p = (w1 < in_width - 1) ? 1 : 0;
+    const T w1lambda = w1r - w1;
+    const T w0lambda = static_cast<T>(1.) - w1lambda;
+
+    const T* pos1 = &in_dptr[in_helper.NdIndexToOffset(n, c, t1, h1, w1)];
+
+    out_dptr[index] =
+        t0lambda
+            * (h0lambda * (w0lambda * pos1[0] + w1lambda * pos1[w1p])
+               + h1lambda
+                     * (w0lambda * pos1[h1p * in_width] + w1lambda * pos1[h1p * in_width + w1p]))
+        + t1lambda
+              * (h0lambda
+                     * (w0lambda * pos1[t1p * in_height * in_width]
+                        + w1lambda * pos1[t1p * in_height * in_width + w1p])
+                 + h1lambda
+                       * (w0lambda * pos1[t1p * in_height * in_width + h1p * in_width]
+                          + w1lambda * pos1[t1p * in_height * in_width + h1p * in_width + w1p]));
+  }
+}
+
+template<typename T>
+__global__ void UpsampleTrilinear3DBackward(const int64_t elem_cnt, const T* dy_dptr,
+                                            NdIndexOffsetHelper<int64_t, 5> dy_helper,
+                                            NdIndexOffsetHelper<int64_t, 5> dx_helper,
+                                            const int64_t in_depth, const int64_t in_height,
+                                            const int64_t in_width, const T rdepth, const T rheight,
+                                            const T rwidth, const bool align_corners, T* dx_dptr) {
+  CUDA_1D_KERNEL_LOOP(index, elem_cnt) {
+    int64_t n, c, d, h, w;
+    dy_helper.OffsetToNdIndex(index, n, c, d, h, w);
+
+    const T t1r = GetAreaPixel(rdepth, d, align_corners);
+    const int64_t t1 = t1r;
+    const int64_t t1p = (t1 < in_depth - 1) ? 1 : 0;
+    const T t1lambda = t1r - t1;
+    const T t0lambda = static_cast<T>(1.) - t1lambda;
+
+    const T h1r = GetAreaPixel(rheight, h, align_corners);
+    const int64_t h1 = h1r;
+    const int64_t h1p = (h1 < in_height - 1) ? 1 : 0;
+    const T h1lambda = h1r - h1;
+    const T h0lambda = static_cast<T>(1.) - h1lambda;
+
+    const T w1r = GetAreaPixel(rwidth, w, align_corners);
+    const int64_t w1 = w1r;
+    const int64_t w1p = (w1 < in_width - 1) ? 1 : 0;
+    const T w1lambda = w1r - w1;
+    const T w0lambda = static_cast<T>(1.) - w1lambda;
+
+    T* pos1 = &dx_dptr[dx_helper.NdIndexToOffset(n, c, t1, h1, w1)];
+    const T* pos2 = &dy_dptr[index];
+
+    cuda::atomic::Add(pos1 + 0, t0lambda * h0lambda * w0lambda * pos2[0]);
+    cuda::atomic::Add(pos1 + w1p, t0lambda * h0lambda * w1lambda * pos2[0]);
+    cuda::atomic::Add(pos1 + h1p * in_width, t0lambda * h1lambda * w0lambda * pos2[0]);
+    cuda::atomic::Add(pos1 + h1p * in_width + w1p, t0lambda * h1lambda * w1lambda * pos2[0]);
+    cuda::atomic::Add(pos1 + t1p * in_height * in_width, t1lambda * h0lambda * w0lambda * pos2[0]);
+    cuda::atomic::Add(pos1 + t1p * in_height * in_width + w1p,
+                      t1lambda * h0lambda * w1lambda * pos2[0]);
+    cuda::atomic::Add(pos1 + t1p * in_height * in_width + h1p * in_width,
+                      t1lambda * h1lambda * w0lambda * pos2[0]);
+    cuda::atomic::Add(pos1 + t1p * in_height * in_width + h1p * in_width + w1p,
+                      t1lambda * h1lambda * w1lambda * pos2[0]);
+  }
+}
+
+}  // namespace
+
+template<typename T>
+class UpsampleTrilinear3DGPUKernel final : public user_op::OpKernel {
+ public:
+  UpsampleTrilinear3DGPUKernel() = default;
+  ~UpsampleTrilinear3DGPUKernel() = default;
+
+ private:
+  using user_op::OpKernel::Compute;
+  void Compute(user_op::KernelComputeContext* ctx) const override {
+    const user_op::Tensor* x_tensor = ctx->Tensor4ArgNameAndIndex("x", 0);
+    user_op::Tensor* y_tensor = ctx->Tensor4ArgNameAndIndex("y", 0);
+    const bool align_corners = ctx->Attr<bool>("align_corners");
+    const int64_t elem_cnt = y_tensor->shape_view().elem_cnt();
+    NdIndexOffsetHelper<int64_t, 5> in_helper(
+        x_tensor->shape_view().At(0), x_tensor->shape_view().At(1), x_tensor->shape_view().At(2),
+        x_tensor->shape_view().At(3), x_tensor->shape_view().At(4));
+    NdIndexOffsetHelper<int64_t, 5> out_helper(
+        y_tensor->shape_view().At(0), y_tensor->shape_view().At(1), y_tensor->shape_view().At(2),
+        y_tensor->shape_view().At(3), y_tensor->shape_view().At(4));
+
+    const int64_t in_depth = x_tensor->shape_view().At(2);
+    const int64_t in_height = x_tensor->shape_view().At(3);
+    const int64_t in_width = x_tensor->shape_view().At(4);
+
+    const int64_t out_depth = y_tensor->shape_view().At(2);
+    const int64_t out_height = y_tensor->shape_view().At(3);
+    const int64_t out_width = y_tensor->shape_view().At(4);
+
+    const std::vector<int64_t> output_size = ctx->Attr<std::vector<int64_t>>("output_size");
+    double depth_scale = ctx->Attr<double>("depth_scale");
+    double height_scale = ctx->Attr<double>("height_scale");
+    double width_scale = ctx->Attr<double>("width_scale");
+    if (!output_size.empty()) {
+      depth_scale = static_cast<double>(out_depth) / static_cast<double>(in_depth);
+      height_scale = static_cast<double>(out_height) / static_cast<double>(in_height);
+      width_scale = static_cast<double>(out_width) / static_cast<double>(in_width);
+    }
+
+    const T scale_depth = GetAreaPixelScale(in_depth, out_depth, align_corners, depth_scale);
+    const T scale_height = GetAreaPixelScale(in_height, out_height, align_corners, height_scale);
+    const T scale_width = GetAreaPixelScale(in_width, out_width, align_corners, width_scale);
+
+    RUN_CUDA_KERNEL((UpsampleTrilinear3DForward<T>), ctx->stream(), elem_cnt, elem_cnt,
+                    x_tensor->dptr<T>(), in_helper, out_helper, x_tensor->shape_view().At(2),
+                    x_tensor->shape_view().At(3), x_tensor->shape_view().At(4), scale_depth,
+                    scale_height, scale_width, align_corners, y_tensor->mut_dptr<T>());
+  }
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
+};
+
+template<typename T>
+class UpsampleTrilinearGrad3DGPUKernel final : public user_op::OpKernel {
+ public:
+  UpsampleTrilinearGrad3DGPUKernel() = default;
+  ~UpsampleTrilinearGrad3DGPUKernel() = default;
+
+ private:
+  using user_op::OpKernel::Compute;
+  void Compute(user_op::KernelComputeContext* ctx) const override {
+    user_op::Tensor* dx_tensor = ctx->Tensor4ArgNameAndIndex("dx", 0);
+
+    Memset<DeviceType::kCUDA>(ctx->stream(), dx_tensor->mut_dptr<T>(), 0,
+                              dx_tensor->shape_view().elem_cnt() * sizeof(T));
+    const user_op::Tensor* dy_tensor = ctx->Tensor4ArgNameAndIndex("dy", 0);
+    const bool align_corners = ctx->Attr<bool>("align_corners");
+    const int64_t elem_cnt = dy_tensor->shape_view().elem_cnt();
+    NdIndexOffsetHelper<int64_t, 5> dy_helper(
+        dy_tensor->shape_view().At(0), dy_tensor->shape_view().At(1), dy_tensor->shape_view().At(2),
+        dy_tensor->shape_view().At(3), dy_tensor->shape_view().At(4));
+    NdIndexOffsetHelper<int64_t, 5> dx_helper(
+        dx_tensor->shape_view().At(0), dx_tensor->shape_view().At(1), dx_tensor->shape_view().At(2),
+        dx_tensor->shape_view().At(3), dx_tensor->shape_view().At(4));
+
+    const int64_t in_depth = dx_tensor->shape_view().At(2);
+    const int64_t in_height = dx_tensor->shape_view().At(3);
+    const int64_t in_width = dx_tensor->shape_view().At(4);
+
+    const int64_t out_depth = dy_tensor->shape_view().At(2);
+    const int64_t out_height = dy_tensor->shape_view().At(3);
+    const int64_t out_width = dy_tensor->shape_view().At(4);
+
+    const std::vector<int64_t> output_size = ctx->Attr<std::vector<int64_t>>("output_size");
+    double depth_scale = ctx->Attr<double>("depth_scale");
+    double height_scale = ctx->Attr<double>("height_scale");
+    double width_scale = ctx->Attr<double>("width_scale");
+    if (!output_size.empty()) {
+      depth_scale = static_cast<double>(out_depth) / static_cast<double>(in_depth);
+      height_scale = static_cast<double>(out_height) / static_cast<double>(in_height);
+      width_scale = static_cast<double>(out_width) / static_cast<double>(in_width);
+    }
+
+    const T scale_depth = GetAreaPixelScale(in_depth, out_depth, align_corners, depth_scale);
+    const T scale_height = GetAreaPixelScale(in_height, out_height, align_corners, height_scale);
+    const T scale_width = GetAreaPixelScale(in_width, out_width, align_corners, width_scale);
+
+    RUN_CUDA_KERNEL((UpsampleTrilinear3DBackward<T>), ctx->stream(), elem_cnt, elem_cnt,
+                    dy_tensor->dptr<T>(), dy_helper, dx_helper, dx_tensor->shape_view().At(2),
+                    dx_tensor->shape_view().At(3), dx_tensor->shape_view().At(4), scale_depth,
+                    scale_height, scale_width, align_corners, dx_tensor->mut_dptr<T>());
+  }
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
+};
+
+#define REGISTER_UPSAMPTRILINEAR3D_CUDA_KERNEL(dtype)                                   \
+  REGISTER_USER_KERNEL("upsample_trilinear_3d")                                         \
+      .SetCreateFn<UpsampleTrilinear3DGPUKernel<dtype>>()                               \
+      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                  \
+                       && (user_op::HobDataType("y", 0) == GetDataType<dtype>::value)); \
+  REGISTER_USER_KERNEL("upsample_trilinear_3d_grad")                                    \
+      .SetCreateFn<UpsampleTrilinearGrad3DGPUKernel<dtype>>()                           \
+      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                  \
+                       && (user_op::HobDataType("dx", 0) == GetDataType<dtype>::value));
+
+REGISTER_UPSAMPTRILINEAR3D_CUDA_KERNEL(float)
+REGISTER_UPSAMPTRILINEAR3D_CUDA_KERNEL(double)
+
 }  // namespace oneflow
\ No newline at end of file
diff --git a/oneflow/user/kernels/variance_kernel_util.hip.cpp b/oneflow/user/kernels/variance_kernel_util.hip.cpp
index 44eb164..47245f3 100644
--- a/oneflow/user/kernels/variance_kernel_util.hip.cpp
+++ b/oneflow/user/kernels/variance_kernel_util.hip.cpp
@@ -1,192 +1,192 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "hip/hip_runtime.h"
-#include <glog/logging.h>
-#include "oneflow/user/kernels/variance_kernel_util.h"
-#include "oneflow/core/hip/layer_norm.hip.h"
-
-namespace oneflow {
-namespace user_op {
-
-namespace {
-template<typename T>
-__inline__ __device__ T Nan();
-
-template<>
-__inline__ __device__ float Nan<float>() {
-  return  __int_as_float(0x7fffffffU);
-}
-
-template<>
-__inline__ __device__ double Nan<double>() {
-  return __longlong_as_double(0xfff8000000000000ULL);
-}
-}  // namespace
-
-template<typename T>
-__global__ void ComputeVarUsingWelfordWrapper(const T* in_ptr, T* out_ptr, const VarParam var_param,
-                                              bool is_nan) {
-  if (is_nan) {
-    CUDA_1D_KERNEL_LOOP(i, var_param.parallel_num) { out_ptr[i] = Nan<T>(); }
-  } else {
-    CUDA_1D_KERNEL_LOOP(i, var_param.parallel_num) {
-      const size_t input_offset = LinearIndex2Offset(
-          i, var_param.dim_size_in_caxis, var_param.stride_in_caxis, var_param.caxis_size);
-      ComputeVarUsingWelford(&in_ptr[input_offset], &out_ptr[i], var_param);
-    }
-  }
-}
-
-namespace {
-template<typename T>
-inline __device__ void WelfordReduce(const T* in_ptr, T* mean, T* m2, T* count,
-                                     const size_t total_elem_cnt, const size_t start,
-                                     const size_t step) {
-  T old_mean = 0.0;
-  for (size_t i = start; i < total_elem_cnt; i += step) {
-    ++(*count);
-    old_mean = *mean;
-    *mean += (in_ptr[i] - *mean) / *count;
-    *m2 += (in_ptr[i] - *mean) * (in_ptr[i] - old_mean);
-  }
-}
-
-template<typename T>
-inline __device__ void WelfordCombine(const T* b_mean, const T* b_m2, const T* b_count, T* mean,
-                                      T* m2, T* count, const size_t total_elem_cnt,
-                                      const size_t start, const size_t step) {
-  for (size_t i = start; i < total_elem_cnt; i += step) {
-    cuda::layer_norm::WelfordCombine(b_mean[i], b_m2[i], b_count[i], mean, m2, count);
-  }
-}
-__device__ int32_t done_block_count = 0;
-}  // namespace
-
-template<typename T>
-__global__ void ComputeVarScalarOut(const T* in_ptr, T* out_ptr, T* tmp_buffer_ptr,
-                                    const VarParam var_param) {
-  if (var_param.elem_cnt == 1 && var_param.unbiased == true) {
-    if (blockIdx.x == 0 && threadIdx.x == 0) { *out_ptr = Nan<T>(); }
-    return;
-  }
-  const size_t elems_per_block = var_param.elem_cnt / gridDim.x;
-  const size_t elems_per_thread = elems_per_block / blockDim.x;
-  // tail element number in block
-  size_t tail_elems = elems_per_block % blockDim.x;
-
-  T thread_mean = 0.0;
-  T thread_m2 = 0.0;
-  T thread_count = 0.0;
-  // every thread deal it's elems
-  if (elems_per_thread > 0) {
-    const size_t block_offset = blockIdx.x * elems_per_block;
-    WelfordReduce(&in_ptr[block_offset], &thread_mean, &thread_m2, &thread_count,
-                  elems_per_block - tail_elems, threadIdx.x, blockDim.x);
-  }
-  // thread 0 of last block handles tail element between blocks
-  if (blockIdx.x == gridDim.x - 1 && threadIdx.x == 0) {
-    tail_elems += var_param.elem_cnt % gridDim.x;
-  }
-  // thread 0 deal tail elems
-  if (tail_elems != 0 && threadIdx.x == 0) {
-    const size_t tail_offset = blockIdx.x * elems_per_block + blockDim.x * elems_per_thread;
-    WelfordReduce(&in_ptr[tail_offset], &thread_mean, &thread_m2, &thread_count, tail_elems,
-                  /*tail start=*/0, /*step=*/1);
-  }
-
-  T block_mean = 0;
-  T block_m2 = 0;
-  T block_count = 0;
-  cuda::layer_norm::WelfordBlockAllReduce<T>(thread_mean, thread_m2, thread_count, &block_mean,
-                                             &block_m2, &block_count);
-
-  if (gridDim.x == 1) {
-    if (threadIdx.x == 0) {
-      *out_ptr =
-          cuda::layer_norm::Div(block_m2, (var_param.unbiased ? block_count - 1 : block_count));
-    }
-    return;
-  }
-
-  T* tmp_mean_ptr = tmp_buffer_ptr;
-  T* tmp_m2_ptr = &tmp_mean_ptr[gridDim.x];
-  T* tmp_count_ptr = &tmp_m2_ptr[gridDim.x];
-  if (threadIdx.x == 0) {
-    tmp_mean_ptr[blockIdx.x] = block_mean;
-    tmp_m2_ptr[blockIdx.x] = block_m2;
-    tmp_count_ptr[blockIdx.x] = block_count;
-  }
-  __shared__ bool is_last_block;
-  if (threadIdx.x == 0) { is_last_block = atomicAdd(&done_block_count, 1) == gridDim.x - 1; }
-  __syncthreads();
-  if (is_last_block) {
-    T last_block_thread_mean = 0;
-    T last_block_thread_m2 = 0;
-    T last_block_thread_count = 0;
-    const size_t welforddatas_per_thread = gridDim.x / blockDim.x;
-    const size_t tail_welforddatas = gridDim.x % blockDim.x;
-
-    if (welforddatas_per_thread > 0) {
-      WelfordCombine(tmp_mean_ptr, tmp_m2_ptr, tmp_count_ptr, &last_block_thread_mean,
-                     &last_block_thread_m2, &last_block_thread_count, gridDim.x - tail_welforddatas,
-                     threadIdx.x, blockDim.x);
-    }
-    // thread 0 deal tail welford data
-    if (tail_welforddatas != 0 && threadIdx.x == 0) {
-      const size_t last_block_tail_offset = blockDim.x * welforddatas_per_thread;
-      WelfordCombine(&tmp_mean_ptr[last_block_tail_offset], &tmp_m2_ptr[last_block_tail_offset],
-                     &tmp_count_ptr[last_block_tail_offset], &last_block_thread_mean,
-                     &last_block_thread_m2, &last_block_thread_count, tail_welforddatas,
-                     /*tail start=*/0, /*step=*/1);
-    }
-    T final_mean = 0;
-    T final_m2 = 0;
-    T final_count = 0;
-    cuda::layer_norm::WelfordBlockAllReduce<T>(last_block_thread_mean, last_block_thread_m2,
-                                               last_block_thread_count, &final_mean, &final_m2,
-                                               &final_count);
-    if (threadIdx.x == 0) {
-      *out_ptr =
-          cuda::layer_norm::Div(final_m2, (var_param.unbiased ? final_count - 1 : final_count));
-      done_block_count = 0;
-    }
-  }
-}
-
-template<typename T>
-struct VarFunctor<DeviceType::kCUDA, T> final {
-  void operator()(ep::Stream* stream, const T* in_ptr, T* out_ptr, T* tmp_buffer_ptr,
-                  const VarParam var_param) {
-    int grid_dim = 0;
-    int block_dim = 0;
-    SetGridDimAndBlockDim(var_param.elem_cnt, &grid_dim, &block_dim);
-    if (var_param.parallel_num == 1) {
-      ComputeVarScalarOut<T>
-          <<<grid_dim, block_dim, 0, stream->As<ep::CudaStream>()->cuda_stream()>>>(
-              in_ptr, out_ptr, tmp_buffer_ptr, var_param);
-    } else {
-      // if var_param.parallel_num is 0, do nothing, return 0-size tensor
-      if (var_param.parallel_num == 0) { return; }
-      RUN_CUDA_KERNEL(ComputeVarUsingWelfordWrapper<T>, stream, var_param.parallel_num, in_ptr,
-                      out_ptr, var_param, IsNanOut(var_param));
-    }
-  }
-};
-
-template struct VarFunctor<DeviceType::kCUDA, float>;
-template struct VarFunctor<DeviceType::kCUDA, double>;
-}  // namespace user_op
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "hip/hip_runtime.h"
+#include <glog/logging.h>
+#include "oneflow/user/kernels/variance_kernel_util.h"
+#include "oneflow/core/hip/layer_norm.hip.h"
+
+namespace oneflow {
+namespace user_op {
+
+namespace {
+template<typename T>
+__inline__ __device__ T Nan();
+
+template<>
+__inline__ __device__ float Nan<float>() {
+  return  __int_as_float(0x7fffffffU);
+}
+
+template<>
+__inline__ __device__ double Nan<double>() {
+  return __longlong_as_double(0xfff8000000000000ULL);
+}
+}  // namespace
+
+template<typename T>
+__global__ void ComputeVarUsingWelfordWrapper(const T* in_ptr, T* out_ptr, const VarParam var_param,
+                                              bool is_nan) {
+  if (is_nan) {
+    CUDA_1D_KERNEL_LOOP(i, var_param.parallel_num) { out_ptr[i] = Nan<T>(); }
+  } else {
+    CUDA_1D_KERNEL_LOOP(i, var_param.parallel_num) {
+      const size_t input_offset = LinearIndex2Offset(
+          i, var_param.dim_size_in_caxis, var_param.stride_in_caxis, var_param.caxis_size);
+      ComputeVarUsingWelford(&in_ptr[input_offset], &out_ptr[i], var_param);
+    }
+  }
+}
+
+namespace {
+template<typename T>
+inline __device__ void WelfordReduce(const T* in_ptr, T* mean, T* m2, T* count,
+                                     const size_t total_elem_cnt, const size_t start,
+                                     const size_t step) {
+  T old_mean = 0.0;
+  for (size_t i = start; i < total_elem_cnt; i += step) {
+    ++(*count);
+    old_mean = *mean;
+    *mean += (in_ptr[i] - *mean) / *count;
+    *m2 += (in_ptr[i] - *mean) * (in_ptr[i] - old_mean);
+  }
+}
+
+template<typename T>
+inline __device__ void WelfordCombine(const T* b_mean, const T* b_m2, const T* b_count, T* mean,
+                                      T* m2, T* count, const size_t total_elem_cnt,
+                                      const size_t start, const size_t step) {
+  for (size_t i = start; i < total_elem_cnt; i += step) {
+    cuda::layer_norm::WelfordCombine(b_mean[i], b_m2[i], b_count[i], mean, m2, count);
+  }
+}
+__device__ int32_t done_block_count = 0;
+}  // namespace
+
+template<typename T>
+__global__ void ComputeVarScalarOut(const T* in_ptr, T* out_ptr, T* tmp_buffer_ptr,
+                                    const VarParam var_param) {
+  if (var_param.elem_cnt == 1 && var_param.unbiased == true) {
+    if (blockIdx.x == 0 && threadIdx.x == 0) { *out_ptr = Nan<T>(); }
+    return;
+  }
+  const size_t elems_per_block = var_param.elem_cnt / gridDim.x;
+  const size_t elems_per_thread = elems_per_block / blockDim.x;
+  // tail element number in block
+  size_t tail_elems = elems_per_block % blockDim.x;
+
+  T thread_mean = 0.0;
+  T thread_m2 = 0.0;
+  T thread_count = 0.0;
+  // every thread deal it's elems
+  if (elems_per_thread > 0) {
+    const size_t block_offset = blockIdx.x * elems_per_block;
+    WelfordReduce(&in_ptr[block_offset], &thread_mean, &thread_m2, &thread_count,
+                  elems_per_block - tail_elems, threadIdx.x, blockDim.x);
+  }
+  // thread 0 of last block handles tail element between blocks
+  if (blockIdx.x == gridDim.x - 1 && threadIdx.x == 0) {
+    tail_elems += var_param.elem_cnt % gridDim.x;
+  }
+  // thread 0 deal tail elems
+  if (tail_elems != 0 && threadIdx.x == 0) {
+    const size_t tail_offset = blockIdx.x * elems_per_block + blockDim.x * elems_per_thread;
+    WelfordReduce(&in_ptr[tail_offset], &thread_mean, &thread_m2, &thread_count, tail_elems,
+                  /*tail start=*/0, /*step=*/1);
+  }
+
+  T block_mean = 0;
+  T block_m2 = 0;
+  T block_count = 0;
+  cuda::layer_norm::WelfordBlockAllReduce<T>(thread_mean, thread_m2, thread_count, &block_mean,
+                                             &block_m2, &block_count);
+
+  if (gridDim.x == 1) {
+    if (threadIdx.x == 0) {
+      *out_ptr =
+          cuda::layer_norm::Div(block_m2, (var_param.unbiased ? block_count - 1 : block_count));
+    }
+    return;
+  }
+
+  T* tmp_mean_ptr = tmp_buffer_ptr;
+  T* tmp_m2_ptr = &tmp_mean_ptr[gridDim.x];
+  T* tmp_count_ptr = &tmp_m2_ptr[gridDim.x];
+  if (threadIdx.x == 0) {
+    tmp_mean_ptr[blockIdx.x] = block_mean;
+    tmp_m2_ptr[blockIdx.x] = block_m2;
+    tmp_count_ptr[blockIdx.x] = block_count;
+  }
+  __shared__ bool is_last_block;
+  if (threadIdx.x == 0) { is_last_block = atomicAdd(&done_block_count, 1) == gridDim.x - 1; }
+  __syncthreads();
+  if (is_last_block) {
+    T last_block_thread_mean = 0;
+    T last_block_thread_m2 = 0;
+    T last_block_thread_count = 0;
+    const size_t welforddatas_per_thread = gridDim.x / blockDim.x;
+    const size_t tail_welforddatas = gridDim.x % blockDim.x;
+
+    if (welforddatas_per_thread > 0) {
+      WelfordCombine(tmp_mean_ptr, tmp_m2_ptr, tmp_count_ptr, &last_block_thread_mean,
+                     &last_block_thread_m2, &last_block_thread_count, gridDim.x - tail_welforddatas,
+                     threadIdx.x, blockDim.x);
+    }
+    // thread 0 deal tail welford data
+    if (tail_welforddatas != 0 && threadIdx.x == 0) {
+      const size_t last_block_tail_offset = blockDim.x * welforddatas_per_thread;
+      WelfordCombine(&tmp_mean_ptr[last_block_tail_offset], &tmp_m2_ptr[last_block_tail_offset],
+                     &tmp_count_ptr[last_block_tail_offset], &last_block_thread_mean,
+                     &last_block_thread_m2, &last_block_thread_count, tail_welforddatas,
+                     /*tail start=*/0, /*step=*/1);
+    }
+    T final_mean = 0;
+    T final_m2 = 0;
+    T final_count = 0;
+    cuda::layer_norm::WelfordBlockAllReduce<T>(last_block_thread_mean, last_block_thread_m2,
+                                               last_block_thread_count, &final_mean, &final_m2,
+                                               &final_count);
+    if (threadIdx.x == 0) {
+      *out_ptr =
+          cuda::layer_norm::Div(final_m2, (var_param.unbiased ? final_count - 1 : final_count));
+      done_block_count = 0;
+    }
+  }
+}
+
+template<typename T>
+struct VarFunctor<DeviceType::kCUDA, T> final {
+  void operator()(ep::Stream* stream, const T* in_ptr, T* out_ptr, T* tmp_buffer_ptr,
+                  const VarParam var_param) {
+    int grid_dim = 0;
+    int block_dim = 0;
+    SetGridDimAndBlockDim(var_param.elem_cnt, &grid_dim, &block_dim);
+    if (var_param.parallel_num == 1) {
+      ComputeVarScalarOut<T>
+          <<<grid_dim, block_dim, 0, stream->As<ep::CudaStream>()->cuda_stream()>>>(
+              in_ptr, out_ptr, tmp_buffer_ptr, var_param);
+    } else {
+      // if var_param.parallel_num is 0, do nothing, return 0-size tensor
+      if (var_param.parallel_num == 0) { return; }
+      RUN_CUDA_KERNEL(ComputeVarUsingWelfordWrapper<T>, stream, var_param.parallel_num, in_ptr,
+                      out_ptr, var_param, IsNanOut(var_param));
+    }
+  }
+};
+
+template struct VarFunctor<DeviceType::kCUDA, float>;
+template struct VarFunctor<DeviceType::kCUDA, double>;
+}  // namespace user_op
 }  // namespace oneflow
\ No newline at end of file
diff --git a/oneflow/user/kernels/where_kernel_util.hip.cpp b/oneflow/user/kernels/where_kernel_util.hip.cpp
index 276b1c1..b3a619a 100644
--- a/oneflow/user/kernels/where_kernel_util.hip.cpp
+++ b/oneflow/user/kernels/where_kernel_util.hip.cpp
@@ -1,90 +1,90 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "oneflow/user/kernels/where_kernel_util.h"
-#include "oneflow/core/hip/elementwise.hip.h"
-#include "oneflow/core/ep/rocm/cuda_stream.h"
-
-namespace oneflow {
-
-namespace {
-
-template<typename T, typename CondT>
-struct WhereFunctor {
-  OF_DEVICE_FUNC T operator()(CondT cond, T lhs, T rhs) const {
-    return static_cast<bool>(cond) ? lhs : rhs;
-  }
-};
-
-template<typename T, typename CondT>
-struct WhereScalarXFunctor {
-  OF_DEVICE_FUNC explicit WhereScalarXFunctor(T scalar) : x_scalar(scalar) {}
-  OF_DEVICE_FUNC T operator()(CondT cond, T rhs) const {
-    return static_cast<bool>(cond) ? x_scalar : rhs;
-  }
-  const T x_scalar;
-};
-
-template<typename T, typename CondT>
-struct WhereScalarYFunctor {
-  OF_DEVICE_FUNC explicit WhereScalarYFunctor(T scalar) : y_scalar(scalar) {}
-  OF_DEVICE_FUNC T operator()(CondT cond, T lhs) const {
-    return static_cast<bool>(cond) ? lhs : y_scalar;
-  }
-  const T y_scalar;
-};
-
-template<typename T, typename CondT>
-struct WhereScalarXYFunctor {
-  OF_DEVICE_FUNC explicit WhereScalarXYFunctor(T x_scalar, T y_scalar)
-      : x_scalar(x_scalar), y_scalar(y_scalar) {}
-  OF_DEVICE_FUNC T operator()(CondT cond) const {
-    return static_cast<bool>(cond) ? x_scalar : y_scalar;
-  }
-  const T x_scalar;
-  const T y_scalar;
-};
-
-}  // namespace
-
-template<typename T, typename CondT>
-struct WhereKernelUtil<DeviceType::kCUDA, T, CondT> {
-  static void Where(ep::Stream* stream, const int64_t elem_cnt, const CondT* cond, const T* lhs,
-                    const T* rhs, T* out) {
-    cuda::elementwise::Ternary(WhereFunctor<T, CondT>(), elem_cnt, out, cond, lhs, rhs,
-                               stream->As<ep::CudaStream>()->cuda_stream());
-  }
-  static void WhereXScalar(ep::Stream* stream, const int64_t elem_cnt, const CondT* cond,
-                           const T x_scalar, const T* rhs, T* out) {
-    cuda::elementwise::Binary(WhereScalarXFunctor<T, CondT>(x_scalar), elem_cnt, out, cond, rhs,
-                              stream->As<ep::CudaStream>()->cuda_stream());
-  }
-  static void WhereYScalar(ep::Stream* stream, const int64_t elem_cnt, const CondT* cond,
-                           const T* lhs, const T y_scalar, T* out) {
-    cuda::elementwise::Binary(WhereScalarYFunctor<T, CondT>(y_scalar), elem_cnt, out, cond, lhs,
-                              stream->As<ep::CudaStream>()->cuda_stream());
-  }
-  static void WhereXYScalar(ep::Stream* stream, const int64_t elem_cnt, const CondT* cond,
-                            const T x_scalar, const T y_scalar, T* out) {
-    cuda::elementwise::Unary(WhereScalarXYFunctor<T, CondT>(x_scalar, y_scalar), elem_cnt, out,
-                             cond, stream->As<ep::CudaStream>()->cuda_stream());
-  }
-};
-
-OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_WHERE_FUNCTOR, (DeviceType::kCUDA),
-                                 ARITHMETIC_DATA_TYPE_SEQ FLOAT16_DATA_TYPE_SEQ BOOL_DATA_TYPE_SEQ,
-                                 INT_DATA_TYPE_SEQ BOOL_DATA_TYPE_SEQ)
-
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "oneflow/user/kernels/where_kernel_util.h"
+#include "oneflow/core/hip/elementwise.hip.h"
+#include "oneflow/core/ep/rocm/cuda_stream.h"
+
+namespace oneflow {
+
+namespace {
+
+template<typename T, typename CondT>
+struct WhereFunctor {
+  OF_DEVICE_FUNC T operator()(CondT cond, T lhs, T rhs) const {
+    return static_cast<bool>(cond) ? lhs : rhs;
+  }
+};
+
+template<typename T, typename CondT>
+struct WhereScalarXFunctor {
+  OF_DEVICE_FUNC explicit WhereScalarXFunctor(T scalar) : x_scalar(scalar) {}
+  OF_DEVICE_FUNC T operator()(CondT cond, T rhs) const {
+    return static_cast<bool>(cond) ? x_scalar : rhs;
+  }
+  const T x_scalar;
+};
+
+template<typename T, typename CondT>
+struct WhereScalarYFunctor {
+  OF_DEVICE_FUNC explicit WhereScalarYFunctor(T scalar) : y_scalar(scalar) {}
+  OF_DEVICE_FUNC T operator()(CondT cond, T lhs) const {
+    return static_cast<bool>(cond) ? lhs : y_scalar;
+  }
+  const T y_scalar;
+};
+
+template<typename T, typename CondT>
+struct WhereScalarXYFunctor {
+  OF_DEVICE_FUNC explicit WhereScalarXYFunctor(T x_scalar, T y_scalar)
+      : x_scalar(x_scalar), y_scalar(y_scalar) {}
+  OF_DEVICE_FUNC T operator()(CondT cond) const {
+    return static_cast<bool>(cond) ? x_scalar : y_scalar;
+  }
+  const T x_scalar;
+  const T y_scalar;
+};
+
+}  // namespace
+
+template<typename T, typename CondT>
+struct WhereKernelUtil<DeviceType::kCUDA, T, CondT> {
+  static void Where(ep::Stream* stream, const int64_t elem_cnt, const CondT* cond, const T* lhs,
+                    const T* rhs, T* out) {
+    cuda::elementwise::Ternary(WhereFunctor<T, CondT>(), elem_cnt, out, cond, lhs, rhs,
+                               stream->As<ep::CudaStream>()->cuda_stream());
+  }
+  static void WhereXScalar(ep::Stream* stream, const int64_t elem_cnt, const CondT* cond,
+                           const T x_scalar, const T* rhs, T* out) {
+    cuda::elementwise::Binary(WhereScalarXFunctor<T, CondT>(x_scalar), elem_cnt, out, cond, rhs,
+                              stream->As<ep::CudaStream>()->cuda_stream());
+  }
+  static void WhereYScalar(ep::Stream* stream, const int64_t elem_cnt, const CondT* cond,
+                           const T* lhs, const T y_scalar, T* out) {
+    cuda::elementwise::Binary(WhereScalarYFunctor<T, CondT>(y_scalar), elem_cnt, out, cond, lhs,
+                              stream->As<ep::CudaStream>()->cuda_stream());
+  }
+  static void WhereXYScalar(ep::Stream* stream, const int64_t elem_cnt, const CondT* cond,
+                            const T x_scalar, const T y_scalar, T* out) {
+    cuda::elementwise::Unary(WhereScalarXYFunctor<T, CondT>(x_scalar, y_scalar), elem_cnt, out,
+                             cond, stream->As<ep::CudaStream>()->cuda_stream());
+  }
+};
+
+OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_WHERE_FUNCTOR, (DeviceType::kCUDA),
+                                 ARITHMETIC_DATA_TYPE_SEQ FLOAT16_DATA_TYPE_SEQ BOOL_DATA_TYPE_SEQ,
+                                 INT_DATA_TYPE_SEQ BOOL_DATA_TYPE_SEQ)
+
 }  // namespace oneflow
\ No newline at end of file
diff --git a/python/oneflow/test/modules/fused_dot_feature_interaction.py b/python/oneflow/test/modules/fused_dot_feature_interaction.py
index 6712f04..7b86daa 100644
--- a/python/oneflow/test/modules/fused_dot_feature_interaction.py
+++ b/python/oneflow/test/modules/fused_dot_feature_interaction.py
@@ -1,43 +1,43 @@
-import numpy as np
-import oneflow as flow
-
-def fused_dot_feature_interaction(x,
-                                  y,
-                                  self_interaction=False,
-                                  output_padding=0,
-                                  output_concat=None,
-                                  dtype=flow.float32
-                                  ):
-    # (bs, es) = x.shape
-    (bs, dims, es) = y.shape
-    
-    if self_interaction:
-        offset = 1
-    else:
-        offset = 0
-    li = flow.tensor([i for i in range(dims + 1) for j in range(i + offset)])
-    lj = flow.tensor([j for i in range(dims + 1) for j in range(i + offset)])
-    T = flow.cat(
-        [
-            flow.reshape(x, (bs, 1, es)),
-            y,
-        ],
-        dim=1,
-    )
-    Z = flow.matmul(T, T, transpose_b=True)
-    # gather_nd not support half, so cast to float32
-    Z = flow.cast(Z, flow.float32)
-    Zflat = Z[:, li, lj]
-    Zflat = flow.cast(Zflat, dtype)
-    if output_concat is not None:
-        R = flow.cat([output_concat, Zflat], dim=1)
-    else:
-        R = Zflat
-    if output_padding != 0:
-        padding_tensor = flow.tensor(
-            np.zeros((bs, output_padding)).astype(np.float32),
-            device="cuda",
-            requires_grad=False,
-        )
-        R = flow.cat([R, padding_tensor], dim=1)
-    return R
+import numpy as np
+import oneflow as flow
+
+def fused_dot_feature_interaction(x,
+                                  y,
+                                  self_interaction=False,
+                                  output_padding=0,
+                                  output_concat=None,
+                                  dtype=flow.float32
+                                  ):
+    # (bs, es) = x.shape
+    (bs, dims, es) = y.shape
+    
+    if self_interaction:
+        offset = 1
+    else:
+        offset = 0
+    li = flow.tensor([i for i in range(dims + 1) for j in range(i + offset)])
+    lj = flow.tensor([j for i in range(dims + 1) for j in range(i + offset)])
+    T = flow.cat(
+        [
+            flow.reshape(x, (bs, 1, es)),
+            y,
+        ],
+        dim=1,
+    )
+    Z = flow.matmul(T, T, transpose_b=True)
+    # gather_nd not support half, so cast to float32
+    Z = flow.cast(Z, flow.float32)
+    Zflat = Z[:, li, lj]
+    Zflat = flow.cast(Zflat, dtype)
+    if output_concat is not None:
+        R = flow.cat([output_concat, Zflat], dim=1)
+    else:
+        R = Zflat
+    if output_padding != 0:
+        padding_tensor = flow.tensor(
+            np.zeros((bs, output_padding)).astype(np.float32),
+            device="cuda",
+            requires_grad=False,
+        )
+        R = flow.cat([R, padding_tensor], dim=1)
+    return R
diff --git a/python/oneflow/test/modules/test_conv.py b/python/oneflow/test/modules/test_conv.py
index 001d35e..201f9b7 100644
--- a/python/oneflow/test/modules/test_conv.py
+++ b/python/oneflow/test/modules/test_conv.py
@@ -1,346 +1,346 @@
-import unittest
-from collections import OrderedDict
-
-import numpy as np
-from oneflow.test_utils.test_util import GenArgList
-from oneflow.test_utils.automated_test_util import *
-
-import oneflow as flow
-import oneflow.nn as nn
-import oneflow.unittest
-
-np_arr = np.array([[[1.28795946, -0.2921792, 0.20338029, 0.78604293, -1.89607573]]])
-input = flow.tensor(
-    np_arr, dtype=flow.float32, device=flow.device("cuda"), requires_grad=True
-)
-weight = np.array(
-    [
-        [[0.10197904, 0.3372305, -0.25743008]],
-        [[0.27720425, -0.52435774, -0.38381988]],
-        [[0.56016803, -0.10063095, -0.10760903]],
-    ]
-)
-m = nn.Conv1d(1, 3, 3, stride=1, bias=False)
-m.weight = flow.nn.Parameter(flow.Tensor(weight))
-m = m.to("cuda")
-output = m(input)
-np_out = np.array(
-    [
-        [
-            [-0.01954307, -0.16356121, 0.77392507],
-            [0.43217283, -0.48933625, 0.37196174],
-            [0.72899038, -0.2687211, 0.23886177],
-        ]
-    ]
-)
-if np.allclose(output.numpy(), np_out, 1e-06, 1e-06):
-    print("conv1d Passed")
-output = output.sum()
-output.backward()
-np_grad = np.array(
-    [[[0.93935132, 0.65159315, -0.09726584, -1.03661716, -0.74885899]]]
-)
-if np.allclose(input.grad.numpy(), np_grad, 1e-06, 1e-06):
-    print("conv1d_back Passed")
-
-
-
-test_conv2d_weight = np.array(
-    [
-        [
-            [
-                [0.8586049675941467, -0.2279418259859085, 0.2013147622346878],
-                [0.35005471110343933, 0.5360521078109741, 1.5194443464279175],
-                [1.9040879011154175, -1.5734431743621826, -0.14007866382598877],
-            ]
-        ],
-        [
-            [
-                [0.29670074582099915, 1.3111951351165771, 0.5035904049873352],
-                [-1.1894450187683105, -0.5502137541770935, -1.591875672340393],
-                [-1.1081947088241577, 0.07872020453214645, -0.9185634255409241],
-            ]
-        ],
-        [
-            [
-                [-0.7457143664360046, -1.2080862522125244, 1.8140212297439575],
-                [-1.5227429866790771, -2.515244960784912, -1.3549325466156006],
-                [-0.9574840068817139, -0.7248556613922119, 1.1119636297225952],
-            ]
-        ],
-    ]
-)
-test_conv2d_data = np.array(
-    [
-        [
-            [
-                [
-                    1.1630785465240479,
-                    0.4838046133518219,
-                    0.299563467502594,
-                    0.15302546322345734,
-                    -1.168814778327942,
-                ],
-                [
-                    1.5580710172653198,
-                    -0.5459445714950562,
-                    -2.3556296825408936,
-                    0.5414402484893799,
-                    2.678506374359131,
-                ],
-                [
-                    1.2546343803405762,
-                    -0.5487740635871887,
-                    -0.6810643672943115,
-                    -0.13531559705734253,
-                    0.37723132967948914,
-                ],
-                [
-                    0.41016456484794617,
-                    0.5712682008743286,
-                    -2.757962703704834,
-                    1.0762799978256226,
-                    -0.6141325235366821,
-                ],
-                [
-                    1.830764889717102,
-                    -1.1468064785003662,
-                    0.053837940096855164,
-                    -2.5074806213378906,
-                    -0.5916498899459839,
-                ],
-            ]
-        ]
-    ]
-)
-test_conv2d_data_grad = np.array(
-    [
-        [
-            [
-                [
-                    0.4095913469791412,
-                    0.2847584038972855,
-                    2.803684800863266,
-                    2.3940934538841248,
-                    2.5189263969659805,
-                ],
-                [
-                    -1.9525419473648071,
-                    -4.606781497597694,
-                    -3.51521897315979,
-                    -1.562677025794983,
-                    1.0915625244379044,
-                ],
-                [
-                    -2.1141327619552612,
-                    -6.987950943410397,
-                    -5.84306687861681,
-                    -3.7289341166615486,
-                    1.1448840647935867,
-                ],
-                [
-                    -2.5237241089344025,
-                    -7.272709347307682,
-                    -8.646751679480076,
-                    -6.123027570545673,
-                    -1.3740423321723938,
-                ],
-                [
-                    -0.1615908145904541,
-                    -2.381169445812702,
-                    -2.32784790545702,
-                    -2.1662570908665657,
-                    0.0533215403556824,
-                ],
-            ]
-        ]
-    ]
-)
-test_conv2d_weight_grad = np.array(
-    [
-        [
-            [
-                [0.6277393400669098, -2.7888944894075394, -0.2910575419664383],
-                [-3.095237225294113, -4.835702538490295, -1.8706469237804413],
-                [-1.0139376372098923, -6.076017692685127, -5.780256435275078],
-            ]
-        ],
-        [
-            [
-                [0.6277393400669098, -2.7888944894075394, -0.2910575419664383],
-                [-3.095237225294113, -4.835702538490295, -1.8706469237804413],
-                [-1.0139376372098923, -6.076017692685127, -5.780256435275078],
-            ]
-        ],
-        [
-            [
-                [0.6277393400669098, -2.7888944894075394, -0.2910575419664383],
-                [-3.095237225294113, -4.835702538490295, -1.8706469237804413],
-                [-1.0139376372098923, -6.076017692685127, -5.780256435275078],
-            ]
-        ],
-    ]
-)
-test_conv2d_output = np.array(
-    [
-        [
-            [
-                [0.9699610471725464, -0.20758534967899323, 2.3857712745666504],
-                [0.3666309118270874, 4.690882682800293, -8.203354835510254],
-                [2.6072847843170166, -1.9033538103103638, 2.331153154373169],
-            ],
-            [
-                [2.519343852996826, 2.3757898807525635, -1.6613528728485107],
-                [0.5777544379234314, -3.5739502906799316, 5.349126815795898],
-                [0.729295015335083, 1.5791023969650269, 3.7627718448638916],
-            ],
-            [
-                [-0.27685487270355225, 6.446267127990723, -2.762883424758911],
-                [-8.25644588470459, 9.616064071655273, 8.005367279052734],
-                [-0.6944921016693115, 3.866114854812622, 4.788446426391602],
-            ],
-        ]
-    ]
-)
-test_conv2d_with_bias_weight = np.array(
-    [
-        [
-            [
-                [1.8271433115005493, -1.0446699857711792, 1.0062190294265747],
-                [0.5174201130867004, -0.806931734085083, 1.3769007921218872],
-                [0.205885112285614, 0.9943519234657288, -0.23580588400363922],
-            ]
-        ],
-        [
-            [
-                [0.29881811141967773, -1.9982075691223145, 0.3511354625225067],
-                [-0.7644741535186768, 1.2594351768493652, -0.9629734754562378],
-                [0.5080506205558777, 0.7561734318733215, 1.6839302778244019],
-            ]
-        ],
-        [
-            [
-                [1.2573646306991577, 0.13123232126235962, 1.6403018236160278],
-                [-1.2138012647628784, 2.399970531463623, -0.38509097695350647],
-                [-0.9878040552139282, 0.9585888385772705, -1.4976465702056885],
-            ]
-        ],
-    ]
-)
-test_conv2d_with_bias_bias = np.array(
-    [0.6605162620544434, -0.18903568387031555, -0.27302607893943787]
-)
-test_conv2d_with_bias_data = np.array(
-    [
-        [
-            [
-                [
-                    -0.47827261686325073,
-                    -1.1739492416381836,
-                    -0.7921845316886902,
-                    0.9321041703224182,
-                    -3.1557741165161133,
-                ],
-                [
-                    2.1935296058654785,
-                    -0.5385921001434326,
-                    -0.8611332774162292,
-                    -1.881519079208374,
-                    -0.7205708026885986,
-                ],
-                [
-                    -0.35601571202278137,
-                    -0.15963983535766602,
-                    1.797447681427002,
-                    0.19594945013523102,
-                    -1.7376397848129272,
-                ],
-                [
-                    0.047347065061330795,
-                    0.14580930769443512,
-                    0.32604914903640747,
-                    0.4578782916069031,
-                    -0.8942581415176392,
-                ],
-                [
-                    0.49383941292762756,
-                    -0.9043426513671875,
-                    -1.2140793800354004,
-                    2.1564064025878906,
-                    1.0938222408294678,
-                ],
-            ]
-        ]
-    ]
-)
-test_conv2d_with_bias_output = np.array(
-    [
-        [
-            [
-                [-0.05607491731643677, -0.185230553150177, -3.8808679580688477],
-                [6.861937046051025, -2.3341472148895264, -0.5597308874130249],
-                [1.8299254179000854, -2.770848274230957, 2.1958212852478027],
-            ],
-            [
-                [2.9348952770233154, 4.117504119873047, -6.278541088104248],
-                [0.2638452351093292, 3.998856782913208, 2.612290620803833],
-                [-1.9891828298568726, -1.6476304531097412, 3.39066219329834],
-            ],
-            [
-                [-8.44466781616211, 0.5747121572494507, -8.501373291015625],
-                [-0.036642804741859436, -0.23458999395370483, -2.370849370956421],
-                [2.8372013568878174, -2.987276077270508, 1.8382092714309692],
-            ],
-        ]
-    ]
-)
-
-to_device = flow.device("cuda")
-
-conv = flow.nn.Conv2d(1, 3, (3, 3), bias=True).to(to_device)
-x = flow.tensor(test_conv2d_with_bias_data, dtype=flow.float32, device=to_device)
-conv.weight = flow.nn.Parameter(flow.Tensor(test_conv2d_with_bias_weight))
-conv.bias = flow.nn.Parameter(flow.Tensor(test_conv2d_with_bias_bias))
-conv.to(to_device)
-of_out = conv(x)
-if np.allclose(of_out.numpy(), test_conv2d_with_bias_output, rtol=1e-4, atol=1e-8):
-    print("conv2d_bias Passed")
-
-conv = flow.nn.Conv2d(1, 3, (3, 3), bias=False).to(flow.device("cuda"))
-x = flow.tensor(test_conv2d_data, dtype=flow.float32, device=to_device, requires_grad=True)
-conv.weight = flow.nn.Parameter(flow.Tensor(test_conv2d_weight), requires_grad=True)
-conv.to(to_device)
-of_out = conv(x)
-of_out.sum().backward()
-if np.allclose(x.grad.numpy(), test_conv2d_data_grad, rtol=1e-4, atol=1e-8):
-    print("con2d_back_data_grad Passed")
-
-if np.allclose(conv.weight.grad.numpy(), test_conv2d_weight_grad, rtol=1e-4, atol=1e-8):
-    print("con2d_back_weight_grad Passed")
-
-conv = flow.nn.Conv2d(1, 3, (3, 3), bias=True).to(to_device)
-x = flow.tensor(test_conv2d_with_bias_data, dtype=flow.float32, device=to_device)
-conv.weight = flow.nn.Parameter(flow.Tensor(test_conv2d_with_bias_weight))
-conv.bias = flow.nn.Parameter(flow.Tensor(test_conv2d_with_bias_bias))
-conv.to(to_device)
-of_out = conv(x)
-if np.allclose(of_out.numpy(), test_conv2d_with_bias_output, rtol=1e-4, atol=1e-8):
-    print("conv2d_bias Passed")
-
-conv = flow.nn.Conv2d(1, 3, (3, 3), bias=False).to(flow.device("cuda"))
-x = flow.tensor(test_conv2d_data, dtype=flow.float32, device=to_device, requires_grad=True)
-conv.weight = flow.nn.Parameter(flow.Tensor(test_conv2d_weight), requires_grad=True)
-conv.to(to_device)
-of_out = conv(x)
-of_out.sum().backward()
-if np.allclose(x.grad.numpy(), test_conv2d_data_grad, rtol=1e-4, atol=1e-8):
-    print("con2d_back_data_grad Passed")
-
-if np.allclose(conv.weight.grad.numpy(), test_conv2d_weight_grad, rtol=1e-4, atol=1e-8):
-    print("con2d_back_weight_grad Passed")
-
-
-
-
-
-
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+from oneflow.test_utils.test_util import GenArgList
+from oneflow.test_utils.automated_test_util import *
+
+import oneflow as flow
+import oneflow.nn as nn
+import oneflow.unittest
+
+np_arr = np.array([[[1.28795946, -0.2921792, 0.20338029, 0.78604293, -1.89607573]]])
+input = flow.tensor(
+    np_arr, dtype=flow.float32, device=flow.device("cuda"), requires_grad=True
+)
+weight = np.array(
+    [
+        [[0.10197904, 0.3372305, -0.25743008]],
+        [[0.27720425, -0.52435774, -0.38381988]],
+        [[0.56016803, -0.10063095, -0.10760903]],
+    ]
+)
+m = nn.Conv1d(1, 3, 3, stride=1, bias=False)
+m.weight = flow.nn.Parameter(flow.Tensor(weight))
+m = m.to("cuda")
+output = m(input)
+np_out = np.array(
+    [
+        [
+            [-0.01954307, -0.16356121, 0.77392507],
+            [0.43217283, -0.48933625, 0.37196174],
+            [0.72899038, -0.2687211, 0.23886177],
+        ]
+    ]
+)
+if np.allclose(output.numpy(), np_out, 1e-06, 1e-06):
+    print("conv1d Passed")
+output = output.sum()
+output.backward()
+np_grad = np.array(
+    [[[0.93935132, 0.65159315, -0.09726584, -1.03661716, -0.74885899]]]
+)
+if np.allclose(input.grad.numpy(), np_grad, 1e-06, 1e-06):
+    print("conv1d_back Passed")
+
+
+
+test_conv2d_weight = np.array(
+    [
+        [
+            [
+                [0.8586049675941467, -0.2279418259859085, 0.2013147622346878],
+                [0.35005471110343933, 0.5360521078109741, 1.5194443464279175],
+                [1.9040879011154175, -1.5734431743621826, -0.14007866382598877],
+            ]
+        ],
+        [
+            [
+                [0.29670074582099915, 1.3111951351165771, 0.5035904049873352],
+                [-1.1894450187683105, -0.5502137541770935, -1.591875672340393],
+                [-1.1081947088241577, 0.07872020453214645, -0.9185634255409241],
+            ]
+        ],
+        [
+            [
+                [-0.7457143664360046, -1.2080862522125244, 1.8140212297439575],
+                [-1.5227429866790771, -2.515244960784912, -1.3549325466156006],
+                [-0.9574840068817139, -0.7248556613922119, 1.1119636297225952],
+            ]
+        ],
+    ]
+)
+test_conv2d_data = np.array(
+    [
+        [
+            [
+                [
+                    1.1630785465240479,
+                    0.4838046133518219,
+                    0.299563467502594,
+                    0.15302546322345734,
+                    -1.168814778327942,
+                ],
+                [
+                    1.5580710172653198,
+                    -0.5459445714950562,
+                    -2.3556296825408936,
+                    0.5414402484893799,
+                    2.678506374359131,
+                ],
+                [
+                    1.2546343803405762,
+                    -0.5487740635871887,
+                    -0.6810643672943115,
+                    -0.13531559705734253,
+                    0.37723132967948914,
+                ],
+                [
+                    0.41016456484794617,
+                    0.5712682008743286,
+                    -2.757962703704834,
+                    1.0762799978256226,
+                    -0.6141325235366821,
+                ],
+                [
+                    1.830764889717102,
+                    -1.1468064785003662,
+                    0.053837940096855164,
+                    -2.5074806213378906,
+                    -0.5916498899459839,
+                ],
+            ]
+        ]
+    ]
+)
+test_conv2d_data_grad = np.array(
+    [
+        [
+            [
+                [
+                    0.4095913469791412,
+                    0.2847584038972855,
+                    2.803684800863266,
+                    2.3940934538841248,
+                    2.5189263969659805,
+                ],
+                [
+                    -1.9525419473648071,
+                    -4.606781497597694,
+                    -3.51521897315979,
+                    -1.562677025794983,
+                    1.0915625244379044,
+                ],
+                [
+                    -2.1141327619552612,
+                    -6.987950943410397,
+                    -5.84306687861681,
+                    -3.7289341166615486,
+                    1.1448840647935867,
+                ],
+                [
+                    -2.5237241089344025,
+                    -7.272709347307682,
+                    -8.646751679480076,
+                    -6.123027570545673,
+                    -1.3740423321723938,
+                ],
+                [
+                    -0.1615908145904541,
+                    -2.381169445812702,
+                    -2.32784790545702,
+                    -2.1662570908665657,
+                    0.0533215403556824,
+                ],
+            ]
+        ]
+    ]
+)
+test_conv2d_weight_grad = np.array(
+    [
+        [
+            [
+                [0.6277393400669098, -2.7888944894075394, -0.2910575419664383],
+                [-3.095237225294113, -4.835702538490295, -1.8706469237804413],
+                [-1.0139376372098923, -6.076017692685127, -5.780256435275078],
+            ]
+        ],
+        [
+            [
+                [0.6277393400669098, -2.7888944894075394, -0.2910575419664383],
+                [-3.095237225294113, -4.835702538490295, -1.8706469237804413],
+                [-1.0139376372098923, -6.076017692685127, -5.780256435275078],
+            ]
+        ],
+        [
+            [
+                [0.6277393400669098, -2.7888944894075394, -0.2910575419664383],
+                [-3.095237225294113, -4.835702538490295, -1.8706469237804413],
+                [-1.0139376372098923, -6.076017692685127, -5.780256435275078],
+            ]
+        ],
+    ]
+)
+test_conv2d_output = np.array(
+    [
+        [
+            [
+                [0.9699610471725464, -0.20758534967899323, 2.3857712745666504],
+                [0.3666309118270874, 4.690882682800293, -8.203354835510254],
+                [2.6072847843170166, -1.9033538103103638, 2.331153154373169],
+            ],
+            [
+                [2.519343852996826, 2.3757898807525635, -1.6613528728485107],
+                [0.5777544379234314, -3.5739502906799316, 5.349126815795898],
+                [0.729295015335083, 1.5791023969650269, 3.7627718448638916],
+            ],
+            [
+                [-0.27685487270355225, 6.446267127990723, -2.762883424758911],
+                [-8.25644588470459, 9.616064071655273, 8.005367279052734],
+                [-0.6944921016693115, 3.866114854812622, 4.788446426391602],
+            ],
+        ]
+    ]
+)
+test_conv2d_with_bias_weight = np.array(
+    [
+        [
+            [
+                [1.8271433115005493, -1.0446699857711792, 1.0062190294265747],
+                [0.5174201130867004, -0.806931734085083, 1.3769007921218872],
+                [0.205885112285614, 0.9943519234657288, -0.23580588400363922],
+            ]
+        ],
+        [
+            [
+                [0.29881811141967773, -1.9982075691223145, 0.3511354625225067],
+                [-0.7644741535186768, 1.2594351768493652, -0.9629734754562378],
+                [0.5080506205558777, 0.7561734318733215, 1.6839302778244019],
+            ]
+        ],
+        [
+            [
+                [1.2573646306991577, 0.13123232126235962, 1.6403018236160278],
+                [-1.2138012647628784, 2.399970531463623, -0.38509097695350647],
+                [-0.9878040552139282, 0.9585888385772705, -1.4976465702056885],
+            ]
+        ],
+    ]
+)
+test_conv2d_with_bias_bias = np.array(
+    [0.6605162620544434, -0.18903568387031555, -0.27302607893943787]
+)
+test_conv2d_with_bias_data = np.array(
+    [
+        [
+            [
+                [
+                    -0.47827261686325073,
+                    -1.1739492416381836,
+                    -0.7921845316886902,
+                    0.9321041703224182,
+                    -3.1557741165161133,
+                ],
+                [
+                    2.1935296058654785,
+                    -0.5385921001434326,
+                    -0.8611332774162292,
+                    -1.881519079208374,
+                    -0.7205708026885986,
+                ],
+                [
+                    -0.35601571202278137,
+                    -0.15963983535766602,
+                    1.797447681427002,
+                    0.19594945013523102,
+                    -1.7376397848129272,
+                ],
+                [
+                    0.047347065061330795,
+                    0.14580930769443512,
+                    0.32604914903640747,
+                    0.4578782916069031,
+                    -0.8942581415176392,
+                ],
+                [
+                    0.49383941292762756,
+                    -0.9043426513671875,
+                    -1.2140793800354004,
+                    2.1564064025878906,
+                    1.0938222408294678,
+                ],
+            ]
+        ]
+    ]
+)
+test_conv2d_with_bias_output = np.array(
+    [
+        [
+            [
+                [-0.05607491731643677, -0.185230553150177, -3.8808679580688477],
+                [6.861937046051025, -2.3341472148895264, -0.5597308874130249],
+                [1.8299254179000854, -2.770848274230957, 2.1958212852478027],
+            ],
+            [
+                [2.9348952770233154, 4.117504119873047, -6.278541088104248],
+                [0.2638452351093292, 3.998856782913208, 2.612290620803833],
+                [-1.9891828298568726, -1.6476304531097412, 3.39066219329834],
+            ],
+            [
+                [-8.44466781616211, 0.5747121572494507, -8.501373291015625],
+                [-0.036642804741859436, -0.23458999395370483, -2.370849370956421],
+                [2.8372013568878174, -2.987276077270508, 1.8382092714309692],
+            ],
+        ]
+    ]
+)
+
+to_device = flow.device("cuda")
+
+conv = flow.nn.Conv2d(1, 3, (3, 3), bias=True).to(to_device)
+x = flow.tensor(test_conv2d_with_bias_data, dtype=flow.float32, device=to_device)
+conv.weight = flow.nn.Parameter(flow.Tensor(test_conv2d_with_bias_weight))
+conv.bias = flow.nn.Parameter(flow.Tensor(test_conv2d_with_bias_bias))
+conv.to(to_device)
+of_out = conv(x)
+if np.allclose(of_out.numpy(), test_conv2d_with_bias_output, rtol=1e-4, atol=1e-8):
+    print("conv2d_bias Passed")
+
+conv = flow.nn.Conv2d(1, 3, (3, 3), bias=False).to(flow.device("cuda"))
+x = flow.tensor(test_conv2d_data, dtype=flow.float32, device=to_device, requires_grad=True)
+conv.weight = flow.nn.Parameter(flow.Tensor(test_conv2d_weight), requires_grad=True)
+conv.to(to_device)
+of_out = conv(x)
+of_out.sum().backward()
+if np.allclose(x.grad.numpy(), test_conv2d_data_grad, rtol=1e-4, atol=1e-8):
+    print("con2d_back_data_grad Passed")
+
+if np.allclose(conv.weight.grad.numpy(), test_conv2d_weight_grad, rtol=1e-4, atol=1e-8):
+    print("con2d_back_weight_grad Passed")
+
+conv = flow.nn.Conv2d(1, 3, (3, 3), bias=True).to(to_device)
+x = flow.tensor(test_conv2d_with_bias_data, dtype=flow.float32, device=to_device)
+conv.weight = flow.nn.Parameter(flow.Tensor(test_conv2d_with_bias_weight))
+conv.bias = flow.nn.Parameter(flow.Tensor(test_conv2d_with_bias_bias))
+conv.to(to_device)
+of_out = conv(x)
+if np.allclose(of_out.numpy(), test_conv2d_with_bias_output, rtol=1e-4, atol=1e-8):
+    print("conv2d_bias Passed")
+
+conv = flow.nn.Conv2d(1, 3, (3, 3), bias=False).to(flow.device("cuda"))
+x = flow.tensor(test_conv2d_data, dtype=flow.float32, device=to_device, requires_grad=True)
+conv.weight = flow.nn.Parameter(flow.Tensor(test_conv2d_weight), requires_grad=True)
+conv.to(to_device)
+of_out = conv(x)
+of_out.sum().backward()
+if np.allclose(x.grad.numpy(), test_conv2d_data_grad, rtol=1e-4, atol=1e-8):
+    print("con2d_back_data_grad Passed")
+
+if np.allclose(conv.weight.grad.numpy(), test_conv2d_weight_grad, rtol=1e-4, atol=1e-8):
+    print("con2d_back_weight_grad Passed")
+
+
+
+
+
+
diff --git a/python/oneflow/test/modules/test_softmax_cross_entropy b/python/oneflow/test/modules/test_softmax_cross_entropy
index 017d01a..a30b1cc 100644
--- a/python/oneflow/test/modules/test_softmax_cross_entropy
+++ b/python/oneflow/test/modules/test_softmax_cross_entropy
@@ -1,174 +1,174 @@
-"""
-Copyright 2020 The OneFlow Authors. All rights reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-"""
-
-import unittest
-import oneflow as flow
-import oneflow.unittest
-import numpy as np
-import torch
-
-class TestSoftmaxCrossEntropyError(flow.unittest.TestCase):
-    def test_softmax_cross_entropy_prediction_numaxes_err(test_case):
-        with test_case.assertRaises(RuntimeError) as context:
-            prediction = flow.randn(10)
-            label = flow.randn(1, 10)
-            flow._C.softmax_cross_entropy(prediction, label)
-        test_case.assertTrue(
-            "The dimension of prediction must be greater than or equal to 2, but found"
-            in str(context.exception)
-        )
-
-    def test_softmax_cross_entropy_prediction_shape_err(test_case):
-        with test_case.assertRaises(RuntimeError) as context:
-            prediction = flow.randn(1, 10)
-            label = flow.randn(1, 11)
-            flow._C.softmax_cross_entropy(prediction, label)
-        test_case.assertTrue(
-            "must match the size of prediction" in str(context.exception)
-        )
-
-    def test_softmax_cross_entropy_dtype_err(test_case):
-        with test_case.assertRaises(TypeError) as context:
-            prediction = flow.randn(1, 10, dtype=flow.float32)
-            label = flow.randn(1, 10, dtype=flow.float64)
-            flow._C.softmax_cross_entropy(prediction, label)
-        test_case.assertTrue(
-            "label and prediction are expected to have the same dtype, but found"
-            in str(context.exception)
-        )
-
-    def test_softmax_cross_entropy_grad_prob_numaxes_err(test_case):
-        with test_case.assertRaises(RuntimeError) as context:
-            dy = flow.randn(10, 5)
-            label = flow.randn(10, 10, 5)
-            prob = flow.randn(10)
-            flow._C.softmax_cross_entropy_grad(dy, label, prob)
-        test_case.assertTrue(
-            "The dimension of prob must be greater than or equal to 2, but found "
-            in str(context.exception)
-        )
-
-    def test_softmax_cross_entropy_grad_dy_numaxes_err(test_case):
-        with test_case.assertRaises(RuntimeError) as context:
-            dy = flow.randn(10, 10, 5)
-            label = flow.randn(10, 10, 5)
-            prob = flow.randn(10, 10, 5)
-            flow._C.softmax_cross_entropy_grad(dy, label, prob)
-        test_case.assertTrue(
-            "The dimension of dy is expected to be less than that of prob by 1, but found"
-            in str(context.exception)
-        )
-
-    def test_softmax_cross_entropy_grad_dy_i_shape_err(test_case):
-        with test_case.assertRaises(RuntimeError) as context:
-            dy = flow.randn(10, 8)
-            label = flow.randn(10, 10, 5)
-            prob = flow.randn(10, 10, 5)
-            flow._C.softmax_cross_entropy_grad(dy, label, prob)
-        test_case.assertTrue("must match the size of label" in str(context.exception))
-
-    def test_softmax_cross_entropy_grad_prob_shape_err(test_case):
-        with test_case.assertRaises(RuntimeError) as context:
-            dy = flow.randn(10, 10)
-            label = flow.randn(10, 10, 5)
-            prob = flow.randn(10, 10, 6)
-            flow._C.softmax_cross_entropy_grad(dy, label, prob)
-        test_case.assertTrue("must match the size of prob" in str(context.exception))
-
-    def test_softmax_cross_entropy_grad_label_dtype_err(test_case):
-        with test_case.assertRaises(TypeError) as context:
-            dy = flow.randn(10, 10, dtype=flow.float64)
-            label = flow.randn(10, 10, 5, dtype=flow.float32)
-            prob = flow.randn(10, 10, 5, dtype=flow.float64)
-            flow._C.softmax_cross_entropy_grad(dy, label, prob)
-        test_case.assertTrue(
-            "label and prob are expected to have the same dtype, but found"
-            in str(context.exception)
-        )
-
-    def test_softmax_cross_entropy_grad_dy_dtype_err(test_case):
-        with test_case.assertRaises(TypeError) as context:
-            dy = flow.randn(10, 10, dtype=flow.float32)
-            label = flow.randn(10, 10, 5, dtype=flow.float64)
-            prob = flow.randn(10, 10, 5, dtype=flow.float64)
-            flow._C.softmax_cross_entropy_grad(dy, label, prob)
-            print(str(context.exception))
-        test_case.assertTrue(
-            "dy and prob are expected to have the same dtype, but found"
-            in str(context.exception)
-        )
-
-
-if __name__ == "__main__":
-
-    np_prediction = np.random.random((1, 10)).astype(np.float32)
-    np_label = np.random.random((1, 10)).astype(np.float32)
-
-    of_prediction = flow.tensor(
-        np_prediction, device=flow.device("cuda"), dtype=flow.float32, requires_grad=True)
-    of_label = flow.tensor(np_label, device=flow.device("cuda"), dtype=flow.float32)
-    of_output = flow._C.softmax_cross_entropy(of_prediction, of_label).to("cuda")
-    of_output.sum()
-
-    torch_prediction = torch.tensor(np_prediction, dtype=torch.float32, requires_grad=True)
-    torch_label = torch.tensor(np_label, dtype=torch.float32)
-    torch_output = torch.nn.functional.cross_entropy(
-        torch_prediction, torch_label, reduction="none")
-    torch_output.sum()
-
-    if np.allclose(
-        of_output.numpy(), torch_output.detach().numpy(), rtol=1e-03, atol=1e-04
-    ):
-        print("test_softmax_cross_entropy Passed")
-    else:
-        print("test_softmax_cross_entropy Failed")
-
-    np_prediction = np.random.random((1, 10, 2)).astype(np.float32)
-    np_label = np.random.random((1, 10, 2)).astype(np.float32)
-
-    of_prediction = flow.tensor(
-        np_prediction, device=flow.device("cpu"), dtype=flow.float32, requires_grad=True)
-    of_label = flow.tensor(np_label, device=flow.device("cpu"), dtype=flow.float32)
-    of_output = flow._C.softmax_cross_entropy(of_prediction, of_label)
-    of_output.sum().backward()
-    print("of cpu res:")
-    print(of_prediction.grad.numpy())
-
-    of_prediction = flow.tensor(
-        np_prediction, device=flow.device("cuda"), dtype=flow.float32, requires_grad=True)
-    of_label = flow.tensor(np_label, device=flow.device("cuda"), dtype=flow.float32)
-    of_output = flow._C.softmax_cross_entropy(of_prediction, of_label).to("cuda")
-    of_output.sum().backward()
-    print("of gpu res:")
-    print(of_prediction.grad.numpy())
-
-    torch_prediction = torch.tensor(np_prediction, dtype=torch.float32, requires_grad=True)
-    torch_label = torch.tensor(np_label, dtype=torch.float32)
-    torch_output = torch.nn.functional.cross_entropy(
-        torch_prediction, torch_label, reduction="none")
-    torch_output.sum().backward()
-
-    print("*************************")
-    print(torch_prediction.grad)
-    # if np.allclose(
-    #     of_prediction.grad.numpy(), torch_prediction.grad, rtol=1e-03, atol=1e-04
-    # ):
-    #     print("test_softmax_cross_entropy_grad Passed")
-    # else:
-    #     print("test_softmax_cross_entropy_grad Failed")
-
-
-
-    
-
-
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+import oneflow as flow
+import oneflow.unittest
+import numpy as np
+import torch
+
+class TestSoftmaxCrossEntropyError(flow.unittest.TestCase):
+    def test_softmax_cross_entropy_prediction_numaxes_err(test_case):
+        with test_case.assertRaises(RuntimeError) as context:
+            prediction = flow.randn(10)
+            label = flow.randn(1, 10)
+            flow._C.softmax_cross_entropy(prediction, label)
+        test_case.assertTrue(
+            "The dimension of prediction must be greater than or equal to 2, but found"
+            in str(context.exception)
+        )
+
+    def test_softmax_cross_entropy_prediction_shape_err(test_case):
+        with test_case.assertRaises(RuntimeError) as context:
+            prediction = flow.randn(1, 10)
+            label = flow.randn(1, 11)
+            flow._C.softmax_cross_entropy(prediction, label)
+        test_case.assertTrue(
+            "must match the size of prediction" in str(context.exception)
+        )
+
+    def test_softmax_cross_entropy_dtype_err(test_case):
+        with test_case.assertRaises(TypeError) as context:
+            prediction = flow.randn(1, 10, dtype=flow.float32)
+            label = flow.randn(1, 10, dtype=flow.float64)
+            flow._C.softmax_cross_entropy(prediction, label)
+        test_case.assertTrue(
+            "label and prediction are expected to have the same dtype, but found"
+            in str(context.exception)
+        )
+
+    def test_softmax_cross_entropy_grad_prob_numaxes_err(test_case):
+        with test_case.assertRaises(RuntimeError) as context:
+            dy = flow.randn(10, 5)
+            label = flow.randn(10, 10, 5)
+            prob = flow.randn(10)
+            flow._C.softmax_cross_entropy_grad(dy, label, prob)
+        test_case.assertTrue(
+            "The dimension of prob must be greater than or equal to 2, but found "
+            in str(context.exception)
+        )
+
+    def test_softmax_cross_entropy_grad_dy_numaxes_err(test_case):
+        with test_case.assertRaises(RuntimeError) as context:
+            dy = flow.randn(10, 10, 5)
+            label = flow.randn(10, 10, 5)
+            prob = flow.randn(10, 10, 5)
+            flow._C.softmax_cross_entropy_grad(dy, label, prob)
+        test_case.assertTrue(
+            "The dimension of dy is expected to be less than that of prob by 1, but found"
+            in str(context.exception)
+        )
+
+    def test_softmax_cross_entropy_grad_dy_i_shape_err(test_case):
+        with test_case.assertRaises(RuntimeError) as context:
+            dy = flow.randn(10, 8)
+            label = flow.randn(10, 10, 5)
+            prob = flow.randn(10, 10, 5)
+            flow._C.softmax_cross_entropy_grad(dy, label, prob)
+        test_case.assertTrue("must match the size of label" in str(context.exception))
+
+    def test_softmax_cross_entropy_grad_prob_shape_err(test_case):
+        with test_case.assertRaises(RuntimeError) as context:
+            dy = flow.randn(10, 10)
+            label = flow.randn(10, 10, 5)
+            prob = flow.randn(10, 10, 6)
+            flow._C.softmax_cross_entropy_grad(dy, label, prob)
+        test_case.assertTrue("must match the size of prob" in str(context.exception))
+
+    def test_softmax_cross_entropy_grad_label_dtype_err(test_case):
+        with test_case.assertRaises(TypeError) as context:
+            dy = flow.randn(10, 10, dtype=flow.float64)
+            label = flow.randn(10, 10, 5, dtype=flow.float32)
+            prob = flow.randn(10, 10, 5, dtype=flow.float64)
+            flow._C.softmax_cross_entropy_grad(dy, label, prob)
+        test_case.assertTrue(
+            "label and prob are expected to have the same dtype, but found"
+            in str(context.exception)
+        )
+
+    def test_softmax_cross_entropy_grad_dy_dtype_err(test_case):
+        with test_case.assertRaises(TypeError) as context:
+            dy = flow.randn(10, 10, dtype=flow.float32)
+            label = flow.randn(10, 10, 5, dtype=flow.float64)
+            prob = flow.randn(10, 10, 5, dtype=flow.float64)
+            flow._C.softmax_cross_entropy_grad(dy, label, prob)
+            print(str(context.exception))
+        test_case.assertTrue(
+            "dy and prob are expected to have the same dtype, but found"
+            in str(context.exception)
+        )
+
+
+if __name__ == "__main__":
+
+    np_prediction = np.random.random((1, 10)).astype(np.float32)
+    np_label = np.random.random((1, 10)).astype(np.float32)
+
+    of_prediction = flow.tensor(
+        np_prediction, device=flow.device("cuda"), dtype=flow.float32, requires_grad=True)
+    of_label = flow.tensor(np_label, device=flow.device("cuda"), dtype=flow.float32)
+    of_output = flow._C.softmax_cross_entropy(of_prediction, of_label).to("cuda")
+    of_output.sum()
+
+    torch_prediction = torch.tensor(np_prediction, dtype=torch.float32, requires_grad=True)
+    torch_label = torch.tensor(np_label, dtype=torch.float32)
+    torch_output = torch.nn.functional.cross_entropy(
+        torch_prediction, torch_label, reduction="none")
+    torch_output.sum()
+
+    if np.allclose(
+        of_output.numpy(), torch_output.detach().numpy(), rtol=1e-03, atol=1e-04
+    ):
+        print("test_softmax_cross_entropy Passed")
+    else:
+        print("test_softmax_cross_entropy Failed")
+
+    np_prediction = np.random.random((1, 10, 2)).astype(np.float32)
+    np_label = np.random.random((1, 10, 2)).astype(np.float32)
+
+    of_prediction = flow.tensor(
+        np_prediction, device=flow.device("cpu"), dtype=flow.float32, requires_grad=True)
+    of_label = flow.tensor(np_label, device=flow.device("cpu"), dtype=flow.float32)
+    of_output = flow._C.softmax_cross_entropy(of_prediction, of_label)
+    of_output.sum().backward()
+    print("of cpu res:")
+    print(of_prediction.grad.numpy())
+
+    of_prediction = flow.tensor(
+        np_prediction, device=flow.device("cuda"), dtype=flow.float32, requires_grad=True)
+    of_label = flow.tensor(np_label, device=flow.device("cuda"), dtype=flow.float32)
+    of_output = flow._C.softmax_cross_entropy(of_prediction, of_label).to("cuda")
+    of_output.sum().backward()
+    print("of gpu res:")
+    print(of_prediction.grad.numpy())
+
+    torch_prediction = torch.tensor(np_prediction, dtype=torch.float32, requires_grad=True)
+    torch_label = torch.tensor(np_label, dtype=torch.float32)
+    torch_output = torch.nn.functional.cross_entropy(
+        torch_prediction, torch_label, reduction="none")
+    torch_output.sum().backward()
+
+    print("*************************")
+    print(torch_prediction.grad)
+    # if np.allclose(
+    #     of_prediction.grad.numpy(), torch_prediction.grad, rtol=1e-03, atol=1e-04
+    # ):
+    #     print("test_softmax_cross_entropy_grad Passed")
+    # else:
+    #     print("test_softmax_cross_entropy_grad Failed")
+
+
+
+    
+
+
diff --git a/python/oneflow/test/profiler/test_profile_lenet.py b/python/oneflow/test/profiler/test_profile_lenet.py
index 07d3a3c..d573277 100644
--- a/python/oneflow/test/profiler/test_profile_lenet.py
+++ b/python/oneflow/test/profiler/test_profile_lenet.py
@@ -1,148 +1,148 @@
-"""
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-"""
-import os
-import unittest
-import oneflow.unittest
-import oneflow as flow
-import oneflow.nn as nn
-import oneflow.nn.functional as F
-import oneflow.profiler
-from oneflow.profiler.events import CustomEvent, KernelEvent
-
-
-class LeNet(nn.Module):
-    def __init__(self):
-        super(LeNet, self).__init__()
-        self.conv1 = nn.Conv2d(3, 6, 5)
-        self.conv2 = nn.Conv2d(6, 16, 5)
-        self.fc1 = nn.Linear(16 * 5 * 5, 120)
-        self.fc2 = nn.Linear(120, 84)
-        self.fc3 = nn.Linear(84, 10)
-
-    def forward(self, x):
-        out = F.relu(self.conv1(x))
-        out = F.max_pool2d(out, 2)
-        out = F.relu(self.conv2(out))
-        out = F.max_pool2d(out, 2)
-        out = out.view(out.size(0), -1)
-        out = F.relu(self.fc1(out))
-        out = F.relu(self.fc2(out))
-        out = self.fc3(out)
-        return out
-
-
-def get_event(events, name: str, input_shapes: str = "-"):
-    for item in events:
-        if isinstance(item, CustomEvent):
-            if item.name == name:
-                return item
-        if isinstance(item, KernelEvent):
-            if item.name == name and item.input_shapes == input_shapes:
-                return item
-    return None
-
-
-def _test_lenet(
-    test_case,
-    on_cuda: bool,
-    record_shapes: bool,
-    record_bandwidth_for_cuda: bool = False,
-):
-    x = flow.randn(2, 3, 32, 32)
-    lenet = LeNet()
-    if on_cuda:
-        x = x.to("cuda")
-        lenet.to("cuda")
-    activities = [oneflow.profiler.ProfilerActivity.CPU]
-    if on_cuda:
-        activities.append(oneflow.profiler.ProfilerActivity.CUDA)
-    with oneflow.profiler.profile(
-        activities=activities,
-        record_shapes=record_shapes,
-        record_bandwidth_for_cuda=record_bandwidth_for_cuda,
-    ) as prof:
-        with oneflow.profiler.record_function("lenet_forward_total_time") as f:
-            for _ in range(2):
-                eager_res = lenet(x)
-        with oneflow.profiler.record_function("lenet_backward_total_time") as f:
-            eager_res.sum().backward()
-    events = prof.key_averages(group_by_input_shape=True)
-    print(events)
-    conv_event = get_event(
-        events, "conv2d", "[(2,3,32,32), (6,3,5,5)]" if record_shapes else "-"
-    )
-    test_case.assertIsNotNone(conv_event)
-
-    if on_cuda:
-        test_case.assertGreater(conv_event.cpu_time, 0.0)
-        test_case.assertGreater(conv_event.cpu_time_total, 0.0)
-        test_case.assertGreater(conv_event.cuda_time, 0.0)
-        test_case.assertGreater(conv_event.cuda_time_total, 0.0)
-    else:
-        test_case.assertGreater(conv_event.cpu_time, 0.0)
-        test_case.assertGreater(conv_event.cpu_time_total, 0.0)
-
-    test_case.assertEqual(conv_event.count, 2 if record_shapes else 4)
-    if record_bandwidth_for_cuda and on_cuda:
-        test_case.assertNotEqual(conv_event.bandwidth, -1)
-
-    relu_grad_event = get_event(
-        events, "relu_grad", "[(2,6,28,28), (2,6,28,28)]" if record_shapes else "-"
-    )
-    test_case.assertIsNotNone(relu_grad_event)
-    if on_cuda:
-        test_case.assertGreater(relu_grad_event.cpu_time, 0.0)
-        test_case.assertGreater(relu_grad_event.cpu_time_total, 0.0)
-        test_case.assertGreater(relu_grad_event.cuda_time, 0.0)
-        test_case.assertGreater(relu_grad_event.cuda_time_total, 0.0)
-    else:
-        test_case.assertGreater(relu_grad_event.cpu_time, 0.0)
-        test_case.assertGreater(relu_grad_event.cpu_time_total, 0.0)
-
-    test_case.assertEqual(relu_grad_event.count, 1 if record_shapes else 4)
-    if record_bandwidth_for_cuda and on_cuda:
-        test_case.assertNotEqual(relu_grad_event.bandwidth, -1)
-
-    test_case.assertIsNotNone(get_event(events, "lenet_forward_total_time"))
-    test_case.assertIsNotNone(get_event(events, "lenet_backward_total_time"))
-
-
-class TestProfileLenet(flow.unittest.TestCase):
-    def test_lenet_cpu(test_case):
-        _test_lenet(test_case, on_cuda=False, record_shapes=True)
-        _test_lenet(test_case, on_cuda=False, record_shapes=False)
-
-    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
-    def test_lenet_cuda(test_case):
-        _test_lenet(
-            test_case, on_cuda=True, record_shapes=True, record_bandwidth_for_cuda=False
-        )
-        _test_lenet(
-            test_case,
-            on_cuda=True,
-            record_shapes=False,
-            record_bandwidth_for_cuda=False,
-        )
-        _test_lenet(
-            test_case, on_cuda=True, record_shapes=True, record_bandwidth_for_cuda=True
-        )
-        _test_lenet(
-            test_case, on_cuda=True, record_shapes=False, record_bandwidth_for_cuda=True
-        )
-
-
-if __name__ == "__main__":
-    unittest.main()
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import os
+import unittest
+import oneflow.unittest
+import oneflow as flow
+import oneflow.nn as nn
+import oneflow.nn.functional as F
+import oneflow.profiler
+from oneflow.profiler.events import CustomEvent, KernelEvent
+
+
+class LeNet(nn.Module):
+    def __init__(self):
+        super(LeNet, self).__init__()
+        self.conv1 = nn.Conv2d(3, 6, 5)
+        self.conv2 = nn.Conv2d(6, 16, 5)
+        self.fc1 = nn.Linear(16 * 5 * 5, 120)
+        self.fc2 = nn.Linear(120, 84)
+        self.fc3 = nn.Linear(84, 10)
+
+    def forward(self, x):
+        out = F.relu(self.conv1(x))
+        out = F.max_pool2d(out, 2)
+        out = F.relu(self.conv2(out))
+        out = F.max_pool2d(out, 2)
+        out = out.view(out.size(0), -1)
+        out = F.relu(self.fc1(out))
+        out = F.relu(self.fc2(out))
+        out = self.fc3(out)
+        return out
+
+
+def get_event(events, name: str, input_shapes: str = "-"):
+    for item in events:
+        if isinstance(item, CustomEvent):
+            if item.name == name:
+                return item
+        if isinstance(item, KernelEvent):
+            if item.name == name and item.input_shapes == input_shapes:
+                return item
+    return None
+
+
+def _test_lenet(
+    test_case,
+    on_cuda: bool,
+    record_shapes: bool,
+    record_bandwidth_for_cuda: bool = False,
+):
+    x = flow.randn(2, 3, 32, 32)
+    lenet = LeNet()
+    if on_cuda:
+        x = x.to("cuda")
+        lenet.to("cuda")
+    activities = [oneflow.profiler.ProfilerActivity.CPU]
+    if on_cuda:
+        activities.append(oneflow.profiler.ProfilerActivity.CUDA)
+    with oneflow.profiler.profile(
+        activities=activities,
+        record_shapes=record_shapes,
+        record_bandwidth_for_cuda=record_bandwidth_for_cuda,
+    ) as prof:
+        with oneflow.profiler.record_function("lenet_forward_total_time") as f:
+            for _ in range(2):
+                eager_res = lenet(x)
+        with oneflow.profiler.record_function("lenet_backward_total_time") as f:
+            eager_res.sum().backward()
+    events = prof.key_averages(group_by_input_shape=True)
+    print(events)
+    conv_event = get_event(
+        events, "conv2d", "[(2,3,32,32), (6,3,5,5)]" if record_shapes else "-"
+    )
+    test_case.assertIsNotNone(conv_event)
+
+    if on_cuda:
+        test_case.assertGreater(conv_event.cpu_time, 0.0)
+        test_case.assertGreater(conv_event.cpu_time_total, 0.0)
+        test_case.assertGreater(conv_event.cuda_time, 0.0)
+        test_case.assertGreater(conv_event.cuda_time_total, 0.0)
+    else:
+        test_case.assertGreater(conv_event.cpu_time, 0.0)
+        test_case.assertGreater(conv_event.cpu_time_total, 0.0)
+
+    test_case.assertEqual(conv_event.count, 2 if record_shapes else 4)
+    if record_bandwidth_for_cuda and on_cuda:
+        test_case.assertNotEqual(conv_event.bandwidth, -1)
+
+    relu_grad_event = get_event(
+        events, "relu_grad", "[(2,6,28,28), (2,6,28,28)]" if record_shapes else "-"
+    )
+    test_case.assertIsNotNone(relu_grad_event)
+    if on_cuda:
+        test_case.assertGreater(relu_grad_event.cpu_time, 0.0)
+        test_case.assertGreater(relu_grad_event.cpu_time_total, 0.0)
+        test_case.assertGreater(relu_grad_event.cuda_time, 0.0)
+        test_case.assertGreater(relu_grad_event.cuda_time_total, 0.0)
+    else:
+        test_case.assertGreater(relu_grad_event.cpu_time, 0.0)
+        test_case.assertGreater(relu_grad_event.cpu_time_total, 0.0)
+
+    test_case.assertEqual(relu_grad_event.count, 1 if record_shapes else 4)
+    if record_bandwidth_for_cuda and on_cuda:
+        test_case.assertNotEqual(relu_grad_event.bandwidth, -1)
+
+    test_case.assertIsNotNone(get_event(events, "lenet_forward_total_time"))
+    test_case.assertIsNotNone(get_event(events, "lenet_backward_total_time"))
+
+
+class TestProfileLenet(flow.unittest.TestCase):
+    def test_lenet_cpu(test_case):
+        _test_lenet(test_case, on_cuda=False, record_shapes=True)
+        _test_lenet(test_case, on_cuda=False, record_shapes=False)
+
+    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+    def test_lenet_cuda(test_case):
+        _test_lenet(
+            test_case, on_cuda=True, record_shapes=True, record_bandwidth_for_cuda=False
+        )
+        _test_lenet(
+            test_case,
+            on_cuda=True,
+            record_shapes=False,
+            record_bandwidth_for_cuda=False,
+        )
+        _test_lenet(
+            test_case, on_cuda=True, record_shapes=True, record_bandwidth_for_cuda=True
+        )
+        _test_lenet(
+            test_case, on_cuda=True, record_shapes=False, record_bandwidth_for_cuda=True
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/version_script.lds b/version_script.lds
index 665a2f9..4737392 100644
--- a/version_script.lds
+++ b/version_script.lds
@@ -1,7 +1,7 @@
-{
-   global:
-     *;
-  local:
-    *llvm*;
-};
-
+{
+   global:
+     *;
+  local:
+    *llvm*;
+};
+
-- 
GitLab